From 4d1827485acecac0016eaaec199e97697afdaa84 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 31 Jul 2023 14:02:11 +0800
Subject: tools/power/turbostat: Fix failure with new uncore sysfs

On some platforms, turbostat fails during launch time like below,

turbostat version 2023.03.17 - Len Brown <lenb@kernel.org>
...
cpu40: MSR_IA32_PACKAGE_THERM_STATUS: 0x884c0000 (24 C)
cpu40: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x00000003 (100 C, 100 C)
turbostat: snapshot_sysfs_counter(/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz): No data available

This is because new uncore sysfs is used on these platforms as
introduced by commit 9b8dea80e3cb ("platform/x86/intel-uncore-freq:
Support for cluster level controls").

With the new uncore sysfs interface,
/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz
is still available, but reading it fails.

How to support the fabric cluster level uncore sysfs is not settled yet,
as a short term fix, clear the BIC_UNCORE_MHZ bit when new sysfs I/F is
detected.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9a10512e3407..9de1ff6f82ce 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4163,6 +4163,10 @@ static void intel_uncore_frequency_probe(void)
 	if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK))
 		return;
 
+	/* Cluster level sysfs not supported yet. */
+	if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK))
+		return;
+
 	if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK))
 		BIC_PRESENT(BIC_UNCORE_MHZ);
 
-- 
cgit 


From 137f01b3529d292a68d22e9681e2f903c768f790 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 25 Mar 2023 21:57:07 +0800
Subject: tools/power/turbostat: Fix a knl bug

MSR_KNL_CORE_C6_RESIDENCY should be evaluated only if
1. this is KNL platform
AND
2. need to get C6 residency or need to calculate C1 residency

Fix the broken logic introduced by commit 1e9042b9c8d4 ("tools/power
turbostat: Fix CPU%C1 display value").

Fixes: 1e9042b9c8d4 ("tools/power turbostat: Fix CPU%C1 display value")
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9de1ff6f82ce..fb6c6ddbdb60 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2180,7 +2180,7 @@ retry:
 	if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) {
 		if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
 			return -7;
-	} else if (do_knl_cstates || soft_c1_residency_display(BIC_CPU_c6)) {
+	} else if (do_knl_cstates && soft_c1_residency_display(BIC_CPU_c6)) {
 		if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6))
 			return -7;
 	}
-- 
cgit 


From b61b7d8c4c22c4298a50ae5d0ee88facb85ce665 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 27 Mar 2023 11:17:44 +0800
Subject: tools/power/turbostat: Enable the C-state Pre-wake printing

Currently the C-state Pre-wake will not be printed due to the
probe has not been invoked. Invoke the probe function accordingly.

Fixes: aeb01e6d71ff ("tools/power turbostat: Print the C-state Pre-wake settings")
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index fb6c6ddbdb60..03d4f09b103a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5794,6 +5794,7 @@ void process_cpuid()
 	rapl_probe(family, model);
 	perf_limit_reasons_probe(family, model);
 	automatic_cstate_conversion_probe(family, model);
+	prewake_cstate_probe(family, model);
 
 	check_tcc_offset(model_orig);
 
-- 
cgit 


From b98a6d78768ec459d394db8bc086d071eb6556c8 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 24 Mar 2023 16:30:55 +0800
Subject: tools/power/turbostat: Enable TCC Offset on more models

All Models that duplicate INTEL_FAM6_CANNONLAKE_L support TCC Offset.
Enable this feature on all these models.

Delete obsolete model_orig.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 03d4f09b103a..d7880870ef68 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -211,9 +211,6 @@ int *fd_instr_count_percpu;
 struct timeval interval_tv = { 5, 0 };
 struct timespec interval_ts = { 5, 0 };
 
-/* Save original CPU model */
-unsigned int model_orig;
-
 unsigned int num_iterations;
 unsigned int header_iterations;
 unsigned int debug;
@@ -4046,14 +4043,7 @@ void check_tcc_offset(int model)
 
 	switch (model) {
 	case INTEL_FAM6_SKYLAKE_L:
-	case INTEL_FAM6_SKYLAKE:
-	case INTEL_FAM6_KABYLAKE_L:
-	case INTEL_FAM6_KABYLAKE:
-	case INTEL_FAM6_ICELAKE_L:
-	case INTEL_FAM6_ICELAKE:
-	case INTEL_FAM6_TIGERLAKE_L:
-	case INTEL_FAM6_TIGERLAKE:
-	case INTEL_FAM6_COMETLAKE:
+	case INTEL_FAM6_CANNONLAKE_L:
 		if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) {
 			msr = (msr >> 30) & 1;
 			if (msr)
@@ -5573,10 +5563,9 @@ void process_cpuid()
 			edx_flags & (1 << 22) ? "ACPI-TM" : "-",
 			edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
 	}
-	if (genuine_intel) {
-		model_orig = model;
+
+	if (genuine_intel)
 		model = intel_model_duplicates(model);
-	}
 
 	if (!(edx_flags & (1 << 5)))
 		errx(1, "CPUID: no MSR");
@@ -5796,7 +5785,7 @@ void process_cpuid()
 	automatic_cstate_conversion_probe(family, model);
 	prewake_cstate_probe(family, model);
 
-	check_tcc_offset(model_orig);
+	check_tcc_offset(model);
 
 	if (!quiet)
 		dump_cstate_pstate_config_info(family, model);
-- 
cgit 


From 2c019d657968bdd93e11615e0919d8181a54742d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 21 Aug 2022 22:39:16 +0800
Subject: tools/power/turbostat: Support alternative graphics sysfs knobs

/sys/class/graphics/fb0/device/drm/card0/ and /sys/class/drm/card0/
point to the same device node.
But in some cases, one exists and the other one does not.

Prefer to use /sys/class/drm/card0/, and fall back to
/sys/class/graphics/fb0/device/drm/card0/.

This recovers the "GFXMHz" and "GFXAMHz" columns on some platforms like
a SPR server.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index d7880870ef68..f9bc2230db73 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3188,8 +3188,8 @@ int snapshot_gfx_rc6_ms(void)
 /*
  * snapshot_gfx_mhz()
  *
- * record snapshot of
- * /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
+ * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
+ * when /sys/class/drm/card0/gt_cur_freq_mhz is not available.
  *
  * return 1 if config change requires a restart, else return 0
  */
@@ -3198,9 +3198,11 @@ int snapshot_gfx_mhz(void)
 	static FILE *fp;
 	int retval;
 
-	if (fp == NULL)
-		fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
-	else {
+	if (fp == NULL) {
+		fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r");
+		if (!fp)
+			fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
+	} else {
 		rewind(fp);
 		fflush(fp);
 	}
@@ -3215,8 +3217,8 @@ int snapshot_gfx_mhz(void)
 /*
  * snapshot_gfx_cur_mhz()
  *
- * record snapshot of
- * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
+ * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
+ * when /sys/class/drm/card0/gt_act_freq_mhz is not available.
  *
  * return 1 if config change requires a restart, else return 0
  */
@@ -3225,9 +3227,11 @@ int snapshot_gfx_act_mhz(void)
 	static FILE *fp;
 	int retval;
 
-	if (fp == NULL)
-		fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
-	else {
+	if (fp == NULL) {
+		fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r");
+		if (!fp)
+			fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
+	} else {
 		rewind(fp);
 		fflush(fp);
 	}
@@ -5804,10 +5808,12 @@ void process_cpuid()
 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
 		BIC_PRESENT(BIC_GFX_rc6);
 
-	if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
+	if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) ||
+	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
 		BIC_PRESENT(BIC_GFXMHz);
 
-	if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
+	if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) ||
+	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
 		BIC_PRESENT(BIC_GFXACTMHz);
 
 	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
-- 
cgit 


From 6d306d6ec7e0f9a5e90f5afd70e3b6c32ae50ce6 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 24 Mar 2023 16:26:25 +0800
Subject: tools/power/turbostat: Replace raw value cpu model with Macro

Kernel already has
 #define INTEL_FAM6_NEHALEM_G	0x1F /* Auburndale / Havendale */

Use standard Macro for CPU Model instead of raw value.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f9bc2230db73..8d3a8af3692a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5413,7 +5413,7 @@ unsigned int intel_model_duplicates(unsigned int model)
 	switch (model) {
 	case INTEL_FAM6_NEHALEM_EP:	/* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */
 	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
-	case 0x1F:		/* Core i7 and i5 Processor - Nehalem */
+	case INTEL_FAM6_NEHALEM_G:	/* Core i7 and i5 Processor - Nehalem */
 	case INTEL_FAM6_WESTMERE:	/* Westmere Client - Clarkdale, Arrandale */
 	case INTEL_FAM6_WESTMERE_EP:	/* Westmere EP - Gulftown */
 		return INTEL_FAM6_NEHALEM;
-- 
cgit 


From bbfc33b1e49f443296e56d8b76c77373f700aedc Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 24 Mar 2023 16:27:30 +0800
Subject: tools/power/turbostat: Remove redundant duplicates

Remove redundant duplicates in intel_model_duplicates().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 8d3a8af3692a..2420300939da 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5412,24 +5412,20 @@ unsigned int intel_model_duplicates(unsigned int model)
 
 	switch (model) {
 	case INTEL_FAM6_NEHALEM_EP:	/* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */
-	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
 	case INTEL_FAM6_NEHALEM_G:	/* Core i7 and i5 Processor - Nehalem */
 	case INTEL_FAM6_WESTMERE:	/* Westmere Client - Clarkdale, Arrandale */
 	case INTEL_FAM6_WESTMERE_EP:	/* Westmere EP - Gulftown */
 		return INTEL_FAM6_NEHALEM;
 
-	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
 	case INTEL_FAM6_WESTMERE_EX:	/* Westmere-EX Xeon - Eagleton */
 		return INTEL_FAM6_NEHALEM_EX;
 
 	case INTEL_FAM6_XEON_PHI_KNM:
 		return INTEL_FAM6_XEON_PHI_KNL;
 
-	case INTEL_FAM6_BROADWELL_X:
 	case INTEL_FAM6_BROADWELL_D:	/* BDX-DE */
 		return INTEL_FAM6_BROADWELL_X;
 
-	case INTEL_FAM6_SKYLAKE_L:
 	case INTEL_FAM6_SKYLAKE:
 	case INTEL_FAM6_KABYLAKE_L:
 	case INTEL_FAM6_KABYLAKE:
-- 
cgit 


From 48674c1bb6124fe392e8fed80a39fcb3f62e6551 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 27 Mar 2023 12:36:55 +0800
Subject: tools/power/turbostat: Remove pseudo check for two models

INTEL_FAM6_ATOM_SILVERMONT_MID/INTEL_FAM6_ATOM_AIRMONT_MID are not
listed in probe_nhm_msrs(). This means that most of the turbostat
features are not available on these two platforms.

Further more, checking for these two models in has_slv_msrs() is
dead code. Because has_slv_msrs() is called by the code guarded by
probe_nhm_msrs().

For these two reasons, remove pseudo check for
INTEL_FAM6_ATOM_SILVERMONT_MID and INTEL_FAM6_ATOM_AIRMONT_MID.

Will add back the support when we can access these two platforms.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 2420300939da..b8874d3dc83e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3792,8 +3792,6 @@ int has_slv_msrs(unsigned int family, unsigned int model)
 
 	switch (model) {
 	case INTEL_FAM6_ATOM_SILVERMONT:
-	case INTEL_FAM6_ATOM_SILVERMONT_MID:
-	case INTEL_FAM6_ATOM_AIRMONT_MID:
 		return 1;
 	}
 	return 0;
-- 
cgit 


From 45232ab168a3c5abad86eafaef2beed8d7037666 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 21:52:53 +0800
Subject: tools/power/turbostat: Add skeleton support for table driven feature
 enumeration

Turbostat supports a series of features that may diverge among different
CPU models.

Current code uses various of CPU model checks in different places to
handle this, which makes the code hard to maintain.

Add skeleton support for table driven feature enumeration to replace the
current error-prone CPU model checks and global variables.

Note: by comparing the CPU models with intel-family.h, it is found that
turbostat support for below four Models are missing, including
INTEL_FAM6_ICELAKE, INTEL_FAM6_ATOM_SILVERMONT_MID,
INTEL_FAM6_ATOM_AIRMONT_MID and INTEL_FAM6_ATOM_AIRMONT_NP. Adding
support for these models is a different work, thus it is not covered in
this patch set.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 192 ++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b8874d3dc83e..4deea374188a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -281,6 +281,197 @@ unsigned int has_misc_feature_control;
 unsigned int first_counter_read = 1;
 int ignore_stdin;
 
+/* Model specific support Start */
+
+/* List of features that may diverge among different platforms */
+struct platform_features {
+};
+
+struct platform_data {
+	unsigned int model;
+	const struct platform_features *features;
+};
+
+static const struct platform_features nhm_features = {
+};
+
+static const struct platform_features nhx_features = {
+};
+
+static const struct platform_features snb_features = {
+};
+
+static const struct platform_features snx_features = {
+};
+
+static const struct platform_features ivb_features = {
+};
+
+static const struct platform_features ivx_features = {
+};
+
+static const struct platform_features hsw_features = {
+};
+
+static const struct platform_features hsx_features = {
+};
+
+static const struct platform_features hswl_features = {
+};
+
+static const struct platform_features hswg_features = {
+};
+
+static const struct platform_features bdw_features = {
+};
+
+static const struct platform_features bdwg_features = {
+};
+
+static const struct platform_features bdx_features = {
+};
+
+static const struct platform_features skl_features = {
+};
+
+static const struct platform_features cnl_features = {
+};
+
+static const struct platform_features skx_features = {
+};
+
+static const struct platform_features icx_features = {
+};
+
+static const struct platform_features spr_features = {
+};
+
+static const struct platform_features slv_features = {
+};
+
+static const struct platform_features slvd_features = {
+};
+
+static const struct platform_features amt_features = {
+};
+
+static const struct platform_features gmt_features = {
+};
+
+static const struct platform_features gmtd_features = {
+};
+
+static const struct platform_features gmtp_features = {
+};
+
+static const struct platform_features tmt_features = {
+};
+
+static const struct platform_features tmtd_features = {
+};
+
+static const struct platform_features knl_features = {
+};
+
+static const struct platform_features default_features = {
+};
+
+static const struct platform_features amd_features = {
+};
+
+static const struct platform_data turbostat_pdata[] = {
+	{ INTEL_FAM6_NEHALEM, &nhm_features },
+	{ INTEL_FAM6_NEHALEM_G, &nhm_features },
+	{ INTEL_FAM6_NEHALEM_EP, &nhm_features },
+	{ INTEL_FAM6_NEHALEM_EX, &nhx_features },
+	{ INTEL_FAM6_WESTMERE, &nhm_features },
+	{ INTEL_FAM6_WESTMERE_EP, &nhm_features },
+	{ INTEL_FAM6_WESTMERE_EX, &nhx_features },
+	{ INTEL_FAM6_SANDYBRIDGE, &snb_features },
+	{ INTEL_FAM6_SANDYBRIDGE_X, &snx_features },
+	{ INTEL_FAM6_IVYBRIDGE, &ivb_features },
+	{ INTEL_FAM6_IVYBRIDGE_X, &ivx_features },
+	{ INTEL_FAM6_HASWELL, &hsw_features },
+	{ INTEL_FAM6_HASWELL_X, &hsx_features },
+	{ INTEL_FAM6_HASWELL_L, &hswl_features },
+	{ INTEL_FAM6_HASWELL_G, &hswg_features },
+	{ INTEL_FAM6_BROADWELL, &bdw_features },
+	{ INTEL_FAM6_BROADWELL_G, &bdwg_features },
+	{ INTEL_FAM6_BROADWELL_X, &bdx_features },
+	{ INTEL_FAM6_BROADWELL_D, &bdx_features },
+	{ INTEL_FAM6_SKYLAKE_L, &skl_features },
+	{ INTEL_FAM6_SKYLAKE, &skl_features },
+	{ INTEL_FAM6_SKYLAKE_X, &skx_features },
+	{ INTEL_FAM6_KABYLAKE_L, &skl_features },
+	{ INTEL_FAM6_KABYLAKE, &skl_features },
+	{ INTEL_FAM6_COMETLAKE, &skl_features },
+	{ INTEL_FAM6_COMETLAKE_L, &skl_features },
+	{ INTEL_FAM6_CANNONLAKE_L, &cnl_features },
+	{ INTEL_FAM6_ICELAKE_X, &icx_features },
+	{ INTEL_FAM6_ICELAKE_D, &icx_features },
+	{ INTEL_FAM6_ICELAKE_L, &cnl_features },
+	{ INTEL_FAM6_ICELAKE_NNPI, &cnl_features },
+	{ INTEL_FAM6_ROCKETLAKE, &cnl_features },
+	{ INTEL_FAM6_TIGERLAKE_L, &cnl_features },
+	{ INTEL_FAM6_TIGERLAKE, &cnl_features },
+	{ INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
+	{ INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
+	{ INTEL_FAM6_LAKEFIELD, &cnl_features },
+	{ INTEL_FAM6_ALDERLAKE, &cnl_features },
+	{ INTEL_FAM6_ALDERLAKE_L, &cnl_features },
+	{ INTEL_FAM6_RAPTORLAKE, &cnl_features },
+	{ INTEL_FAM6_RAPTORLAKE_P, &cnl_features },
+	{ INTEL_FAM6_RAPTORLAKE_S, &cnl_features },
+	{ INTEL_FAM6_METEORLAKE, &cnl_features },
+	{ INTEL_FAM6_METEORLAKE_L, &cnl_features },
+	{ INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
+	{ INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
+	{ INTEL_FAM6_ATOM_AIRMONT, &amt_features },
+	{ INTEL_FAM6_ATOM_GOLDMONT, &gmt_features },
+	{ INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features },
+	{ INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features },
+	{ INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
+	{ INTEL_FAM6_ATOM_TREMONT, &tmt_features },
+	{ INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
+	{ INTEL_FAM6_ATOM_GRACEMONT, &cnl_features },
+	{ INTEL_FAM6_XEON_PHI_KNL, &knl_features },
+	{ INTEL_FAM6_XEON_PHI_KNM, &knl_features },
+	/*
+	 * Missing support for
+	 * INTEL_FAM6_ICELAKE
+	 * INTEL_FAM6_ATOM_SILVERMONT_MID
+	 * INTEL_FAM6_ATOM_AIRMONT_MID
+	 * INTEL_FAM6_ATOM_AIRMONT_NP
+	 */
+	{ 0, NULL },
+};
+
+static const struct platform_features *platform;
+
+void probe_platform_features(unsigned int family, unsigned int model)
+{
+	int i;
+
+	if (authentic_amd || hygon_genuine) {
+		platform = &amd_features;
+		return;
+	}
+
+	platform = &default_features;
+
+	if (!genuine_intel || family != 6)
+		return;
+
+	for (i = 0; turbostat_pdata[i].features; i++) {
+		if (turbostat_pdata[i].model == model) {
+			platform = turbostat_pdata[i].features;
+			return;
+		}
+	}
+}
+
+/* Model specific support End */
+
 #define RAPL_PKG		(1 << 0)
 					/* 0x610 MSR_PKG_POWER_LIMIT */
 					/* 0x611 MSR_PKG_ENERGY_STATUS */
@@ -5562,6 +5753,7 @@ void process_cpuid()
 			edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
 	}
 
+	probe_platform_features(family, model);
 	if (genuine_intel)
 		model = intel_model_duplicates(model);
 
-- 
cgit 


From 778fc34a7a3db2811e28ce570318dd047f278cb2 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 21 Aug 2023 15:26:32 +0800
Subject: tools/power/turbostat: Abstract MSR_MISC_FEATURE_CONTROL support

Abstract MSR_MISC_FEATURE_CONTROL support.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4deea374188a..7eaa0adf72e0 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -277,7 +277,6 @@ unsigned int has_hwp_notify;	/* IA32_HWP_INTERRUPT */
 unsigned int has_hwp_activity_window;	/* IA32_HWP_REQUEST[bits 41:32] */
 unsigned int has_hwp_epp;	/* IA32_HWP_REQUEST[bits 31:24] */
 unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
-unsigned int has_misc_feature_control;
 unsigned int first_counter_read = 1;
 int ignore_stdin;
 
@@ -285,6 +284,7 @@ int ignore_stdin;
 
 /* List of features that may diverge among different platforms */
 struct platform_features {
+	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
 };
 
 struct platform_data {
@@ -299,51 +299,67 @@ static const struct platform_features nhx_features = {
 };
 
 static const struct platform_features snb_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features snx_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features ivb_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features ivx_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features hsw_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features hsx_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features hswl_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features hswg_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features bdw_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features bdwg_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features bdx_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features skl_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features cnl_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features skx_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features icx_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features spr_features = {
+	.has_msr_misc_feature_control = 1,
 };
 
 static const struct platform_features slv_features = {
@@ -3883,7 +3899,6 @@ void check_permissions(void)
  *
  * Side effect:
  * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL
- * sets has_misc_feature_control
  */
 int probe_nhm_msrs(unsigned int family, unsigned int model)
 {
@@ -3909,7 +3924,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
 	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
 		pkg_cstate_limits = snb_pkg_cstate_limits;
-		has_misc_feature_control = 1;
 		break;
 	case INTEL_FAM6_HASWELL:	/* HSW */
 	case INTEL_FAM6_HASWELL_G:	/* HSW */
@@ -3921,16 +3935,13 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
 		pkg_cstate_limits = hsw_pkg_cstate_limits;
-		has_misc_feature_control = 1;
 		break;
 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
 	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
 		pkg_cstate_limits = skx_pkg_cstate_limits;
-		has_misc_feature_control = 1;
 		break;
 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
 		pkg_cstate_limits = icx_pkg_cstate_limits;
-		has_misc_feature_control = 1;
 		break;
 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
 		no_MSR_MISC_PWR_MGMT = 1;
@@ -5541,7 +5552,7 @@ void decode_misc_feature_control(void)
 {
 	unsigned long long msr;
 
-	if (!has_misc_feature_control)
+	if (!platform->has_msr_misc_feature_control)
 		return;
 
 	if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
-- 
cgit 


From 3dd0e7547d11e770bafb40ad41f2631cc4b16649 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 21 Aug 2023 22:12:57 +0800
Subject: tools/power/turbostat: Abstract MSR_MISC_PWR_MGMT support

Abstract MSR_MISC_PWR_MGMT support.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 7eaa0adf72e0..9507f310e212 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -238,7 +238,6 @@ unsigned int hygon_genuine;
 unsigned int max_level, max_extended_level;
 unsigned int has_invariant_tsc;
 unsigned int do_nhm_platform_info;
-unsigned int no_MSR_MISC_PWR_MGMT;
 unsigned int aperf_mperf_multiplier = 1;
 double bclk;
 double base_hz;
@@ -285,6 +284,7 @@ int ignore_stdin;
 /* List of features that may diverge among different platforms */
 struct platform_features {
 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
+	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
 };
 
 struct platform_data {
@@ -293,100 +293,125 @@ struct platform_data {
 };
 
 static const struct platform_features nhm_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features nhx_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features snb_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features snx_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features ivb_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features ivx_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features hsw_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features hsx_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features hswl_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features hswg_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features bdw_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features bdwg_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features bdx_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features skl_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features cnl_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features skx_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features icx_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features spr_features = {
 	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features slv_features = {
 };
 
 static const struct platform_features slvd_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features amt_features = {
 };
 
 static const struct platform_features gmt_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features gmtd_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features gmtp_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features tmt_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features tmtd_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features knl_features = {
+	.has_msr_misc_pwr_mgmt = 1,
 };
 
 static const struct platform_features default_features = {
@@ -3944,14 +3969,12 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 		pkg_cstate_limits = icx_pkg_cstate_limits;
 		break;
 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
-		no_MSR_MISC_PWR_MGMT = 1;
 		/* FALLTHRU */
 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
 		pkg_cstate_limits = slv_pkg_cstate_limits;
 		break;
 	case INTEL_FAM6_ATOM_AIRMONT:	/* AMT */
 		pkg_cstate_limits = amt_pkg_cstate_limits;
-		no_MSR_MISC_PWR_MGMT = 1;
 		break;
 	case INTEL_FAM6_XEON_PHI_KNL:	/* PHI */
 		pkg_cstate_limits = phi_pkg_cstate_limits;
@@ -5576,7 +5599,7 @@ void decode_misc_pwr_mgmt_msr(void)
 	if (!do_nhm_platform_info)
 		return;
 
-	if (no_MSR_MISC_PWR_MGMT)
+	if (!platform->has_msr_misc_pwr_mgmt)
 		return;
 
 	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
-- 
cgit 


From 71e841293c715797d8c6ae8cdc3f74b4396c5570 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 21 Aug 2023 22:22:48 +0800
Subject: tools/power/turbostat: Abstract BCLK frequency support

Abstract CPU base clock frequency support.

Note that bclk is used by
1. calculate base_hz using MSR_PLATFORM_INFO, which is guarded by
   probe_nhm_msrs().
2. dump MSR_PLATFORM_INFO and Turbo Ratio Limit MSRs, which are also
   guarded by probe_nhm_msrs().
Thus probe_bclk() works for probe_nhm_msrs() models only.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 131 ++++++++++++++++++++++------------
 1 file changed, 87 insertions(+), 44 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9507f310e212..66ba70017d53 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -269,7 +269,6 @@ unsigned int do_ring_perf_limit_reasons;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
 int base_cpu;
-double discover_bclk(unsigned int family, unsigned int model);
 unsigned int has_hwp;		/* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
 			/* IA32_HWP_REQUEST, IA32_HWP_STATUS */
 unsigned int has_hwp_notify;	/* IA32_HWP_INTERRUPT */
@@ -279,12 +278,15 @@ unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
 unsigned int first_counter_read = 1;
 int ignore_stdin;
 
+int get_msr(int cpu, off_t offset, unsigned long long *msr);
+
 /* Model specific support Start */
 
 /* List of features that may diverge among different platforms */
 struct platform_features {
 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
+	int bclk_freq;		/* CPU base clock */
 };
 
 struct platform_data {
@@ -292,126 +294,185 @@ struct platform_data {
 	const struct platform_features *features;
 };
 
+/* For BCLK */
+enum bclk_freq {
+	BCLK_100MHZ = 1,
+	BCLK_133MHZ,
+	BCLK_SLV,
+};
+
+#define SLM_BCLK_FREQS 5
+double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
+
+double slm_bclk(void)
+{
+	unsigned long long msr = 3;
+	unsigned int i;
+	double freq;
+
+	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
+		fprintf(outf, "SLM BCLK: unknown\n");
+
+	i = msr & 0xf;
+	if (i >= SLM_BCLK_FREQS) {
+		fprintf(outf, "SLM BCLK[%d] invalid\n", i);
+		i = 3;
+	}
+	freq = slm_freq_table[i];
+
+	if (!quiet)
+		fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
+
+	return freq;
+}
+
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_133MHZ,
 };
 
 static const struct platform_features nhx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_133MHZ,
 };
 
 static const struct platform_features snb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features snx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features ivb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features ivx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features hsw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features hsx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features hswl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features hswg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features bdw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features bdwg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features bdx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features skl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features cnl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features skx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features icx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features spr_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features slv_features = {
+	.bclk_freq = BCLK_SLV,
 };
 
 static const struct platform_features slvd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_SLV,
 };
 
 static const struct platform_features amt_features = {
+	.bclk_freq = BCLK_133MHZ,
 };
 
 static const struct platform_features gmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features gmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features gmtp_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features knl_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.bclk_freq = BCLK_100MHZ,
 };
 
 static const struct platform_features default_features = {
@@ -3907,6 +3968,30 @@ void check_permissions(void)
 		exit(-6);
 }
 
+void probe_bclk(void)
+{
+	unsigned long long msr;
+	unsigned int base_ratio;
+
+	if (!do_nhm_platform_info)
+		return;
+
+	if (platform->bclk_freq == BCLK_100MHZ)
+		bclk = 100.00;
+	else if (platform->bclk_freq == BCLK_133MHZ)
+		bclk = 133.33;
+	else if (platform->bclk_freq == BCLK_SLV)
+		bclk = slm_bclk();
+	else
+		return;
+
+	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
+	base_ratio = (msr >> 8) & 0xFF;
+
+	base_hz = base_ratio * bclk * 1000000;
+	has_base_hz = 1;
+}
+
 /*
  * NHM adds support for additional MSRs:
  *
@@ -3928,7 +4013,6 @@ void check_permissions(void)
 int probe_nhm_msrs(unsigned int family, unsigned int model)
 {
 	unsigned long long msr;
-	unsigned int base_ratio;
 	int *pkg_cstate_limits;
 
 	if (!genuine_intel)
@@ -3937,8 +4021,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	if (family != 6)
 		return 0;
 
-	bclk = discover_bclk(family, model);
-
 	switch (model) {
 	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
 	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
@@ -3992,11 +4074,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
 
-	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
-	base_ratio = (msr >> 8) & 0xFF;
-
-	base_hz = base_ratio * bclk * 1000000;
-	has_base_hz = 1;
 	return 1;
 }
 
@@ -5403,41 +5480,6 @@ unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
 	return 1;
 }
 
-#define SLM_BCLK_FREQS 5
-double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
-
-double slm_bclk(void)
-{
-	unsigned long long msr = 3;
-	unsigned int i;
-	double freq;
-
-	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
-		fprintf(outf, "SLM BCLK: unknown\n");
-
-	i = msr & 0xf;
-	if (i >= SLM_BCLK_FREQS) {
-		fprintf(outf, "SLM BCLK[%d] invalid\n", i);
-		i = 3;
-	}
-	freq = slm_freq_table[i];
-
-	if (!quiet)
-		fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
-
-	return freq;
-}
-
-double discover_bclk(unsigned int family, unsigned int model)
-{
-	if (has_snb_msrs(family, model) || is_knl(family, model))
-		return 100.00;
-	else if (is_slm(family, model))
-		return slm_bclk();
-	else
-		return 133.33;
-}
-
 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -5929,6 +5971,7 @@ void process_cpuid()
 		BIC_PRESENT(BIC_CPU_c6);
 		BIC_PRESENT(BIC_SMI);
 	}
+	probe_bclk();
 	do_snb_cstates = has_snb_msrs(family, model);
 
 	if (do_snb_cstates)
-- 
cgit 


From 3989fc890782c8002477895e9f24ffb98a132293 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 22:37:06 +0800
Subject: tools/power/turbostat: Abstract Package cstate limit decoding support

Abstract the support for decoding package cstate limit from
MSR_PKG_CST_CONFIG_CONTROL.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 112 ++++++++++++++++++++++++++--------
 1 file changed, 86 insertions(+), 26 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 66ba70017d53..bcad9332a3b2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -287,6 +287,7 @@ struct platform_features {
 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
 	int bclk_freq;		/* CPU base clock */
+	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 };
 
 struct platform_data {
@@ -326,153 +327,193 @@ double slm_bclk(void)
 	return freq;
 }
 
+/* For Package cstate limit */
+enum package_cstate_limit {
+	CST_LIMIT_NHM = 1,
+	CST_LIMIT_SNB,
+	CST_LIMIT_HSW,
+	CST_LIMIT_SKX,
+	CST_LIMIT_ICX,
+	CST_LIMIT_SLV,
+	CST_LIMIT_AMT,
+	CST_LIMIT_KNL,
+	CST_LIMIT_GMT,
+};
+
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_133MHZ,
+	.cst_limit = CST_LIMIT_NHM,
 };
 
 static const struct platform_features nhx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_133MHZ,
+	.cst_limit = CST_LIMIT_NHM,
 };
 
 static const struct platform_features snb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_SNB,
 };
 
 static const struct platform_features snx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_SNB,
 };
 
 static const struct platform_features ivb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_SNB,
 };
 
 static const struct platform_features ivx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_SNB,
 };
 
 static const struct platform_features hsw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features hsx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features hswl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features hswg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features bdw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features bdwg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features bdx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features skl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features cnl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_HSW,
 };
 
 static const struct platform_features skx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_SKX,
 };
 
 static const struct platform_features icx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_ICX,
 };
 
 static const struct platform_features spr_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_SKX,
 };
 
 static const struct platform_features slv_features = {
 	.bclk_freq = BCLK_SLV,
+	.cst_limit = CST_LIMIT_SLV,
 };
 
 static const struct platform_features slvd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_SLV,
+	.cst_limit = CST_LIMIT_SLV,
 };
 
 static const struct platform_features amt_features = {
 	.bclk_freq = BCLK_133MHZ,
+	.cst_limit = CST_LIMIT_AMT,
 };
 
 static const struct platform_features gmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features gmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features gmtp_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features knl_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.cst_limit = CST_LIMIT_KNL,
 };
 
 static const struct platform_features default_features = {
@@ -2704,6 +2745,50 @@ int icx_pkg_cstate_limits[16] =
 	PCLRSV, PCLRSV
 };
 
+void probe_cst_limit(void)
+{
+	unsigned long long msr;
+	int *pkg_cstate_limits;
+
+	if (!do_nhm_platform_info)
+		return;
+
+	switch (platform->cst_limit) {
+	case CST_LIMIT_NHM:
+		pkg_cstate_limits = nhm_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_SNB:
+		pkg_cstate_limits = snb_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_HSW:
+		pkg_cstate_limits = hsw_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_SKX:
+		pkg_cstate_limits = skx_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_ICX:
+		pkg_cstate_limits = icx_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_SLV:
+		pkg_cstate_limits = slv_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_AMT:
+		pkg_cstate_limits = amt_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_KNL:
+		pkg_cstate_limits = phi_pkg_cstate_limits;
+		break;
+	case CST_LIMIT_GMT:
+		pkg_cstate_limits = glm_pkg_cstate_limits;
+		break;
+	default:
+		return;
+	}
+
+	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
+	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
+}
+
 static void calculate_tsc_tweak()
 {
 	tsc_tweak = base_hz / tsc_hz;
@@ -4006,15 +4091,9 @@ void probe_bclk(void)
  * MSR_PKG_C6_RESIDENCY            0x000003f9
  * MSR_CORE_C3_RESIDENCY           0x000003fc
  * MSR_CORE_C6_RESIDENCY           0x000003fd
- *
- * Side effect:
- * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL
  */
 int probe_nhm_msrs(unsigned int family, unsigned int model)
 {
-	unsigned long long msr;
-	int *pkg_cstate_limits;
-
 	if (!genuine_intel)
 		return 0;
 
@@ -4024,14 +4103,10 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	switch (model) {
 	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
 	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
-		pkg_cstate_limits = nhm_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_SANDYBRIDGE:	/* SNB */
 	case INTEL_FAM6_SANDYBRIDGE_X:	/* SNB Xeon */
 	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
 	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
-		pkg_cstate_limits = snb_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_HASWELL:	/* HSW */
 	case INTEL_FAM6_HASWELL_G:	/* HSW */
 	case INTEL_FAM6_HASWELL_X:	/* HSX */
@@ -4041,38 +4116,22 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-		pkg_cstate_limits = hsw_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
 	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
-		pkg_cstate_limits = skx_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
-		pkg_cstate_limits = icx_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
-		/* FALLTHRU */
 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
-		pkg_cstate_limits = slv_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_ATOM_AIRMONT:	/* AMT */
-		pkg_cstate_limits = amt_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_XEON_PHI_KNL:	/* PHI */
-		pkg_cstate_limits = phi_pkg_cstate_limits;
-		break;
 	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
 	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
 	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
-		pkg_cstate_limits = glm_pkg_cstate_limits;
 		break;
 	default:
 		return 0;
 	}
-	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
-	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
 
 	return 1;
 }
@@ -5964,6 +6023,7 @@ void process_cpuid()
 	BIC_PRESENT(BIC_IRQ);
 	BIC_PRESENT(BIC_TSC_MHz);
 
+	probe_cst_limit();
 	if (probe_nhm_msrs(family, model)) {
 		do_nhm_platform_info = 1;
 		BIC_PRESENT(BIC_CPU_c1);
-- 
cgit 


From fcfa1ce074ab76272639961d4d7900b91657a8d5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 25 Aug 2023 16:52:23 +0800
Subject: tools/power/turbostat: Abstract Nehalem MSRs support

MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT,
MSR_PKG_CST_CONFIG_CONTROL, and the TRL MSRs are always available for
platforms since Nehalem. Support for these msrs can be described
altogether.

Abstract the support for these MSRs.

Delete probe_nhm_msrs() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 101 ++++++++++++----------------------
 1 file changed, 34 insertions(+), 67 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index bcad9332a3b2..bc221e800aec 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -237,7 +237,6 @@ unsigned int authentic_amd;
 unsigned int hygon_genuine;
 unsigned int max_level, max_extended_level;
 unsigned int has_invariant_tsc;
-unsigned int do_nhm_platform_info;
 unsigned int aperf_mperf_multiplier = 1;
 double bclk;
 double base_hz;
@@ -286,6 +285,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr);
 struct platform_features {
 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
+	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */
 	int bclk_freq;		/* CPU base clock */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 };
@@ -342,12 +342,14 @@ enum package_cstate_limit {
 
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
 	.cst_limit = CST_LIMIT_NHM,
 };
 
 static const struct platform_features nhx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
 	.cst_limit = CST_LIMIT_NHM,
 };
@@ -355,6 +357,7 @@ static const struct platform_features nhx_features = {
 static const struct platform_features snb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 };
@@ -362,6 +365,7 @@ static const struct platform_features snb_features = {
 static const struct platform_features snx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 };
@@ -369,6 +373,7 @@ static const struct platform_features snx_features = {
 static const struct platform_features ivb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 };
@@ -376,6 +381,7 @@ static const struct platform_features ivb_features = {
 static const struct platform_features ivx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 };
@@ -383,6 +389,7 @@ static const struct platform_features ivx_features = {
 static const struct platform_features hsw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -390,6 +397,7 @@ static const struct platform_features hsw_features = {
 static const struct platform_features hsx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -397,6 +405,7 @@ static const struct platform_features hsx_features = {
 static const struct platform_features hswl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -404,6 +413,7 @@ static const struct platform_features hswl_features = {
 static const struct platform_features hswg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -411,6 +421,7 @@ static const struct platform_features hswg_features = {
 static const struct platform_features bdw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -418,6 +429,7 @@ static const struct platform_features bdw_features = {
 static const struct platform_features bdwg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -425,6 +437,7 @@ static const struct platform_features bdwg_features = {
 static const struct platform_features bdx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -432,6 +445,7 @@ static const struct platform_features bdx_features = {
 static const struct platform_features skl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -439,6 +453,7 @@ static const struct platform_features skl_features = {
 static const struct platform_features cnl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 };
@@ -446,6 +461,7 @@ static const struct platform_features cnl_features = {
 static const struct platform_features skx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
 };
@@ -453,6 +469,7 @@ static const struct platform_features skx_features = {
 static const struct platform_features icx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_ICX,
 };
@@ -460,58 +477,68 @@ static const struct platform_features icx_features = {
 static const struct platform_features spr_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
 };
 
 static const struct platform_features slv_features = {
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
 	.cst_limit = CST_LIMIT_SLV,
 };
 
 static const struct platform_features slvd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
 	.cst_limit = CST_LIMIT_SLV,
 };
 
 static const struct platform_features amt_features = {
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
 	.cst_limit = CST_LIMIT_AMT,
 };
 
 static const struct platform_features gmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features gmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features gmtp_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 };
 
 static const struct platform_features knl_features = {
 	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_KNL,
 };
@@ -2750,7 +2777,7 @@ void probe_cst_limit(void)
 	unsigned long long msr;
 	int *pkg_cstate_limits;
 
-	if (!do_nhm_platform_info)
+	if (!platform->has_nhm_msrs)
 		return;
 
 	switch (platform->cst_limit) {
@@ -4058,7 +4085,7 @@ void probe_bclk(void)
 	unsigned long long msr;
 	unsigned int base_ratio;
 
-	if (!do_nhm_platform_info)
+	if (!platform->has_nhm_msrs)
 		return;
 
 	if (platform->bclk_freq == BCLK_100MHZ)
@@ -4077,65 +4104,6 @@ void probe_bclk(void)
 	has_base_hz = 1;
 }
 
-/*
- * NHM adds support for additional MSRs:
- *
- * MSR_SMI_COUNT                   0x00000034
- *
- * MSR_PLATFORM_INFO               0x000000ce
- * MSR_PKG_CST_CONFIG_CONTROL     0x000000e2
- *
- * MSR_MISC_PWR_MGMT               0x000001aa
- *
- * MSR_PKG_C3_RESIDENCY            0x000003f8
- * MSR_PKG_C6_RESIDENCY            0x000003f9
- * MSR_CORE_C3_RESIDENCY           0x000003fc
- * MSR_CORE_C6_RESIDENCY           0x000003fd
- */
-int probe_nhm_msrs(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
-	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
-	case INTEL_FAM6_SANDYBRIDGE:	/* SNB */
-	case INTEL_FAM6_SANDYBRIDGE_X:	/* SNB Xeon */
-	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
-	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
-	case INTEL_FAM6_HASWELL:	/* HSW */
-	case INTEL_FAM6_HASWELL_G:	/* HSW */
-	case INTEL_FAM6_HASWELL_X:	/* HSX */
-	case INTEL_FAM6_HASWELL_L:	/* HSW */
-	case INTEL_FAM6_BROADWELL:	/* BDW */
-	case INTEL_FAM6_BROADWELL_G:	/* BDW */
-	case INTEL_FAM6_BROADWELL_X:	/* BDX */
-	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
-	case INTEL_FAM6_ICELAKE_X:	/* ICX */
-	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
-	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
-	case INTEL_FAM6_ATOM_AIRMONT:	/* AMT */
-	case INTEL_FAM6_XEON_PHI_KNL:	/* PHI */
-	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
-	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
-	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
-	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
-		break;
-	default:
-		return 0;
-	}
-
-	return 1;
-}
-
 /*
  * SLV client has support for unique MSRs:
  *
@@ -4461,7 +4429,7 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model)
 
 static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
 {
-	if (!do_nhm_platform_info)
+	if (!platform->has_nhm_msrs)
 		return;
 
 	dump_nhm_platform_info();
@@ -5606,7 +5574,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 	}
 
 	/* Temperature Target MSR is Nehalem and newer only */
-	if (!do_nhm_platform_info)
+	if (!platform->has_nhm_msrs)
 		goto guess;
 
 	if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
@@ -5697,7 +5665,7 @@ void decode_misc_pwr_mgmt_msr(void)
 {
 	unsigned long long msr;
 
-	if (!do_nhm_platform_info)
+	if (!platform->has_nhm_msrs)
 		return;
 
 	if (!platform->has_msr_misc_pwr_mgmt)
@@ -6024,8 +5992,7 @@ void process_cpuid()
 	BIC_PRESENT(BIC_TSC_MHz);
 
 	probe_cst_limit();
-	if (probe_nhm_msrs(family, model)) {
-		do_nhm_platform_info = 1;
+	if (platform->has_nhm_msrs) {
 		BIC_PRESENT(BIC_CPU_c1);
 		BIC_PRESENT(BIC_CPU_c3);
 		BIC_PRESENT(BIC_CPU_c6);
-- 
cgit 


From c2c25e85df316a624e4a8ee85d65ea29265f486d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 27 Mar 2023 14:57:06 +0800
Subject: tools/power/turbostat: Remove a redundant check

Platforms with has_msr_misc_pwr_mgmt set is a subset of platforms with
has_nhm_msrs set.

Thus remove the redudant check for platform->has_nhm_msrs.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index bc221e800aec..4eb10491f714 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5665,9 +5665,6 @@ void decode_misc_pwr_mgmt_msr(void)
 {
 	unsigned long long msr;
 
-	if (!platform->has_nhm_msrs)
-		return;
-
 	if (!platform->has_msr_misc_pwr_mgmt)
 		return;
 
-- 
cgit 


From 8b7199c0855e3b22532a21aaaed78e699431e4e5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 27 Mar 2023 14:58:35 +0800
Subject: tools/power/turbostat: Rename some functions

Rename dump_nhm_platform_info() and dump_nhm_cst_cfg() to
dump_platform_info() and dump_cst_cfg() because these MSRs' behavior is
consistent when they're available.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4eb10491f714..654ae1ce130c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2823,7 +2823,7 @@ static void calculate_tsc_tweak()
 
 void prewake_cstate_probe(unsigned int family, unsigned int model);
 
-static void dump_nhm_platform_info(void)
+static void dump_platform_info(void)
 {
 	unsigned long long msr;
 	unsigned int ratio;
@@ -3059,7 +3059,7 @@ static void dump_knl_turbo_ratio_limits(void)
 				ratio[i], bclk, ratio[i] * bclk, cores[i]);
 }
 
-static void dump_nhm_cst_cfg(void)
+static void dump_cst_cfg(void)
 {
 	unsigned long long msr;
 
@@ -4432,9 +4432,9 @@ static void dump_cstate_pstate_config_info(unsigned int family, unsigned int mod
 	if (!platform->has_nhm_msrs)
 		return;
 
-	dump_nhm_platform_info();
+	dump_platform_info();
 	dump_turbo_ratio_info(family, model);
-	dump_nhm_cst_cfg();
+	dump_cst_cfg();
 }
 
 static int read_sysfs_int(char *path)
-- 
cgit 


From 10d85d85ab4f3ae7faca4450ce7b0a166fd396f0 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:16:00 +0800
Subject: tools/power/turbostat: Abstract Turbo Ratio Limit MSRs support

Abstract the support for MSR_TURBO_RATIO_LIMIT, MSR_TRUBO_RATIO_LIMIT1,
MSR_TURBO_RATIO_LIMIT2, MSR_SECONDARY_TURBO_RATIO_LIMIT,
MSR_ATOM_CORE_RATIOS and MSR_ATOM_CORE_TURBO_RATIOS.

Delete has_turbo_ratio_group_limits(), has_turbo_ratio_limit(),
has_atom_turbo_ratio_limit(), has_ivt_turbo_ratio_limit(),
has_hsw_turbo_ratio_limit(), has_knl_turbo_ratio_limit() and
has_glm_turbo_ratio_limit() CPU model checks.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 171 +++++++++-------------------------
 1 file changed, 46 insertions(+), 125 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 654ae1ce130c..b6c53ec740d3 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -288,6 +288,7 @@ struct platform_features {
 	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */
 	int bclk_freq;		/* CPU base clock */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
+	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 };
 
 struct platform_data {
@@ -340,11 +341,22 @@ enum package_cstate_limit {
 	CST_LIMIT_GMT,
 };
 
+/* For Turbo Ratio Limit MSRs */
+enum turbo_ratio_limit_msrs {
+	TRL_BASE = BIT(0),
+	TRL_LIMIT1 = BIT(1),
+	TRL_LIMIT2 = BIT(2),
+	TRL_ATOM = BIT(3),
+	TRL_KNL = BIT(4),
+	TRL_CORECOUNT = BIT(5),
+};
+
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
 	.cst_limit = CST_LIMIT_NHM,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features nhx_features = {
@@ -360,6 +372,7 @@ static const struct platform_features snb_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features snx_features = {
@@ -368,6 +381,7 @@ static const struct platform_features snx_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features ivb_features = {
@@ -376,6 +390,7 @@ static const struct platform_features ivb_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features ivx_features = {
@@ -384,6 +399,7 @@ static const struct platform_features ivx_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
+	.trl_msrs = TRL_BASE | TRL_LIMIT1,
 };
 
 static const struct platform_features hsw_features = {
@@ -392,6 +408,7 @@ static const struct platform_features hsw_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features hsx_features = {
@@ -400,6 +417,7 @@ static const struct platform_features hsx_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 };
 
 static const struct platform_features hswl_features = {
@@ -408,6 +426,7 @@ static const struct platform_features hswl_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features hswg_features = {
@@ -416,6 +435,7 @@ static const struct platform_features hswg_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features bdw_features = {
@@ -424,6 +444,7 @@ static const struct platform_features bdw_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features bdwg_features = {
@@ -432,6 +453,7 @@ static const struct platform_features bdwg_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features bdx_features = {
@@ -440,6 +462,7 @@ static const struct platform_features bdx_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features skl_features = {
@@ -448,6 +471,7 @@ static const struct platform_features skl_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features cnl_features = {
@@ -456,6 +480,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features skx_features = {
@@ -464,6 +489,7 @@ static const struct platform_features skx_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
 static const struct platform_features icx_features = {
@@ -472,6 +498,7 @@ static const struct platform_features icx_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_ICX,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
 static const struct platform_features spr_features = {
@@ -480,12 +507,14 @@ static const struct platform_features spr_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
 	.cst_limit = CST_LIMIT_SLV,
+	.trl_msrs = TRL_ATOM,
 };
 
 static const struct platform_features slvd_features = {
@@ -493,12 +522,14 @@ static const struct platform_features slvd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
 	.cst_limit = CST_LIMIT_SLV,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features amt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
 	.cst_limit = CST_LIMIT_AMT,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features gmt_features = {
@@ -506,6 +537,7 @@ static const struct platform_features gmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
 static const struct platform_features gmtd_features = {
@@ -513,6 +545,7 @@ static const struct platform_features gmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
 static const struct platform_features gmtp_features = {
@@ -520,6 +553,7 @@ static const struct platform_features gmtp_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features tmt_features = {
@@ -527,6 +561,7 @@ static const struct platform_features tmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
+	.trl_msrs = TRL_BASE,
 };
 
 static const struct platform_features tmtd_features = {
@@ -534,6 +569,7 @@ static const struct platform_features tmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
 static const struct platform_features knl_features = {
@@ -541,6 +577,7 @@ static const struct platform_features knl_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_KNL,
+	.trl_msrs = TRL_KNL,
 };
 
 static const struct platform_features default_features = {
@@ -2911,29 +2948,7 @@ static void dump_ivt_turbo_ratio_limits(void)
 	return;
 }
 
-int has_turbo_ratio_group_limits(int family, int model)
-{
-
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_GOLDMONT:
-	case INTEL_FAM6_SKYLAKE_X:
-	case INTEL_FAM6_ICELAKE_X:
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:
-	case INTEL_FAM6_ATOM_GOLDMONT_D:
-	case INTEL_FAM6_ATOM_TREMONT_D:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model)
+static void dump_turbo_ratio_limits(int trl_msr_offset)
 {
 	unsigned long long msr, core_counts;
 	int shift;
@@ -2942,7 +2957,7 @@ static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model)
 	fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
 		base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
 
-	if (has_turbo_ratio_group_limits(family, model)) {
+	if (platform->trl_msrs & TRL_CORECOUNT) {
 		get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
 		fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
 	} else {
@@ -4236,100 +4251,6 @@ int is_jvl(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int has_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-	if (has_slv_msrs(family, model))
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-		/* Nehalem compatible, but do not include turbo-ratio limit support */
-	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
-	case INTEL_FAM6_XEON_PHI_KNL:	/* PHI - Knights Landing (different MSR definition) */
-		return 0;
-	default:
-		return 1;
-	}
-}
-
-int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-	if (has_slv_msrs(family, model))
-		return 1;
-
-	return 0;
-}
-
-int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
-	case INTEL_FAM6_HASWELL_X:	/* HSW Xeon */
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_HASWELL_X:	/* HSW Xeon */
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_XEON_PHI_KNL:	/* Knights Landing */
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_GOLDMONT:
-	case INTEL_FAM6_SKYLAKE_X:
-	case INTEL_FAM6_ICELAKE_X:
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 int has_config_tdp(unsigned int family, unsigned int model)
 {
 	if (!genuine_intel)
@@ -4404,23 +4325,23 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model)
 	if (!has_turbo)
 		return;
 
-	if (has_hsw_turbo_ratio_limit(family, model))
+	if (platform->trl_msrs & TRL_LIMIT2)
 		dump_hsw_turbo_ratio_limits();
 
-	if (has_ivt_turbo_ratio_limit(family, model))
+	if (platform->trl_msrs & TRL_LIMIT1)
 		dump_ivt_turbo_ratio_limits();
 
-	if (has_turbo_ratio_limit(family, model)) {
-		dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT, family, model);
+	if (platform->trl_msrs & TRL_BASE) {
+		dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
 
 		if (is_hybrid)
-			dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT, family, model);
+			dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT);
 	}
 
-	if (has_atom_turbo_ratio_limit(family, model))
+	if (platform->trl_msrs & TRL_ATOM)
 		dump_atom_turbo_ratio_limits();
 
-	if (has_knl_turbo_ratio_limit(family, model))
+	if (platform->trl_msrs & TRL_KNL)
 		dump_knl_turbo_ratio_limits();
 
 	if (has_config_tdp(family, model))
-- 
cgit 


From a3943deaf98f713c819dd7e67af734d4ed4da030 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 30 Jul 2023 13:54:25 +0800
Subject: tools/power/turbostat: Rename some TRL functions

Rename dump_hsw_turbo_ratio_limits() and dump_ivt_turbo_ratio_limits()
to dump_turbo_ratio_limit2() and dump_turbo_ratio_limit1() because they
dump MSR_TURBO_RATIO_LIMIT1/LIMIT2, and the MSRs' behavior is
consistent when they are available.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b6c53ec740d3..abcc055ea0e1 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2886,7 +2886,7 @@ static void dump_platform_info(void)
 	return;
 }
 
-static void dump_hsw_turbo_ratio_limits(void)
+static void dump_turbo_ratio_limit2(void)
 {
 	unsigned long long msr;
 	unsigned int ratio;
@@ -2905,7 +2905,7 @@ static void dump_hsw_turbo_ratio_limits(void)
 	return;
 }
 
-static void dump_ivt_turbo_ratio_limits(void)
+static void dump_turbo_ratio_limit1(void)
 {
 	unsigned long long msr;
 	unsigned int ratio;
@@ -4326,10 +4326,10 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model)
 		return;
 
 	if (platform->trl_msrs & TRL_LIMIT2)
-		dump_hsw_turbo_ratio_limits();
+		dump_turbo_ratio_limit2();
 
 	if (platform->trl_msrs & TRL_LIMIT1)
-		dump_ivt_turbo_ratio_limits();
+		dump_turbo_ratio_limit1();
 
 	if (platform->trl_msrs & TRL_BASE) {
 		dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
-- 
cgit 


From a61c9cb478c0bf31a50d1829834e822e92876c95 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 14:19:46 +0800
Subject: tools/power/turbostat: Abstract Config TDP MSRs support

Abstract the support for MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL
and MSR_TURBO_ACTIVATION_RATIO.

Delete has_config_tdp() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 54 +++++++++++++----------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index abcc055ea0e1..bb9d8c2605c8 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -286,6 +286,7 @@ struct platform_features {
 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
 	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */
+	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
 	int bclk_freq;		/* CPU base clock */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
@@ -388,6 +389,7 @@ static const struct platform_features ivb_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
@@ -406,6 +408,7 @@ static const struct platform_features hsw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -415,6 +418,7 @@ static const struct platform_features hsx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
@@ -424,6 +428,7 @@ static const struct platform_features hswl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -433,6 +438,7 @@ static const struct platform_features hswg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -442,6 +448,7 @@ static const struct platform_features bdw_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -451,6 +458,7 @@ static const struct platform_features bdwg_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -460,6 +468,7 @@ static const struct platform_features bdx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -469,6 +478,7 @@ static const struct platform_features skl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -478,6 +488,7 @@ static const struct platform_features cnl_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
@@ -487,6 +498,7 @@ static const struct platform_features skx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -496,6 +508,7 @@ static const struct platform_features icx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -505,6 +518,7 @@ static const struct platform_features spr_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -575,6 +589,7 @@ static const struct platform_features tmtd_features = {
 static const struct platform_features knl_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_KNL,
 	.trl_msrs = TRL_KNL,
@@ -4251,35 +4266,6 @@ int is_jvl(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int has_config_tdp(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
-	case INTEL_FAM6_HASWELL:	/* HSW */
-	case INTEL_FAM6_HASWELL_X:	/* HSX */
-	case INTEL_FAM6_HASWELL_L:	/* HSW */
-	case INTEL_FAM6_HASWELL_G:	/* HSW */
-	case INTEL_FAM6_BROADWELL:	/* BDW */
-	case INTEL_FAM6_BROADWELL_G:	/* BDW */
-	case INTEL_FAM6_BROADWELL_X:	/* BDX */
-	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
-	case INTEL_FAM6_ICELAKE_X:	/* ICX */
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
-	case INTEL_FAM6_XEON_PHI_KNL:	/* Knights Landing */
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 /*
  * tcc_offset_bits:
  * 0: Tcc Offset not supported (Default)
@@ -4320,7 +4306,7 @@ static void remove_underbar(char *s)
 	*to = 0;
 }
 
-static void dump_turbo_ratio_info(unsigned int family, unsigned int model)
+static void dump_turbo_ratio_info(void)
 {
 	if (!has_turbo)
 		return;
@@ -4344,17 +4330,17 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model)
 	if (platform->trl_msrs & TRL_KNL)
 		dump_knl_turbo_ratio_limits();
 
-	if (has_config_tdp(family, model))
+	if (platform->has_config_tdp)
 		dump_config_tdp();
 }
 
-static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
+static void dump_cstate_pstate_config_info(void)
 {
 	if (!platform->has_nhm_msrs)
 		return;
 
 	dump_platform_info();
-	dump_turbo_ratio_info(family, model);
+	dump_turbo_ratio_info();
 	dump_cst_cfg();
 }
 
@@ -6000,7 +5986,7 @@ void process_cpuid()
 	check_tcc_offset(model);
 
 	if (!quiet)
-		dump_cstate_pstate_config_info(family, model);
+		dump_cstate_pstate_config_info();
 	intel_uncore_frequency_probe();
 
 	if (!quiet)
-- 
cgit 


From d8e1623baa0b49aa90cf5801cfaac9e3d3aa1e19 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 25 Aug 2023 23:04:44 +0800
Subject: tools/power/turbostat: Abstract TCC Offset bits support

Abstract the support for different TCC Offset bits in
MSR_IA32_TEMPERATURE_TARGET.

Delete check_tcc_offset() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 53 ++++++++---------------------------
 1 file changed, 12 insertions(+), 41 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index bb9d8c2605c8..c39beb4078da 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -256,7 +256,6 @@ unsigned int gfx_cur_mhz;
 unsigned int gfx_act_mhz;
 unsigned int tj_max;
 unsigned int tj_max_override;
-int tcc_offset_bits;
 double rapl_power_units, rapl_time_units;
 double rapl_dram_energy_units, rapl_energy_units;
 double rapl_joule_counter_range;
@@ -290,6 +289,7 @@ struct platform_features {
 	int bclk_freq;		/* CPU base clock */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
+	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
 struct platform_data {
@@ -482,6 +482,7 @@ static const struct platform_features skl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.tcc_offset_bits = 6,
 };
 
 static const struct platform_features cnl_features = {
@@ -492,6 +493,7 @@ static const struct platform_features cnl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.tcc_offset_bits = 6,
 };
 
 static const struct platform_features skx_features = {
@@ -4266,33 +4268,6 @@ int is_jvl(unsigned int family, unsigned int model)
 	return 0;
 }
 
-/*
- * tcc_offset_bits:
- * 0: Tcc Offset not supported (Default)
- * 6: Bit 29:24 of MSR_PLATFORM_INFO
- * 4: Bit 27:24 of MSR_PLATFORM_INFO
- */
-void check_tcc_offset(int model)
-{
-	unsigned long long msr;
-
-	if (!genuine_intel)
-		return;
-
-	switch (model) {
-	case INTEL_FAM6_SKYLAKE_L:
-	case INTEL_FAM6_CANNONLAKE_L:
-		if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) {
-			msr = (msr >> 30) & 1;
-			if (msr)
-				tcc_offset_bits = 6;
-		}
-		return;
-	default:
-		return;
-	}
-}
-
 static void remove_underbar(char *s)
 {
 	char *to = s;
@@ -5490,20 +5465,18 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 	tcc_default = (msr >> 16) & 0xFF;
 
 	if (!quiet) {
-		switch (tcc_offset_bits) {
-		case 4:
-			tcc_offset = (msr >> 24) & 0xF;
-			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
-				cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
-			break;
-		case 6:
-			tcc_offset = (msr >> 24) & 0x3F;
+		int bits = platform->tcc_offset_bits;
+		unsigned long long enabled = 0;
+
+		if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
+			enabled = (enabled >> 30) & 1;
+
+		if (bits && enabled) {
+			tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0);
 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
 				cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
-			break;
-		default:
+		} else {
 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
-			break;
 		}
 	}
 
@@ -5983,8 +5956,6 @@ void process_cpuid()
 	automatic_cstate_conversion_probe(family, model);
 	prewake_cstate_probe(family, model);
 
-	check_tcc_offset(model);
-
 	if (!quiet)
 		dump_cstate_pstate_config_info();
 	intel_uncore_frequency_probe();
-- 
cgit 


From 0c057cf7a0e163bf9631b83c002b3a55691674c7 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 30 Jul 2023 14:25:17 +0800
Subject: tools/power/turbostat: Abstract Perf Limit Reasons MSRs support

Abstract the support for MSR_CORE/GFX/RING_PERF_LIMIT_REASONS MSRs.

Delete perf_limit_reasons_probe() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 44 ++++++++++++-----------------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c39beb4078da..1207845340ad 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -259,11 +259,8 @@ unsigned int tj_max_override;
 double rapl_power_units, rapl_time_units;
 double rapl_dram_energy_units, rapl_energy_units;
 double rapl_joule_counter_range;
-unsigned int do_core_perf_limit_reasons;
 unsigned int has_automatic_cstate_conversion;
 unsigned int dis_cstate_prewake;
-unsigned int do_gfx_perf_limit_reasons;
-unsigned int do_ring_perf_limit_reasons;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
 int base_cpu;
@@ -289,6 +286,7 @@ struct platform_features {
 	int bclk_freq;		/* CPU base clock */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
+	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
@@ -352,6 +350,13 @@ enum turbo_ratio_limit_msrs {
 	TRL_CORECOUNT = BIT(5),
 };
 
+/* For Perf Limit Reason MSRs */
+enum perf_limit_reason_msrs {
+	PLR_CORE = BIT(0),
+	PLR_GFX = BIT(1),
+	PLR_RING = BIT(2),
+};
+
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
@@ -412,6 +417,7 @@ static const struct platform_features hsw_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
 };
 
 static const struct platform_features hsx_features = {
@@ -422,6 +428,7 @@ static const struct platform_features hsx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
+	.plr_msrs = PLR_CORE | PLR_RING,
 };
 
 static const struct platform_features hswl_features = {
@@ -432,6 +439,7 @@ static const struct platform_features hswl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
 };
 
 static const struct platform_features hswg_features = {
@@ -442,6 +450,7 @@ static const struct platform_features hswg_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
 };
 
 static const struct platform_features bdw_features = {
@@ -4657,7 +4666,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 		return -1;
 	}
 
-	if (do_core_perf_limit_reasons) {
+	if (platform->plr_msrs & PLR_CORE) {
 		get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
 		fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
@@ -4690,7 +4699,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 			(msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
 
 	}
-	if (do_gfx_perf_limit_reasons) {
+	if (platform->plr_msrs & PLR_GFX) {
 		get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
 		fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
@@ -4710,7 +4719,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 			(msr & 1 << 25) ? "GFXPwr, " : "",
 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
 	}
-	if (do_ring_perf_limit_reasons) {
+	if (platform->plr_msrs & PLR_RING) {
 		get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
 		fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
 		fprintf(outf, " (Active: %s%s%s%s%s%s)",
@@ -5002,28 +5011,6 @@ void rapl_probe(unsigned int family, unsigned int model)
 		rapl_probe_amd(family, model);
 }
 
-void perf_limit_reasons_probe(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return;
-
-	if (family != 6)
-		return;
-
-	switch (model) {
-	case INTEL_FAM6_HASWELL:	/* HSW */
-	case INTEL_FAM6_HASWELL_L:	/* HSW */
-	case INTEL_FAM6_HASWELL_G:	/* HSW */
-		do_gfx_perf_limit_reasons = 1;
-		/* FALLTHRU */
-	case INTEL_FAM6_HASWELL_X:	/* HSX */
-		do_core_perf_limit_reasons = 1;
-		do_ring_perf_limit_reasons = 1;
-	default:
-		return;
-	}
-}
-
 void automatic_cstate_conversion_probe(unsigned int family, unsigned int model)
 {
 	if (family != 6)
@@ -5952,7 +5939,6 @@ void process_cpuid()
 		decode_c6_demotion_policy_msr();
 
 	rapl_probe(family, model);
-	perf_limit_reasons_probe(family, model);
 	automatic_cstate_conversion_probe(family, model);
 	prewake_cstate_probe(family, model);
 
-- 
cgit 


From d90120bf9f111bec8ba0b8ef86c46ccbcd9df188 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 22 Apr 2023 11:29:18 +0800
Subject: tools/power/turbostat: Abstract Automatic Cstate Conversion support

Abstract the support for AUTOMATIC_CSTATE_CONVERSION bit in
MSR_PKG_CST_CONFIG_CONTROL.

Delete automatic_cstate_conversion_probe() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1207845340ad..a235cbf7b581 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -259,7 +259,6 @@ unsigned int tj_max_override;
 double rapl_power_units, rapl_time_units;
 double rapl_dram_energy_units, rapl_energy_units;
 double rapl_joule_counter_range;
-unsigned int has_automatic_cstate_conversion;
 unsigned int dis_cstate_prewake;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
@@ -285,6 +284,7 @@ struct platform_features {
 	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
 	int bclk_freq;		/* CPU base clock */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
+	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
@@ -480,6 +480,7 @@ static const struct platform_features bdx_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
 };
 
@@ -512,6 +513,7 @@ static const struct platform_features skx_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
+	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
 
@@ -3116,7 +3118,7 @@ static void dump_cst_cfg(void)
 		(msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
 
 #define AUTOMATIC_CSTATE_CONVERSION		(1UL << 16)
-	if (has_automatic_cstate_conversion) {
+	if (platform->has_cst_auto_convension) {
 		fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
 	}
 
@@ -5011,18 +5013,6 @@ void rapl_probe(unsigned int family, unsigned int model)
 		rapl_probe_amd(family, model);
 }
 
-void automatic_cstate_conversion_probe(unsigned int family, unsigned int model)
-{
-	if (family != 6)
-		return;
-
-	switch (model) {
-	case INTEL_FAM6_BROADWELL_X:
-	case INTEL_FAM6_SKYLAKE_X:
-		has_automatic_cstate_conversion = 1;
-	}
-}
-
 void prewake_cstate_probe(unsigned int family, unsigned int model)
 {
 	if (is_icx(family, model) || is_spr(family, model))
@@ -5939,7 +5929,6 @@ void process_cpuid()
 		decode_c6_demotion_policy_msr();
 
 	rapl_probe(family, model);
-	automatic_cstate_conversion_probe(family, model);
 	prewake_cstate_probe(family, model);
 
 	if (!quiet)
-- 
cgit 


From a5d1ab93a0993616efbf61378e491d6673f4684d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 22 Apr 2023 11:39:49 +0800
Subject: tools/power/turbostat: Abstract hardcoded Crystal Clock frequency

Abstract the support for hardcoded Crystal Clock frequency, which is
used when crystal clock is not available from CPUID.15.

Delete CPU model checks in process_cpuid().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a235cbf7b581..c76baa10f4eb 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -283,6 +283,7 @@ struct platform_features {
 	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */
 	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
 	int bclk_freq;		/* CPU base clock */
+	int crystal_freq;	/* Crystal clock to use when not available from CPUID.15 */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
@@ -490,6 +491,7 @@ static const struct platform_features skl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.crystal_freq = 24000000,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -563,6 +565,7 @@ static const struct platform_features gmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.crystal_freq = 19200000,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
@@ -571,6 +574,7 @@ static const struct platform_features gmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.crystal_freq = 25000000,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 };
@@ -579,6 +583,7 @@ static const struct platform_features gmtp_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.crystal_freq = 19200000,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 };
@@ -5796,26 +5801,12 @@ void process_cpuid()
 		__cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
 
 		if (ebx_tsc != 0) {
-
 			if (!quiet && (ebx != 0))
 				fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
 					eax_crystal, ebx_tsc, crystal_hz);
 
 			if (crystal_hz == 0)
-				switch (model) {
-				case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-					crystal_hz = 24000000;	/* 24.0 MHz */
-					break;
-				case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
-					crystal_hz = 25000000;	/* 25.0 MHz */
-					break;
-				case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
-				case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-					crystal_hz = 19200000;	/* 19.2 MHz */
-					break;
-				default:
-					crystal_hz = 0;
-				}
+				crystal_hz = platform->crystal_freq;
 
 			if (crystal_hz) {
 				tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
-- 
cgit 


From b9cd66833d3a651cea10666674e9abcf2182e8ad Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 26 Aug 2023 14:38:38 +0800
Subject: tools/power/turbostat: Redefine RAPL macros

Redefine RAPL macros to make the code more readable.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 102 +++++++++++++++-------------------
 1 file changed, 45 insertions(+), 57 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c76baa10f4eb..4829e8289feb 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -358,6 +358,40 @@ enum perf_limit_reason_msrs {
 	PLR_RING = BIT(2),
 };
 
+/* For RAPL MSRs */
+enum rapl_msrs {
+	RAPL_PKG_POWER_LIMIT = BIT(0),	/* 0x610 MSR_PKG_POWER_LIMIT */
+	RAPL_PKG_ENERGY_STATUS = BIT(1),	/* 0x611 MSR_PKG_ENERGY_STATUS */
+	RAPL_PKG_PERF_STATUS = BIT(2),	/* 0x613 MSR_PKG_PERF_STATUS */
+	RAPL_PKG_POWER_INFO = BIT(3),	/* 0x614 MSR_PKG_POWER_INFO */
+	RAPL_DRAM_POWER_LIMIT = BIT(4),	/* 0x618 MSR_DRAM_POWER_LIMIT */
+	RAPL_DRAM_ENERGY_STATUS = BIT(5),	/* 0x619 MSR_DRAM_ENERGY_STATUS */
+	RAPL_DRAM_PERF_STATUS = BIT(6),	/* 0x61b MSR_DRAM_PERF_STATUS */
+	RAPL_DRAM_POWER_INFO = BIT(7),	/* 0x61c MSR_DRAM_POWER_INFO */
+	RAPL_CORE_POWER_LIMIT = BIT(8),	/* 0x638 MSR_PP0_POWER_LIMIT */
+	RAPL_CORE_ENERGY_STATUS = BIT(9),	/* 0x639 MSR_PP0_ENERGY_STATUS */
+	RAPL_CORE_POLICY = BIT(10),	/* 0x63a MSR_PP0_POLICY */
+	RAPL_GFX_POWER_LIMIT = BIT(11),	/* 0x640 MSR_PP1_POWER_LIMIT */
+	RAPL_GFX_ENERGY_STATUS = BIT(12),	/* 0x641 MSR_PP1_ENERGY_STATUS */
+	RAPL_GFX_POLICY = BIT(13),	/* 0x642 MSR_PP1_POLICY */
+	RAPL_AMD_PWR_UNIT = BIT(14),	/* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
+	RAPL_AMD_CORE_ENERGY_STAT = BIT(15),	/* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
+	RAPL_AMD_PKG_ENERGY_STAT = BIT(16),	/* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
+	RAPL_PER_CORE_ENERGY = BIT(17),	/* Indicates cores energy collection is per-core, not per-package. */
+};
+
+#define RAPL_PKG	(RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
+#define RAPL_DRAM	(RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT)
+#define RAPL_CORE	(RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT)
+#define RAPL_GFX	(RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS)
+
+#define RAPL_PKG_ALL	(RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
+#define RAPL_DRAM_ALL	(RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
+#define RAPL_CORE_ALL	(RAPL_CORE | RAPL_CORE_POLICY)
+#define RAPL_GFX_ALL	(RAPL_GFX | RAPL_GFX_POLIGY)
+
+#define RAPL_AMD_F17H	(RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
+
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
@@ -712,42 +746,6 @@ void probe_platform_features(unsigned int family, unsigned int model)
 
 /* Model specific support End */
 
-#define RAPL_PKG		(1 << 0)
-					/* 0x610 MSR_PKG_POWER_LIMIT */
-					/* 0x611 MSR_PKG_ENERGY_STATUS */
-#define RAPL_PKG_PERF_STATUS	(1 << 1)
-					/* 0x613 MSR_PKG_PERF_STATUS */
-#define RAPL_PKG_POWER_INFO	(1 << 2)
-					/* 0x614 MSR_PKG_POWER_INFO */
-
-#define RAPL_DRAM		(1 << 3)
-					/* 0x618 MSR_DRAM_POWER_LIMIT */
-					/* 0x619 MSR_DRAM_ENERGY_STATUS */
-#define RAPL_DRAM_PERF_STATUS	(1 << 4)
-					/* 0x61b MSR_DRAM_PERF_STATUS */
-#define RAPL_DRAM_POWER_INFO	(1 << 5)
-					/* 0x61c MSR_DRAM_POWER_INFO */
-
-#define RAPL_CORES_POWER_LIMIT	(1 << 6)
-					/* 0x638 MSR_PP0_POWER_LIMIT */
-#define RAPL_CORE_POLICY	(1 << 7)
-					/* 0x63a MSR_PP0_POLICY */
-
-#define RAPL_GFX		(1 << 8)
-					/* 0x640 MSR_PP1_POWER_LIMIT */
-					/* 0x641 MSR_PP1_ENERGY_STATUS */
-					/* 0x642 MSR_PP1_POLICY */
-
-#define RAPL_CORES_ENERGY_STATUS	(1 << 9)
-					/* 0x639 MSR_PP0_ENERGY_STATUS */
-#define RAPL_PER_CORE_ENERGY	(1 << 10)
-					/* Indicates cores energy collection is per-core,
-					 * not per-package. */
-#define RAPL_AMD_F17H		(1 << 11)
-					/* 0xc0010299 MSR_RAPL_PWR_UNIT */
-					/* 0xc001029a MSR_CORE_ENERGY_STAT */
-					/* 0xc001029b MSR_PKG_ENERGY_STAT */
-#define RAPL_CORES (RAPL_CORES_ENERGY_STATUS | RAPL_CORES_POWER_LIMIT)
 #define	TJMAX_DEFAULT	100
 
 /* MSRs that are not yet in the kernel-provided header. */
@@ -948,7 +946,7 @@ int idx_valid(int idx)
 	case IDX_DRAM_ENERGY:
 		return do_rapl & RAPL_DRAM;
 	case IDX_PP0_ENERGY:
-		return do_rapl & RAPL_CORES_ENERGY_STATUS;
+		return do_rapl & RAPL_CORE_ENERGY_STATUS;
 	case IDX_PP1_ENERGY:
 		return do_rapl & RAPL_GFX;
 	case IDX_PKG_PERF:
@@ -2710,7 +2708,7 @@ retry:
 			return -13;
 		p->energy_pkg = msr;
 	}
-	if (do_rapl & RAPL_CORES_ENERGY_STATUS) {
+	if (do_rapl & RAPL_CORE_ENERGY_STATUS) {
 		if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
 			return -14;
 		p->energy_cores = msr;
@@ -4810,7 +4808,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 	case INTEL_FAM6_HASWELL_G:	/* HSW */
 	case INTEL_FAM6_BROADWELL:	/* BDW */
 	case INTEL_FAM6_BROADWELL_G:	/* BDW */
-		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO;
 		if (rapl_joules) {
 			BIC_PRESENT(BIC_Pkg_J);
 			BIC_PRESENT(BIC_Cor_J);
@@ -4830,9 +4828,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 			BIC_PRESENT(BIC_PkgWatt);
 		break;
 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
-		do_rapl =
-		    RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
-		    | RAPL_GFX | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX;
 		if (rapl_joules) {
 			BIC_PRESENT(BIC_Pkg_J);
 			BIC_PRESENT(BIC_Cor_J);
@@ -4846,7 +4842,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 		}
 		break;
 	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
-		do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG_ALL;
 		BIC_PRESENT(BIC_PKG__);
 		if (rapl_joules)
 			BIC_PRESENT(BIC_Pkg_J);
@@ -4855,9 +4851,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 		break;
 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-		do_rapl =
-		    RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
-		    | RAPL_GFX | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX;
 		BIC_PRESENT(BIC_PKG__);
 		BIC_PRESENT(BIC_RAM__);
 		if (rapl_joules) {
@@ -4878,9 +4872,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
 	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
 	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
-		do_rapl =
-		    RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
-		    RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL;
 		BIC_PRESENT(BIC_PKG__);
 		BIC_PRESENT(BIC_RAM__);
 		if (rapl_joules) {
@@ -4893,9 +4885,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 		break;
 	case INTEL_FAM6_SANDYBRIDGE_X:
 	case INTEL_FAM6_IVYBRIDGE_X:
-		do_rapl =
-		    RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS |
-		    RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL;
 		BIC_PRESENT(BIC_PKG__);
 		BIC_PRESENT(BIC_RAM__);
 		if (rapl_joules) {
@@ -4910,7 +4900,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 		break;
 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
-		do_rapl = RAPL_PKG | RAPL_CORES;
+		do_rapl = RAPL_PKG | RAPL_CORE;
 		if (rapl_joules) {
 			BIC_PRESENT(BIC_Pkg_J);
 			BIC_PRESENT(BIC_Cor_J);
@@ -4920,9 +4910,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 		}
 		break;
 	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
-		do_rapl =
-		    RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
-		    RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS;
+		do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS;
 		BIC_PRESENT(BIC_PKG__);
 		BIC_PRESENT(BIC_RAM__);
 		if (rapl_joules) {
@@ -5195,7 +5183,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 		fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
 	}
-	if (do_rapl & RAPL_CORES_POWER_LIMIT) {
+	if (do_rapl & RAPL_CORE_POWER_LIMIT) {
 		if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
 			return -9;
 		fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
-- 
cgit 


From a98f886035d5f7e0ec66036dd6bf98b40e75b692 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 26 Aug 2023 14:57:12 +0800
Subject: tools/power/turbostat: Simplify the logic for RAPL enumeration

The support for each RAPL domains, as well as the support for the perf
status of each RAPL domains, can be detected by checking the
availabilities of the corresponding RAPL MSRs.

Change the code accordingly and remove the hardcoded logic for each
model.

Note that this also fixes the INTEL_FAM6_ATOM_TREMONT model, which has
RAPL_PKG_PERF_STATUS and MSR_DRAM_PERF_STATUS but doesn't have BIC_PKG__
and BIC_RAM__ set.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 99 ++++++++---------------------------
 1 file changed, 22 insertions(+), 77 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4829e8289feb..b2da36437b12 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4809,62 +4809,20 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 	case INTEL_FAM6_BROADWELL:	/* BDW */
 	case INTEL_FAM6_BROADWELL_G:	/* BDW */
 		do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO;
-		if (rapl_joules) {
-			BIC_PRESENT(BIC_Pkg_J);
-			BIC_PRESENT(BIC_Cor_J);
-			BIC_PRESENT(BIC_GFX_J);
-		} else {
-			BIC_PRESENT(BIC_PkgWatt);
-			BIC_PRESENT(BIC_CorWatt);
-			BIC_PRESENT(BIC_GFXWatt);
-		}
 		break;
 	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
 		do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO;
-		if (rapl_joules)
-			BIC_PRESENT(BIC_Pkg_J);
-		else
-			BIC_PRESENT(BIC_PkgWatt);
 		break;
 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
 		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX;
-		if (rapl_joules) {
-			BIC_PRESENT(BIC_Pkg_J);
-			BIC_PRESENT(BIC_Cor_J);
-			BIC_PRESENT(BIC_RAM_J);
-			BIC_PRESENT(BIC_GFX_J);
-		} else {
-			BIC_PRESENT(BIC_PkgWatt);
-			BIC_PRESENT(BIC_CorWatt);
-			BIC_PRESENT(BIC_RAMWatt);
-			BIC_PRESENT(BIC_GFXWatt);
-		}
 		break;
 	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
 		do_rapl = RAPL_PKG_ALL;
-		BIC_PRESENT(BIC_PKG__);
-		if (rapl_joules)
-			BIC_PRESENT(BIC_Pkg_J);
-		else
-			BIC_PRESENT(BIC_PkgWatt);
 		break;
 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
 		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX;
-		BIC_PRESENT(BIC_PKG__);
-		BIC_PRESENT(BIC_RAM__);
-		if (rapl_joules) {
-			BIC_PRESENT(BIC_Pkg_J);
-			BIC_PRESENT(BIC_Cor_J);
-			BIC_PRESENT(BIC_RAM_J);
-			BIC_PRESENT(BIC_GFX_J);
-		} else {
-			BIC_PRESENT(BIC_PkgWatt);
-			BIC_PRESENT(BIC_CorWatt);
-			BIC_PRESENT(BIC_RAMWatt);
-			BIC_PRESENT(BIC_GFXWatt);
-		}
 		break;
 	case INTEL_FAM6_HASWELL_X:	/* HSX */
 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
@@ -4873,60 +4831,47 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
 	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
 		do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL;
-		BIC_PRESENT(BIC_PKG__);
-		BIC_PRESENT(BIC_RAM__);
-		if (rapl_joules) {
-			BIC_PRESENT(BIC_Pkg_J);
-			BIC_PRESENT(BIC_RAM_J);
-		} else {
-			BIC_PRESENT(BIC_PkgWatt);
-			BIC_PRESENT(BIC_RAMWatt);
-		}
 		break;
 	case INTEL_FAM6_SANDYBRIDGE_X:
 	case INTEL_FAM6_IVYBRIDGE_X:
 		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL;
-		BIC_PRESENT(BIC_PKG__);
-		BIC_PRESENT(BIC_RAM__);
-		if (rapl_joules) {
-			BIC_PRESENT(BIC_Pkg_J);
-			BIC_PRESENT(BIC_Cor_J);
-			BIC_PRESENT(BIC_RAM_J);
-		} else {
-			BIC_PRESENT(BIC_PkgWatt);
-			BIC_PRESENT(BIC_CorWatt);
-			BIC_PRESENT(BIC_RAMWatt);
-		}
 		break;
 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
 		do_rapl = RAPL_PKG | RAPL_CORE;
-		if (rapl_joules) {
-			BIC_PRESENT(BIC_Pkg_J);
-			BIC_PRESENT(BIC_Cor_J);
-		} else {
-			BIC_PRESENT(BIC_PkgWatt);
-			BIC_PRESENT(BIC_CorWatt);
-		}
 		break;
 	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
 		do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS;
-		BIC_PRESENT(BIC_PKG__);
-		BIC_PRESENT(BIC_RAM__);
-		if (rapl_joules) {
+		break;
+	default:
+		return;
+	}
+
+	if (rapl_joules) {
+		if (do_rapl & RAPL_PKG_ENERGY_STATUS)
 			BIC_PRESENT(BIC_Pkg_J);
+		if (do_rapl & RAPL_CORE_ENERGY_STATUS)
 			BIC_PRESENT(BIC_Cor_J);
+		if (do_rapl & RAPL_DRAM_ENERGY_STATUS)
 			BIC_PRESENT(BIC_RAM_J);
-		} else {
+		if (do_rapl & RAPL_GFX_ENERGY_STATUS)
+			BIC_PRESENT(BIC_GFX_J);
+	} else {
+		if (do_rapl & RAPL_PKG_ENERGY_STATUS)
 			BIC_PRESENT(BIC_PkgWatt);
+		if (do_rapl & RAPL_CORE_ENERGY_STATUS)
 			BIC_PRESENT(BIC_CorWatt);
+		if (do_rapl & RAPL_DRAM_ENERGY_STATUS)
 			BIC_PRESENT(BIC_RAMWatt);
-		}
-		break;
-	default:
-		return;
+		if (do_rapl & RAPL_GFX_ENERGY_STATUS)
+			BIC_PRESENT(BIC_GFXWatt);
 	}
 
+	if (do_rapl & RAPL_PKG_PERF_STATUS)
+		BIC_PRESENT(BIC_PKG__);
+	if (do_rapl & RAPL_DRAM_PERF_STATUS)
+		BIC_PRESENT(BIC_RAM__);
+
 	/* units on package 0, verify later other packages match */
 	if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
 		return;
-- 
cgit 


From 86ba263d9b72b7766a99059e9b3bd104d089b7fa Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 28 Aug 2023 15:09:40 +0800
Subject: tools/power/turbostat: Abstract RAPL MSRs support

Abstract the support for RAPL MSRs.

Delete CPU model checks in rapl_probe_intel().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 189 +++++++++++++++-------------------
 1 file changed, 85 insertions(+), 104 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b2da36437b12..90e1abe96dd5 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -288,6 +288,7 @@ struct platform_features {
 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
+	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
@@ -414,6 +415,7 @@ static const struct platform_features snb_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features snx_features = {
@@ -423,6 +425,7 @@ static const struct platform_features snx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features ivb_features = {
@@ -433,6 +436,7 @@ static const struct platform_features ivb_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features ivx_features = {
@@ -442,6 +446,7 @@ static const struct platform_features ivx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features hsw_features = {
@@ -453,6 +458,7 @@ static const struct platform_features hsw_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features hsx_features = {
@@ -464,6 +470,7 @@ static const struct platform_features hsx_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features hswl_features = {
@@ -475,6 +482,7 @@ static const struct platform_features hswl_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features hswg_features = {
@@ -486,6 +494,7 @@ static const struct platform_features hswg_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features bdw_features = {
@@ -496,6 +505,7 @@ static const struct platform_features bdw_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features bdwg_features = {
@@ -506,6 +516,7 @@ static const struct platform_features bdwg_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features bdx_features = {
@@ -517,6 +528,7 @@ static const struct platform_features bdx_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features skl_features = {
@@ -529,6 +541,7 @@ static const struct platform_features skl_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
 };
 
 static const struct platform_features cnl_features = {
@@ -540,6 +553,7 @@ static const struct platform_features cnl_features = {
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
 };
 
 static const struct platform_features skx_features = {
@@ -551,6 +565,7 @@ static const struct platform_features skx_features = {
 	.cst_limit = CST_LIMIT_SKX,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features icx_features = {
@@ -561,6 +576,7 @@ static const struct platform_features icx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features spr_features = {
@@ -571,6 +587,7 @@ static const struct platform_features spr_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features slv_features = {
@@ -578,6 +595,7 @@ static const struct platform_features slv_features = {
 	.bclk_freq = BCLK_SLV,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_ATOM,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 };
 
 static const struct platform_features slvd_features = {
@@ -586,6 +604,7 @@ static const struct platform_features slvd_features = {
 	.bclk_freq = BCLK_SLV,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 };
 
 static const struct platform_features amt_features = {
@@ -602,6 +621,7 @@ static const struct platform_features gmt_features = {
 	.crystal_freq = 19200000,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features gmtd_features = {
@@ -611,6 +631,7 @@ static const struct platform_features gmtd_features = {
 	.crystal_freq = 25000000,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
 };
 
 static const struct platform_features gmtp_features = {
@@ -620,6 +641,7 @@ static const struct platform_features gmtp_features = {
 	.crystal_freq = 19200000,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
 };
 
 static const struct platform_features tmt_features = {
@@ -628,6 +650,7 @@ static const struct platform_features tmt_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
 };
 
 static const struct platform_features tmtd_features = {
@@ -636,6 +659,7 @@ static const struct platform_features tmtd_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL,
 };
 
 static const struct platform_features knl_features = {
@@ -645,6 +669,7 @@ static const struct platform_features knl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.cst_limit = CST_LIMIT_KNL,
 	.trl_msrs = TRL_KNL,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
 static const struct platform_features default_features = {
@@ -653,6 +678,10 @@ static const struct platform_features default_features = {
 static const struct platform_features amd_features = {
 };
 
+static const struct platform_features amd_features_with_rapl = {
+	.rapl_msrs = RAPL_AMD_F17H,
+};
+
 static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_NEHALEM, &nhm_features },
 	{ INTEL_FAM6_NEHALEM_G, &nhm_features },
@@ -728,6 +757,17 @@ void probe_platform_features(unsigned int family, unsigned int model)
 
 	if (authentic_amd || hygon_genuine) {
 		platform = &amd_features;
+
+		if (max_extended_level >= 0x80000007) {
+			unsigned int eax, ebx, ecx, edx;
+
+			__cpuid(0x80000007, eax, ebx, ecx, edx);
+			/* RAPL (Fam 17h+) */
+			if ((edx & (1 << 14)) && family >= 0x17) {
+				platform = &amd_features_with_rapl;
+				do_rapl = RAPL_PER_CORE_ENERGY;
+			}
+		}
 		return;
 	}
 
@@ -882,7 +922,7 @@ off_t idx_to_offset(int idx)
 
 	switch (idx) {
 	case IDX_PKG_ENERGY:
-		if (do_rapl & RAPL_AMD_F17H)
+		if (platform->rapl_msrs & RAPL_AMD_F17H)
 			offset = MSR_PKG_ENERGY_STAT;
 		else
 			offset = MSR_PKG_ENERGY_STATUS;
@@ -942,17 +982,17 @@ int idx_valid(int idx)
 {
 	switch (idx) {
 	case IDX_PKG_ENERGY:
-		return do_rapl & (RAPL_PKG | RAPL_AMD_F17H);
+		return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H);
 	case IDX_DRAM_ENERGY:
-		return do_rapl & RAPL_DRAM;
+		return platform->rapl_msrs & RAPL_DRAM;
 	case IDX_PP0_ENERGY:
-		return do_rapl & RAPL_CORE_ENERGY_STATUS;
+		return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS;
 	case IDX_PP1_ENERGY:
-		return do_rapl & RAPL_GFX;
+		return platform->rapl_msrs & RAPL_GFX;
 	case IDX_PKG_PERF:
-		return do_rapl & RAPL_PKG_PERF_STATUS;
+		return platform->rapl_msrs & RAPL_PKG_PERF_STATUS;
 	case IDX_DRAM_PERF:
-		return do_rapl & RAPL_DRAM_PERF_STATUS;
+		return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS;
 	default:
 		return 0;
 	}
@@ -1330,10 +1370,10 @@ void print_header(char *delim)
 	if (DO_BIC(BIC_CORE_THROT_CNT))
 		outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
 
-	if (do_rapl && !rapl_joules) {
+	if (platform->rapl_msrs && !rapl_joules) {
 		if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
-	} else if (do_rapl && rapl_joules) {
+	} else if (platform->rapl_msrs && rapl_joules) {
 		if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
 	}
@@ -1392,7 +1432,7 @@ void print_header(char *delim)
 	if (DO_BIC(BIC_SYS_LPI))
 		outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
 
-	if (do_rapl && !rapl_joules) {
+	if (platform->rapl_msrs && !rapl_joules) {
 		if (DO_BIC(BIC_PkgWatt))
 			outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
@@ -1405,7 +1445,7 @@ void print_header(char *delim)
 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_RAM__))
 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
-	} else if (do_rapl && rapl_joules) {
+	} else if (platform->rapl_msrs && rapl_joules) {
 		if (DO_BIC(BIC_Pkg_J))
 			outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
@@ -2638,7 +2678,7 @@ retry:
 	if (DO_BIC(BIC_CORE_THROT_CNT))
 		get_core_throt_cnt(cpu, &c->core_throt_cnt);
 
-	if (do_rapl & RAPL_AMD_F17H) {
+	if (platform->rapl_msrs & RAPL_AMD_F17H) {
 		if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr))
 			return -14;
 		c->core_energy = msr & 0xFFFFFFFF;
@@ -2703,37 +2743,37 @@ retry:
 	if (DO_BIC(BIC_SYS_LPI))
 		p->sys_lpi = cpuidle_cur_sys_lpi_us;
 
-	if (do_rapl & RAPL_PKG) {
+	if (platform->rapl_msrs & RAPL_PKG) {
 		if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
 			return -13;
 		p->energy_pkg = msr;
 	}
-	if (do_rapl & RAPL_CORE_ENERGY_STATUS) {
+	if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
 		if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
 			return -14;
 		p->energy_cores = msr;
 	}
-	if (do_rapl & RAPL_DRAM) {
+	if (platform->rapl_msrs & RAPL_DRAM) {
 		if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
 			return -15;
 		p->energy_dram = msr;
 	}
-	if (do_rapl & RAPL_GFX) {
+	if (platform->rapl_msrs & RAPL_GFX) {
 		if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
 			return -16;
 		p->energy_gfx = msr;
 	}
-	if (do_rapl & RAPL_PKG_PERF_STATUS) {
+	if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
 		if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
 			return -16;
 		p->rapl_pkg_perf_status = msr;
 	}
-	if (do_rapl & RAPL_DRAM_PERF_STATUS) {
+	if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
 		if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
 			return -16;
 		p->rapl_dram_perf_status = msr;
 	}
-	if (do_rapl & RAPL_AMD_F17H) {
+	if (platform->rapl_msrs & RAPL_AMD_F17H) {
 		if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
 			return -13;
 		p->energy_pkg = msr;
@@ -4750,7 +4790,7 @@ double get_tdp_intel(unsigned int model)
 {
 	unsigned long long msr;
 
-	if (do_rapl & RAPL_PKG_POWER_INFO)
+	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
 		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
 
@@ -4791,85 +4831,35 @@ static double rapl_dram_energy_units_probe(int model, double rapl_energy_units)
 	}
 }
 
-void rapl_probe_intel(unsigned int family, unsigned int model)
+void rapl_probe_intel(unsigned int model)
 {
 	unsigned long long msr;
 	unsigned int time_unit;
 	double tdp;
 
-	if (family != 6)
-		return;
-
-	switch (model) {
-	case INTEL_FAM6_SANDYBRIDGE:
-	case INTEL_FAM6_IVYBRIDGE:
-	case INTEL_FAM6_HASWELL:	/* HSW */
-	case INTEL_FAM6_HASWELL_L:	/* HSW */
-	case INTEL_FAM6_HASWELL_G:	/* HSW */
-	case INTEL_FAM6_BROADWELL:	/* BDW */
-	case INTEL_FAM6_BROADWELL_G:	/* BDW */
-		do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO;
-		break;
-	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
-	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-		do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO;
-		break;
-	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
-		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX;
-		break;
-	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
-		do_rapl = RAPL_PKG_ALL;
-		break;
-	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX;
-		break;
-	case INTEL_FAM6_HASWELL_X:	/* HSX */
-	case INTEL_FAM6_BROADWELL_X:	/* BDX */
-	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
-	case INTEL_FAM6_ICELAKE_X:	/* ICX */
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
-	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
-		do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL;
-		break;
-	case INTEL_FAM6_SANDYBRIDGE_X:
-	case INTEL_FAM6_IVYBRIDGE_X:
-		do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL;
-		break;
-	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
-	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
-		do_rapl = RAPL_PKG | RAPL_CORE;
-		break;
-	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
-		do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS;
-		break;
-	default:
-		return;
-	}
-
 	if (rapl_joules) {
-		if (do_rapl & RAPL_PKG_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS)
 			BIC_PRESENT(BIC_Pkg_J);
-		if (do_rapl & RAPL_CORE_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS)
 			BIC_PRESENT(BIC_Cor_J);
-		if (do_rapl & RAPL_DRAM_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS)
 			BIC_PRESENT(BIC_RAM_J);
-		if (do_rapl & RAPL_GFX_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS)
 			BIC_PRESENT(BIC_GFX_J);
 	} else {
-		if (do_rapl & RAPL_PKG_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS)
 			BIC_PRESENT(BIC_PkgWatt);
-		if (do_rapl & RAPL_CORE_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS)
 			BIC_PRESENT(BIC_CorWatt);
-		if (do_rapl & RAPL_DRAM_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS)
 			BIC_PRESENT(BIC_RAMWatt);
-		if (do_rapl & RAPL_GFX_ENERGY_STATUS)
+		if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS)
 			BIC_PRESENT(BIC_GFXWatt);
 	}
 
-	if (do_rapl & RAPL_PKG_PERF_STATUS)
+	if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS)
 		BIC_PRESENT(BIC_PKG__);
-	if (do_rapl & RAPL_DRAM_PERF_STATUS)
+	if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)
 		BIC_PRESENT(BIC_RAM__);
 
 	/* units on package 0, verify later other packages match */
@@ -4900,22 +4890,10 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 void rapl_probe_amd(unsigned int family, unsigned int model)
 {
 	unsigned long long msr;
-	unsigned int eax, ebx, ecx, edx;
-	unsigned int has_rapl = 0;
 	double tdp;
 
 	UNUSED(model);
 
-	if (max_extended_level >= 0x80000007) {
-		__cpuid(0x80000007, eax, ebx, ecx, edx);
-		/* RAPL (Fam 17h+) */
-		has_rapl = edx & (1 << 14);
-	}
-
-	if (!has_rapl || family < 0x17)
-		return;
-
-	do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY;
 	if (rapl_joules) {
 		BIC_PRESENT(BIC_Pkg_J);
 		BIC_PRESENT(BIC_Cor_J);
@@ -4941,12 +4919,15 @@ void rapl_probe_amd(unsigned int family, unsigned int model)
 /*
  * rapl_probe()
  *
- * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units
+ * sets rapl_power_units, rapl_energy_units, rapl_time_units
  */
 void rapl_probe(unsigned int family, unsigned int model)
 {
+	if (!platform->rapl_msrs)
+		return;
+
 	if (genuine_intel)
-		rapl_probe_intel(family, model);
+		rapl_probe_intel(model);
 	if (authentic_amd || hygon_genuine)
 		rapl_probe_amd(family, model);
 }
@@ -5040,7 +5021,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	UNUSED(c);
 	UNUSED(p);
 
-	if (!do_rapl)
+	if (!platform->rapl_msrs)
 		return 0;
 
 	/* RAPL counters are per package, so print only for 1st thread/package */
@@ -5053,7 +5034,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		return -1;
 	}
 
-	if (do_rapl & RAPL_AMD_F17H) {
+	if (platform->rapl_msrs & RAPL_AMD_F17H) {
 		msr_name = "MSR_RAPL_PWR_UNIT";
 		if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
 			return -1;
@@ -5066,7 +5047,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
 		rapl_power_units, rapl_energy_units, rapl_time_units);
 
-	if (do_rapl & RAPL_PKG_POWER_INFO) {
+	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) {
 
 		if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
 			return -5;
@@ -5079,7 +5060,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
 
 	}
-	if (do_rapl & RAPL_PKG) {
+	if (platform->rapl_msrs & RAPL_PKG) {
 
 		if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
 			return -9;
@@ -5103,7 +5084,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
 	}
 
-	if (do_rapl & RAPL_DRAM_POWER_INFO) {
+	if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) {
 		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
 			return -6;
 
@@ -5114,7 +5095,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
 	}
-	if (do_rapl & RAPL_DRAM) {
+	if (platform->rapl_msrs & RAPL_DRAM) {
 		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
 			return -9;
 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
@@ -5122,20 +5103,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 		print_power_limit_msr(cpu, msr, "DRAM Limit");
 	}
-	if (do_rapl & RAPL_CORE_POLICY) {
+	if (platform->rapl_msrs & RAPL_CORE_POLICY) {
 		if (get_msr(cpu, MSR_PP0_POLICY, &msr))
 			return -7;
 
 		fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
 	}
-	if (do_rapl & RAPL_CORE_POWER_LIMIT) {
+	if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) {
 		if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
 			return -9;
 		fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
 		print_power_limit_msr(cpu, msr, "Cores Limit");
 	}
-	if (do_rapl & RAPL_GFX) {
+	if (platform->rapl_msrs & RAPL_GFX) {
 		if (get_msr(cpu, MSR_PP1_POLICY, &msr))
 			return -8;
 
-- 
cgit 


From e338831b14d2da921348c8c4055b9f2c94effe73 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 21:37:46 +0800
Subject: tools/power/turbostat: Abstract Per Core RAPL support

Abstract the support for Per Core RAPL.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 90e1abe96dd5..6314c40dc15b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -245,7 +245,6 @@ double tsc_tweak = 1.0;
 unsigned int show_pkg_only;
 unsigned int show_core_only;
 char *output_buffer, *outp;
-unsigned int do_rapl;
 unsigned int do_dts;
 unsigned int do_ptm;
 unsigned int do_ipc;
@@ -289,6 +288,7 @@ struct platform_features {
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
+	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
@@ -378,7 +378,6 @@ enum rapl_msrs {
 	RAPL_AMD_PWR_UNIT = BIT(14),	/* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
 	RAPL_AMD_CORE_ENERGY_STAT = BIT(15),	/* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
 	RAPL_AMD_PKG_ENERGY_STAT = BIT(16),	/* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
-	RAPL_PER_CORE_ENERGY = BIT(17),	/* Indicates cores energy collection is per-core, not per-package. */
 };
 
 #define RAPL_PKG	(RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
@@ -680,6 +679,7 @@ static const struct platform_features amd_features = {
 
 static const struct platform_features amd_features_with_rapl = {
 	.rapl_msrs = RAPL_AMD_F17H,
+	.has_per_core_rapl = 1,
 };
 
 static const struct platform_data turbostat_pdata[] = {
@@ -763,10 +763,8 @@ void probe_platform_features(unsigned int family, unsigned int model)
 
 			__cpuid(0x80000007, eax, ebx, ecx, edx);
 			/* RAPL (Fam 17h+) */
-			if ((edx & (1 << 14)) && family >= 0x17) {
+			if ((edx & (1 << 14)) && family >= 0x17)
 				platform = &amd_features_with_rapl;
-				do_rapl = RAPL_PER_CORE_ENERGY;
-			}
 		}
 		return;
 	}
@@ -1371,10 +1369,10 @@ void print_header(char *delim)
 		outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
 
 	if (platform->rapl_msrs && !rapl_joules) {
-		if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
+		if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
 	} else if (platform->rapl_msrs && rapl_joules) {
-		if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
+		if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
 	}
 
@@ -1435,7 +1433,7 @@ void print_header(char *delim)
 	if (platform->rapl_msrs && !rapl_joules) {
 		if (DO_BIC(BIC_PkgWatt))
 			outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
-		if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+		if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_GFXWatt))
 			outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
@@ -1448,7 +1446,7 @@ void print_header(char *delim)
 	} else if (platform->rapl_msrs && rapl_joules) {
 		if (DO_BIC(BIC_Pkg_J))
 			outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
-		if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+		if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_GFX_J))
 			outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
@@ -1750,10 +1748,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	fmt8 = "%s%.2f";
 
-	if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
+	if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
 		outp +=
 		    sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
-	if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
+	if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units);
 
 	/* print per-package data only for 1st core in package */
@@ -1818,7 +1816,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		outp +=
 		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float);
 
-	if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+	if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
 		outp +=
 		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float);
 	if (DO_BIC(BIC_GFXWatt))
@@ -1830,7 +1828,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 			    p->energy_dram * rapl_dram_energy_units / interval_float);
 	if (DO_BIC(BIC_Pkg_J))
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units);
-	if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+	if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units);
 	if (DO_BIC(BIC_GFX_J))
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units);
-- 
cgit 


From 6d35b8c4a661c849361239fe316035ea952606a3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 22 Apr 2023 11:59:04 +0800
Subject: tools/power/turbostat: Abstract RAPL divisor support

INTEL_FAM6_ATOM_SILVERMONT model needs a divisor to convert the raw
Energy Units value from MSR_RAPL_POWER_UNIT.

Abstract the support for RAPL divisor.

Delete CPU model check in rapl_probe_intel().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6314c40dc15b..a31724335671 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -289,6 +289,7 @@ struct platform_features {
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
 	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
+	bool has_rapl_divisor;	/* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
@@ -595,6 +596,7 @@ static const struct platform_features slv_features = {
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
+	.has_rapl_divisor = 1,
 };
 
 static const struct platform_features slvd_features = {
@@ -4865,7 +4867,7 @@ void rapl_probe_intel(unsigned int model)
 		return;
 
 	rapl_power_units = 1.0 / (1 << (msr & 0xF));
-	if (model == INTEL_FAM6_ATOM_SILVERMONT)
+	if (platform->has_rapl_divisor)
 		rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
 	else
 		rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
-- 
cgit 


From 9e6f35159cdef148c711d2fb7d5fd2b2b6fb772d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 22 Apr 2023 12:21:58 +0800
Subject: tools/power/turbostat: Abstract fixed DRAM Energy unit support

Abstract the support for fixed Dram domain energy unit.

Delete rapl_dram_energy_units_probe() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a31724335671..a26ae5a2e2bd 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -290,6 +290,7 @@ struct platform_features {
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
 	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
 	bool has_rapl_divisor;	/* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
+	bool has_fixed_rapl_unit;	/* Fixed Energy Unit used for DRAM RAPL Domain */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
@@ -471,6 +472,7 @@ static const struct platform_features hsx_features = {
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+	.has_fixed_rapl_unit = 1,
 };
 
 static const struct platform_features hswl_features = {
@@ -529,6 +531,7 @@ static const struct platform_features bdx_features = {
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+	.has_fixed_rapl_unit = 1,
 };
 
 static const struct platform_features skl_features = {
@@ -566,6 +569,7 @@ static const struct platform_features skx_features = {
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+	.has_fixed_rapl_unit = 1,
 };
 
 static const struct platform_features icx_features = {
@@ -577,6 +581,7 @@ static const struct platform_features icx_features = {
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+	.has_fixed_rapl_unit = 1,
 };
 
 static const struct platform_features spr_features = {
@@ -671,6 +676,7 @@ static const struct platform_features knl_features = {
 	.cst_limit = CST_LIMIT_KNL,
 	.trl_msrs = TRL_KNL,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+	.has_fixed_rapl_unit = 1,
 };
 
 static const struct platform_features default_features = {
@@ -4811,26 +4817,6 @@ double get_tdp_amd(unsigned int family)
 	return 280.0;
 }
 
-/*
- * rapl_dram_energy_units_probe()
- * Energy units are either hard-coded, or come from RAPL Energy Unit MSR.
- */
-static double rapl_dram_energy_units_probe(int model, double rapl_energy_units)
-{
-	/* only called for genuine_intel, family 6 */
-
-	switch (model) {
-	case INTEL_FAM6_HASWELL_X:	/* HSX */
-	case INTEL_FAM6_BROADWELL_X:	/* BDX */
-	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
-	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
-	case INTEL_FAM6_ICELAKE_X:	/* ICX */
-		return (rapl_dram_energy_units = 15.3 / 1000000);
-	default:
-		return (rapl_energy_units);
-	}
-}
-
 void rapl_probe_intel(unsigned int model)
 {
 	unsigned long long msr;
@@ -4872,7 +4858,10 @@ void rapl_probe_intel(unsigned int model)
 	else
 		rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
 
-	rapl_dram_energy_units = rapl_dram_energy_units_probe(model, rapl_energy_units);
+	if (platform->has_fixed_rapl_unit)
+		rapl_dram_energy_units = (15.3 / 1000000);
+	else
+		rapl_dram_energy_units = rapl_energy_units;
 
 	time_unit = msr >> 16 & 0xF;
 	if (time_unit == 0)
-- 
cgit 


From 7c60409382a4be05d601e0b45db7b0166845b0cf Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 10:21:10 +0800
Subject: tools/power/turbostat: Abstract hardcoded TDP value

Different hardcoded TDP values are used when TDP can not be retrieved
from the hardware.

Abstract hardcoded TDP value.

Delete CPU model checks in get_tdp_intel().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a26ae5a2e2bd..45698c3a9e72 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -291,6 +291,7 @@ struct platform_features {
 	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
 	bool has_rapl_divisor;	/* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
 	bool has_fixed_rapl_unit;	/* Fixed Energy Unit used for DRAM RAPL Domain */
+	int rapl_quirk_tdp;	/* Hardcoded TDP value when cannot be retrieved from hardware */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 };
 
@@ -602,6 +603,7 @@ static const struct platform_features slv_features = {
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 	.has_rapl_divisor = 1,
+	.rapl_quirk_tdp = 30,
 };
 
 static const struct platform_features slvd_features = {
@@ -611,6 +613,7 @@ static const struct platform_features slvd_features = {
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
+	.rapl_quirk_tdp = 30,
 };
 
 static const struct platform_features amt_features = {
@@ -688,6 +691,7 @@ static const struct platform_features amd_features = {
 static const struct platform_features amd_features_with_rapl = {
 	.rapl_msrs = RAPL_AMD_F17H,
 	.has_per_core_rapl = 1,
+	.rapl_quirk_tdp = 280,	/* This is the max stock TDP of HEDT/Server Fam17h+ chips */
 };
 
 static const struct platform_data turbostat_pdata[] = {
@@ -4792,29 +4796,31 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
 #define	RAPL_TIME_GRANULARITY	0x3F	/* 6 bit time granularity */
 
+double get_quirk_tdp(void)
+{
+	if (platform->rapl_quirk_tdp)
+		return platform->rapl_quirk_tdp;
+
+	return 135.0;
+}
+
 double get_tdp_intel(unsigned int model)
 {
 	unsigned long long msr;
 
+	UNUSED(model);
+
 	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
 		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_SILVERMONT:
-	case INTEL_FAM6_ATOM_SILVERMONT_D:
-		return 30.0;
-	default:
-		return 135.0;
-	}
+	return get_quirk_tdp();
 }
 
 double get_tdp_amd(unsigned int family)
 {
 	UNUSED(family);
 
-	/* This is the max stock TDP of HEDT/Server Fam17h+ chips */
-	return 280.0;
+	return get_quirk_tdp();
 }
 
 void rapl_probe_intel(unsigned int model)
-- 
cgit 


From bf1ad57c3f92c551dbfdcecd49797253f55cb7c1 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 10:28:04 +0800
Subject: tools/power/turbostat: Remove unused family/model parameters for RAPL
 functions

RAPL probing can be done without family/model checking. Remove these
parameters in rapl probe functions.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 45698c3a9e72..44d7321b004e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4804,26 +4804,22 @@ double get_quirk_tdp(void)
 	return 135.0;
 }
 
-double get_tdp_intel(unsigned int model)
+double get_tdp_intel(void)
 {
 	unsigned long long msr;
 
-	UNUSED(model);
-
 	if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
 		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
 	return get_quirk_tdp();
 }
 
-double get_tdp_amd(unsigned int family)
+double get_tdp_amd(void)
 {
-	UNUSED(family);
-
 	return get_quirk_tdp();
 }
 
-void rapl_probe_intel(unsigned int model)
+void rapl_probe_intel(void)
 {
 	unsigned long long msr;
 	unsigned int time_unit;
@@ -4875,20 +4871,18 @@ void rapl_probe_intel(unsigned int model)
 
 	rapl_time_units = 1.0 / (1 << (time_unit));
 
-	tdp = get_tdp_intel(model);
+	tdp = get_tdp_intel();
 
 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
 	if (!quiet)
 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
 }
 
-void rapl_probe_amd(unsigned int family, unsigned int model)
+void rapl_probe_amd(void)
 {
 	unsigned long long msr;
 	double tdp;
 
-	UNUSED(model);
-
 	if (rapl_joules) {
 		BIC_PRESENT(BIC_Pkg_J);
 		BIC_PRESENT(BIC_Cor_J);
@@ -4904,7 +4898,7 @@ void rapl_probe_amd(unsigned int family, unsigned int model)
 	rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
 	rapl_power_units = ldexp(1.0, -(msr & 0xf));
 
-	tdp = get_tdp_amd(family);
+	tdp = get_tdp_amd();
 
 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
 	if (!quiet)
@@ -4916,15 +4910,15 @@ void rapl_probe_amd(unsigned int family, unsigned int model)
  *
  * sets rapl_power_units, rapl_energy_units, rapl_time_units
  */
-void rapl_probe(unsigned int family, unsigned int model)
+void rapl_probe(void)
 {
 	if (!platform->rapl_msrs)
 		return;
 
 	if (genuine_intel)
-		rapl_probe_intel(model);
+		rapl_probe_intel();
 	if (authentic_amd || hygon_genuine)
-		rapl_probe_amd(family, model);
+		rapl_probe_amd();
 }
 
 void prewake_cstate_probe(unsigned int family, unsigned int model)
@@ -5828,7 +5822,7 @@ void process_cpuid()
 	if (!quiet && has_slv_msrs(family, model))
 		decode_c6_demotion_policy_msr();
 
-	rapl_probe(family, model);
+	rapl_probe();
 	prewake_cstate_probe(family, model);
 
 	if (!quiet)
-- 
cgit 


From 485a017c45200ed82518f6cdeea554f77e9a0562 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 10:37:23 +0800
Subject: tools/power/turbostat: Abstract TSC tweak support

On some models, the CPU base frequency is different from the TSC
frequency, and the aperf/mperf counters are running at CPU base
frequency instead of TSC frequency.

Abstract support for TSC tweak.

Given that tsc_tweak depends on base_hz, move the code to probe_bclk()
after base_hz is available.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 44d7321b004e..05385fabc83a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -293,6 +293,7 @@ struct platform_features {
 	bool has_fixed_rapl_unit;	/* Fixed Energy Unit used for DRAM RAPL Domain */
 	int rapl_quirk_tdp;	/* Hardcoded TDP value when cannot be retrieved from hardware */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
+	bool enable_tsc_tweak;	/* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
 };
 
 struct platform_data {
@@ -546,6 +547,7 @@ static const struct platform_features skl_features = {
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+	.enable_tsc_tweak = 1,
 };
 
 static const struct platform_features cnl_features = {
@@ -558,6 +560,7 @@ static const struct platform_features cnl_features = {
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+	.enable_tsc_tweak = 1,
 };
 
 static const struct platform_features skx_features = {
@@ -660,6 +663,7 @@ static const struct platform_features tmt_features = {
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+	.enable_tsc_tweak = 1,
 };
 
 static const struct platform_features tmtd_features = {
@@ -2934,11 +2938,6 @@ void probe_cst_limit(void)
 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
 }
 
-static void calculate_tsc_tweak()
-{
-	tsc_tweak = base_hz / tsc_hz;
-}
-
 void prewake_cstate_probe(unsigned int family, unsigned int model);
 
 static void dump_platform_info(void)
@@ -4198,6 +4197,9 @@ void probe_bclk(void)
 
 	base_hz = base_ratio * bclk * 1000000;
 	has_base_hz = 1;
+
+	if (platform->enable_tsc_tweak)
+		tsc_tweak = base_hz / tsc_hz;
 }
 
 /*
@@ -5836,9 +5838,6 @@ void process_cpuid()
 	if (!quiet)
 		dump_sysfs_pstate_config();
 
-	if (has_skl_msrs(family, model) || is_ehl(family, model))
-		calculate_tsc_tweak();
-
 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
 		BIC_PRESENT(BIC_GFX_rc6);
 
-- 
cgit 


From 3c6a17b8ae44b0116e303402803c173fe2a3da92 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 22:37:37 +0800
Subject: tools/power/turbostat: Add skeleton support for cstate enumeration

Add skeleton support for cstate enumeration.

Note that the previous logic may override the cstate setting for
multiple times for different reasons. The conversion to new cstate
enumeration must be done step by step following the previous code
order strictly.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 57 ++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 05385fabc83a..6a49eb941fe0 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -283,6 +283,7 @@ struct platform_features {
 	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
 	int bclk_freq;		/* CPU base clock */
 	int crystal_freq;	/* Crystal clock to use when not available from CPUID.15 */
+	int supported_cstates;	/* Core cstates and Package cstates supported */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
@@ -396,6 +397,21 @@ enum rapl_msrs {
 
 #define RAPL_AMD_F17H	(RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
 
+/* For Cstates */
+enum cstates {
+	CC1 = BIT(0),
+	CC3 = BIT(1),
+	CC6 = BIT(2),
+	CC7 = BIT(3),
+	PC2 = BIT(4),
+	PC3 = BIT(5),
+	PC6 = BIT(6),
+	PC7 = BIT(7),
+	PC8 = BIT(8),
+	PC9 = BIT(9),
+	PC10 = BIT(10),
+};
+
 static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
@@ -5560,6 +5576,44 @@ void linux_perf_init(void)
 	BIC_PRESENT(BIC_IPC);
 }
 
+void probe_cstates(void)
+{
+	probe_cst_limit();
+
+	if (platform->supported_cstates & CC1)
+		BIC_PRESENT(BIC_CPU_c1);
+
+	if (platform->supported_cstates & CC3)
+		BIC_PRESENT(BIC_CPU_c3);
+
+	if (platform->supported_cstates & CC6)
+		BIC_PRESENT(BIC_CPU_c6);
+
+	if (platform->supported_cstates & CC7)
+		BIC_PRESENT(BIC_CPU_c7);
+
+	if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2))
+		BIC_PRESENT(BIC_Pkgpc2);
+
+	if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3))
+		BIC_PRESENT(BIC_Pkgpc3);
+
+	if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6))
+		BIC_PRESENT(BIC_Pkgpc6);
+
+	if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7))
+		BIC_PRESENT(BIC_Pkgpc7);
+
+	if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8))
+		BIC_PRESENT(BIC_Pkgpc8);
+
+	if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9))
+		BIC_PRESENT(BIC_Pkgpc9);
+
+	if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10))
+		BIC_PRESENT(BIC_Pkgpc10);
+}
+
 void process_cpuid()
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -5741,7 +5795,8 @@ void process_cpuid()
 	BIC_PRESENT(BIC_IRQ);
 	BIC_PRESENT(BIC_TSC_MHz);
 
-	probe_cst_limit();
+	probe_cstates();
+
 	if (platform->has_nhm_msrs) {
 		BIC_PRESENT(BIC_CPU_c1);
 		BIC_PRESENT(BIC_CPU_c3);
-- 
cgit 


From ce7ddf8af2f96a8a7af4cc2273843518c5810166 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:16:27 +0800
Subject: tools/power/turbostat: Adjust cstate for models with .has_nhm_msrs
 set

Enable CC1/CC3/CC6 for platforms with .has_nhm_msrs set.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6a49eb941fe0..c7345e0c5185 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -416,6 +416,7 @@ static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_NHM,
 	.trl_msrs = TRL_BASE,
 };
@@ -424,6 +425,7 @@ static const struct platform_features nhx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_NHM,
 };
 
@@ -432,6 +434,7 @@ static const struct platform_features snb_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -442,6 +445,7 @@ static const struct platform_features snx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -453,6 +457,7 @@ static const struct platform_features ivb_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -463,6 +468,7 @@ static const struct platform_features ivx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -474,6 +480,7 @@ static const struct platform_features hsw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -486,6 +493,7 @@ static const struct platform_features hsx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
@@ -499,6 +507,7 @@ static const struct platform_features hswl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -511,6 +520,7 @@ static const struct platform_features hswg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -523,6 +533,7 @@ static const struct platform_features bdw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -534,6 +545,7 @@ static const struct platform_features bdwg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -545,6 +557,7 @@ static const struct platform_features bdx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
@@ -559,6 +572,7 @@ static const struct platform_features skl_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 24000000,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -572,6 +586,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -585,6 +600,7 @@ static const struct platform_features skx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -598,6 +614,7 @@ static const struct platform_features icx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -610,6 +627,7 @@ static const struct platform_features spr_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -618,6 +636,7 @@ static const struct platform_features spr_features = {
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -629,6 +648,7 @@ static const struct platform_features slvd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -638,6 +658,7 @@ static const struct platform_features slvd_features = {
 static const struct platform_features amt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_AMT,
 	.trl_msrs = TRL_BASE,
 };
@@ -647,6 +668,7 @@ static const struct platform_features gmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -657,6 +679,7 @@ static const struct platform_features gmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 25000000,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
@@ -667,6 +690,7 @@ static const struct platform_features gmtp_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -676,6 +700,7 @@ static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -686,6 +711,7 @@ static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL,
@@ -696,6 +722,7 @@ static const struct platform_features knl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC3 | CC6,
 	.cst_limit = CST_LIMIT_KNL,
 	.trl_msrs = TRL_KNL,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -5797,12 +5824,8 @@ void process_cpuid()
 
 	probe_cstates();
 
-	if (platform->has_nhm_msrs) {
-		BIC_PRESENT(BIC_CPU_c1);
-		BIC_PRESENT(BIC_CPU_c3);
-		BIC_PRESENT(BIC_CPU_c6);
+	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
-	}
 	probe_bclk();
 	do_snb_cstates = has_snb_msrs(family, model);
 
-- 
cgit 


From 942c854d8d0f6c6fc0864a9da5f5e374a8e146e5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:16:56 +0800
Subject: tools/power/turbostat: Adjust cstate for has_snb_msrs() models

Enable CC7 and PC2 for has_snb_msrs() models.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 47 ++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c7345e0c5185..174a8d0750da 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -434,7 +434,7 @@ static const struct platform_features snb_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -445,7 +445,7 @@ static const struct platform_features snx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -457,7 +457,7 @@ static const struct platform_features ivb_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -468,7 +468,7 @@ static const struct platform_features ivx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -480,7 +480,7 @@ static const struct platform_features hsw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -493,7 +493,7 @@ static const struct platform_features hsx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
@@ -507,7 +507,7 @@ static const struct platform_features hswl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -520,7 +520,7 @@ static const struct platform_features hswg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -533,7 +533,7 @@ static const struct platform_features bdw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -545,7 +545,7 @@ static const struct platform_features bdwg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -557,7 +557,7 @@ static const struct platform_features bdx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
@@ -572,7 +572,7 @@ static const struct platform_features skl_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 24000000,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -586,7 +586,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -600,7 +600,7 @@ static const struct platform_features skx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_SKX,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -614,7 +614,7 @@ static const struct platform_features icx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -627,7 +627,7 @@ static const struct platform_features spr_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -668,7 +668,7 @@ static const struct platform_features gmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -679,7 +679,7 @@ static const struct platform_features gmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 25000000,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
@@ -690,7 +690,7 @@ static const struct platform_features gmtp_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -700,7 +700,7 @@ static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -711,7 +711,7 @@ static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL,
@@ -5829,12 +5829,7 @@ void process_cpuid()
 	probe_bclk();
 	do_snb_cstates = has_snb_msrs(family, model);
 
-	if (do_snb_cstates)
-		BIC_PRESENT(BIC_CPU_c7);
-
 	do_irtl_snb = has_snb_msrs(family, model);
-	if (do_snb_cstates && (pkg_cstate_limit >= PCL__2))
-		BIC_PRESENT(BIC_Pkgpc2);
 	if (pkg_cstate_limit >= PCL__3)
 		BIC_PRESENT(BIC_Pkgpc3);
 	if (pkg_cstate_limit >= PCL__6)
-- 
cgit 


From 6f1935c036f79b56b6a1dc6e51c8c6fe483983ec Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:17:23 +0800
Subject: tools/power/turbostat: Adjust cstate for models with .cst_limit set

Enable PC3/PC6 for platforms with .cst_limit set because package cstates
are guarded by pkg_cstate_limit.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 58 ++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 174a8d0750da..2bfbf4ccf5ac 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -416,7 +416,7 @@ static const struct platform_features nhm_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_NHM,
 	.trl_msrs = TRL_BASE,
 };
@@ -425,7 +425,7 @@ static const struct platform_features nhx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_NHM,
 };
 
@@ -434,7 +434,7 @@ static const struct platform_features snb_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -445,7 +445,7 @@ static const struct platform_features snx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -457,7 +457,7 @@ static const struct platform_features ivb_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -468,7 +468,7 @@ static const struct platform_features ivx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -480,7 +480,7 @@ static const struct platform_features hsw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -493,7 +493,7 @@ static const struct platform_features hsx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
@@ -507,7 +507,7 @@ static const struct platform_features hswl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -520,7 +520,7 @@ static const struct platform_features hswg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -533,7 +533,7 @@ static const struct platform_features bdw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -545,7 +545,7 @@ static const struct platform_features bdwg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -557,7 +557,7 @@ static const struct platform_features bdx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
@@ -572,7 +572,7 @@ static const struct platform_features skl_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 24000000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -586,7 +586,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -600,7 +600,7 @@ static const struct platform_features skx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -614,7 +614,7 @@ static const struct platform_features icx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -627,7 +627,7 @@ static const struct platform_features spr_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -636,7 +636,7 @@ static const struct platform_features spr_features = {
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -648,7 +648,7 @@ static const struct platform_features slvd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -658,7 +658,7 @@ static const struct platform_features slvd_features = {
 static const struct platform_features amt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_133MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_AMT,
 	.trl_msrs = TRL_BASE,
 };
@@ -668,7 +668,7 @@ static const struct platform_features gmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -679,7 +679,7 @@ static const struct platform_features gmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 25000000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
@@ -690,7 +690,7 @@ static const struct platform_features gmtp_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -700,7 +700,7 @@ static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -711,7 +711,7 @@ static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL,
@@ -722,7 +722,7 @@ static const struct platform_features knl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_KNL,
 	.trl_msrs = TRL_KNL,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -5830,10 +5830,6 @@ void process_cpuid()
 	do_snb_cstates = has_snb_msrs(family, model);
 
 	do_irtl_snb = has_snb_msrs(family, model);
-	if (pkg_cstate_limit >= PCL__3)
-		BIC_PRESENT(BIC_Pkgpc3);
-	if (pkg_cstate_limit >= PCL__6)
-		BIC_PRESENT(BIC_Pkgpc6);
 	if (do_snb_cstates && (pkg_cstate_limit >= PCL__7))
 		BIC_PRESENT(BIC_Pkgpc7);
 	if (has_slv_msrs(family, model)) {
-- 
cgit 


From 192cbf0468ae31062526287e257f5b56214d2da5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:17:58 +0800
Subject: tools/power/turbostat: Adjust cstate for has_snb_msrs() models

Enable PC7 for has_snb_msrs() models.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 46 ++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 2bfbf4ccf5ac..f3d44e81d7d5 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -221,7 +221,6 @@ unsigned int rapl_joules;
 unsigned int summary_only;
 unsigned int list_header_only;
 unsigned int dump_only;
-unsigned int do_snb_cstates;
 unsigned int do_knl_cstates;
 unsigned int do_slm_cstates;
 unsigned int use_c1_residency_msr;
@@ -434,7 +433,7 @@ static const struct platform_features snb_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -445,7 +444,7 @@ static const struct platform_features snx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -457,7 +456,7 @@ static const struct platform_features ivb_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -468,7 +467,7 @@ static const struct platform_features ivx_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
@@ -480,7 +479,7 @@ static const struct platform_features hsw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -493,7 +492,7 @@ static const struct platform_features hsx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
@@ -507,7 +506,7 @@ static const struct platform_features hswl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -520,7 +519,7 @@ static const struct platform_features hswg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -533,7 +532,7 @@ static const struct platform_features bdw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -545,7 +544,7 @@ static const struct platform_features bdwg_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -557,7 +556,7 @@ static const struct platform_features bdx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
@@ -572,7 +571,7 @@ static const struct platform_features skl_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 24000000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -586,7 +585,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -600,7 +599,7 @@ static const struct platform_features skx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SKX,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -614,7 +613,7 @@ static const struct platform_features icx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -627,7 +626,7 @@ static const struct platform_features spr_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -668,7 +667,7 @@ static const struct platform_features gmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -679,7 +678,7 @@ static const struct platform_features gmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 25000000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
@@ -690,7 +689,7 @@ static const struct platform_features gmtp_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -700,7 +699,7 @@ static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -711,7 +710,7 @@ static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL,
@@ -5827,11 +5826,8 @@ void process_cpuid()
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
 	probe_bclk();
-	do_snb_cstates = has_snb_msrs(family, model);
 
 	do_irtl_snb = has_snb_msrs(family, model);
-	if (do_snb_cstates && (pkg_cstate_limit >= PCL__7))
-		BIC_PRESENT(BIC_Pkgpc7);
 	if (has_slv_msrs(family, model)) {
 		BIC_NOT_PRESENT(BIC_Pkgpc2);
 		BIC_NOT_PRESENT(BIC_Pkgpc3);
-- 
cgit 


From ff206149551f09117f44883650a45ae692745703 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:07:08 +0800
Subject: tools/power/turbostat: Adjust cstate for has_slv_msrs() models

Disable PC2/PC3/PC7 and enable PC6 for has_slv_msrs() models.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f3d44e81d7d5..972f5a9b14e6 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -635,7 +635,7 @@ static const struct platform_features spr_features = {
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
-	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
+	.supported_cstates = CC1 | CC3 | CC6 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -5829,10 +5829,6 @@ void process_cpuid()
 
 	do_irtl_snb = has_snb_msrs(family, model);
 	if (has_slv_msrs(family, model)) {
-		BIC_NOT_PRESENT(BIC_Pkgpc2);
-		BIC_NOT_PRESENT(BIC_Pkgpc3);
-		BIC_PRESENT(BIC_Pkgpc6);
-		BIC_NOT_PRESENT(BIC_Pkgpc7);
 		BIC_PRESENT(BIC_Mod_c6);
 		use_c1_residency_msr = 1;
 	}
-- 
cgit 


From 3d982ac0dafeab860b9d42f5cc41a78753275fdd Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:09:20 +0800
Subject: tools/power/turbostat: Adjust cstate for is_jvl() models

Disable CC3/CC7/PC2/PC3/PC6/PC7 for is_jvl() models.

Delete is_jvl() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 972f5a9b14e6..e95972edde3c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -710,7 +710,7 @@ static const struct platform_features tmtd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL,
@@ -4361,21 +4361,6 @@ int is_ehl(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_jvl(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_TREMONT_D:
-		return 1;
-	}
-	return 0;
-}
-
 static void remove_underbar(char *s)
 {
 	char *to = s;
@@ -5832,14 +5817,6 @@ void process_cpuid()
 		BIC_PRESENT(BIC_Mod_c6);
 		use_c1_residency_msr = 1;
 	}
-	if (is_jvl(family, model)) {
-		BIC_NOT_PRESENT(BIC_CPU_c3);
-		BIC_NOT_PRESENT(BIC_CPU_c7);
-		BIC_NOT_PRESENT(BIC_Pkgpc2);
-		BIC_NOT_PRESENT(BIC_Pkgpc3);
-		BIC_NOT_PRESENT(BIC_Pkgpc6);
-		BIC_NOT_PRESENT(BIC_Pkgpc7);
-	}
 	if (is_dnv(family, model)) {
 		BIC_PRESENT(BIC_CPU_c1);
 		BIC_NOT_PRESENT(BIC_CPU_c3);
-- 
cgit 


From 8e20ced057423f0edaf40b650facc221e8030b33 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:12:29 +0800
Subject: tools/power/turbostat: Adjust cstate for is_dnv() models

Enable CC1 and disable CC3/CC7/PC3/PC7 for is_dnv() models.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index e95972edde3c..069704ef3c80 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -678,7 +678,7 @@ static const struct platform_features gmtd_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 25000000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
@@ -5818,11 +5818,6 @@ void process_cpuid()
 		use_c1_residency_msr = 1;
 	}
 	if (is_dnv(family, model)) {
-		BIC_PRESENT(BIC_CPU_c1);
-		BIC_NOT_PRESENT(BIC_CPU_c3);
-		BIC_NOT_PRESENT(BIC_Pkgpc3);
-		BIC_NOT_PRESENT(BIC_CPU_c7);
-		BIC_NOT_PRESENT(BIC_Pkgpc7);
 		use_c1_residency_msr = 1;
 	}
 	if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) {
-- 
cgit 


From 24d16bec379db6eea2d72e18c97c6c80e486a5e1 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:18:27 +0800
Subject: tools/power/turbostat: Adjust cstate for is_skx()/is_icx()/is_spr()
 models

Disable CC3/CC7/PC3/PC7 for is_skx()/is_icx()/is_spr() models.

Delete is_skx() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 069704ef3c80..262af40fe35d 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -599,7 +599,7 @@ static const struct platform_features skx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -613,7 +613,7 @@ static const struct platform_features icx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_ICX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -626,7 +626,7 @@ static const struct platform_features spr_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -4298,22 +4298,6 @@ int is_bdx(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_skx(unsigned int family, unsigned int model)
-{
-
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_SKYLAKE_X:
-		return 1;
-	}
-	return 0;
-}
-
 int is_icx(unsigned int family, unsigned int model)
 {
 
@@ -5820,12 +5804,6 @@ void process_cpuid()
 	if (is_dnv(family, model)) {
 		use_c1_residency_msr = 1;
 	}
-	if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) {
-		BIC_NOT_PRESENT(BIC_CPU_c3);
-		BIC_NOT_PRESENT(BIC_Pkgpc3);
-		BIC_NOT_PRESENT(BIC_CPU_c7);
-		BIC_NOT_PRESENT(BIC_Pkgpc7);
-	}
 	if (is_bdx(family, model)) {
 		BIC_NOT_PRESENT(BIC_CPU_c7);
 		BIC_NOT_PRESENT(BIC_Pkgpc7);
-- 
cgit 


From 1109694817fb4fcdfccb1a86fd58e69fb60f4eab Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:16:52 +0800
Subject: tools/power/turbostat: Adjust cstate for is_bdx() models

Disable CC7/PC7 for is_bdx() models.

Delete is_bdx() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 262af40fe35d..27ca29f0545a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -556,7 +556,7 @@ static const struct platform_features bdx_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
@@ -4282,22 +4282,6 @@ int is_dnv(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_bdx(unsigned int family, unsigned int model)
-{
-
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_BROADWELL_X:
-		return 1;
-	}
-	return 0;
-}
-
 int is_icx(unsigned int family, unsigned int model)
 {
 
@@ -5804,10 +5788,6 @@ void process_cpuid()
 	if (is_dnv(family, model)) {
 		use_c1_residency_msr = 1;
 	}
-	if (is_bdx(family, model)) {
-		BIC_NOT_PRESENT(BIC_CPU_c7);
-		BIC_NOT_PRESENT(BIC_Pkgpc7);
-	}
 	if (has_c8910_msrs(family, model)) {
 		if (pkg_cstate_limit >= PCL__8)
 			BIC_PRESENT(BIC_Pkgpc8);
-- 
cgit 


From 4d2c95d40a90877ffd8f961055419f1f550a7ed9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:20:02 +0800
Subject: tools/power/turbostat: Adjust cstate for has_c8910_msrs() models

Enable PC8/PC9/PC10 for has_c8910_msrs() models.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 27ca29f0545a..b0bc973c077d 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -506,7 +506,7 @@ static const struct platform_features hswl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
@@ -532,7 +532,7 @@ static const struct platform_features bdw_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -571,7 +571,7 @@ static const struct platform_features skl_features = {
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 24000000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -585,7 +585,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -667,7 +667,7 @@ static const struct platform_features gmt_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -689,7 +689,7 @@ static const struct platform_features gmtp_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
 	.crystal_freq = 19200000,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
@@ -699,7 +699,7 @@ static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -5788,14 +5788,6 @@ void process_cpuid()
 	if (is_dnv(family, model)) {
 		use_c1_residency_msr = 1;
 	}
-	if (has_c8910_msrs(family, model)) {
-		if (pkg_cstate_limit >= PCL__8)
-			BIC_PRESENT(BIC_Pkgpc8);
-		if (pkg_cstate_limit >= PCL__9)
-			BIC_PRESENT(BIC_Pkgpc9);
-		if (pkg_cstate_limit >= PCL_10)
-			BIC_PRESENT(BIC_Pkgpc10);
-	}
 	do_irtl_hsw = has_c8910_msrs(family, model);
 	if (has_skl_msrs(family, model)) {
 		BIC_PRESENT(BIC_Totl_c0);
-- 
cgit 


From cd7a2b6a61100a0fe8a40916d1afbd72d8833d57 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:24:19 +0800
Subject: tools/power/turbostat: Adjust cstate for
 is_slm()/is_knl()/is_cnl()/is_ehl() models

Disable CC3 for is_slm()/is_knl()/is_cnl()/is_ehl() models.

Delete is_cnl()/is_ehl() CPU model checks.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 44 ++++-------------------------------
 1 file changed, 5 insertions(+), 39 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b0bc973c077d..24f470883ca2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -585,7 +585,7 @@ static const struct platform_features cnl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
@@ -635,7 +635,7 @@ static const struct platform_features spr_features = {
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
-	.supported_cstates = CC1 | CC3 | CC6 | PC6,
+	.supported_cstates = CC1 | CC6 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -647,7 +647,7 @@ static const struct platform_features slvd_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
-	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
+	.supported_cstates = CC1 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
@@ -699,7 +699,7 @@ static const struct platform_features tmt_features = {
 	.has_msr_misc_pwr_mgmt = 1,
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -721,7 +721,7 @@ static const struct platform_features knl_features = {
 	.has_nhm_msrs = 1,
 	.has_config_tdp = 1,
 	.bclk_freq = BCLK_100MHZ,
-	.supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
+	.supported_cstates = CC1 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_KNL,
 	.trl_msrs = TRL_KNL,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -4314,21 +4314,6 @@ int is_spr(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_ehl(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_TREMONT:
-		return 1;
-	}
-	return 0;
-}
-
 static void remove_underbar(char *s)
 {
 	char *to = s;
@@ -5248,22 +5233,6 @@ int is_knl(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_cnl(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-		return 1;
-	}
-
-	return 0;
-}
-
 unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
 {
 	if (is_knl(family, model))
@@ -5798,9 +5767,6 @@ void process_cpuid()
 	do_slm_cstates = is_slm(family, model);
 	do_knl_cstates = is_knl(family, model);
 
-	if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model))
-		BIC_NOT_PRESENT(BIC_CPU_c3);
-
 	if (!quiet)
 		decode_misc_pwr_mgmt_msr();
 
-- 
cgit 


From 8c382f9e74663072805565036026d4e79de96425 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:27:26 +0800
Subject: tools/power/turbostat: Use fine grained IRTL output

It is pointless to dump the IRTL register for a package cstate that is
not supported by the platform.

Print IRTL only for states that are available in
platform->supported_cstates.

Delete has_c8910_msrs() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 99 +++++++++++++----------------------
 1 file changed, 36 insertions(+), 63 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 24f470883ca2..680373010b01 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -229,7 +229,6 @@ unsigned int has_epb;
 unsigned int has_turbo;
 unsigned int is_hybrid;
 unsigned int do_irtl_snb;
-unsigned int do_irtl_hsw;
 unsigned int units = 1000000;	/* MHz etc */
 unsigned int genuine_intel;
 unsigned int authentic_amd;
@@ -3269,39 +3268,47 @@ void print_irtl(void)
 {
 	unsigned long long msr;
 
-	get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
-	fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
-
-	get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
-	fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
-
-	get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
-	fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	if (platform->supported_cstates & PC3) {
+		get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
+		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	}
 
-	if (!do_irtl_hsw)
-		return;
+	if (platform->supported_cstates & PC6) {
+		get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
+		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	}
 
-	get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
-	fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	if (platform->supported_cstates & PC7) {
+		get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
+		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	}
 
-	get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
-	fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	if (platform->supported_cstates & PC8) {
+		get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
+		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	}
 
-	get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
-	fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	if (platform->supported_cstates & PC9) {
+		get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
+		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	}
 
+	if (platform->supported_cstates & PC10) {
+		get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
+		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+			(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+	}
 }
 
 void free_fd_percpu(void)
@@ -5145,39 +5152,6 @@ int has_snb_msrs(unsigned int family, unsigned int model)
 	return 0;
 }
 
-/*
- * HSW ULT added support for C8/C9/C10 MSRs:
- *
- * MSR_PKG_C8_RESIDENCY		0x00000630
- * MSR_PKG_C9_RESIDENCY		0x00000631
- * MSR_PKG_C10_RESIDENCY	0x00000632
- *
- * MSR_PKGC8_IRTL		0x00000633
- * MSR_PKGC9_IRTL		0x00000634
- * MSR_PKGC10_IRTL		0x00000635
- *
- */
-int has_c8910_msrs(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_HASWELL_L:	/* HSW */
-	case INTEL_FAM6_BROADWELL:	/* BDW */
-	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
-	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
-		return 1;
-	}
-	return 0;
-}
-
 /*
  * SKL adds support for additional MSRS:
  *
@@ -5757,7 +5731,6 @@ void process_cpuid()
 	if (is_dnv(family, model)) {
 		use_c1_residency_msr = 1;
 	}
-	do_irtl_hsw = has_c8910_msrs(family, model);
 	if (has_skl_msrs(family, model)) {
 		BIC_PRESENT(BIC_Totl_c0);
 		BIC_PRESENT(BIC_Any_c0);
-- 
cgit 


From 148df4fd04a98fb24198ecb4419c87e07d38af30 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Sep 2023 23:18:51 +0800
Subject: tools/power/turbostat: Abstract IRTL support

Abstract the support for MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL.

Delete has_snb_msrs() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 72 +++++++++++++----------------------
 1 file changed, 26 insertions(+), 46 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 680373010b01..44be06b763b2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -228,7 +228,6 @@ unsigned int has_aperf;
 unsigned int has_epb;
 unsigned int has_turbo;
 unsigned int is_hybrid;
-unsigned int do_irtl_snb;
 unsigned int units = 1000000;	/* MHz etc */
 unsigned int genuine_intel;
 unsigned int authentic_amd;
@@ -284,6 +283,7 @@ struct platform_features {
 	int supported_cstates;	/* Core cstates and Package cstates supported */
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
+	bool has_irtl_msrs;	/* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -434,6 +434,7 @@ static const struct platform_features snb_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
@@ -445,6 +446,7 @@ static const struct platform_features snx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
 };
@@ -457,6 +459,7 @@ static const struct platform_features ivb_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
@@ -468,6 +471,7 @@ static const struct platform_features ivx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_SNB,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
 };
@@ -480,6 +484,7 @@ static const struct platform_features hsw_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -493,6 +498,7 @@ static const struct platform_features hsx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
 	.plr_msrs = PLR_CORE | PLR_RING,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -507,6 +513,7 @@ static const struct platform_features hswl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -520,6 +527,7 @@ static const struct platform_features hswg_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
@@ -533,6 +541,7 @@ static const struct platform_features bdw_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
@@ -545,6 +554,7 @@ static const struct platform_features bdwg_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
 };
@@ -557,6 +567,7 @@ static const struct platform_features bdx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -572,6 +583,7 @@ static const struct platform_features skl_features = {
 	.crystal_freq = 24000000,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -586,6 +598,7 @@ static const struct platform_features cnl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -600,6 +613,7 @@ static const struct platform_features skx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
+	.has_irtl_msrs = 1,
 	.has_cst_auto_convension = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
@@ -614,6 +628,7 @@ static const struct platform_features icx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_ICX,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 	.has_fixed_rapl_unit = 1,
@@ -627,6 +642,7 @@ static const struct platform_features spr_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
@@ -668,6 +684,7 @@ static const struct platform_features gmt_features = {
 	.crystal_freq = 19200000,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
 };
@@ -679,6 +696,7 @@ static const struct platform_features gmtd_features = {
 	.crystal_freq = 25000000,
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
 };
@@ -690,6 +708,7 @@ static const struct platform_features gmtp_features = {
 	.crystal_freq = 19200000,
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
 };
@@ -700,6 +719,7 @@ static const struct platform_features tmt_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_GMT,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
 	.enable_tsc_tweak = 1,
@@ -711,6 +731,7 @@ static const struct platform_features tmtd_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6,
 	.cst_limit = CST_LIMIT_GMT,
+	.has_irtl_msrs = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL,
 };
@@ -3268,6 +3289,9 @@ void print_irtl(void)
 {
 	unsigned long long msr;
 
+	if (!platform->has_irtl_msrs)
+		return;
+
 	if (platform->supported_cstates & PC3) {
 		get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
 		fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
@@ -5109,49 +5133,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	return 0;
 }
 
-/*
- * SNB adds support for additional MSRs:
- *
- * MSR_PKG_C7_RESIDENCY            0x000003fa
- * MSR_CORE_C7_RESIDENCY           0x000003fe
- * MSR_PKG_C2_RESIDENCY            0x0000060d
- */
-
-int has_snb_msrs(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_SANDYBRIDGE:
-	case INTEL_FAM6_SANDYBRIDGE_X:
-	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
-	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
-	case INTEL_FAM6_HASWELL:	/* HSW */
-	case INTEL_FAM6_HASWELL_X:	/* HSW */
-	case INTEL_FAM6_HASWELL_L:	/* HSW */
-	case INTEL_FAM6_HASWELL_G:	/* HSW */
-	case INTEL_FAM6_BROADWELL:	/* BDW */
-	case INTEL_FAM6_BROADWELL_G:	/* BDW */
-	case INTEL_FAM6_BROADWELL_X:	/* BDX */
-	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
-	case INTEL_FAM6_ICELAKE_X:	/* ICX */
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:	/* SPR */
-	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
-	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
-	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
-	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
-		return 1;
-	}
-	return 0;
-}
-
 /*
  * SKL adds support for additional MSRS:
  *
@@ -5723,7 +5704,6 @@ void process_cpuid()
 		BIC_PRESENT(BIC_SMI);
 	probe_bclk();
 
-	do_irtl_snb = has_snb_msrs(family, model);
 	if (has_slv_msrs(family, model)) {
 		BIC_PRESENT(BIC_Mod_c6);
 		use_c1_residency_msr = 1;
@@ -6098,7 +6078,7 @@ void turbostat_init()
 	if (!quiet)
 		for_all_cpus(print_thermal, ODD_COUNTERS);
 
-	if (!quiet && do_irtl_snb)
+	if (!quiet)
 		print_irtl();
 
 	if (DO_BIC(BIC_IPC))
-- 
cgit 


From 76d83d2ae8e3099d9a6bd67fba918108824d7d4d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:41:30 +0800
Subject: tools/power/turbostat: Abstract MSR_CORE_C1_RES support

Abstract the support for MSR_CORE_C1_RES.

Delete is_dnv() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 30 ++++++------------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 44be06b763b2..de9260c96678 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -223,7 +223,6 @@ unsigned int list_header_only;
 unsigned int dump_only;
 unsigned int do_knl_cstates;
 unsigned int do_slm_cstates;
-unsigned int use_c1_residency_msr;
 unsigned int has_aperf;
 unsigned int has_epb;
 unsigned int has_turbo;
@@ -284,6 +283,7 @@ struct platform_features {
 	int cst_limit;		/* MSR_PKG_CST_CONFIG_CONTROL */
 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
 	bool has_irtl_msrs;	/* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
+	bool has_msr_core_c1_res;	/* MSR_CORE_C1_RES */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -652,6 +652,7 @@ static const struct platform_features slv_features = {
 	.bclk_freq = BCLK_SLV,
 	.supported_cstates = CC1 | CC6 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
+	.has_msr_core_c1_res = 1,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 	.has_rapl_divisor = 1,
@@ -697,6 +698,7 @@ static const struct platform_features gmtd_features = {
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_GMT,
 	.has_irtl_msrs = 1,
+	.has_msr_core_c1_res = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
 };
@@ -2069,7 +2071,7 @@ void delta_core(struct core_data *new, struct core_data *old)
 
 int soft_c1_residency_display(int bic)
 {
-	if (!DO_BIC(BIC_CPU_c1) || use_c1_residency_msr)
+	if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res)
 		return 0;
 
 	return DO_BIC_READ(bic);
@@ -2118,7 +2120,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 		}
 	}
 
-	if (use_c1_residency_msr) {
+	if (platform->has_msr_core_c1_res) {
 		/*
 		 * Some models have a dedicated C1 residency MSR,
 		 * which should be more accurate than the derivation below.
@@ -2700,7 +2702,7 @@ retry:
 			return -5;
 		t->smi_count = msr & 0xFFFFFFFF;
 	}
-	if (DO_BIC(BIC_CPU_c1) && use_c1_residency_msr) {
+	if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) {
 		if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1))
 			return -6;
 	}
@@ -4297,22 +4299,6 @@ int has_slv_msrs(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_dnv(unsigned int family, unsigned int model)
-{
-
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_GOLDMONT_D:
-		return 1;
-	}
-	return 0;
-}
-
 int is_icx(unsigned int family, unsigned int model)
 {
 
@@ -5706,10 +5692,6 @@ void process_cpuid()
 
 	if (has_slv_msrs(family, model)) {
 		BIC_PRESENT(BIC_Mod_c6);
-		use_c1_residency_msr = 1;
-	}
-	if (is_dnv(family, model)) {
-		use_c1_residency_msr = 1;
 	}
 	if (has_skl_msrs(family, model)) {
 		BIC_PRESENT(BIC_Totl_c0);
-- 
cgit 


From 9cc1c1038526a5b6c9a57397def80ba79c260ff2 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:47:30 +0800
Subject: tools/power/turbostat: Abstract MSR_MODULE_C6_RES_MS support

Abstract MSR_MODULE_C6_RES_MS support.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index de9260c96678..5f90d96bd9e3 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -284,6 +284,7 @@ struct platform_features {
 	bool has_cst_auto_convension;	/* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
 	bool has_irtl_msrs;	/* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
 	bool has_msr_core_c1_res;	/* MSR_CORE_C1_RES */
+	bool has_msr_module_c6_res_ms;	/* MSR_MODULE_C6_RES_MS */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -653,6 +654,7 @@ static const struct platform_features slv_features = {
 	.supported_cstates = CC1 | CC6 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
 	.has_msr_core_c1_res = 1,
+	.has_msr_module_c6_res_ms = 1,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 	.has_rapl_divisor = 1,
@@ -5690,9 +5692,9 @@ void process_cpuid()
 		BIC_PRESENT(BIC_SMI);
 	probe_bclk();
 
-	if (has_slv_msrs(family, model)) {
+	if (platform->has_msr_module_c6_res_ms)
 		BIC_PRESENT(BIC_Mod_c6);
-	}
+
 	if (has_skl_msrs(family, model)) {
 		BIC_PRESENT(BIC_Totl_c0);
 		BIC_PRESENT(BIC_Any_c0);
-- 
cgit 


From 6c36882e09dbc9a44d64180ba7972838d3f45488 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 14:52:24 +0800
Subject: tools/power/turbostat: Abstract MSR_CC6/MC6_DEMOTION_POLICY_CONFIG
 support

Abstract the support for MSR_CC6/MC6_DEMOTION_POLICY_CONFIG.

Delete has_slv_msrs() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5f90d96bd9e3..f8d7ba87f968 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -285,6 +285,7 @@ struct platform_features {
 	bool has_irtl_msrs;	/* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
 	bool has_msr_core_c1_res;	/* MSR_CORE_C1_RES */
 	bool has_msr_module_c6_res_ms;	/* MSR_MODULE_C6_RES_MS */
+	bool has_msr_c6_demotion_policy_config;	/* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -655,6 +656,7 @@ static const struct platform_features slv_features = {
 	.cst_limit = CST_LIMIT_SLV,
 	.has_msr_core_c1_res = 1,
 	.has_msr_module_c6_res_ms = 1,
+	.has_msr_c6_demotion_policy_config = 1,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 	.has_rapl_divisor = 1,
@@ -4279,28 +4281,6 @@ void probe_bclk(void)
 		tsc_tweak = base_hz / tsc_hz;
 }
 
-/*
- * SLV client has support for unique MSRs:
- *
- * MSR_CC6_DEMOTION_POLICY_CONFIG
- * MSR_MC6_DEMOTION_POLICY_CONFIG
- */
-
-int has_slv_msrs(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_SILVERMONT:
-		return 1;
-	}
-	return 0;
-}
-
 int is_icx(unsigned int family, unsigned int model)
 {
 
@@ -5358,6 +5338,9 @@ void decode_c6_demotion_policy_msr(void)
 {
 	unsigned long long msr;
 
+	if (!platform->has_msr_c6_demotion_policy_config)
+		return;
+
 	if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
 		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
@@ -5707,7 +5690,7 @@ void process_cpuid()
 	if (!quiet)
 		decode_misc_pwr_mgmt_msr();
 
-	if (!quiet && has_slv_msrs(family, model))
+	if (!quiet)
 		decode_c6_demotion_policy_msr();
 
 	rapl_probe();
-- 
cgit 


From c8202a6c3acf7bbde42b5e389eec40fd8e1b8358 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 15:26:14 +0800
Subject: tools/power/turbostat: Abstract MSR_ATOM_PKG_C6_RESIDENCY support

Abstract the support for MSR_ATOM_PKG_C6_RESIDENCY.

Delete is_slm() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f8d7ba87f968..a04861846d33 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -222,7 +222,6 @@ unsigned int summary_only;
 unsigned int list_header_only;
 unsigned int dump_only;
 unsigned int do_knl_cstates;
-unsigned int do_slm_cstates;
 unsigned int has_aperf;
 unsigned int has_epb;
 unsigned int has_turbo;
@@ -286,6 +285,7 @@ struct platform_features {
 	bool has_msr_core_c1_res;	/* MSR_CORE_C1_RES */
 	bool has_msr_module_c6_res_ms;	/* MSR_MODULE_C6_RES_MS */
 	bool has_msr_c6_demotion_policy_config;	/* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
+	bool has_msr_atom_pkg_c6_residency;	/* MSR_ATOM_PKG_C6_RESIDENCY */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -657,6 +657,7 @@ static const struct platform_features slv_features = {
 	.has_msr_core_c1_res = 1,
 	.has_msr_module_c6_res_ms = 1,
 	.has_msr_c6_demotion_policy_config = 1,
+	.has_msr_atom_pkg_c6_residency = 1,
 	.trl_msrs = TRL_ATOM,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 	.has_rapl_divisor = 1,
@@ -669,6 +670,7 @@ static const struct platform_features slvd_features = {
 	.bclk_freq = BCLK_SLV,
 	.supported_cstates = CC1 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_SLV,
+	.has_msr_atom_pkg_c6_residency = 1,
 	.trl_msrs = TRL_BASE,
 	.rapl_msrs = RAPL_PKG | RAPL_CORE,
 	.rapl_quirk_tdp = 30,
@@ -2795,7 +2797,7 @@ retry:
 		if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
 			return -9;
 	if (DO_BIC(BIC_Pkgpc6)) {
-		if (do_slm_cstates) {
+		if (platform->has_msr_atom_pkg_c6_residency) {
 			if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6))
 				return -10;
 		} else {
@@ -5125,22 +5127,6 @@ int has_skl_msrs(unsigned int family, unsigned int model)
 	return 0;
 }
 
-int is_slm(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
-	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
-		return 1;
-	}
-	return 0;
-}
-
 int is_knl(unsigned int family, unsigned int model)
 {
 	if (!genuine_intel)
@@ -5684,7 +5670,6 @@ void process_cpuid()
 		BIC_PRESENT(BIC_GFX_c0);
 		BIC_PRESENT(BIC_CPUGFX);
 	}
-	do_slm_cstates = is_slm(family, model);
 	do_knl_cstates = is_knl(family, model);
 
 	if (!quiet)
-- 
cgit 


From 80d132cb45f2cc171395bfaacd74567a183ab160 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 15:00:58 +0800
Subject: tools/power/turbostat: Abstract MSR_KNL_CORE_C6_RESIDENCY support

Abstract the support for MSR_KNL_CORE_C6_RESIDENCY.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a04861846d33..f0a99e092fa7 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -221,7 +221,6 @@ unsigned int rapl_joules;
 unsigned int summary_only;
 unsigned int list_header_only;
 unsigned int dump_only;
-unsigned int do_knl_cstates;
 unsigned int has_aperf;
 unsigned int has_epb;
 unsigned int has_turbo;
@@ -286,6 +285,7 @@ struct platform_features {
 	bool has_msr_module_c6_res_ms;	/* MSR_MODULE_C6_RES_MS */
 	bool has_msr_c6_demotion_policy_config;	/* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
 	bool has_msr_atom_pkg_c6_residency;	/* MSR_ATOM_PKG_C6_RESIDENCY */
+	bool has_msr_knl_core_c6_residency;	/* MSR_KNL_CORE_C6_RESIDENCY */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -751,6 +751,7 @@ static const struct platform_features knl_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | PC3 | PC6,
 	.cst_limit = CST_LIMIT_KNL,
+	.has_msr_knl_core_c6_residency = 1,
 	.trl_msrs = TRL_KNL,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 	.has_fixed_rapl_unit = 1,
@@ -2727,10 +2728,10 @@ retry:
 			return -6;
 	}
 
-	if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) {
+	if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) {
 		if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
 			return -7;
-	} else if (do_knl_cstates && soft_c1_residency_display(BIC_CPU_c6)) {
+	} else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) {
 		if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6))
 			return -7;
 	}
@@ -5670,7 +5671,6 @@ void process_cpuid()
 		BIC_PRESENT(BIC_GFX_c0);
 		BIC_PRESENT(BIC_CPUGFX);
 	}
-	do_knl_cstates = is_knl(family, model);
 
 	if (!quiet)
 		decode_misc_pwr_mgmt_msr();
-- 
cgit 


From 58ddb691d8d8a281535406766da4e23e0f011126 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 15:07:43 +0800
Subject: tools/power/turbostat: Abstract extended cstate MSRs support

Abstract the support for MSR_PKG_WEIGHTED_CORE_C0_RES,
MSR_PKG_ANY_CORE_C0_RES, MSR_PKG_ANY_GFXE_C0_RES and
MSR_PKG_BOTH_CORE_GFXE_C0_RES.

Delete has_skl_msrs() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f0a99e092fa7..613b284f2f09 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -286,6 +286,7 @@ struct platform_features {
 	bool has_msr_c6_demotion_policy_config;	/* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
 	bool has_msr_atom_pkg_c6_residency;	/* MSR_ATOM_PKG_C6_RESIDENCY */
 	bool has_msr_knl_core_c6_residency;	/* MSR_KNL_CORE_C6_RESIDENCY */
+	bool has_ext_cst_msrs;	/* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -586,6 +587,7 @@ static const struct platform_features skl_features = {
 	.supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_irtl_msrs = 1,
+	.has_ext_cst_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -601,6 +603,7 @@ static const struct platform_features cnl_features = {
 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_irtl_msrs = 1,
+	.has_ext_cst_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
@@ -5104,30 +5107,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	return 0;
 }
 
-/*
- * SKL adds support for additional MSRS:
- *
- * MSR_PKG_WEIGHTED_CORE_C0_RES    0x00000658
- * MSR_PKG_ANY_CORE_C0_RES         0x00000659
- * MSR_PKG_ANY_GFXE_C0_RES         0x0000065A
- * MSR_PKG_BOTH_CORE_GFXE_C0_RES   0x0000065B
- */
-int has_skl_msrs(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
-	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
-		return 1;
-	}
-	return 0;
-}
-
 int is_knl(unsigned int family, unsigned int model)
 {
 	if (!genuine_intel)
@@ -5665,7 +5644,7 @@ void process_cpuid()
 	if (platform->has_msr_module_c6_res_ms)
 		BIC_PRESENT(BIC_Mod_c6);
 
-	if (has_skl_msrs(family, model)) {
+	if (platform->has_ext_cst_msrs) {
 		BIC_PRESENT(BIC_Totl_c0);
 		BIC_PRESENT(BIC_Any_c0);
 		BIC_PRESENT(BIC_GFX_c0);
-- 
cgit 


From ed43247b15a4fb4df01c3408fcab6e206f93ab87 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 15:49:10 +0800
Subject: tools/power/turbostat: Abstract aperf/mperf multiplier support

Abstract aperf/mperf multiplier support.

Delete is_knl() CPU model check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 613b284f2f09..65a507d82fc4 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -296,6 +296,7 @@ struct platform_features {
 	int rapl_quirk_tdp;	/* Hardcoded TDP value when cannot be retrieved from hardware */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 	bool enable_tsc_tweak;	/* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
+	bool need_perf_multiplier;	/* mperf/aperf multiplier */
 };
 
 struct platform_data {
@@ -758,6 +759,7 @@ static const struct platform_features knl_features = {
 	.trl_msrs = TRL_KNL,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 	.has_fixed_rapl_unit = 1,
+	.need_perf_multiplier = 1,
 };
 
 static const struct platform_features default_features = {
@@ -5107,28 +5109,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	return 0;
 }
 
-int is_knl(unsigned int family, unsigned int model)
-{
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
-		return 1;
-	}
-	return 0;
-}
-
-unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
-{
-	if (is_knl(family, model))
-		return 1024;
-	return 1;
-}
-
 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -5630,7 +5610,7 @@ void process_cpuid()
 	}
 
 	if (has_aperf)
-		aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model);
+		aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
 
 	BIC_PRESENT(BIC_IRQ);
 	BIC_PRESENT(BIC_TSC_MHz);
-- 
cgit 


From 7d0ebe6f7eaf8c0ac069ddab1fe3793401139fb3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 15:57:44 +0800
Subject: tools/power/turbostat: Abstract cstate prewake bit support

Abstract cstate prewake bit support.

Delete is_icx()/is_spr() CPU model checks.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 47 +++--------------------------------
 1 file changed, 4 insertions(+), 43 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 65a507d82fc4..580cc2a3b947 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -252,7 +252,6 @@ unsigned int tj_max_override;
 double rapl_power_units, rapl_time_units;
 double rapl_dram_energy_units, rapl_energy_units;
 double rapl_joule_counter_range;
-unsigned int dis_cstate_prewake;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
 int base_cpu;
@@ -287,6 +286,7 @@ struct platform_features {
 	bool has_msr_atom_pkg_c6_residency;	/* MSR_ATOM_PKG_C6_RESIDENCY */
 	bool has_msr_knl_core_c6_residency;	/* MSR_KNL_CORE_C6_RESIDENCY */
 	bool has_ext_cst_msrs;	/* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
+	bool has_cst_prewake_bit;	/* Cstate prewake bit in MSR_IA32_POWER_CTL */
 	int trl_msrs;		/* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
 	int plr_msrs;		/* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
 	int rapl_msrs;		/* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
@@ -635,6 +635,7 @@ static const struct platform_features icx_features = {
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_ICX,
 	.has_irtl_msrs = 1,
+	.has_cst_prewake_bit = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 	.has_fixed_rapl_unit = 1,
@@ -649,6 +650,7 @@ static const struct platform_features spr_features = {
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
 	.has_irtl_msrs = 1,
+	.has_cst_prewake_bit = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
@@ -3014,8 +3016,6 @@ void probe_cst_limit(void)
 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
 }
 
-void prewake_cstate_probe(unsigned int family, unsigned int model);
-
 static void dump_platform_info(void)
 {
 	unsigned long long msr;
@@ -3036,7 +3036,7 @@ static void dump_platform_info(void)
 		base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
 
 	/* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
-	if (dis_cstate_prewake)
+	if (platform->has_cst_prewake_bit)
 		fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
 
 	return;
@@ -4289,38 +4289,6 @@ void probe_bclk(void)
 		tsc_tweak = base_hz / tsc_hz;
 }
 
-int is_icx(unsigned int family, unsigned int model)
-{
-
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_ICELAKE_X:
-		return 1;
-	}
-	return 0;
-}
-
-int is_spr(unsigned int family, unsigned int model)
-{
-
-	if (!genuine_intel)
-		return 0;
-
-	if (family != 6)
-		return 0;
-
-	switch (model) {
-	case INTEL_FAM6_SAPPHIRERAPIDS_X:
-		return 1;
-	}
-	return 0;
-}
-
 static void remove_underbar(char *s)
 {
 	char *to = s;
@@ -4910,12 +4878,6 @@ void rapl_probe(void)
 		rapl_probe_amd();
 }
 
-void prewake_cstate_probe(unsigned int family, unsigned int model)
-{
-	if (is_icx(family, model) || is_spr(family, model))
-		dis_cstate_prewake = 1;
-}
-
 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	unsigned long long msr;
@@ -5638,7 +5600,6 @@ void process_cpuid()
 		decode_c6_demotion_policy_msr();
 
 	rapl_probe();
-	prewake_cstate_probe(family, model);
 
 	if (!quiet)
 		dump_cstate_pstate_config_info();
-- 
cgit 


From d085b3b0f11af7d23601b832e0d9e446d18df968 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 21:53:21 +0800
Subject: tools/power/turbostat: Delete intel_model_duplicates()

Now CPU model checks have been cleaned up, no code depends on the
duplicated CPU model value.

Delete intel_model_duplicates().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 59 -----------------------------------
 1 file changed, 59 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 580cc2a3b947..89f53e1ac63a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5258,63 +5258,6 @@ void decode_c6_demotion_policy_msr(void)
 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
 }
 
-/*
- * When models are the same, for the purpose of turbostat, reuse
- */
-unsigned int intel_model_duplicates(unsigned int model)
-{
-
-	switch (model) {
-	case INTEL_FAM6_NEHALEM_EP:	/* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */
-	case INTEL_FAM6_NEHALEM_G:	/* Core i7 and i5 Processor - Nehalem */
-	case INTEL_FAM6_WESTMERE:	/* Westmere Client - Clarkdale, Arrandale */
-	case INTEL_FAM6_WESTMERE_EP:	/* Westmere EP - Gulftown */
-		return INTEL_FAM6_NEHALEM;
-
-	case INTEL_FAM6_WESTMERE_EX:	/* Westmere-EX Xeon - Eagleton */
-		return INTEL_FAM6_NEHALEM_EX;
-
-	case INTEL_FAM6_XEON_PHI_KNM:
-		return INTEL_FAM6_XEON_PHI_KNL;
-
-	case INTEL_FAM6_BROADWELL_D:	/* BDX-DE */
-		return INTEL_FAM6_BROADWELL_X;
-
-	case INTEL_FAM6_SKYLAKE:
-	case INTEL_FAM6_KABYLAKE_L:
-	case INTEL_FAM6_KABYLAKE:
-	case INTEL_FAM6_COMETLAKE_L:
-	case INTEL_FAM6_COMETLAKE:
-		return INTEL_FAM6_SKYLAKE_L;
-
-	case INTEL_FAM6_ICELAKE_L:
-	case INTEL_FAM6_ICELAKE_NNPI:
-	case INTEL_FAM6_TIGERLAKE_L:
-	case INTEL_FAM6_TIGERLAKE:
-	case INTEL_FAM6_ROCKETLAKE:
-	case INTEL_FAM6_LAKEFIELD:
-	case INTEL_FAM6_ALDERLAKE:
-	case INTEL_FAM6_ALDERLAKE_L:
-	case INTEL_FAM6_ATOM_GRACEMONT:
-	case INTEL_FAM6_RAPTORLAKE:
-	case INTEL_FAM6_RAPTORLAKE_P:
-	case INTEL_FAM6_RAPTORLAKE_S:
-	case INTEL_FAM6_METEORLAKE:
-	case INTEL_FAM6_METEORLAKE_L:
-		return INTEL_FAM6_CANNONLAKE_L;
-
-	case INTEL_FAM6_ATOM_TREMONT_L:
-		return INTEL_FAM6_ATOM_TREMONT;
-
-	case INTEL_FAM6_ICELAKE_D:
-		return INTEL_FAM6_ICELAKE_X;
-
-	case INTEL_FAM6_EMERALDRAPIDS_X:
-		return INTEL_FAM6_SAPPHIRERAPIDS_X;
-	}
-	return model;
-}
-
 void print_dev_latency(void)
 {
 	char *path = "/dev/cpu_dma_latency";
@@ -5457,8 +5400,6 @@ void process_cpuid()
 	}
 
 	probe_platform_features(family, model);
-	if (genuine_intel)
-		model = intel_model_duplicates(model);
 
 	if (!(edx_flags & (1 << 5)))
 		errx(1, "CPUID: no MSR");
-- 
cgit 


From 32e8c6169af7ef7c938b1bd996d27ab171e27d80 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 14:44:06 +0800
Subject: tools/power/turbostat: Improve probe_platform_features() logic

AMD/Hygon platforms that don't have RAPL use 'amd_features' to describe
the platform features. Unknown Intel platforms use 'default_features' to
describe the platform features.

As none of the platform feature is set for 'amd_features' or
'default_features', there is no need to maintain both of them.

Remove 'amd_features' structure and improve the logic in
probe_platform_features().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 89f53e1ac63a..102ba515cf4b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -767,9 +767,6 @@ static const struct platform_features knl_features = {
 static const struct platform_features default_features = {
 };
 
-static const struct platform_features amd_features = {
-};
-
 static const struct platform_features amd_features_with_rapl = {
 	.rapl_msrs = RAPL_AMD_F17H,
 	.has_per_core_rapl = 1,
@@ -849,9 +846,9 @@ void probe_platform_features(unsigned int family, unsigned int model)
 {
 	int i;
 
-	if (authentic_amd || hygon_genuine) {
-		platform = &amd_features;
+	platform = &default_features;
 
+	if (authentic_amd || hygon_genuine) {
 		if (max_extended_level >= 0x80000007) {
 			unsigned int eax, ebx, ecx, edx;
 
@@ -863,8 +860,6 @@ void probe_platform_features(unsigned int family, unsigned int model)
 		return;
 	}
 
-	platform = &default_features;
-
 	if (!genuine_intel || family != 6)
 		return;
 
-- 
cgit 


From 045acf6064c5567011163e97c28906e0a7791414 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 29 Aug 2023 13:47:58 +0800
Subject: tools/power/turbostat: Relocate cstate probing code

Move all cstate probing related code into probe_cstates().

Note that dump_platform_info() actually dumps both MSR_PLATFORM_INFO and
MSR_IA32_POWER_CTL. MSR_PLATFORM_INFO is for pstate and
MSR_IA32_POWER_CTL is for cstate. So split dump_platform_info() and dump
MSR_IA32_POWER_CTL in probe_cstates().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 50 ++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 102ba515cf4b..e5ca1586961e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -272,7 +272,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr);
 struct platform_features {
 	bool has_msr_misc_feature_control;	/* MSR_MISC_FEATURE_CONTROL */
 	bool has_msr_misc_pwr_mgmt;	/* MSR_MISC_PWR_MGMT */
-	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */
+	bool has_nhm_msrs;	/* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */
 	bool has_config_tdp;	/* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
 	int bclk_freq;		/* CPU base clock */
 	int crystal_freq;	/* Crystal clock to use when not available from CPUID.15 */
@@ -3025,6 +3025,14 @@ static void dump_platform_info(void)
 
 	ratio = (msr >> 8) & 0xFF;
 	fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
+}
+
+static void dump_power_ctl(void)
+{
+	unsigned long long msr;
+
+	if (!platform->has_nhm_msrs)
+		return;
 
 	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
 	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
@@ -3229,6 +3237,9 @@ static void dump_cst_cfg(void)
 {
 	unsigned long long msr;
 
+	if (!platform->has_nhm_msrs)
+		return;
+
 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
 
 	fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
@@ -4332,7 +4343,6 @@ static void dump_cstate_pstate_config_info(void)
 
 	dump_platform_info();
 	dump_turbo_ratio_info();
-	dump_cst_cfg();
 }
 
 static int read_sysfs_int(char *path)
@@ -5332,6 +5342,25 @@ void probe_cstates(void)
 
 	if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10))
 		BIC_PRESENT(BIC_Pkgpc10);
+
+	if (platform->has_msr_module_c6_res_ms)
+		BIC_PRESENT(BIC_Mod_c6);
+
+	if (platform->has_ext_cst_msrs) {
+		BIC_PRESENT(BIC_Totl_c0);
+		BIC_PRESENT(BIC_Any_c0);
+		BIC_PRESENT(BIC_GFX_c0);
+		BIC_PRESENT(BIC_CPUGFX);
+	}
+
+	if (quiet)
+		return;
+
+	dump_power_ctl();
+	dump_cst_cfg();
+	decode_c6_demotion_policy_msr();
+	print_dev_latency();
+	dump_sysfs_cstate_config();
 }
 
 void process_cpuid()
@@ -5519,32 +5548,15 @@ void process_cpuid()
 		BIC_PRESENT(BIC_SMI);
 	probe_bclk();
 
-	if (platform->has_msr_module_c6_res_ms)
-		BIC_PRESENT(BIC_Mod_c6);
-
-	if (platform->has_ext_cst_msrs) {
-		BIC_PRESENT(BIC_Totl_c0);
-		BIC_PRESENT(BIC_Any_c0);
-		BIC_PRESENT(BIC_GFX_c0);
-		BIC_PRESENT(BIC_CPUGFX);
-	}
-
 	if (!quiet)
 		decode_misc_pwr_mgmt_msr();
 
-	if (!quiet)
-		decode_c6_demotion_policy_msr();
-
 	rapl_probe();
 
 	if (!quiet)
 		dump_cstate_pstate_config_info();
 	intel_uncore_frequency_probe();
 
-	if (!quiet)
-		print_dev_latency();
-	if (!quiet)
-		dump_sysfs_cstate_config();
 	if (!quiet)
 		dump_sysfs_pstate_config();
 
-- 
cgit 


From 11cd9a09f3e827605cf8fc7c343ddb8c9f4ee95d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 15:58:10 +0800
Subject: tools/power/turbostat: Relocate pstate probing code

Introduce probe_pstates() and move all pstate probing related code into
it.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 39 +++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index e5ca1586961e..1bcaee6d0ead 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3016,6 +3016,9 @@ static void dump_platform_info(void)
 	unsigned long long msr;
 	unsigned int ratio;
 
+	if (!platform->has_nhm_msrs)
+		return;
+
 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
 
 	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
@@ -4313,6 +4316,9 @@ static void dump_turbo_ratio_info(void)
 	if (!has_turbo)
 		return;
 
+	if (!platform->has_nhm_msrs)
+		return;
+
 	if (platform->trl_msrs & TRL_LIMIT2)
 		dump_turbo_ratio_limit2();
 
@@ -4336,15 +4342,6 @@ static void dump_turbo_ratio_info(void)
 		dump_config_tdp();
 }
 
-static void dump_cstate_pstate_config_info(void)
-{
-	if (!platform->has_nhm_msrs)
-		return;
-
-	dump_platform_info();
-	dump_turbo_ratio_info();
-}
-
 static int read_sysfs_int(char *path)
 {
 	FILE *input;
@@ -5363,6 +5360,19 @@ void probe_cstates(void)
 	dump_sysfs_cstate_config();
 }
 
+void probe_pstates(void)
+{
+	probe_bclk();
+
+	if (quiet)
+		return;
+
+	dump_platform_info();
+	dump_turbo_ratio_info();
+	dump_sysfs_pstate_config();
+	decode_misc_pwr_mgmt_msr();
+}
+
 void process_cpuid()
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -5542,24 +5552,17 @@ void process_cpuid()
 	BIC_PRESENT(BIC_IRQ);
 	BIC_PRESENT(BIC_TSC_MHz);
 
+	probe_pstates();
+
 	probe_cstates();
 
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
-	probe_bclk();
-
-	if (!quiet)
-		decode_misc_pwr_mgmt_msr();
 
 	rapl_probe();
 
-	if (!quiet)
-		dump_cstate_pstate_config_info();
 	intel_uncore_frequency_probe();
 
-	if (!quiet)
-		dump_sysfs_pstate_config();
-
 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
 		BIC_PRESENT(BIC_GFX_rc6);
 
-- 
cgit 


From 622c8f23556266a5afdd657aa0518b7d70d5dfc7 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 16:00:15 +0800
Subject: tools/power/turbostat: Rename uncore probing function

Rename intel_uncore_frequency_probe() to probe_intel_uncore_frequency()
to be consistent with other probing function names.

Probe uncore frequency right after probing cstates.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1bcaee6d0ead..a956a30d81de 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4378,7 +4378,7 @@ static void dump_sysfs_file(char *path)
 	fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
 }
 
-static void intel_uncore_frequency_probe(void)
+static void probe_intel_uncore_frequency(void)
 {
 	int i, j;
 	char path[128];
@@ -5556,13 +5556,13 @@ void process_cpuid()
 
 	probe_cstates();
 
+	probe_intel_uncore_frequency();
+
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
 
 	rapl_probe();
 
-	intel_uncore_frequency_probe();
-
 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
 		BIC_PRESENT(BIC_GFX_rc6);
 
-- 
cgit 


From 6cb13609a07ba466b4613fdce7da8c98508069b7 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 16:00:33 +0800
Subject: tools/power/turbostat: Rename rapl probing function

Rename rapl_probe() to probe_rapl() to be consistent with other probing
function names.

Probe rapl after probing uncore frequency.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a956a30d81de..bf2b1d1b2627 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4865,11 +4865,11 @@ void rapl_probe_amd(void)
 }
 
 /*
- * rapl_probe()
+ * probe_rapl()
  *
  * sets rapl_power_units, rapl_energy_units, rapl_time_units
  */
-void rapl_probe(void)
+void probe_rapl(void)
 {
 	if (!platform->rapl_msrs)
 		return;
@@ -5558,11 +5558,11 @@ void process_cpuid()
 
 	probe_intel_uncore_frequency();
 
+	probe_rapl();
+
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
 
-	rapl_probe();
-
 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
 		BIC_PRESENT(BIC_GFX_rc6);
 
-- 
cgit 


From 2538d1673d02f66f6bdf01eebf36e271228778e9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 16:01:12 +0800
Subject: tools/power/turbostat: Relocate graphics probing code

Introduce probe_graphics(), and move all graphics probing related code
into it.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index bf2b1d1b2627..feff9ecff368 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4424,6 +4424,20 @@ static void probe_intel_uncore_frequency(void)
 	}
 }
 
+static void probe_graphics(void)
+{
+	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
+		BIC_PRESENT(BIC_GFX_rc6);
+
+	if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) ||
+	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
+		BIC_PRESENT(BIC_GFXMHz);
+
+	if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) ||
+	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
+		BIC_PRESENT(BIC_GFXACTMHz);
+}
+
 static void dump_sysfs_cstate_config(void)
 {
 	char path[64];
@@ -5558,22 +5572,13 @@ void process_cpuid()
 
 	probe_intel_uncore_frequency();
 
+	probe_graphics();
+
 	probe_rapl();
 
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
 
-	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
-		BIC_PRESENT(BIC_GFX_rc6);
-
-	if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) ||
-	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
-		BIC_PRESENT(BIC_GFXMHz);
-
-	if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) ||
-	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
-		BIC_PRESENT(BIC_GFXACTMHz);
-
 	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
 		BIC_PRESENT(BIC_CPU_LPI);
 	else
-- 
cgit 


From e7d7b82de192464b733fb2bcc9e305ea6f6ea47e Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 16:01:36 +0800
Subject: tools/power/turbostat: Relocate lpi probing code

Introduce probe_lpi(), and move all lpi probing related code into it.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 38 ++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index feff9ecff368..ad9147757d5a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5374,6 +5374,26 @@ void probe_cstates(void)
 	dump_sysfs_cstate_config();
 }
 
+void probe_lpi(void)
+{
+	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
+		BIC_PRESENT(BIC_CPU_LPI);
+	else
+		BIC_NOT_PRESENT(BIC_CPU_LPI);
+
+	if (!access(sys_lpi_file_sysfs, R_OK)) {
+		sys_lpi_file = sys_lpi_file_sysfs;
+		BIC_PRESENT(BIC_SYS_LPI);
+	} else if (!access(sys_lpi_file_debugfs, R_OK)) {
+		sys_lpi_file = sys_lpi_file_debugfs;
+		BIC_PRESENT(BIC_SYS_LPI);
+	} else {
+		sys_lpi_file_sysfs = NULL;
+		BIC_NOT_PRESENT(BIC_SYS_LPI);
+	}
+
+}
+
 void probe_pstates(void)
 {
 	probe_bclk();
@@ -5570,6 +5590,8 @@ void process_cpuid()
 
 	probe_cstates();
 
+	probe_lpi();
+
 	probe_intel_uncore_frequency();
 
 	probe_graphics();
@@ -5579,27 +5601,11 @@ void process_cpuid()
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
 
-	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
-		BIC_PRESENT(BIC_CPU_LPI);
-	else
-		BIC_NOT_PRESENT(BIC_CPU_LPI);
-
 	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
 		BIC_PRESENT(BIC_CORE_THROT_CNT);
 	else
 		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
 
-	if (!access(sys_lpi_file_sysfs, R_OK)) {
-		sys_lpi_file = sys_lpi_file_sysfs;
-		BIC_PRESENT(BIC_SYS_LPI);
-	} else if (!access(sys_lpi_file_debugfs, R_OK)) {
-		sys_lpi_file = sys_lpi_file_debugfs;
-		BIC_PRESENT(BIC_SYS_LPI);
-	} else {
-		sys_lpi_file_sysfs = NULL;
-		BIC_NOT_PRESENT(BIC_SYS_LPI);
-	}
-
 	if (!quiet)
 		decode_misc_feature_control();
 
-- 
cgit 


From db735f8ba78bf7692579b78f0ca1e40563ef79fd Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 31 Aug 2023 16:02:16 +0800
Subject: tools/power/turbostat: Relocate thermal probing code

Introduce probe_thermal(), and move all thermal probing related code
into it.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index ad9147757d5a..8dae576234a4 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4894,6 +4894,14 @@ void probe_rapl(void)
 		rapl_probe_amd();
 }
 
+void probe_thermal(void)
+{
+	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
+		BIC_PRESENT(BIC_CORE_THROT_CNT);
+	else
+		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
+}
+
 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	unsigned long long msr;
@@ -5598,14 +5606,11 @@ void process_cpuid()
 
 	probe_rapl();
 
+	probe_thermal();
+
 	if (platform->has_nhm_msrs)
 		BIC_PRESENT(BIC_SMI);
 
-	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
-		BIC_PRESENT(BIC_CORE_THROT_CNT);
-	else
-		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
-
 	if (!quiet)
 		decode_misc_feature_control();
 
-- 
cgit 


From ce7a32c2a4cdba06ed5ead4ed8f980c893e20cd2 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 30 Aug 2023 16:36:06 +0800
Subject: tools/power/turbostat: Reorder some functions

Reorder some functions to solve code depdency introduced by next patch.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 214 +++++++++++++++++-----------------
 1 file changed, 107 insertions(+), 107 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 8dae576234a4..eb333612bdec 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4878,92 +4878,6 @@ void rapl_probe_amd(void)
 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
 }
 
-/*
- * probe_rapl()
- *
- * sets rapl_power_units, rapl_energy_units, rapl_time_units
- */
-void probe_rapl(void)
-{
-	if (!platform->rapl_msrs)
-		return;
-
-	if (genuine_intel)
-		rapl_probe_intel();
-	if (authentic_amd || hygon_genuine)
-		rapl_probe_amd();
-}
-
-void probe_thermal(void)
-{
-	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
-		BIC_PRESENT(BIC_CORE_THROT_CNT);
-	else
-		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
-}
-
-int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
-{
-	unsigned long long msr;
-	unsigned int dts, dts2;
-	int cpu;
-
-	UNUSED(c);
-	UNUSED(p);
-
-	if (!(do_dts || do_ptm))
-		return 0;
-
-	cpu = t->cpu_id;
-
-	/* DTS is per-core, no need to print for each thread */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
-		return 0;
-
-	if (cpu_migrate(cpu)) {
-		fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
-		return -1;
-	}
-
-	if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) {
-		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
-			return 0;
-
-		dts = (msr >> 16) & 0x7F;
-		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
-
-		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
-			return 0;
-
-		dts = (msr >> 16) & 0x7F;
-		dts2 = (msr >> 8) & 0x7F;
-		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
-			cpu, msr, tj_max - dts, tj_max - dts2);
-	}
-
-	if (do_dts && debug) {
-		unsigned int resolution;
-
-		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
-			return 0;
-
-		dts = (msr >> 16) & 0x7F;
-		resolution = (msr >> 27) & 0xF;
-		fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
-			cpu, msr, tj_max - dts, resolution);
-
-		if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
-			return 0;
-
-		dts = (msr >> 16) & 0x7F;
-		dts2 = (msr >> 8) & 0x7F;
-		fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
-			cpu, msr, tj_max - dts, tj_max - dts2);
-	}
-
-	return 0;
-}
-
 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
 {
 	fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
@@ -5095,29 +5009,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	return 0;
 }
 
-int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+/*
+ * probe_rapl()
+ *
+ * sets rapl_power_units, rapl_energy_units, rapl_time_units
+ */
+void probe_rapl(void)
 {
-	unsigned int eax, ebx, ecx, edx;
-
-	UNUSED(c);
-	UNUSED(p);
-
-	if (!genuine_intel)
-		return 0;
-
-	if (cpu_migrate(t->cpu_id)) {
-		fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
-		return -1;
-	}
-
-	if (max_level < 0x1a)
-		return 0;
+	if (!platform->rapl_msrs)
+		return;
 
-	__cpuid(0x1a, eax, ebx, ecx, edx);
-	eax = (eax >> 24) & 0xFF;
-	if (eax == 0x20)
-		t->is_atom = true;
-	return 0;
+	if (genuine_intel)
+		rapl_probe_intel();
+	if (authentic_amd || hygon_genuine)
+		rapl_probe_amd();
 }
 
 /*
@@ -5200,6 +5105,101 @@ guess:
 	return 0;
 }
 
+int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	unsigned int dts, dts2;
+	int cpu;
+
+	UNUSED(c);
+	UNUSED(p);
+
+	if (!(do_dts || do_ptm))
+		return 0;
+
+	cpu = t->cpu_id;
+
+	/* DTS is per-core, no need to print for each thread */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+		return 0;
+
+	if (cpu_migrate(cpu)) {
+		fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) {
+		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
+
+		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		dts2 = (msr >> 8) & 0x7F;
+		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+			cpu, msr, tj_max - dts, tj_max - dts2);
+	}
+
+	if (do_dts && debug) {
+		unsigned int resolution;
+
+		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		resolution = (msr >> 27) & 0xF;
+		fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
+			cpu, msr, tj_max - dts, resolution);
+
+		if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
+			return 0;
+
+		dts = (msr >> 16) & 0x7F;
+		dts2 = (msr >> 8) & 0x7F;
+		fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+			cpu, msr, tj_max - dts, tj_max - dts2);
+	}
+
+	return 0;
+}
+
+void probe_thermal(void)
+{
+	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
+		BIC_PRESENT(BIC_CORE_THROT_CNT);
+	else
+		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
+}
+
+int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	UNUSED(c);
+	UNUSED(p);
+
+	if (!genuine_intel)
+		return 0;
+
+	if (cpu_migrate(t->cpu_id)) {
+		fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
+		return -1;
+	}
+
+	if (max_level < 0x1a)
+		return 0;
+
+	__cpuid(0x1a, eax, ebx, ecx, edx);
+	eax = (eax >> 24) & 0xFF;
+	if (eax == 0x20)
+		t->is_atom = true;
+	return 0;
+}
+
 void decode_feature_control_msr(void)
 {
 	unsigned long long msr;
-- 
cgit 


From 5612b2c89bd0a3a2f7c9e8756e6ead971b03f8a3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 30 Aug 2023 16:54:51 +0800
Subject: tools/power/turbostat: Relocate more probing related code

Relocate more feature probing code outside of process_cpuids() into the
corresponding probing functions.

This improves the readability of code and the turbostat output.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 37 ++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index eb333612bdec..ffeee48e8d85 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5023,6 +5023,11 @@ void probe_rapl(void)
 		rapl_probe_intel();
 	if (authentic_amd || hygon_genuine)
 		rapl_probe_amd();
+
+	if (quiet)
+		return;
+
+	for_all_cpus(print_rapl, ODD_COUNTERS);
 }
 
 /*
@@ -5173,6 +5178,13 @@ void probe_thermal(void)
 		BIC_PRESENT(BIC_CORE_THROT_CNT);
 	else
 		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
+
+	for_all_cpus(set_temperature_target, ODD_COUNTERS);
+
+	if (quiet)
+		return;
+
+	for_all_cpus(print_thermal, ODD_COUNTERS);
 }
 
 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
@@ -5380,6 +5392,7 @@ void probe_cstates(void)
 	decode_c6_demotion_policy_msr();
 	print_dev_latency();
 	dump_sysfs_cstate_config();
+	print_irtl();
 }
 
 void probe_lpi(void)
@@ -5413,6 +5426,10 @@ void probe_pstates(void)
 	dump_turbo_ratio_info();
 	dump_sysfs_pstate_config();
 	decode_misc_pwr_mgmt_msr();
+
+	for_all_cpus(print_hwp, ODD_COUNTERS);
+	for_all_cpus(print_epb, ODD_COUNTERS);
+	for_all_cpus(print_perf_limit, ODD_COUNTERS);
 }
 
 void process_cpuid()
@@ -5897,29 +5914,9 @@ void turbostat_init()
 	process_cpuid();
 	linux_perf_init();
 
-	if (!quiet)
-		for_all_cpus(print_hwp, ODD_COUNTERS);
-
-	if (!quiet)
-		for_all_cpus(print_epb, ODD_COUNTERS);
-
-	if (!quiet)
-		for_all_cpus(print_perf_limit, ODD_COUNTERS);
-
-	if (!quiet)
-		for_all_cpus(print_rapl, ODD_COUNTERS);
-
-	for_all_cpus(set_temperature_target, ODD_COUNTERS);
-
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
 
-	if (!quiet)
-		for_all_cpus(print_thermal, ODD_COUNTERS);
-
-	if (!quiet)
-		print_irtl();
-
 	if (DO_BIC(BIC_IPC))
 		(void)get_instr_count_fd(base_cpu);
 }
-- 
cgit 


From 7ee39d8d593e3d28eced0fa1a8c8c6bdcbd4156e Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 13 Sep 2023 23:32:39 +0800
Subject: tools/power/turbostat: Introduce probe_pm_features()

Feature probe has nothing to do with CPUID, thus it should not be in
process_cpuids().

Introduce probe_pm_features() and move all feature probing functions
into it.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index ffeee48e8d85..607152b36c1a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5610,7 +5610,10 @@ void process_cpuid()
 
 	BIC_PRESENT(BIC_IRQ);
 	BIC_PRESENT(BIC_TSC_MHz);
+}
 
+void probe_pm_features(void)
+{
 	probe_pstates();
 
 	probe_cstates();
@@ -5630,8 +5633,6 @@ void process_cpuid()
 
 	if (!quiet)
 		decode_misc_feature_control();
-
-	return;
 }
 
 /*
@@ -5912,6 +5913,7 @@ void turbostat_init()
 	check_dev_msr();
 	check_permissions();
 	process_cpuid();
+	probe_pm_features();
 	linux_perf_init();
 
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
-- 
cgit 


From 05ad96ff0fb9d1b16abb5022b9c62636c6780fc2 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 27 Aug 2023 15:33:27 +0800
Subject: tools/power/turbostat: Enable MSR_CORE_C1_RES on recent Intel client
 platforms

All recent Intel client platforms have MSR_CORE_C1_RES. Enable the
support on these platforms, including CNL/ICL/LKF/RKL/TGL/ADL/RPL/MTL.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 607152b36c1a..9895f348b637 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -604,6 +604,7 @@ static const struct platform_features cnl_features = {
 	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
 	.cst_limit = CST_LIMIT_HSW,
 	.has_irtl_msrs = 1,
+	.has_msr_core_c1_res = 1,
 	.has_ext_cst_msrs = 1,
 	.trl_msrs = TRL_BASE,
 	.tcc_offset_bits = 6,
-- 
cgit 


From 6b74a30b767e362eda7deeac52edcd546c5f6d8f Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 9 Sep 2023 13:26:51 +0800
Subject: tools/power/turbostat: Remove PC7/PC9 support on ADL/RPL

Compared with other platforms that share cnl_features, ADL/RPL don't
have PC7/PC9.

Clone a new platform feature set from cnl_features for ADL/RPL, with
PC7/PC9 removed.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9895f348b637..a769daa59b12 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -612,6 +612,23 @@ static const struct platform_features cnl_features = {
 	.enable_tsc_tweak = 1,
 };
 
+static const struct platform_features adl_features = {
+	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
+	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10,
+	.cst_limit = CST_LIMIT_HSW,
+	.has_irtl_msrs = 1,
+	.has_msr_core_c1_res = 1,
+	.has_ext_cst_msrs = 1,
+	.trl_msrs = TRL_BASE,
+	.tcc_offset_bits = 6,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+	.enable_tsc_tweak = 1,
+};
+
 static const struct platform_features skx_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
@@ -812,11 +829,11 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
 	{ INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
 	{ INTEL_FAM6_LAKEFIELD, &cnl_features },
-	{ INTEL_FAM6_ALDERLAKE, &cnl_features },
-	{ INTEL_FAM6_ALDERLAKE_L, &cnl_features },
-	{ INTEL_FAM6_RAPTORLAKE, &cnl_features },
-	{ INTEL_FAM6_RAPTORLAKE_P, &cnl_features },
-	{ INTEL_FAM6_RAPTORLAKE_S, &cnl_features },
+	{ INTEL_FAM6_ALDERLAKE, &adl_features },
+	{ INTEL_FAM6_ALDERLAKE_L, &adl_features },
+	{ INTEL_FAM6_RAPTORLAKE, &adl_features },
+	{ INTEL_FAM6_RAPTORLAKE_P, &adl_features },
+	{ INTEL_FAM6_RAPTORLAKE_S, &adl_features },
 	{ INTEL_FAM6_METEORLAKE, &cnl_features },
 	{ INTEL_FAM6_METEORLAKE_L, &cnl_features },
 	{ INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
@@ -828,7 +845,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
 	{ INTEL_FAM6_ATOM_TREMONT, &tmt_features },
 	{ INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
-	{ INTEL_FAM6_ATOM_GRACEMONT, &cnl_features },
+	{ INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
 	{ INTEL_FAM6_XEON_PHI_KNL, &knl_features },
 	{ INTEL_FAM6_XEON_PHI_KNM, &knl_features },
 	/*
-- 
cgit 


From 71cfd1da9f0635ccd7124ad8b67b9aed596be491 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 6 Oct 2023 11:01:17 +0800
Subject: tools/power/turbostat: Introduce cpu_allowed_set

Turbostat supports "-c" parameter which limits output to system summary
plus the specified cpu-set. But some code still uses cpu_present_set to
read and dump the counters.

Introduce cpu_allowed_set for code that should obey the specified cpu-set.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a769daa59b12..d8f44ea5b4bf 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -904,8 +904,8 @@ int backwards_count;
 char *progname;
 
 #define CPU_SUBSET_MAXCPUS	1024	/* need to use before probe... */
-cpu_set_t *cpu_present_set, *cpu_affinity_set, *cpu_subset;
-size_t cpu_present_setsize, cpu_affinity_setsize, cpu_subset_size;
+cpu_set_t *cpu_present_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
+size_t cpu_present_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
 #define MAX_ADDED_COUNTERS 8
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define BITMASK_SIZE 32
@@ -1157,6 +1157,11 @@ int cpu_is_not_present(int cpu)
 	return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
 }
 
+int cpu_is_not_allowed(int cpu)
+{
+	return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
+}
+
 /*
  * run func(thread, core, package) in topology order
  * skip non-present cpus
@@ -3396,6 +3401,10 @@ void free_all_buffers(void)
 	cpu_present_set = NULL;
 	cpu_present_setsize = 0;
 
+	CPU_FREE(cpu_allowed_set);
+	cpu_allowed_set = NULL;
+	cpu_allowed_setsize = 0;
+
 	CPU_FREE(cpu_affinity_set);
 	cpu_affinity_set = NULL;
 	cpu_affinity_setsize = 0;
@@ -5697,13 +5706,29 @@ void topology_probe()
 	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
 	for_all_proc_cpus(mark_cpu_present);
 
+	/*
+	 * Allocate and initialize cpu_allowed_set
+	 */
+	cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1));
+	if (cpu_allowed_set == NULL)
+		err(3, "CPU_ALLOC");
+	cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
+	CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
+
 	/*
 	 * Validate that all cpus in cpu_subset are also in cpu_present_set
 	 */
 	for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
-		if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
+		if (!cpu_subset) {
+			if (CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set))
+				CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
+			continue;
+		}
+		if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) {
 			if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set))
 				err(1, "cpu%d not present", i);
+			CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
+		}
 	}
 
 	/*
-- 
cgit 


From 4ede6d1ce7acba9cafe7df4e935b174623cd2181 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 4 Oct 2023 13:52:02 +0800
Subject: tools/power/turbostat: Obey allowed CPUs when accessing CPU counters

for_all_cpus/for_all_cpus_2 are used for accessing the per CPU counters,
and they should follow the cpu_allowed_set instead of cpu_present_set.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index d8f44ea5b4bf..202cf5231d7a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1182,7 +1182,7 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 
 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
-					if (cpu_is_not_present(t->cpu_id))
+					if (cpu_is_not_allowed(t->cpu_id))
 						continue;
 
 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
@@ -3618,7 +3618,7 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
 
 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
-					if (cpu_is_not_present(t->cpu_id))
+					if (cpu_is_not_allowed(t->cpu_id))
 						continue;
 
 					t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
-- 
cgit 


From 7bb3fe27ad4f62039b2ac80a2147452a608b474f Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 4 Oct 2023 14:25:25 +0800
Subject: tools/power/turbostat: Obey allowed CPUs during startup

Set turbostat CPU affinity to make sure turbostat is running on one of
the allowed CPUs.

Set base_cpu to the first allowed CPU so that some platform information
is dumped using one of the allowed CPUs.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 202cf5231d7a..f81ce832d17a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5731,6 +5731,10 @@ void topology_probe()
 		}
 	}
 
+	if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
+		err(-ENODEV, "No valid cpus found");
+	sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set);
+
 	/*
 	 * Allocate and initialize cpu_affinity_set
 	 */
@@ -5941,12 +5945,17 @@ void setup_all_buffers(void)
 
 void set_base_cpu(void)
 {
-	base_cpu = sched_getcpu();
-	if (base_cpu < 0)
-		err(-ENODEV, "No valid cpus found");
+	int i;
 
-	if (debug > 1)
-		fprintf(outf, "base_cpu = %d\n", base_cpu);
+	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
+		if (cpu_is_not_allowed(i))
+			continue;
+		base_cpu = i;
+		if (debug > 1)
+			fprintf(outf, "base_cpu = %d\n", base_cpu);
+		return;
+	}
+	err(-ENODEV, "No valid cpus found");
 }
 
 void turbostat_init()
@@ -5976,8 +5985,6 @@ int fork_it(char **argv)
 	first_counter_read = 0;
 	if (status)
 		exit(status);
-	/* clear affinity side-effect of get_counters() */
-	sched_setaffinity(0, cpu_present_setsize, cpu_present_set);
 	gettimeofday(&tv_even, (struct timezone *)NULL);
 
 	child_pid = fork();
-- 
cgit 


From 74318add132365db3026e281ac06836a26cda857 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 4 Oct 2023 15:13:23 +0800
Subject: tools/power/turbostat: Abstract several functions

When detecting the primary thread/core in a core/package, current code
doesn't handle the allowed CPUs.

Abstract several functions for further fix of this issue.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 58 +++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f81ce832d17a..5f6bc076e1dd 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1198,6 +1198,30 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 	return 0;
 }
 
+int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	UNUSED(c);
+	UNUSED(p);
+
+	return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE);
+}
+
+int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	UNUSED(c);
+	UNUSED(p);
+
+	return (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE);
+}
+
+int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	UNUSED(c);
+	UNUSED(p);
+
+	return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE);
+}
+
 int cpu_migrate(int cpu)
 {
 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
@@ -1682,11 +1706,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	int printed = 0;
 
 	/* if showing only 1st thread in core and this isn't one, bail out */
-	if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+	if (show_core_only && !is_cpu_first_thread_in_core(t, c, p))
 		return 0;
 
 	/* if showing only 1st thread in pkg and this isn't one, bail out */
-	if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p))
 		return 0;
 
 	/*if not summary line and --cpu is used */
@@ -1820,7 +1844,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
 
 	/* print per-core data only for 1st thread in core */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+	if (!is_cpu_first_thread_in_core(t, c, p))
 		goto done;
 
 	if (DO_BIC(BIC_CPU_c3))
@@ -1867,7 +1891,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units);
 
 	/* print per-package data only for 1st core in package */
-	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_core_in_package(t, c, p))
 		goto done;
 
 	/* PkgTmp */
@@ -2202,7 +2226,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
 	int retval = 0;
 
 	/* calculate core delta only for 1st thread in core */
-	if (t->flags & CPU_IS_FIRST_THREAD_IN_CORE)
+	if (is_cpu_first_thread_in_core(t, c, p))
 		delta_core(c, c2);
 
 	/* always calculate thread delta */
@@ -2211,7 +2235,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
 		return retval;
 
 	/* calculate package delta only for 1st core in package */
-	if (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)
+	if (is_cpu_first_core_in_package(t, c, p))
 		retval = delta_package(p, p2);
 
 	return retval;
@@ -2325,7 +2349,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	}
 
 	/* sum per-core values only for 1st thread in core */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+	if (!is_cpu_first_thread_in_core(t, c, p))
 		return 0;
 
 	average.cores.c3 += c->c3;
@@ -2345,7 +2369,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	}
 
 	/* sum per-pkg values only for 1st core in pkg */
-	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_core_in_package(t, c, p))
 		return 0;
 
 	if (DO_BIC(BIC_Totl_c0))
@@ -2745,7 +2769,7 @@ retry:
 	}
 
 	/* collect core counters only for 1st thread in core */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+	if (!is_cpu_first_thread_in_core(t, c, p))
 		goto done;
 
 	if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) {
@@ -2800,7 +2824,7 @@ retry:
 	}
 
 	/* collect package counters only for 1st core in package */
-	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_core_in_package(t, c, p))
 		goto done;
 
 	if (DO_BIC(BIC_Totl_c0)) {
@@ -4581,7 +4605,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	cpu = t->cpu_id;
 
 	/* EPB is per-package */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_thread_in_package(t, c, p))
 		return 0;
 
 	if (cpu_migrate(cpu)) {
@@ -4630,7 +4654,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	cpu = t->cpu_id;
 
 	/* MSR_HWP_CAPABILITIES is per-package */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_thread_in_package(t, c, p))
 		return 0;
 
 	if (cpu_migrate(cpu)) {
@@ -4713,7 +4737,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 	cpu = t->cpu_id;
 
 	/* per-package */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_thread_in_package(t, c, p))
 		return 0;
 
 	if (cpu_migrate(cpu)) {
@@ -4930,7 +4954,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		return 0;
 
 	/* RAPL counters are per package, so print only for 1st thread/package */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_thread_in_package(t, c, p))
 		return 0;
 
 	cpu = t->cpu_id;
@@ -5083,7 +5107,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 		return 0;
 
 	/* this is a per-package concept */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+	if (!is_cpu_first_thread_in_package(t, c, p))
 		return 0;
 
 	cpu = t->cpu_id;
@@ -5152,7 +5176,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 	cpu = t->cpu_id;
 
 	/* DTS is per-core, no need to print for each thread */
-	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+	if (!is_cpu_first_thread_in_core(t, c, p))
 		return 0;
 
 	if (cpu_migrate(cpu)) {
@@ -5160,7 +5184,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 		return -1;
 	}
 
-	if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) {
+	if (do_ptm && is_cpu_first_core_in_package(t, c, p)) {
 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
 			return 0;
 
-- 
cgit 


From ccf8a0528061b4ca5f5c0e73c8d888e7e6d8b054 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 6 Oct 2023 09:59:14 +0800
Subject: tools/power/turbostat: Obey allowed CPUs for primary thread/core
 detection

Thread_id doesn't tell if a CPU is allowed or not.

Detect allowed CPUs only and use the first detected thread/core as the
primary thread/core of a core/package.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5f6bc076e1dd..e586164906fa 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -926,12 +926,11 @@ struct thread_data {
 	unsigned int x2apic_id;
 	unsigned int flags;
 	bool is_atom;
-#define CPU_IS_FIRST_THREAD_IN_CORE	0x2
-#define CPU_IS_FIRST_CORE_IN_PACKAGE	0x4
 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
 } *thread_even, *thread_odd;
 
 struct core_data {
+	int base_cpu;
 	unsigned long long c3;
 	unsigned long long c6;
 	unsigned long long c7;
@@ -944,6 +943,7 @@ struct core_data {
 } *core_even, *core_odd;
 
 struct pkg_data {
+	int base_cpu;
 	unsigned long long pc2;
 	unsigned long long pc3;
 	unsigned long long pc6;
@@ -1200,26 +1200,21 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 
 int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
-	UNUSED(c);
 	UNUSED(p);
 
-	return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE);
+	return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
 }
 
 int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	UNUSED(c);
-	UNUSED(p);
 
-	return (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE);
+	return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
 }
 
 int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
-	UNUSED(c);
-	UNUSED(p);
-
-	return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE);
+	return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
 }
 
 int cpu_migrate(int cpu)
@@ -2263,9 +2258,6 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	t->irq_count = 0;
 	t->smi_count = 0;
 
-	/* tells format_counters to dump all fields from this set */
-	t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE;
-
 	c->c3 = 0;
 	c->c6 = 0;
 	c->c7 = 0;
@@ -5872,15 +5864,19 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_
 	if (*c == NULL)
 		goto error;
 
-	for (i = 0; i < num_cores; i++)
+	for (i = 0; i < num_cores; i++) {
 		(*c)[i].core_id = -1;
+		(*c)[i].base_cpu = -1;
+	}
 
 	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
 	if (*p == NULL)
 		goto error;
 
-	for (i = 0; i < topo.num_packages; i++)
+	for (i = 0; i < topo.num_packages; i++) {
 		(*p)[i].package_id = i;
+		(*p)[i].base_cpu = -1;
+	}
 
 	return;
 error:
@@ -5913,10 +5909,11 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 	p = GET_PKG(pkg_base, pkg_id);
 
 	t->cpu_id = cpu_id;
-	if (thread_id == 0) {
-		t->flags |= CPU_IS_FIRST_THREAD_IN_CORE;
-		if (cpu_is_first_core_in_package(cpu_id))
-			t->flags |= CPU_IS_FIRST_CORE_IN_PACKAGE;
+	if (!cpu_is_not_allowed(cpu_id)) {
+		if (c->base_cpu < 0)
+			c->base_cpu = t->cpu_id;
+		if (p->base_cpu < 0)
+			p->base_cpu = t->cpu_id;
 	}
 
 	c->core_id = core_id;
-- 
cgit 


From 0fe3752901370b83b63d4ddbce708b23c8e41ea9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 6 Oct 2023 18:35:26 +0800
Subject: tools/power/turbostat: Obey allowed CPUs for system summary

System summary should summarize the information for allowed CPUs instead
of all the present CPUs.

Introduce topology information for allowed CPUs, and use them to
get system summary.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 71 +++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 25 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index e586164906fa..91aa03989317 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1134,6 +1134,9 @@ struct topo_params {
 	int num_die;
 	int num_cpus;
 	int num_cores;
+	int allowed_packages;
+	int allowed_cpus;
+	int allowed_cores;
 	int max_cpu_num;
 	int max_node_num;
 	int nodes_per_pkg;
@@ -1179,7 +1182,6 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 					struct thread_data *t;
 					struct core_data *c;
 					struct pkg_data *p;
-
 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
 					if (cpu_is_not_allowed(t->cpu_id))
@@ -2426,40 +2428,40 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 	/* Use the global time delta for the average. */
 	average.threads.tv_delta = tv_delta;
 
-	average.threads.tsc /= topo.num_cpus;
-	average.threads.aperf /= topo.num_cpus;
-	average.threads.mperf /= topo.num_cpus;
-	average.threads.instr_count /= topo.num_cpus;
-	average.threads.c1 /= topo.num_cpus;
+	average.threads.tsc /= topo.allowed_cpus;
+	average.threads.aperf /= topo.allowed_cpus;
+	average.threads.mperf /= topo.allowed_cpus;
+	average.threads.instr_count /= topo.allowed_cpus;
+	average.threads.c1 /= topo.allowed_cpus;
 
 	if (average.threads.irq_count > 9999999)
 		sums_need_wide_columns = 1;
 
-	average.cores.c3 /= topo.num_cores;
-	average.cores.c6 /= topo.num_cores;
-	average.cores.c7 /= topo.num_cores;
-	average.cores.mc6_us /= topo.num_cores;
+	average.cores.c3 /= topo.allowed_cores;
+	average.cores.c6 /= topo.allowed_cores;
+	average.cores.c7 /= topo.allowed_cores;
+	average.cores.mc6_us /= topo.allowed_cores;
 
 	if (DO_BIC(BIC_Totl_c0))
-		average.packages.pkg_wtd_core_c0 /= topo.num_packages;
+		average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Any_c0))
-		average.packages.pkg_any_core_c0 /= topo.num_packages;
+		average.packages.pkg_any_core_c0 /= topo.allowed_packages;
 	if (DO_BIC(BIC_GFX_c0))
-		average.packages.pkg_any_gfxe_c0 /= topo.num_packages;
+		average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
 	if (DO_BIC(BIC_CPUGFX))
-		average.packages.pkg_both_core_gfxe_c0 /= topo.num_packages;
+		average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
 
-	average.packages.pc2 /= topo.num_packages;
+	average.packages.pc2 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Pkgpc3))
-		average.packages.pc3 /= topo.num_packages;
+		average.packages.pc3 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Pkgpc6))
-		average.packages.pc6 /= topo.num_packages;
+		average.packages.pc6 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Pkgpc7))
-		average.packages.pc7 /= topo.num_packages;
+		average.packages.pc7 /= topo.allowed_packages;
 
-	average.packages.pc8 /= topo.num_packages;
-	average.packages.pc9 /= topo.num_packages;
-	average.packages.pc10 /= topo.num_packages;
+	average.packages.pc8 /= topo.allowed_packages;
+	average.packages.pc9 /= topo.allowed_packages;
+	average.packages.pc10 /= topo.allowed_packages;
 
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -2469,7 +2471,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 				sums_need_wide_columns = 1;
 			continue;
 		}
-		average.threads.counter[i] /= topo.num_cpus;
+		average.threads.counter[i] /= topo.allowed_cpus;
 	}
 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -2478,7 +2480,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 			if (average.cores.counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 		}
-		average.cores.counter[i] /= topo.num_cores;
+		average.cores.counter[i] /= topo.allowed_cores;
 	}
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -2487,7 +2489,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 			if (average.packages.counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 		}
-		average.packages.counter[i] /= topo.num_packages;
+		average.packages.counter[i] /= topo.allowed_packages;
 	}
 }
 
@@ -3690,7 +3692,7 @@ void re_initialize(void)
 {
 	free_all_buffers();
 	setup_all_buffers();
-	fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus);
+	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus);
 }
 
 void set_max_cpu_num(void)
@@ -5953,6 +5955,24 @@ void allocate_irq_buffers(void)
 		err(-1, "calloc %d", topo.max_cpu_num + 1);
 }
 
+int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	topo.allowed_cpus++;
+	if ((int)t->cpu_id == c->base_cpu)
+		topo.allowed_cores++;
+	if ((int)t->cpu_id == p->base_cpu)
+		topo.allowed_packages++;
+
+	return 0;
+}
+
+void topology_update(void)
+{
+	topo.allowed_cpus = 0;
+	topo.allowed_cores = 0;
+	topo.allowed_packages = 0;
+	for_all_cpus(update_topo, ODD_COUNTERS);
+}
 void setup_all_buffers(void)
 {
 	topology_probe();
@@ -5962,6 +5982,7 @@ void setup_all_buffers(void)
 	allocate_counters(&thread_odd, &core_odd, &package_odd);
 	allocate_output_buffer();
 	for_all_proc_cpus(initialize_counters);
+	topology_update();
 }
 
 void set_base_cpu(void)
-- 
cgit 


From c25ef0e5d9d7d5fb9e1679286fc7a11e70f16c70 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 20 Oct 2023 14:20:47 +0800
Subject: tools/power/turbostat: Handle offlined CPUs in cpu_subset

It is possible that the cpu_subset contains offlined CPUs.

If this happens during start, exit immediately because this is likely an
operator error that is best fixed by re-invoking.
If this happens at runtime, give a warning only because turbostat should
do its best effort to continue running.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 91aa03989317..f9ea07961a22 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1149,7 +1149,7 @@ struct timeval tv_even, tv_odd, tv_delta;
 int *irq_column_2_cpu;		/* /proc/interrupts column numbers */
 int *irqs_per_cpu;		/* indexed by cpu_num */
 
-void setup_all_buffers(void);
+void setup_all_buffers(bool startup);
 
 char *sys_lpi_file;
 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
@@ -3691,7 +3691,7 @@ int for_all_proc_cpus(int (func) (int))
 void re_initialize(void)
 {
 	free_all_buffers();
-	setup_all_buffers();
+	setup_all_buffers(false);
 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus);
 }
 
@@ -5692,7 +5692,7 @@ int dir_filter(const struct dirent *dirp)
 		return 0;
 }
 
-void topology_probe()
+void topology_probe(bool startup)
 {
 	int i;
 	int max_core_id = 0;
@@ -5734,7 +5734,12 @@ void topology_probe()
 	CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
 
 	/*
-	 * Validate that all cpus in cpu_subset are also in cpu_present_set
+	 * Validate cpu_subset and update cpu_allowed_set.
+	 *
+	 * Make sure all cpus in cpu_subset are also in cpu_present_set during startup,
+	 * and give a warning when cpus in cpu_subset become unavailable at runtime.
+	 *
+	 * cpu_allowed_set is the intersection of cpu_present_set and cpu_subset.
 	 */
 	for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
 		if (!cpu_subset) {
@@ -5743,9 +5748,15 @@ void topology_probe()
 			continue;
 		}
 		if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) {
-			if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set))
-				err(1, "cpu%d not present", i);
-			CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
+			if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
+				/* all cpus in cpu_subset must be in cpu_present_set during startup */
+				if (startup)
+					err(1, "cpu%d not present", i);
+				else
+					fprintf(stderr, "cpu%d not present\n", i);
+			} else {
+				CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
+			}
 		}
 	}
 
@@ -5973,9 +5984,9 @@ void topology_update(void)
 	topo.allowed_packages = 0;
 	for_all_cpus(update_topo, ODD_COUNTERS);
 }
-void setup_all_buffers(void)
+void setup_all_buffers(bool startup)
 {
-	topology_probe();
+	topology_probe(startup);
 	allocate_irq_buffers();
 	allocate_fd_percpu();
 	allocate_counters(&thread_even, &core_even, &package_even);
@@ -6002,7 +6013,7 @@ void set_base_cpu(void)
 
 void turbostat_init()
 {
-	setup_all_buffers();
+	setup_all_buffers(true);
 	set_base_cpu();
 	check_dev_msr();
 	check_permissions();
-- 
cgit 


From 8c3dd2c9e54273922ea71b2a4c0e77fc624c396b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 20 Oct 2023 09:45:21 +0800
Subject: tools/power/turbostat: Abstrct function for parsing cpu string

Abstract parse_cpu_str() which can update any specified cpu_set by a
given cpu string. This can be used to handle further CPU limitations
from other sources like cgroup.

The cpu string parsing code is also enhanced to handle the strings that
have an extra '\n' before string terminator.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 104 ++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 49 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f9ea07961a22..3a759b49f25e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3565,6 +3565,59 @@ int get_physical_node_id(struct cpu_topology *thiscpu)
 	return -1;
 }
 
+static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
+{
+	unsigned int start, end;
+	char *next = cpu_str;
+
+	while (next && *next) {
+
+		if (*next == '-')	/* no negative cpu numbers */
+			return 1;
+
+		start = strtoul(next, &next, 10);
+
+		if (start >= CPU_SUBSET_MAXCPUS)
+			return 1;
+		CPU_SET_S(start, cpu_set_size, cpu_set);
+
+		if (*next == '\0' || *next == '\n')
+			break;
+
+		if (*next == ',') {
+			next += 1;
+			continue;
+		}
+
+		if (*next == '-') {
+			next += 1;	/* start range */
+		} else if (*next == '.') {
+			next += 1;
+			if (*next == '.')
+				next += 1;	/* start range */
+			else
+				return 1;
+		}
+
+		end = strtoul(next, &next, 10);
+		if (end <= start)
+			return 1;
+
+		while (++start <= end) {
+			if (start >= CPU_SUBSET_MAXCPUS)
+				return 1;
+			CPU_SET_S(start, cpu_set_size, cpu_set);
+		}
+
+		if (*next == ',')
+			next += 1;
+		else if (*next != '\0' && *next != '\n')
+			return 1;
+	}
+
+	return 0;
+}
+
 int get_thread_siblings(struct cpu_topology *thiscpu)
 {
 	char path[80], character;
@@ -6384,9 +6437,6 @@ void probe_sysfs(void)
  */
 void parse_cpu_command(char *optarg)
 {
-	unsigned int start, end;
-	char *next;
-
 	if (!strcmp(optarg, "core")) {
 		if (cpu_subset)
 			goto error;
@@ -6409,52 +6459,8 @@ void parse_cpu_command(char *optarg)
 
 	CPU_ZERO_S(cpu_subset_size, cpu_subset);
 
-	next = optarg;
-
-	while (next && *next) {
-
-		if (*next == '-')	/* no negative cpu numbers */
-			goto error;
-
-		start = strtoul(next, &next, 10);
-
-		if (start >= CPU_SUBSET_MAXCPUS)
-			goto error;
-		CPU_SET_S(start, cpu_subset_size, cpu_subset);
-
-		if (*next == '\0')
-			break;
-
-		if (*next == ',') {
-			next += 1;
-			continue;
-		}
-
-		if (*next == '-') {
-			next += 1;	/* start range */
-		} else if (*next == '.') {
-			next += 1;
-			if (*next == '.')
-				next += 1;	/* start range */
-			else
-				goto error;
-		}
-
-		end = strtoul(next, &next, 10);
-		if (end <= start)
-			goto error;
-
-		while (++start <= end) {
-			if (start >= CPU_SUBSET_MAXCPUS)
-				goto error;
-			CPU_SET_S(start, cpu_subset_size, cpu_subset);
-		}
-
-		if (*next == ',')
-			next += 1;
-		else if (*next != '\0')
-			goto error;
-	}
+	if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size))
+		goto error;
 
 	return;
 
-- 
cgit 


From f638858da0925b29122f05135663013dc240eaf9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 20 Oct 2023 09:39:22 +0800
Subject: tools/power/turbostat: Handle cgroup v2 cpu limitation

CPUs can be isolated via cgroup settings and turbostat should avoid
migrating to these CPUs, just like it does for the '-c' cpus.

Introduce cpu_effective_set to save the cgroup cpu limitation info from
/sys/fs/cgroup/cpuset.cpus.effective. And use cpu_allowed_set as the
intersection of cpu_present_set, cpu_effective_set and cpu_subset.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 95 +++++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 15 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 3a759b49f25e..0ef6fba118b1 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -904,8 +904,8 @@ int backwards_count;
 char *progname;
 
 #define CPU_SUBSET_MAXCPUS	1024	/* need to use before probe... */
-cpu_set_t *cpu_present_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
-size_t cpu_present_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
+cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
+size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
 #define MAX_ADDED_COUNTERS 8
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define BITMASK_SIZE 32
@@ -3419,6 +3419,10 @@ void free_all_buffers(void)
 	cpu_present_set = NULL;
 	cpu_present_setsize = 0;
 
+	CPU_FREE(cpu_effective_set);
+	cpu_effective_set = NULL;
+	cpu_effective_setsize = 0;
+
 	CPU_FREE(cpu_allowed_set);
 	cpu_allowed_set = NULL;
 	cpu_allowed_setsize = 0;
@@ -3741,6 +3745,46 @@ int for_all_proc_cpus(int (func) (int))
 	return 0;
 }
 
+#define PATH_EFFECTIVE_CPUS	"/sys/fs/cgroup/cpuset.cpus.effective"
+
+static char cpu_effective_str[1024];
+
+static int update_effective_str(bool startup)
+{
+	FILE *fp;
+	char *pos;
+	char buf[1024];
+	int ret;
+
+	if (cpu_effective_str[0] == '\0' && !startup)
+		return 0;
+
+	fp = fopen(PATH_EFFECTIVE_CPUS, "r");
+	if (!fp)
+		return 0;
+
+	pos = fgets(buf, 1024, fp);
+	if (!pos)
+		err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
+
+	fclose(fp);
+
+	ret = strncmp(cpu_effective_str, buf, 1024);
+	if (!ret)
+		return 0;
+
+	strncpy(cpu_effective_str, buf, 1024);
+	return 1;
+}
+
+static void update_effective_set(bool startup)
+{
+	update_effective_str(startup);
+
+	if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
+		err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
+}
+
 void re_initialize(void)
 {
 	free_all_buffers();
@@ -4257,6 +4301,10 @@ restart:
 			re_initialize();
 			goto restart;
 		}
+		if (update_effective_str(false)) {
+			re_initialize();
+			goto restart;
+		}
 		do_sleep();
 		if (snapshot_proc_sysfs_files())
 			goto restart;
@@ -5777,6 +5825,16 @@ void topology_probe(bool startup)
 	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
 	for_all_proc_cpus(mark_cpu_present);
 
+	/*
+	 * Allocate and initialize cpu_effective_set
+	 */
+	cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1));
+	if (cpu_effective_set == NULL)
+		err(3, "CPU_ALLOC");
+	cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
+	CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set);
+	update_effective_set(startup);
+
 	/*
 	 * Allocate and initialize cpu_allowed_set
 	 */
@@ -5787,30 +5845,37 @@ void topology_probe(bool startup)
 	CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
 
 	/*
-	 * Validate cpu_subset and update cpu_allowed_set.
+	 * Validate and update cpu_allowed_set.
 	 *
-	 * Make sure all cpus in cpu_subset are also in cpu_present_set during startup,
-	 * and give a warning when cpus in cpu_subset become unavailable at runtime.
+	 * Make sure all cpus in cpu_subset are also in cpu_present_set during startup.
+	 * Give a warning when cpus in cpu_subset become unavailable at runtime.
+	 * Give a warning when cpus are not effective because of cgroup setting.
 	 *
-	 * cpu_allowed_set is the intersection of cpu_present_set and cpu_subset.
+	 * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset.
 	 */
 	for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
-		if (!cpu_subset) {
-			if (CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set))
-				CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
+		if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
 			continue;
-		}
-		if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) {
-			if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
-				/* all cpus in cpu_subset must be in cpu_present_set during startup */
+
+		if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
+			if (cpu_subset) {
+				/* cpus in cpu_subset must be in cpu_present_set during startup */
 				if (startup)
 					err(1, "cpu%d not present", i);
 				else
 					fprintf(stderr, "cpu%d not present\n", i);
-			} else {
-				CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
 			}
+			continue;
 		}
+
+		if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) {
+			if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) {
+				fprintf(stderr, "cpu%d not effective\n", i);
+				continue;
+			}
+		}
+
+		CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
 	}
 
 	if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
-- 
cgit 


From 37f68a2940558b4f6f8e51b7b1d00f084b4bdde2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 24 Jan 2023 10:39:53 -0800
Subject: tools/power/turbostat: Move process to root cgroup

When available CPUs are reduced via cgroup cpuset controller, turbostat
will exit with errors (For example):
	get_counters: Could not migrate to CPU 0
	turbostat: re-initialized with num_cpus 20
	get_counters: Could not migrate to CPU 0
	turbostat: re-initialized with num_cpus 20

Move the turbostat to root cgroup, which has every CPU.

Writing the value 0 to a cgroup.procs file causes the writing
process to be moved to the corresponding cgroup.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0ef6fba118b1..fea63d9d8e02 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -6666,6 +6666,19 @@ void cmdline(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
+	int fd, ret;
+
+	fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
+	if (fd < 0)
+		goto skip_cgroup_setting;
+
+	ret = write(fd, "0\n", 2);
+	if (ret == -1)
+		perror("Can't update cgroup\n");
+
+	close(fd);
+
+skip_cgroup_setting:
 	outf = stderr;
 	cmdline(argc, argv);
 
-- 
cgit 


From 0e3f10e6aa97b0134b526ec9cdc3ccdac2239b43 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 19 Oct 2023 11:04:33 +0800
Subject: tools/power/turbostat: Add MSR_CORE_C1_RES support for spr_features

Add MSR_CORE_C1_RES support for spr_features because both Sapphirerapids
and Emeraldrapids support this MSR.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index fea63d9d8e02..bbeeec02bf5b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -667,6 +667,7 @@ static const struct platform_features spr_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_SKX,
+	.has_msr_core_c1_res = 1,
 	.has_irtl_msrs = 1,
 	.has_cst_prewake_bit = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
-- 
cgit 


From 5feab4a6b8a730438a0fe8758dfa0700f951edde Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 9 Sep 2023 13:28:07 +0800
Subject: tools/power/turbostat: Add initial support for GraniteRapids

Add initial support for GraniteRapids.

It shares the same features with SapphireRapids.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index bbeeec02bf5b..981d39454b49 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -829,6 +829,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_TIGERLAKE, &cnl_features },
 	{ INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
 	{ INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
+	{ INTEL_FAM6_GRANITERAPIDS_X, &spr_features },
 	{ INTEL_FAM6_LAKEFIELD, &cnl_features },
 	{ INTEL_FAM6_ALDERLAKE, &adl_features },
 	{ INTEL_FAM6_ALDERLAKE_L, &adl_features },
-- 
cgit 


From d33605f367414c7e0009978d1fbe9af01a36e221 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 28 Sep 2023 13:09:02 +0800
Subject: tools/power/turbostat: Add initial support for SierraForest

Add initial support for SierraForest.

It shares the same features with SapphireRapids, except that it has
MSR_MODULE_C6_RES_MS support.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 981d39454b49..a50b6d071f6a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -674,6 +674,22 @@ static const struct platform_features spr_features = {
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
+static const struct platform_features srf_features = {
+	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
+	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC6 | PC2 | PC6,
+	.cst_limit = CST_LIMIT_SKX,
+	.has_msr_core_c1_res = 1,
+	.has_msr_module_c6_res_ms = 1,
+	.has_irtl_msrs = 1,
+	.has_cst_prewake_bit = 1,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+};
+
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
@@ -848,6 +864,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_ATOM_TREMONT, &tmt_features },
 	{ INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
 	{ INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
+	{ INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
 	{ INTEL_FAM6_XEON_PHI_KNL, &knl_features },
 	{ INTEL_FAM6_XEON_PHI_KNM, &knl_features },
 	/*
-- 
cgit 


From 5a6efcb9102af4210d5a59182dbfbc594ae50fd4 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 19 Oct 2023 11:00:07 +0800
Subject: tools/power/turbostat: Add initial support for GrandRidge

Add initial support for GrandRidge.

It shares the same features as SierraForest, except that it does not
support PC2/PC6.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a50b6d071f6a..3407f08593f8 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -690,6 +690,22 @@ static const struct platform_features srf_features = {
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
 };
 
+static const struct platform_features grr_features = {
+	.has_msr_misc_feature_control = 1,
+	.has_msr_misc_pwr_mgmt = 1,
+	.has_nhm_msrs = 1,
+	.has_config_tdp = 1,
+	.bclk_freq = BCLK_100MHZ,
+	.supported_cstates = CC1 | CC6,
+	.cst_limit = CST_LIMIT_SKX,
+	.has_msr_core_c1_res = 1,
+	.has_msr_module_c6_res_ms = 1,
+	.has_irtl_msrs = 1,
+	.has_cst_prewake_bit = 1,
+	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
+	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+};
+
 static const struct platform_features slv_features = {
 	.has_nhm_msrs = 1,
 	.bclk_freq = BCLK_SLV,
@@ -865,6 +881,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
 	{ INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
 	{ INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
+	{ INTEL_FAM6_ATOM_CRESTMONT, &grr_features },
 	{ INTEL_FAM6_XEON_PHI_KNL, &knl_features },
 	{ INTEL_FAM6_XEON_PHI_KNM, &knl_features },
 	/*
-- 
cgit 


From 7b57e7b683e3872b02117f46bd7dc7ad765888a8 Mon Sep 17 00:00:00 2001
From: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Date: Tue, 23 May 2023 17:46:45 +0530
Subject: tools/power/turbostat: Add initial support for ArrowLake

Add initial support for ArrowLake platform.

It shares the same features with CannonLake.

Signed-off-by: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 3407f08593f8..6c52cceae1d7 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -870,6 +870,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_RAPTORLAKE_S, &adl_features },
 	{ INTEL_FAM6_METEORLAKE, &cnl_features },
 	{ INTEL_FAM6_METEORLAKE_L, &cnl_features },
+	{ INTEL_FAM6_ARROWLAKE, &cnl_features },
 	{ INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
 	{ INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
 	{ INTEL_FAM6_ATOM_AIRMONT, &amt_features },
-- 
cgit 


From 956dbd3de400a5665faf08a8588556db9c1bb56e Mon Sep 17 00:00:00 2001
From: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Date: Tue, 23 May 2023 17:46:45 +0530
Subject: tools/power/turbostat: Add initial support for LunarLake

Add initial support for LunarLake platform.

It shares the same features with CannonLake.

Signed-off-by: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6c52cceae1d7..8a311d7272e7 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -871,6 +871,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_FAM6_METEORLAKE, &cnl_features },
 	{ INTEL_FAM6_METEORLAKE_L, &cnl_features },
 	{ INTEL_FAM6_ARROWLAKE, &cnl_features },
+	{ INTEL_FAM6_LUNARLAKE_M, &cnl_features },
 	{ INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
 	{ INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
 	{ INTEL_FAM6_ATOM_AIRMONT, &amt_features },
-- 
cgit 


From 3503895788d402d6a3814085ed582c364ec3e903 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 31 Oct 2023 12:02:06 -0400
Subject: virtio_pci: move structure to a header

These are guest/host interfaces, so they belong in the header where e.g.
qemu will know to find them.

Note: we added a new structure as opposed to extending existing one
because someone might be relying on the size of the existing structure
staying unchanged.  Add a warning to avoid using sizeof.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/virtio/virtio_pci_modern_dev.c |  7 ++++---
 include/linux/virtio_pci_modern.h      |  7 -------
 include/uapi/linux/virtio_pci.h        | 11 +++++++++++
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index e2a1fe7bb66c..7de8b1ebabac 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -294,9 +294,10 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 
 	err = -EINVAL;
 	mdev->common = vp_modern_map_capability(mdev, common,
-				      sizeof(struct virtio_pci_common_cfg), 4,
-				      0, sizeof(struct virtio_pci_modern_common_cfg),
-				      &mdev->common_len, NULL);
+			      sizeof(struct virtio_pci_common_cfg), 4, 0,
+			      offsetofend(struct virtio_pci_modern_common_cfg,
+					  queue_reset),
+			      &mdev->common_len, NULL);
 	if (!mdev->common)
 		goto err_map_common;
 	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index d0f2797420f7..a09e13a577a9 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -5,13 +5,6 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
-struct virtio_pci_modern_common_cfg {
-	struct virtio_pci_common_cfg cfg;
-
-	__le16 queue_notify_data;	/* read-write */
-	__le16 queue_reset;		/* read-write */
-};
-
 /**
  * struct virtio_pci_modern_device - info for modern PCI virtio
  * @pci_dev:	    Ptr to the PCI device struct
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index f703afc7ad31..44f4dd2add18 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -166,6 +166,17 @@ struct virtio_pci_common_cfg {
 	__le32 queue_used_hi;		/* read-write */
 };
 
+/*
+ * Warning: do not use sizeof on this: use offsetofend for
+ * specific fields you need.
+ */
+struct virtio_pci_modern_common_cfg {
+	struct virtio_pci_common_cfg cfg;
+
+	__le16 queue_notify_data;	/* read-write */
+	__le16 queue_reset;		/* read-write */
+};
+
 /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */
 struct virtio_pci_cfg_cap {
 	struct virtio_pci_cap cap;
-- 
cgit 


From 0d82410252ea324f0064e75b9865bb74cccc1dda Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 31 Oct 2023 15:43:39 +0100
Subject: vdpa_sim_blk: allocate the buffer zeroed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deleting and recreating a device can lead to having the same
content as the old device, so let's always allocate buffers
completely zeroed out.

Fixes: abebb16254b3 ("vdpa_sim_blk: support shared backend")
Suggested-by: Qing Wang <qinwang@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20231031144339.121453-1-sgarzare@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index b3a3cb165795..b137f3679343 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -437,7 +437,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
 	if (blk->shared_backend) {
 		blk->buffer = shared_buffer;
 	} else {
-		blk->buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+		blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
 				       GFP_KERNEL);
 		if (!blk->buffer) {
 			ret = -ENOMEM;
@@ -495,7 +495,7 @@ static int __init vdpasim_blk_init(void)
 		goto parent_err;
 
 	if (shared_backend) {
-		shared_buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+		shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
 					 GFP_KERNEL);
 		if (!shared_buffer) {
 			ret = -ENOMEM;
-- 
cgit 


From f2de37a572853d340f945a7748f74e3ed8c6b743 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn@rivosinc.com>
Date: Thu, 12 Oct 2023 12:28:52 +0200
Subject: riscv, qemu_fw_cfg: Add support for RISC-V architecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Qemu fw_cfg support was missing for RISC-V, which made it hard to do
proper vmcore dumps from qemu.

Add the missing RISC-V arch-defines.

You can now do vmcore dumps from qemu. Add "-device vmcoreinfo" to the
qemu command-line. From the qemu monitor:
  (qemu) dump-guest-memory vmcore

The vmcore can now be used, e.g., with the "crash" utility.

Acked-by: "Michael S. Tsirkin" <mst@redhat.com>
Acked-by: Alistair Francis <alistair.francis@wdc.com>
Tested-by: Clément Léger <cleger@rivosinc.com>
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Message-Id: <20231012102852.234442-1-bjorn@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/firmware/Kconfig       | 2 +-
 drivers/firmware/qemu_fw_cfg.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index b59e3041fd62..f05ff56629b3 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -155,7 +155,7 @@ config RASPBERRYPI_FIRMWARE
 
 config FW_CFG_SYSFS
 	tristate "QEMU fw_cfg device support in sysfs"
-	depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || SPARC || X86)
+	depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || RISCV || SPARC || X86)
 	depends on HAS_IOPORT_MAP
 	default n
 	help
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index a69399a6b7c0..1448f61173b3 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -211,7 +211,7 @@ static void fw_cfg_io_cleanup(void)
 
 /* arch-specific ctrl & data register offsets are not available in ACPI, DT */
 #if !(defined(FW_CFG_CTRL_OFF) && defined(FW_CFG_DATA_OFF))
-# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64))
+# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV))
 #  define FW_CFG_CTRL_OFF 0x08
 #  define FW_CFG_DATA_OFF 0x00
 #  define FW_CFG_DMA_OFF 0x10
-- 
cgit 


From b2c8b644fac1087dfe69a1762c04df090178a5ae Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 25 Oct 2023 16:53:19 +0200
Subject: virtio_pci: Switch away from deprecated irq_set_affinity_hint

Since commit 65c7cdedeb30 ("genirq: Provide new interfaces for affinity
hints") irq_set_affinity_hint is being phased out.

Switch to new interfaces for setting and applying irq affinity hints.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Message-Id: <20231025145319.380775-1-jakub@cloudflare.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/virtio/virtio_pci_common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index c2524a7207cf..7a5593997e0e 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -242,7 +242,7 @@ void vp_del_vqs(struct virtio_device *vdev)
 			if (v != VIRTIO_MSI_NO_VECTOR) {
 				int irq = pci_irq_vector(vp_dev->pci_dev, v);
 
-				irq_set_affinity_hint(irq, NULL);
+				irq_update_affinity_hint(irq, NULL);
 				free_irq(irq, vq);
 			}
 		}
@@ -443,10 +443,10 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
 		mask = vp_dev->msix_affinity_masks[info->msix_vector];
 		irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
 		if (!cpu_mask)
-			irq_set_affinity_hint(irq, NULL);
+			irq_update_affinity_hint(irq, NULL);
 		else {
 			cpumask_copy(mask, cpu_mask);
-			irq_set_affinity_hint(irq, mask);
+			irq_set_affinity_and_hint(irq, mask);
 		}
 	}
 	return 0;
-- 
cgit 


From e07754e0a1ea2d63fb29574253d1fd7405607343 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 27 Oct 2023 15:12:54 +0300
Subject: vhost-vdpa: fix use after free in vhost_vdpa_probe()

The put_device() calls vhost_vdpa_release_dev() which calls
ida_simple_remove() and frees "v".  So this call to
ida_simple_remove() is a use after free and a double free.

Fixes: ebe6a354fa7e ("vhost-vdpa: Call ida_simple_remove() when failed")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Message-Id: <cf53cb61-0699-4e36-a980-94fd4268ff00@moroto.mountain>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vdpa.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 30df5c58db73..da7ec77cdaff 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -1582,7 +1582,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
 
 err:
 	put_device(&v->dev);
-	ida_simple_remove(&vhost_vdpa_ida, v->minor);
 	return r;
 }
 
-- 
cgit 


From dec96fc2dcb59723e041416b8dc53e011b4bfc2e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 13 Oct 2023 10:05:48 +0100
Subject: btrfs: use u64 for buffer sizes in the tree search ioctls

In the tree search v2 ioctl we use the type size_t, which is an unsigned
long, to track the buffer size in the local variable 'buf_size'. An
unsigned long is 32 bits wide on a 32 bits architecture. The buffer size
defined in struct btrfs_ioctl_search_args_v2 is a u64, so when we later
try to copy the local variable 'buf_size' to the argument struct, when
the search returns -EOVERFLOW, we copy only 32 bits which will be a
problem on big endian systems.

Fix this by using a u64 type for the buffer sizes, not only at
btrfs_ioctl_tree_search_v2(), but also everywhere down the call chain
so that we can use the u64 at btrfs_ioctl_tree_search_v2().

Fixes: cc68a8a5a433 ("btrfs: new ioctl TREE_SEARCH_V2")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/linux-btrfs/ce6f4bd6-9453-4ffe-ba00-cee35495e10f@moroto.mountain/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 89cd212735ea..f7e94aff41ae 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1528,7 +1528,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
 static noinline int copy_to_sk(struct btrfs_path *path,
 			       struct btrfs_key *key,
 			       struct btrfs_ioctl_search_key *sk,
-			       size_t *buf_size,
+			       u64 *buf_size,
 			       char __user *ubuf,
 			       unsigned long *sk_offset,
 			       int *num_found)
@@ -1660,7 +1660,7 @@ out:
 
 static noinline int search_ioctl(struct inode *inode,
 				 struct btrfs_ioctl_search_key *sk,
-				 size_t *buf_size,
+				 u64 *buf_size,
 				 char __user *ubuf)
 {
 	struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -1733,7 +1733,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
 	struct btrfs_ioctl_search_args __user *uargs = argp;
 	struct btrfs_ioctl_search_key sk;
 	int ret;
-	size_t buf_size;
+	u64 buf_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1763,8 +1763,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
 	struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
 	struct btrfs_ioctl_search_args_v2 args;
 	int ret;
-	size_t buf_size;
-	const size_t buf_limit = SZ_16M;
+	u64 buf_size;
+	const u64 buf_limit = SZ_16M;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-- 
cgit 


From b8212814d1e8428a082234223105e4071b844fab Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 12 Oct 2023 12:42:55 +0300
Subject: btrfs: directly return 0 on no error code in
 btrfs_insert_raid_extent()

It's more obvious to return a literal zero instead of "return ret;".
Plus Smatch complains that ret could be uninitialized if the
ordered_extent->bioc_list list is empty and this silences that warning.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 944e8f1862aa..9589362acfbf 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
 		btrfs_put_bioc(bioc);
 	}
 
-	return ret;
+	return 0;
 }
 
 int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
-- 
cgit 


From dfcb03ae8a341600d72fbf3c79429f306764d653 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 17 Oct 2023 16:23:22 +0900
Subject: btrfs: zoned: drop no longer valid write pointer check

There is a check of the write pointer vs the zone size to reject an invalid
write pointer. However, as of now, we have RAID0/RAID10 on the zoned
mode, we can have a block group whose size is larger than the zone size.

As an equivalent check against the block group's zone_capacity is already
there, we can just drop this invalid check.

Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 3504ade30cb0..188378ca19c7 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1661,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 out:
-	if (cache->alloc_offset > fs_info->zone_size) {
-		btrfs_err(fs_info,
-			"zoned: invalid write pointer %llu in block group %llu",
-			cache->alloc_offset, cache->start);
-		ret = -EIO;
-	}
-
 	if (cache->alloc_offset > cache->zone_capacity) {
 		btrfs_err(fs_info,
 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
-- 
cgit 


From 776a838f1fa95670c1c1cf7109a898090b473fa3 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 17 Oct 2023 17:00:31 +0900
Subject: btrfs: zoned: wait for data BG to be finished on direct IO allocation

Running the fio command below on a ZNS device results in "Resource
temporarily unavailable" error.

  $ sudo fio --name=w --directory=/mnt --filesize=1GB --bs=16MB --numjobs=16 \
        --rw=write --ioengine=libaio --iodepth=128 --direct=1

  fio: io_u error on file /mnt/w.2.0: Resource temporarily unavailable: write offset=117440512, buflen=16777216
  fio: io_u error on file /mnt/w.2.0: Resource temporarily unavailable: write offset=134217728, buflen=16777216
  ...

This happens because -EAGAIN error returned from btrfs_reserve_extent()
called from btrfs_new_extent_direct() is spilling over to the userland.

btrfs_reserve_extent() returns -EAGAIN when there is no active zone
available. Then, the caller should wait for some other on-going IO to
finish a zone and retry the allocation.

This logic is already implemented for buffered write in cow_file_range(),
but it is missing for the direct IO counterpart. Implement the same logic
for it.

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: 2ce543f47843 ("btrfs: zoned: wait until zone is finished when allocation didn't progress")
CC: stable@vger.kernel.org # 6.1+
Tested-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b388505c91cc..137c4824ff91 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6979,8 +6979,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 	int ret;
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
 				   0, alloc_hint, &ins, 1, 1);
+	if (ret == -EAGAIN) {
+		ASSERT(btrfs_is_zoned(fs_info));
+		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+			       TASK_UNINTERRUPTIBLE);
+		goto again;
+	}
 	if (ret)
 		return ERR_PTR(ret);
 
-- 
cgit 


From d8ba2a91fc3cd0347823435971f58f473cbba7aa Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 13 Oct 2023 15:18:17 -0400
Subject: btrfs: get correct owning_root when dropping snapshot

Dave reported a bug where we were aborting the transaction while trying
to cleanup the squota reservation for an extent.

This turned out to be because we're doing btrfs_header_owner(next) in
do_walk_down when we decide to free the block.  However in this code
block we haven't explicitly read next, so it could be stale.  We would
then get whatever garbage happened to be in the pages at this point.
The commit that introduced that is "btrfs: track owning root in
btrfs_ref".

Fix this by saving the owner_root when we do the
btrfs_lookup_extent_info().  We always do this in do_walk_down, it is
how we make the decision of whether or not to delete the block.  This is
cheap because we've already done the extent item lookup at this point,
so it's straightforward to just grab the owner root as well.

Then we can use this when deleting the metadata block without needing to
force a read of the extent buffer to find the owner.

This fixes the problem that Dave reported.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       |  2 +-
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++--------
 fs/btrfs/extent-tree.h |  3 ++-
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c0c5f2239820..14cefeaf9622 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -421,7 +421,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 	if (btrfs_block_can_be_shared(root, buf)) {
 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 					       btrfs_header_level(buf), 1,
-					       &refs, &flags);
+					       &refs, &flags, NULL);
 		if (ret)
 			return ret;
 		if (unlikely(refs == 0)) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c8e5b4715b49..0455935ff558 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  */
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 offset, int metadata, u64 *refs, u64 *flags)
+			     u64 offset, int metadata, u64 *refs, u64 *flags,
+			     u64 *owning_root)
 {
 	struct btrfs_root *extent_root;
 	struct btrfs_delayed_ref_head *head;
@@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	u64 num_refs;
 	u64 extent_flags;
+	u64 owner = 0;
 	int ret;
 
 	/*
@@ -167,6 +169,8 @@ search_again:
 					    struct btrfs_extent_item);
 			num_refs = btrfs_extent_refs(leaf, ei);
 			extent_flags = btrfs_extent_flags(leaf, ei);
+			owner = btrfs_get_extent_owner_root(fs_info, leaf,
+							    path->slots[0]);
 		} else {
 			ret = -EUCLEAN;
 			btrfs_err(fs_info,
@@ -226,6 +230,8 @@ out:
 		*refs = num_refs;
 	if (flags)
 		*flags = extent_flags;
+	if (owning_root)
+		*owning_root = owner;
 out_free:
 	btrfs_free_path(path);
 	return ret;
@@ -5234,7 +5240,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 		/* We don't lock the tree block, it's OK to be racy here */
 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
 					       wc->level - 1, 1, &refs,
-					       &flags);
+					       &flags, NULL);
 		/* We don't care about errors in readahead. */
 		if (ret < 0)
 			continue;
@@ -5301,7 +5307,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 		ret = btrfs_lookup_extent_info(trans, fs_info,
 					       eb->start, level, 1,
 					       &wc->refs[level],
-					       &wc->flags[level]);
+					       &wc->flags[level],
+					       NULL);
 		BUG_ON(ret == -ENOMEM);
 		if (ret)
 			return ret;
@@ -5391,6 +5398,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 generation;
 	u64 parent;
+	u64 owner_root = 0;
 	struct btrfs_tree_parent_check check = { 0 };
 	struct btrfs_key key;
 	struct btrfs_ref ref = { 0 };
@@ -5434,7 +5442,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
 				       &wc->refs[level - 1],
-				       &wc->flags[level - 1]);
+				       &wc->flags[level - 1],
+				       &owner_root);
 	if (ret < 0)
 		goto out_unlock;
 
@@ -5567,8 +5576,7 @@ skip:
 		find_next_key(path, level, &wc->drop_progress);
 
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-				       fs_info->nodesize, parent,
-				       btrfs_header_owner(next));
+				       fs_info->nodesize, parent, owner_root);
 		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
 				    0, false);
 		ret = btrfs_free_extent(trans, &ref);
@@ -5635,7 +5643,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			ret = btrfs_lookup_extent_info(trans, fs_info,
 						       eb->start, level, 1,
 						       &wc->refs[level],
-						       &wc->flags[level]);
+						       &wc->flags[level],
+						       NULL);
 			if (ret < 0) {
 				btrfs_tree_unlock_rw(eb, path->locks[level]);
 				path->locks[level] = 0;
@@ -5880,7 +5889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 			ret = btrfs_lookup_extent_info(trans, fs_info,
 						path->nodes[level]->start,
 						level, 1, &wc->refs[level],
-						&wc->flags[level]);
+						&wc->flags[level], NULL);
 			if (ret < 0) {
 				err = ret;
 				goto out_end_trans;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 0716f65d9753..2e066035ccee 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 offset, int metadata, u64 *refs, u64 *flags);
+			     u64 offset, int metadata, u64 *refs, u64 *flags,
+			     u64 *owner_root);
 int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
 		     int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-- 
cgit 


From 47e2b06b7b5cb356a987ba3429550c3a89ea89d6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sat, 28 Oct 2023 13:28:45 +1030
Subject: btrfs: make found_logical_ret parameter mandatory for function
 queue_scrub_stripe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[BUG]
There is a compilation warning reported on commit ae76d8e3e135 ("btrfs:
scrub: fix grouping of read IO"), where gcc (14.0.0 20231022 experimental)
is reporting the following uninitialized variable:

  fs/btrfs/scrub.c: In function ‘scrub_simple_mirror.isra’:
  fs/btrfs/scrub.c:2075:29: error: ‘found_logical’ may be used uninitialized [-Werror=maybe-uninitialized[https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wmaybe-uninitialized]]
   2075 |                 cur_logical = found_logical + BTRFS_STRIPE_LEN;
  fs/btrfs/scrub.c:2040:21: note: ‘found_logical’ was declared here
   2040 |                 u64 found_logical;
        |                     ^~~~~~~~~~~~~

[CAUSE]
This is a false alert, as @found_logical is passed as parameter
@found_logical_ret of function queue_scrub_stripe().

As long as queue_scrub_stripe() returned 0, we would update
@found_logical_ret.  And if queue_scrub_stripe() returned >0 or <0, the
caller would not utilized @found_logical, thus there should be nothing
wrong.

Although the triggering gcc is still experimental, it looks like the
extra check on "if (found_logical_ret)" can sometimes confuse the
compiler.

Meanwhile the only caller of queue_scrub_stripe() is always passing a
valid pointer, there is no need for such check at all.

[FIX]
Although the report itself is a false alert, we can still make it more
explicit by:

- Replace the check for @found_logical_ret with ASSERT()

- Initialize @found_logical to U64_MAX

- Add one extra ASSERT() to make sure @found_logical got updated

Link: https://lore.kernel.org/linux-btrfs/87fs1x1p93.fsf@gentoo.org/
Fixes: ae76d8e3e135 ("btrfs: scrub: fix grouping of read IO")
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9ce5be21b036..f62a408671cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1868,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 	 */
 	ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
 
+	/* @found_logical_ret must be specified. */
+	ASSERT(found_logical_ret);
+
 	stripe = &sctx->stripes[sctx->cur_stripe];
 	scrub_reset_stripe(stripe);
 	ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1876,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 	/* Either >0 as no more extents or <0 for error. */
 	if (ret)
 		return ret;
-	if (found_logical_ret)
-		*found_logical_ret = stripe->logical;
+	*found_logical_ret = stripe->logical;
 	sctx->cur_stripe++;
 
 	/* We filled one group, submit it. */
@@ -2080,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 
 	/* Go through each extent items inside the logical range */
 	while (cur_logical < logical_end) {
-		u64 found_logical;
+		u64 found_logical = U64_MAX;
 		u64 cur_physical = physical + cur_logical - logical_start;
 
 		/* Canceled? */
@@ -2115,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 		if (ret < 0)
 			break;
 
+		/* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+		ASSERT(found_logical != U64_MAX);
 		cur_logical = found_logical + BTRFS_STRIPE_LEN;
 
 		/* Don't hold CPU for too long time */
-- 
cgit 


From cd63ffbd23edc176f09cac5c9287db732d7cbb73 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 30 Oct 2023 11:54:23 +0000
Subject: btrfs: fix error pointer dereference after failure to allocate fs
 devices

At device_list_add() we allocate a btrfs_fs_devices structure and then
before checking if the allocation failed (pointer is ERR_PTR(-ENOMEM)),
we dereference the error pointer in a memcpy() argument if the feature
BTRFS_FEATURE_INCOMPAT_METADATA_UUID is enabled.
Fix this by checking for an allocation error before trying the memcpy().

Fixes: f7361d8c3fc3 ("btrfs: sipmlify uuid parameters of alloc_fs_devices()")
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1fdfa9153e30..dd279241f78c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -746,13 +746,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 
 	if (!fs_devices) {
 		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (IS_ERR(fs_devices))
+			return ERR_CAST(fs_devices);
+
 		if (has_metadata_uuid)
 			memcpy(fs_devices->metadata_uuid,
 			       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
 
-		if (IS_ERR(fs_devices))
-			return ERR_CAST(fs_devices);
-
 		if (same_fsid_diff_dev) {
 			generate_random_uuid(fs_devices->fsid);
 			fs_devices->temp_fsid = true;
-- 
cgit 


From c692800cb2ef7a4f4940c68d765cd4649aff3e46 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 2 Nov 2023 02:33:14 +0300
Subject: MAINTAINERS: Add Intel TDX entry

Add myself as Intel TDX maintainer.

I drove upstreaming most of TDX code so far and I will continue
working on TDX for foreseeable future.

[ dhansen: * Add myself as a reviewer too
	   * Swap Maintained=>Supported.  I double
	     checked Kirill is still being paid
	   * Add drivers/virt/coco/tdx-guest ]

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/all/20231101233314.2567-1-kirill.shutemov%40linux.intel.com
---
 MAINTAINERS | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index dd5de540ec0b..b697020eca1c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23460,6 +23460,20 @@ F:	arch/x86/kernel/dumpstack.c
 F:	arch/x86/kernel/stacktrace.c
 F:	arch/x86/kernel/unwind_*.c
 
+X86 TRUST DOMAIN EXTENSIONS (TDX)
+M:	Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+R:	Dave Hansen <dave.hansen@linux.intel.com>
+L:	x86@kernel.org
+L:	linux-coco@lists.linux.dev
+S:	Supported
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx
+F:	arch/x86/boot/compressed/tdx*
+F:	arch/x86/coco/tdx/
+F:	arch/x86/include/asm/shared/tdx.h
+F:	arch/x86/include/asm/tdx.h
+F:	arch/x86/virt/vmx/tdx/
+F:	drivers/virt/coco/tdx-guest
+
 X86 VDSO
 M:	Andy Lutomirski <luto@kernel.org>
 L:	linux-kernel@vger.kernel.org
-- 
cgit 


From d3badb15613c14dd35d3495b1dde5c90fcd616dd Mon Sep 17 00:00:00 2001
From: Fang Xiang <fangxiang3@xiaomi.com>
Date: Mon, 30 Oct 2023 16:32:56 +0800
Subject: irqchip/gic-v3-its: Flush ITS tables correctly in non-coherent GIC
 designs

In non-coherent GIC designs, the ITS tables must be flushed before writing
to the GITS_BASER<n> registers, otherwise the ITS could read dirty tables,
which results in unpredictable behavior.

Flush the tables right at the begin of its_setup_baser() to prevent that.

[ tglx: Massage changelog ]

Fixes: a8707f553884 ("irqchip/gic-v3: Add Rockchip 3588001 erratum workaround")
Suggested-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Fang Xiang <fangxiang3@xiaomi.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231030083256.4345-1-fangxiang3@xiaomi.com
---
 drivers/irqchip/irq-gic-v3-its.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index a8c89df1a997..9a7a74239eab 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2379,12 +2379,12 @@ retry_baser:
 		break;
 	}
 
+	if (!shr)
+		gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
+
 	its_write_baser(its, baser, val);
 	tmp = baser->val;
 
-	if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE)
-		tmp &= ~GITS_BASER_SHAREABILITY_MASK;
-
 	if ((val ^ tmp) & GITS_BASER_SHAREABILITY_MASK) {
 		/*
 		 * Shareability didn't stick. Just use
@@ -2394,10 +2394,9 @@ retry_baser:
 		 * non-cacheable as well.
 		 */
 		shr = tmp & GITS_BASER_SHAREABILITY_MASK;
-		if (!shr) {
+		if (!shr)
 			cache = GITS_BASER_nC;
-			gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
-		}
+
 		goto retry_baser;
 	}
 
@@ -2609,6 +2608,11 @@ static int its_alloc_tables(struct its_node *its)
 		/* erratum 24313: ignore memory access type */
 		cache = GITS_BASER_nCnB;
 
+	if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE) {
+		cache = GITS_BASER_nC;
+		shr = 0;
+	}
+
 	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
 		struct its_baser *baser = its->tables + i;
 		u64 val = its_read_baser(its, baser);
-- 
cgit 


From 18216762bcf618c52b85719d3563243f80e4a2d4 Mon Sep 17 00:00:00 2001
From: Bagas Sanjaya <bagasdotme@gmail.com>
Date: Mon, 6 Nov 2023 17:12:04 +0700
Subject: x86/Documentation: Indent 'note::' directive for protocol version
 number note

The protocol version number note is between the protocol version table and
the memory layout section. As such, Sphinx renders the note directive not
only on the actual note, but until the end of doc.

Indent the directive so that only the actual protocol version number
note is rendered as such.

Fixes: 2c33c27fd603 ("x86/boot: Introduce kernel_info")
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20231106101206.76487-2-bagasdotme@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/arch/x86/boot.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst
index f5d2f2414de8..22cc7a040dae 100644
--- a/Documentation/arch/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -77,7 +77,7 @@ Protocol 2.14	BURNT BY INCORRECT COMMIT
 Protocol 2.15	(Kernel 5.5) Added the kernel_info and kernel_info.setup_type_max.
 =============	============================================================
 
-.. note::
+  .. note::
      The protocol version number should be changed only if the setup header
      is changed. There is no need to update the version number if boot_params
      or kernel_info are changed. Additionally, it is recommended to use
-- 
cgit 


From f2c1dba31133233697fc96e808c6005fc304a8e9 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 28 Jun 2023 09:40:53 -0400
Subject: tools/power/turbostat: bugfix "--show IPC"

turbostat --show IPC

displays "inf" for the IPC column

turbostat was missing the explicit dependency of IPC on APERF,
and thus neglected to collect APERF when only IPC was requested.

typcial use:

turbostat --quiet --show CPU,IPC

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 8a311d7272e7..5aa6598a40a9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2202,7 +2202,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 
 	old->c1 = new->c1 - old->c1;
 
-	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
+	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
+	    || soft_c1_residency_display(BIC_Avg_MHz)) {
 		if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
 			old->aperf = new->aperf - old->aperf;
 			old->mperf = new->mperf - old->mperf;
@@ -2724,7 +2725,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 retry:
 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
 
-	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
+	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
+	    || soft_c1_residency_display(BIC_Avg_MHz)) {
 		unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
 
 		/*
-- 
cgit 


From b8337e6a780dad9505f9d44da07c0a5c52fa0a04 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 7 Nov 2023 23:28:30 -0500
Subject: tools/power turbostat: version 2023.11.07

Turbostat features are now table-driven (Rui Zhang)
Add support for some new platforms (Sumeet Pawnikar, Rui Zhang)
Gracefully run in configs when CPUs are limited (Rui Zhang, Srinivas Pandruvada)
misc minor fixes.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5aa6598a40a9..7a334377f92b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -6259,7 +6259,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2023.03.17 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2023.11.07 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
-- 
cgit 


From 31255e072b2e91f97645d792d25b2db744186dd1 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Tue, 7 Nov 2023 10:22:51 -0800
Subject: x86/shstk: Delay signal entry SSP write until after user accesses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a signal is being delivered, the kernel needs to make accesses to
userspace. These accesses could encounter an access error, in which case
the signal delivery itself will trigger a segfault. Usually this would
result in the kernel killing the process. But in the case of a SEGV signal
handler being configured, the failure of the first signal delivery will
result in *another* signal getting delivered. The second signal may
succeed if another thread has resolved the issue that triggered the
segfault (i.e. a well timed mprotect()/mmap()), or the second signal is
being delivered to another stack (i.e. an alt stack).

On x86, in the non-shadow stack case, all the accesses to userspace are
done before changes to the registers (in pt_regs). The operation is
aborted when an access error occurs, so although there may be writes done
for the first signal, control flow changes for the signal (regs->ip,
regs->sp, etc) are not committed until all the accesses have already
completed successfully. This means that the second signal will be
delivered as if it happened at the time of the first signal. It will
effectively replace the first aborted signal, overwriting the half-written
frame of the aborted signal. So on sigreturn from the second signal,
control flow will resume happily from the point of control flow where the
original signal was delivered.

The problem is, when shadow stack is active, the shadow stack SSP
register/MSR is updated *before* some of the userspace accesses. This
means if the earlier accesses succeed and the later ones fail, the second
signal will not be delivered at the same spot on the shadow stack as the
first one. So on sigreturn from the second signal, the SSP will be
pointing to the wrong location on the shadow stack (off by a frame).

Pengfei privately reported that while using a shadow stack enabled glibc,
the “signal06” test in the LTP test-suite hung. It turns out it is
testing the above described double signal scenario. When this test was
compiled with shadow stack, the first signal pushed a shadow stack
sigframe, then the second pushed another. When the second signal was
handled, the SSP was at the first shadow stack signal frame instead of
the original location. The test then got stuck as the #CP from the twice
incremented SSP was incorrect and generated segfaults in a loop.

Fix this by adjusting the SSP register only after any userspace accesses,
such that there can be no failures after the SSP is adjusted. Do this by
moving the shadow stack sigframe push logic to happen after all other
userspace accesses.

Note, sigreturn (as opposed to the signal delivery dealt with in this
patch) has ordering behavior that could lead to similar failures. The
ordering issues there extend beyond shadow stack to include the alt stack
restoration. Fixing that would require cross-arch changes, and the
ordering today does not cause any known test or apps breakages. So leave
it as is, for now.

[ dhansen: minor changelog/subject tweak ]

Fixes: 05e36022c054 ("x86/shstk: Handle signals for shadow stack")
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20231107182251.91276-1-rick.p.edgecombe%40intel.com
Link: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/signal/signal06.c
---
 arch/x86/kernel/signal_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index cacf2ede6217..23d8aaf8d9fd 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 	frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
 	uc_flags = frame_uc_flags(regs);
 
-	if (setup_signal_shadow_stack(ksig))
-		return -EFAULT;
-
 	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
@@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 			return -EFAULT;
 	}
 
+	if (setup_signal_shadow_stack(ksig))
+		return -EFAULT;
+
 	/* Set up registers for signal handler */
 	regs->di = ksig->sig;
 	/* In case the signal handler was declared without prototypes */
-- 
cgit 


From 65120498aaf8d7320647a8b6d6de7db42e74ea52 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Nov 2023 13:58:27 +0100
Subject: stackleak: add declarations for global functions

With -Wmissing-prototypes enabled, the stackleak code produces a couple of
warnings that have no declarations because they are only called from assembler:

stackleak.c:127:25: error: no previous prototype for 'stackleak_erase' [-Werror=missing-prototypes]
stackleak.c:139:25: error: no previous prototype for 'stackleak_erase_on_task_stack' [-Werror=missing-prototypes]
stackleak.c:151:25: error: no previous prototype for 'stackleak_erase_off_task_stack' [-Werror=missing-prototypes]
stackleak.c:159:49: error: no previous prototype for 'stackleak_track_stack' [-Werror=missing-prototypes]

Add declarations to the stackleak header to shut up the warnings.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231108125843.3806765-7-arnd@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/stackleak.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h
index c36e7a3b45e7..3be2cb564710 100644
--- a/include/linux/stackleak.h
+++ b/include/linux/stackleak.h
@@ -14,6 +14,7 @@
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 #include <asm/stacktrace.h>
+#include <linux/linkage.h>
 
 /*
  * The lowest address on tsk's stack which we can plausibly erase.
@@ -76,6 +77,11 @@ static inline void stackleak_task_init(struct task_struct *t)
 # endif
 }
 
+asmlinkage void noinstr stackleak_erase(void);
+asmlinkage void noinstr stackleak_erase_on_task_stack(void);
+asmlinkage void noinstr stackleak_erase_off_task_stack(void);
+void __no_caller_saved_registers noinstr stackleak_track_stack(void);
+
 #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
 static inline void stackleak_task_init(struct task_struct *t) { }
 #endif
-- 
cgit 


From 1ee60356c2dca938362528404af95b8ef3e49b6a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 4 Nov 2023 13:43:37 -0700
Subject: gcc-plugins: randstruct: Only warn about true flexible arrays

The randstruct GCC plugin tried to discover "fake" flexible arrays
to issue warnings about them in randomized structs. In the future
LSM overhead reduction series, it would be legal to have a randomized
struct with a 1-element array, and this should _not_ be treated as a
flexible array, especially since commit df8fc4e934c1 ("kbuild: Enable
-fstrict-flex-arrays=3"). Disable the 0-sized and 1-element array
discovery logic in the plugin, but keep the "true" flexible array check.

Cc: KP Singh <kpsingh@kernel.org>
Cc: linux-hardening@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311021532.iBwuZUZ0-lkp@intel.com/
Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3")
Reviewed-by: Bill Wendling <morbo@google.com>
Acked-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20231104204334.work.160-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 scripts/gcc-plugins/randomize_layout_plugin.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c
index 366395cab490..910bd21d08f4 100644
--- a/scripts/gcc-plugins/randomize_layout_plugin.c
+++ b/scripts/gcc-plugins/randomize_layout_plugin.c
@@ -278,8 +278,6 @@ static bool is_flexible_array(const_tree field)
 {
 	const_tree fieldtype;
 	const_tree typesize;
-	const_tree elemtype;
-	const_tree elemsize;
 
 	fieldtype = TREE_TYPE(field);
 	typesize = TYPE_SIZE(fieldtype);
@@ -287,20 +285,12 @@ static bool is_flexible_array(const_tree field)
 	if (TREE_CODE(fieldtype) != ARRAY_TYPE)
 		return false;
 
-	elemtype = TREE_TYPE(fieldtype);
-	elemsize = TYPE_SIZE(elemtype);
-
 	/* size of type is represented in bits */
 
 	if (typesize == NULL_TREE && TYPE_DOMAIN(fieldtype) != NULL_TREE &&
 	    TYPE_MAX_VALUE(TYPE_DOMAIN(fieldtype)) == NULL_TREE)
 		return true;
 
-	if (typesize != NULL_TREE &&
-	    (TREE_CONSTANT(typesize) && (!tree_to_uhwi(typesize) ||
-	     tree_to_uhwi(typesize) == tree_to_uhwi(elemsize))))
-		return true;
-
 	return false;
 }
 
-- 
cgit 


From 19597cad64d608aa8ac2f8aef50a50187a565223 Mon Sep 17 00:00:00 2001
From: Quinn Tran <qutran@marvell.com>
Date: Mon, 30 Oct 2023 12:19:12 +0530
Subject: scsi: qla2xxx: Fix system crash due to bad pointer access

User experiences system crash when running AER error injection.  The
perturbation causes the abort-all-I/O path to trigger. The driver assumes
all I/O on this path is FCP only. If there is both NVMe & FCP traffic, a
system crash happens. Add additional check to see if I/O is FCP or not
before access.

PID: 999019  TASK: ff35d769f24722c0  CPU: 53  COMMAND: "kworker/53:1"
 0 [ff3f78b964847b58] machine_kexec at ffffffffae86973d
 1 [ff3f78b964847ba8] __crash_kexec at ffffffffae9be29d
 2 [ff3f78b964847c70] crash_kexec at ffffffffae9bf528
 3 [ff3f78b964847c78] oops_end at ffffffffae8282ab
 4 [ff3f78b964847c98] exc_page_fault at ffffffffaf2da502
 5 [ff3f78b964847cc0] asm_exc_page_fault at ffffffffaf400b62
   [exception RIP: qla2x00_abort_srb+444]
   RIP: ffffffffc07b5f8c  RSP: ff3f78b964847d78  RFLAGS: 00010046
   RAX: 0000000000000282  RBX: ff35d74a0195a200  RCX: ff35d76886fd03a0
   RDX: 0000000000000001  RSI: ffffffffc07c5ec8  RDI: ff35d74a0195a200
   RBP: ff35d76913d22080   R8: ff35d7694d103200   R9: ff35d7694d103200
   R10: 0000000100000000  R11: ffffffffb05d6630  R12: 0000000000010000
   R13: ff3f78b964847df8  R14: ff35d768d8754000  R15: ff35d768877248e0
   ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 6 [ff3f78b964847d70] qla2x00_abort_srb at ffffffffc07b5f84 [qla2xxx]
 7 [ff3f78b964847de0] __qla2x00_abort_all_cmds at ffffffffc07b6238 [qla2xxx]
 8 [ff3f78b964847e38] qla2x00_abort_all_cmds at ffffffffc07ba635 [qla2xxx]
 9 [ff3f78b964847e58] qla2x00_terminate_rport_io at ffffffffc08145eb [qla2xxx]
10 [ff3f78b964847e70] fc_terminate_rport_io at ffffffffc045987e [scsi_transport_fc]
11 [ff3f78b964847e88] process_one_work at ffffffffae914f15
12 [ff3f78b964847ed0] worker_thread at ffffffffae9154c0
13 [ff3f78b964847f10] kthread at ffffffffae91c456
14 [ff3f78b964847f50] ret_from_fork at ffffffffae8036ef

Cc: stable@vger.kernel.org
Fixes: f45bca8c5052 ("scsi: qla2xxx: Fix double scsi_done for abort path")
Signed-off-by: Quinn Tran <qutran@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Link: https://lore.kernel.org/r/20231030064912.37912-1-njavali@marvell.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/qla2xxx/qla_os.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 7e103d711825..d24410944f7d 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1837,8 +1837,16 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res,
 		}
 
 		spin_lock_irqsave(qp->qp_lock_ptr, *flags);
-		if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd)))
-			sp->done(sp, res);
+		switch (sp->type) {
+		case SRB_SCSI_CMD:
+			if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd)))
+				sp->done(sp, res);
+			break;
+		default:
+			if (ret_cmd)
+				sp->done(sp, res);
+			break;
+		}
 	} else {
 		sp->done(sp, res);
 	}
-- 
cgit 


From defde5a50d91c74e1ce71a7f0bce7fb1ae311d84 Mon Sep 17 00:00:00 2001
From: Naomi Chu <naomi.chu@mediatek.com>
Date: Thu, 2 Nov 2023 13:24:24 +0800
Subject: scsi: ufs: core: Expand MCQ queue slot to DeviceQueueDepth + 1

The UFSHCI 4.0 specification mandates that there should always be at least
one empty slot in each queue for distinguishing between full and empty
states. Enlarge 'hwq->max_entries' to 'DeviceQueueDepth + 1' to allow
UFSHCI 4.0 controllers to fully utilize MCQ queue slots.

Fixes: 4682abfae2eb ("scsi: ufs: core: mcq: Allocate memory for MCQ mode")
Signed-off-by: Naomi Chu <naomi.chu@mediatek.com>
Link: https://lore.kernel.org/r/20231102052426.12006-2-naomi.chu@mediatek.com
Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Chun-Hung <chun-hung.wu@mediatek.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufs-mcq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index 2ba8ec254dce..5c75ab9d6bb5 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -436,7 +436,7 @@ int ufshcd_mcq_init(struct ufs_hba *hba)
 
 	for (i = 0; i < hba->nr_hw_queues; i++) {
 		hwq = &hba->uhq[i];
-		hwq->max_entries = hba->nutrs;
+		hwq->max_entries = hba->nutrs + 1;
 		spin_lock_init(&hwq->sq_lock);
 		spin_lock_init(&hwq->cq_lock);
 		mutex_init(&hwq->sq_mutex);
-- 
cgit 


From 27900d7119c464b43cd9eac69c85884d17bae240 Mon Sep 17 00:00:00 2001
From: Peter Wang <peter.wang@mediatek.com>
Date: Mon, 6 Nov 2023 15:51:17 +0800
Subject: scsi: ufs: core: Fix racing issue between ufshcd_mcq_abort() and ISR

If command timeout happens and cq complete IRQ is raised at the same time,
ufshcd_mcq_abort clears lprb->cmd and a NULL pointer deref happens in the
ISR. Error log:

ufshcd_abort: Device abort task at tag 18
Unable to handle kernel NULL pointer dereference at virtual address
0000000000000108
pc : [0xffffffe27ef867ac] scsi_dma_unmap+0xc/0x44
lr : [0xffffffe27f1b898c] ufshcd_release_scsi_cmd+0x24/0x114

Fixes: f1304d442077 ("scsi: ufs: mcq: Added ufshcd_mcq_abort()")
Cc: stable@vger.kernel.org
Signed-off-by: Peter Wang <peter.wang@mediatek.com>
Link: https://lore.kernel.org/r/20231106075117.8995-1-peter.wang@mediatek.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufs-mcq.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index 5c75ab9d6bb5..0787456c2b89 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -630,6 +630,7 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
 	int tag = scsi_cmd_to_rq(cmd)->tag;
 	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
 	struct ufs_hw_queue *hwq;
+	unsigned long flags;
 	int err = FAILED;
 
 	if (!ufshcd_cmd_inflight(lrbp->cmd)) {
@@ -670,8 +671,10 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
 	}
 
 	err = SUCCESS;
+	spin_lock_irqsave(&hwq->cq_lock, flags);
 	if (ufshcd_cmd_inflight(lrbp->cmd))
 		ufshcd_release_scsi_cmd(hba, lrbp);
+	spin_unlock_irqrestore(&hwq->cq_lock, flags);
 
 out:
 	return err;
-- 
cgit 


From 860c3d03bbc3f17aef8600662c488f27fd093142 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Mon, 6 Nov 2023 17:04:33 +0300
Subject: scsi: scsi_debug: Fix some bugs in sdebug_error_write()

There are two bug in this code:

 1) If count is zero, then it will lead to a NULL dereference.  The
    kmalloc() will successfully allocate zero bytes and the test for "if
    (buf[0] == '-')" will read beyond the end of the zero size buffer and
    Oops.

 2) The code does not ensure that the user's string is properly NUL
    terminated which could lead to a read overflow.

Fixes: a9996d722b11 ("scsi: scsi_debug: Add interface to manage error injection for a single device")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/7733643d-e102-4581-8d29-769472011c97@moroto.mountain
Reviewed-by: Wenchao Hao <haowenchao2@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 67922e2c4c19..0dd21598f7b6 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1019,7 +1019,7 @@ static ssize_t sdebug_error_write(struct file *file, const char __user *ubuf,
 	struct sdebug_err_inject *inject;
 	struct scsi_device *sdev = (struct scsi_device *)file->f_inode->i_private;
 
-	buf = kmalloc(count, GFP_KERNEL);
+	buf = kzalloc(count + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
-- 
cgit 


From 037fbd3fcfbd99145f9310d93f6637012807cfd0 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Mon, 6 Nov 2023 17:05:04 +0300
Subject: scsi: scsi_debug: Delete some bogus error checking

Smatch complains that "dentry" is never initialized.  These days everyone
initializes all their stack variables to zero so this means that it will
trigger a warning every time this function is run.

Really, debugfs functions are not supposed to be checked for errors in
normal code.  For example, if we updated this code to check the correct
variable then it would print a warning if CONFIG_DEBUGFS was disabled.  We
don't want that.  Just delete the check.

Fixes: f084fe52c640 ("scsi: scsi_debug: Add debugfs interface to fail target reset")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/c602c9ad-5e35-4e18-a47f-87ed956a9ec2@moroto.mountain
Reviewed-by: Wenchao Hao <haowenchao2@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_debug.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 0dd21598f7b6..6d8218a44122 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1132,7 +1132,6 @@ static const struct file_operations sdebug_target_reset_fail_fops = {
 static int sdebug_target_alloc(struct scsi_target *starget)
 {
 	struct sdebug_target_info *targetip;
-	struct dentry *dentry;
 
 	targetip = kzalloc(sizeof(struct sdebug_target_info), GFP_KERNEL);
 	if (!targetip)
@@ -1140,15 +1139,9 @@ static int sdebug_target_alloc(struct scsi_target *starget)
 
 	targetip->debugfs_entry = debugfs_create_dir(dev_name(&starget->dev),
 				sdebug_debugfs_root);
-	if (IS_ERR_OR_NULL(targetip->debugfs_entry))
-		pr_info("%s: failed to create debugfs directory for target %s\n",
-			__func__, dev_name(&starget->dev));
 
 	debugfs_create_file("fail_reset", 0600, targetip->debugfs_entry, starget,
 				&sdebug_target_reset_fail_fops);
-	if (IS_ERR_OR_NULL(dentry))
-		pr_info("%s: failed to create fail_reset file for target %s\n",
-			__func__, dev_name(&starget->dev));
 
 	starget->hostdata = targetip;
 
-- 
cgit 


From 3b83486399a6a9feb9c681b74c21a227d48d7020 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 6 Nov 2023 17:13:04 -0600
Subject: scsi: sd: Fix sshdr use in sd_suspend_common()

If scsi_execute_cmd() returns < 0, it doesn't initialize the sshdr, so we
shouldn't access the sshdr. If it returns 0, then the cmd executed
successfully, so there is no need to check the sshdr. sd_sync_cache() will
only access the sshdr if it's been setup because it calls
scsi_status_is_check_condition() before accessing it. However, the
sd_sync_cache() caller, sd_suspend_common(), does not check.

sd_suspend_common() is only checking for ILLEGAL_REQUEST which it's using
to determine if the command is supported. If it's not it just ignores the
error. So to fix its sshdr use this patch just moves that check to
sd_sync_cache() where it converts ILLEGAL_REQUEST to success/0.
sd_suspend_common() was ignoring that error and sd_shutdown() doesn't check
for errors so there will be no behavior changes.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20231106231304.5694-2-michael.christie@oracle.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin Wilck <mwilck@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd.c | 53 +++++++++++++++++++++++------------------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 864db30b9c8e..3ef3b00397fe 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1565,24 +1565,21 @@ out:
 	return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
-static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
+static int sd_sync_cache(struct scsi_disk *sdkp)
 {
 	int retries, res;
 	struct scsi_device *sdp = sdkp->device;
 	const int timeout = sdp->request_queue->rq_timeout
 		* SD_FLUSH_TIMEOUT_MULTIPLIER;
-	struct scsi_sense_hdr my_sshdr;
+	struct scsi_sense_hdr sshdr;
 	const struct scsi_exec_args exec_args = {
 		.req_flags = BLK_MQ_REQ_PM,
-		/* caller might not be interested in sense, but we need it */
-		.sshdr = sshdr ? : &my_sshdr,
+		.sshdr = &sshdr,
 	};
 
 	if (!scsi_device_online(sdp))
 		return -ENODEV;
 
-	sshdr = exec_args.sshdr;
-
 	for (retries = 3; retries > 0; --retries) {
 		unsigned char cmd[16] = { 0 };
 
@@ -1607,15 +1604,23 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
 			return res;
 
 		if (scsi_status_is_check_condition(res) &&
-		    scsi_sense_valid(sshdr)) {
-			sd_print_sense_hdr(sdkp, sshdr);
+		    scsi_sense_valid(&sshdr)) {
+			sd_print_sense_hdr(sdkp, &sshdr);
 
 			/* we need to evaluate the error return  */
-			if (sshdr->asc == 0x3a ||	/* medium not present */
-			    sshdr->asc == 0x20 ||	/* invalid command */
-			    (sshdr->asc == 0x74 && sshdr->ascq == 0x71))	/* drive is password locked */
+			if (sshdr.asc == 0x3a ||	/* medium not present */
+			    sshdr.asc == 0x20 ||	/* invalid command */
+			    (sshdr.asc == 0x74 && sshdr.ascq == 0x71))	/* drive is password locked */
 				/* this is no error here */
 				return 0;
+			/*
+			 * This drive doesn't support sync and there's not much
+			 * we can do because this is called during shutdown
+			 * or suspend so just return success so those operations
+			 * can proceed.
+			 */
+			if (sshdr.sense_key == ILLEGAL_REQUEST)
+				return 0;
 		}
 
 		switch (host_byte(res)) {
@@ -3774,7 +3779,7 @@ static void sd_shutdown(struct device *dev)
 
 	if (sdkp->WCE && sdkp->media_present) {
 		sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
-		sd_sync_cache(sdkp, NULL);
+		sd_sync_cache(sdkp);
 	}
 
 	if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) {
@@ -3786,7 +3791,6 @@ static void sd_shutdown(struct device *dev)
 static int sd_suspend_common(struct device *dev, bool ignore_stop_errors)
 {
 	struct scsi_disk *sdkp = dev_get_drvdata(dev);
-	struct scsi_sense_hdr sshdr;
 	int ret = 0;
 
 	if (!sdkp)	/* E.g.: runtime suspend following sd_remove() */
@@ -3795,24 +3799,13 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors)
 	if (sdkp->WCE && sdkp->media_present) {
 		if (!sdkp->device->silence_suspend)
 			sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
-		ret = sd_sync_cache(sdkp, &sshdr);
-
-		if (ret) {
-			/* ignore OFFLINE device */
-			if (ret == -ENODEV)
-				return 0;
-
-			if (!scsi_sense_valid(&sshdr) ||
-			    sshdr.sense_key != ILLEGAL_REQUEST)
-				return ret;
+		ret = sd_sync_cache(sdkp);
+		/* ignore OFFLINE device */
+		if (ret == -ENODEV)
+			return 0;
 
-			/*
-			 * sshdr.sense_key == ILLEGAL_REQUEST means this drive
-			 * doesn't support sync. There's not much to do and
-			 * suspend shouldn't fail.
-			 */
-			ret = 0;
-		}
+		if (ret)
+			return ret;
 	}
 
 	if (sdkp->device->manage_start_stop) {
-- 
cgit 


From e439e4a62a8ea3c39d65c546de3af7d1c594077c Mon Sep 17 00:00:00 2001
From: Neil Armstrong <neil.armstrong@linaro.org>
Date: Mon, 30 Oct 2023 10:43:11 +0100
Subject: scsi: ufs: qcom-ufs: dt-bindings: Document the SM8650 UFS Controller

Document the UFS Controller on the SM8650 Platform.

Reviewed-by: Alim Akhtar <alim.akhtar@samsung.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-bindings-ufs-v3-1-a96364463fd5@linaro.org
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
index 462ead5a1cec..2cf3d016db42 100644
--- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
+++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
@@ -36,6 +36,7 @@ properties:
           - qcom,sm8350-ufshc
           - qcom,sm8450-ufshc
           - qcom,sm8550-ufshc
+          - qcom,sm8650-ufshc
       - const: qcom,ufshc
       - const: jedec,ufs-2.0
 
@@ -122,6 +123,7 @@ allOf:
               - qcom,sm8350-ufshc
               - qcom,sm8450-ufshc
               - qcom,sm8550-ufshc
+              - qcom,sm8650-ufshc
     then:
       properties:
         clocks:
-- 
cgit 


From 6c8e69e4a702b072206f166111c003d704de15d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 3 Nov 2023 12:26:25 +0000
Subject: btrfs: fix race between accounting qgroup extents and removing a
 qgroup

When doing qgroup accounting for an extent, we take the spinlock
fs_info->qgroup_lock and then add qgroups to the local list (iterator)
named "qgroups". These qgroups are found in the fs_info->qgroup_tree
rbtree. After we're done, we unlock fs_info->qgroup_lock and then call
qgroup_iterator_nested_clean(), which will iterate over all the qgroups
added to the local list "qgroups" and then delete them from the list.
Deleting a qgroup from the list can however result in a use-after-free
if a qgroup remove operation happens after we unlock fs_info->qgroup_lock
and before or while we are at qgroup_iterator_nested_clean().

Fix this by calling qgroup_iterator_nested_clean() while still holding
the lock fs_info->qgroup_lock - we don't need it under the 'out' label
since before taking the lock the "qgroups" list is always empty. This
guarantees safety because btrfs_remove_qgroup() takes that lock before
removing a qgroup from the rbtree fs_info->qgroup_tree.

This was reported by syzbot with the following stack traces:

   BUG: KASAN: slab-use-after-free in __list_del_entry_valid_or_report+0x2f/0x130 lib/list_debug.c:49
   Read of size 8 at addr ffff888027e420b0 by task kworker/u4:3/48

   CPU: 1 PID: 48 Comm: kworker/u4:3 Not tainted 6.6.0-syzkaller-10396-g4652b8e4f3ff #0
   Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
   Workqueue: btrfs-qgroup-rescan btrfs_work_helper
   Call Trace:
    <TASK>
    __dump_stack lib/dump_stack.c:88 [inline]
    dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106
    print_address_description mm/kasan/report.c:364 [inline]
    print_report+0x163/0x540 mm/kasan/report.c:475
    kasan_report+0x175/0x1b0 mm/kasan/report.c:588
    __list_del_entry_valid_or_report+0x2f/0x130 lib/list_debug.c:49
    __list_del_entry_valid include/linux/list.h:124 [inline]
    __list_del_entry include/linux/list.h:215 [inline]
    list_del_init include/linux/list.h:287 [inline]
    qgroup_iterator_nested_clean fs/btrfs/qgroup.c:2623 [inline]
    btrfs_qgroup_account_extent+0x18b/0x1150 fs/btrfs/qgroup.c:2883
    qgroup_rescan_leaf fs/btrfs/qgroup.c:3543 [inline]
    btrfs_qgroup_rescan_worker+0x1078/0x1c60 fs/btrfs/qgroup.c:3604
    btrfs_work_helper+0x37c/0xbd0 fs/btrfs/async-thread.c:315
    process_one_work kernel/workqueue.c:2630 [inline]
    process_scheduled_works+0x90f/0x1400 kernel/workqueue.c:2703
    worker_thread+0xa5f/0xff0 kernel/workqueue.c:2784
    kthread+0x2d3/0x370 kernel/kthread.c:388
    ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147
    ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242
    </TASK>

   Allocated by task 6355:
    kasan_save_stack mm/kasan/common.c:45 [inline]
    kasan_set_track+0x4f/0x70 mm/kasan/common.c:52
    ____kasan_kmalloc mm/kasan/common.c:374 [inline]
    __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:383
    kmalloc include/linux/slab.h:600 [inline]
    kzalloc include/linux/slab.h:721 [inline]
    btrfs_quota_enable+0xee9/0x2060 fs/btrfs/qgroup.c:1209
    btrfs_ioctl_quota_ctl+0x143/0x190 fs/btrfs/ioctl.c:3705
    vfs_ioctl fs/ioctl.c:51 [inline]
    __do_sys_ioctl fs/ioctl.c:871 [inline]
    __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857
    do_syscall_x64 arch/x86/entry/common.c:51 [inline]
    do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
    entry_SYSCALL_64_after_hwframe+0x63/0x6b

   Freed by task 6355:
    kasan_save_stack mm/kasan/common.c:45 [inline]
    kasan_set_track+0x4f/0x70 mm/kasan/common.c:52
    kasan_save_free_info+0x28/0x40 mm/kasan/generic.c:522
    ____kasan_slab_free+0xd6/0x120 mm/kasan/common.c:236
    kasan_slab_free include/linux/kasan.h:164 [inline]
    slab_free_hook mm/slub.c:1800 [inline]
    slab_free_freelist_hook mm/slub.c:1826 [inline]
    slab_free mm/slub.c:3809 [inline]
    __kmem_cache_free+0x263/0x3a0 mm/slub.c:3822
    btrfs_remove_qgroup+0x764/0x8c0 fs/btrfs/qgroup.c:1787
    btrfs_ioctl_qgroup_create+0x185/0x1e0 fs/btrfs/ioctl.c:3811
    vfs_ioctl fs/ioctl.c:51 [inline]
    __do_sys_ioctl fs/ioctl.c:871 [inline]
    __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857
    do_syscall_x64 arch/x86/entry/common.c:51 [inline]
    do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
    entry_SYSCALL_64_after_hwframe+0x63/0x6b

   Last potentially related work creation:
    kasan_save_stack+0x3f/0x60 mm/kasan/common.c:45
    __kasan_record_aux_stack+0xad/0xc0 mm/kasan/generic.c:492
    __call_rcu_common kernel/rcu/tree.c:2667 [inline]
    call_rcu+0x167/0xa70 kernel/rcu/tree.c:2781
    kthread_worker_fn+0x4ba/0xa90 kernel/kthread.c:823
    kthread+0x2d3/0x370 kernel/kthread.c:388
    ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147
    ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242

   Second to last potentially related work creation:
    kasan_save_stack+0x3f/0x60 mm/kasan/common.c:45
    __kasan_record_aux_stack+0xad/0xc0 mm/kasan/generic.c:492
    __call_rcu_common kernel/rcu/tree.c:2667 [inline]
    call_rcu+0x167/0xa70 kernel/rcu/tree.c:2781
    kthread_worker_fn+0x4ba/0xa90 kernel/kthread.c:823
    kthread+0x2d3/0x370 kernel/kthread.c:388
    ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147
    ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242

   The buggy address belongs to the object at ffff888027e42000
    which belongs to the cache kmalloc-512 of size 512
   The buggy address is located 176 bytes inside of
    freed 512-byte region [ffff888027e42000, ffff888027e42200)

   The buggy address belongs to the physical page:
   page:ffffea00009f9000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x27e40
   head:ffffea00009f9000 order:2 entire_mapcount:0 nr_pages_mapped:0 pincount:0
   flags: 0xfff00000000840(slab|head|node=0|zone=1|lastcpupid=0x7ff)
   page_type: 0xffffffff()
   raw: 00fff00000000840 ffff888012c41c80 ffffea0000a5ba00 dead000000000002
   raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000
   page dumped because: kasan: bad access detected
   page_owner tracks the page as allocated
   page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 4514, tgid 4514 (udevadm), ts 24598439480, free_ts 23755696267
    set_page_owner include/linux/page_owner.h:31 [inline]
    post_alloc_hook+0x1e6/0x210 mm/page_alloc.c:1536
    prep_new_page mm/page_alloc.c:1543 [inline]
    get_page_from_freelist+0x31db/0x3360 mm/page_alloc.c:3170
    __alloc_pages+0x255/0x670 mm/page_alloc.c:4426
    alloc_slab_page+0x6a/0x160 mm/slub.c:1870
    allocate_slab mm/slub.c:2017 [inline]
    new_slab+0x84/0x2f0 mm/slub.c:2070
    ___slab_alloc+0xc85/0x1310 mm/slub.c:3223
    __slab_alloc mm/slub.c:3322 [inline]
    __slab_alloc_node mm/slub.c:3375 [inline]
    slab_alloc_node mm/slub.c:3468 [inline]
    __kmem_cache_alloc_node+0x19d/0x270 mm/slub.c:3517
    kmalloc_trace+0x2a/0xe0 mm/slab_common.c:1098
    kmalloc include/linux/slab.h:600 [inline]
    kzalloc include/linux/slab.h:721 [inline]
    kernfs_fop_open+0x3e7/0xcc0 fs/kernfs/file.c:670
    do_dentry_open+0x8fd/0x1590 fs/open.c:948
    do_open fs/namei.c:3622 [inline]
    path_openat+0x2845/0x3280 fs/namei.c:3779
    do_filp_open+0x234/0x490 fs/namei.c:3809
    do_sys_openat2+0x13e/0x1d0 fs/open.c:1440
    do_sys_open fs/open.c:1455 [inline]
    __do_sys_openat fs/open.c:1471 [inline]
    __se_sys_openat fs/open.c:1466 [inline]
    __x64_sys_openat+0x247/0x290 fs/open.c:1466
    do_syscall_x64 arch/x86/entry/common.c:51 [inline]
    do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
    entry_SYSCALL_64_after_hwframe+0x63/0x6b
   page last free stack trace:
    reset_page_owner include/linux/page_owner.h:24 [inline]
    free_pages_prepare mm/page_alloc.c:1136 [inline]
    free_unref_page_prepare+0x8c3/0x9f0 mm/page_alloc.c:2312
    free_unref_page+0x37/0x3f0 mm/page_alloc.c:2405
    discard_slab mm/slub.c:2116 [inline]
    __unfreeze_partials+0x1dc/0x220 mm/slub.c:2655
    put_cpu_partial+0x17b/0x250 mm/slub.c:2731
    __slab_free+0x2b6/0x390 mm/slub.c:3679
    qlink_free mm/kasan/quarantine.c:166 [inline]
    qlist_free_all+0x75/0xe0 mm/kasan/quarantine.c:185
    kasan_quarantine_reduce+0x14b/0x160 mm/kasan/quarantine.c:292
    __kasan_slab_alloc+0x23/0x70 mm/kasan/common.c:305
    kasan_slab_alloc include/linux/kasan.h:188 [inline]
    slab_post_alloc_hook+0x67/0x3d0 mm/slab.h:762
    slab_alloc_node mm/slub.c:3478 [inline]
    slab_alloc mm/slub.c:3486 [inline]
    __kmem_cache_alloc_lru mm/slub.c:3493 [inline]
    kmem_cache_alloc+0x104/0x2c0 mm/slub.c:3502
    getname_flags+0xbc/0x4f0 fs/namei.c:140
    do_sys_openat2+0xd2/0x1d0 fs/open.c:1434
    do_sys_open fs/open.c:1455 [inline]
    __do_sys_openat fs/open.c:1471 [inline]
    __se_sys_openat fs/open.c:1466 [inline]
    __x64_sys_openat+0x247/0x290 fs/open.c:1466
    do_syscall_x64 arch/x86/entry/common.c:51 [inline]
    do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
    entry_SYSCALL_64_after_hwframe+0x63/0x6b

   Memory state around the buggy address:
    ffff888027e41f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
    ffff888027e42000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
   >ffff888027e42080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                        ^
    ffff888027e42100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
    ffff888027e42180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Reported-by: syzbot+e0b615318f8fcfc01ceb@syzkaller.appspotmail.com
Fixes: dce28769a33a ("btrfs: qgroup: use qgroup_iterator_nested to in qgroup_update_refcnt()")
CC: stable@vger.kernel.org # 6.6
Link: https://lore.kernel.org/linux-btrfs/00000000000091a5b2060936bf6d@google.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index edb84cc03237..e48eba7e9379 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2874,13 +2874,19 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
 			       num_bytes, seq);
 
+	/*
+	 * We're done using the iterator, release all its qgroups while holding
+	 * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+	 * and trigger use-after-free accesses to qgroups.
+	 */
+	qgroup_iterator_nested_clean(&qgroups);
+
 	/*
 	 * Bump qgroup_seq to avoid seq overlap
 	 */
 	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
 	spin_unlock(&fs_info->qgroup_lock);
 out_free:
-	qgroup_iterator_nested_clean(&qgroups);
 	ulist_free(old_roots);
 	ulist_free(new_roots);
 	return ret;
-- 
cgit 


From 609d99379736aa6c5b0658654084198aa808035a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Nov 2023 20:17:37 +0000
Subject: btrfs: fix qgroup record leaks when using simple quotas

When using simple quotas we are not supposed to allocate qgroup records
when adding delayed references. However we allocate them if either mode
of quotas is enabled (the new simple one or the old one), but then we
never free them because running the accounting, which frees the records,
is only run when using the old quotas (at btrfs_qgroup_account_extents()),
resulting in a memory leak of the records allocated when adding delayed
references.

Fix this by allocating the records only if the old quotas mode is enabled.
Also fix btrfs_qgroup_trace_extent_nolock() to return 1 if the old quotas
mode is not enabled - meaning the caller has to free the record.

Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas")
Reported-by: syzbot+d3ddc6dcc6386dea398b@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000004769106097f9a34@google.com/
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 4 ++--
 fs/btrfs/qgroup.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9223934d95f4..891ea2fa263c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+	if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+	if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e48eba7e9379..ce446d9d7f23 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	u64 bytenr = record->bytenr;
 
 	if (!btrfs_qgroup_full_accounting(fs_info))
-		return 0;
+		return 1;
 
 	lockdep_assert_held(&delayed_refs->lock);
 	trace_btrfs_qgroup_trace_extent(fs_info, record);
-- 
cgit 


From d3933152442b7f94419e9ea71835d71b620baf0e Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 3 Nov 2023 11:38:04 -0700
Subject: btrfs: make OWNER_REF_KEY type value smallest among inline refs

BTRFS_EXTENT_OWNER_REF_KEY is the type of simple quotas extent owner
refs. This special inline ref goes in front of all other inline refs.

In general, inline refs have a required sorted order s.t. type never
decreases (among other requirements). This was recently reified into a
tree-checker and fsck rule, which broke simple quotas. To be fair,
though, in a sense, the new owner ref item had also violated that not
yet fully enforced requirement.

This fix brings the owner ref item into compliance with the requirement
that inline ref type never decrease.

btrfs/301 exercises this behavior and should pass again with this fix.

Fixes: d9a620f77e33 ("btrfs: new inline ref storing owning subvol of data extents")
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs_tree.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index c25fc9614594..d24e8e121507 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -219,6 +219,22 @@
  */
 #define BTRFS_METADATA_ITEM_KEY	169
 
+/*
+ * Special inline ref key which stores the id of the subvolume which originally
+ * created the extent. This subvolume owns the extent permanently from the
+ * perspective of simple quotas. Needed to know which subvolume to free quota
+ * usage from when the extent is deleted.
+ *
+ * Stored as an inline ref rather to avoid wasting space on a separate item on
+ * top of the existing extent item. However, unlike the other inline refs,
+ * there is one one owner ref per extent rather than one per extent.
+ *
+ * Because of this, it goes at the front of the list of inline refs, and thus
+ * must have a lower type value than any other inline ref type (to satisfy the
+ * disk format rule that inline refs have non-decreasing type).
+ */
+#define BTRFS_EXTENT_OWNER_REF_KEY	172
+
 #define BTRFS_TREE_BLOCK_REF_KEY	176
 
 #define BTRFS_EXTENT_DATA_REF_KEY	178
@@ -233,14 +249,6 @@
 
 #define BTRFS_SHARED_DATA_REF_KEY	184
 
-/*
- * Special inline ref key which stores the id of the subvolume which originally
- * created the extent. This subvolume owns the extent permanently from the
- * perspective of simple quotas. Needed to know which subvolume to free quota
- * usage from when the extent is deleted.
- */
-#define BTRFS_EXTENT_OWNER_REF_KEY	188
-
 /*
  * block groups give us hints into the extent allocation trees.  Which
  * blocks are free etc etc
-- 
cgit 


From ec9aedb2aa1ab7ac420c00b31f5edc5be15ec167 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 3 Jul 2023 00:28:02 +0800
Subject: x86/acpi: Ignore invalid x2APIC entries

Currently, the kernel enumerates the possible CPUs by parsing both ACPI
MADT Local APIC entries and x2APIC entries. So CPUs with "valid" APIC IDs,
even if they have duplicated APIC IDs in Local APIC and x2APIC, are always
enumerated.

Below is what ACPI MADT Local APIC and x2APIC describes on an
Ivebridge-EP system,

[02Ch 0044   1]                Subtable Type : 00 [Processor Local APIC]
[02Fh 0047   1]                Local Apic ID : 00
...
[164h 0356   1]                Subtable Type : 00 [Processor Local APIC]
[167h 0359   1]                Local Apic ID : 39
[16Ch 0364   1]                Subtable Type : 00 [Processor Local APIC]
[16Fh 0367   1]                Local Apic ID : FF
...
[3ECh 1004   1]                Subtable Type : 09 [Processor Local x2APIC]
[3F0h 1008   4]                Processor x2Apic ID : 00000000
...
[B5Ch 2908   1]                Subtable Type : 09 [Processor Local x2APIC]
[B60h 2912   4]                Processor x2Apic ID : 00000077

As a result, kernel shows "smpboot: Allowing 168 CPUs, 120 hotplug CPUs".
And this wastes significant amount of memory for the per-cpu data.
Plus this also breaks https://lore.kernel.org/all/87edm36qqb.ffs@tglx/,
because __max_logical_packages is over-estimated by the APIC IDs in
the x2APIC entries.

According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure:

  "[Compatibility note] On some legacy OSes, Logical processors with APIC
   ID values less than 255 (whether in XAPIC or X2APIC mode) must use the
   Processor Local APIC structure to convey their APIC information to OSPM,
   and those processors must be declared in the DSDT using the Processor()
   keyword. Logical processors with APIC ID values 255 and greater must use
   the Processor Local x2APIC structure and be declared using the Device()
   keyword."

Therefore prevent the registration of x2APIC entries with an APIC ID less
than 255 if the local APIC table enumerates valid APIC IDs.

[ tglx: Simplify the logic ]

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230702162802.344176-1-rui.zhang@intel.com
---
 arch/x86/kernel/acpi/boot.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c55c0ef47a18..fc5bce1b5047 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
 static bool acpi_support_online_capable;
 #endif
 
@@ -232,6 +233,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
 	if (!acpi_is_processor_usable(processor->lapic_flags))
 		return 0;
 
+	/*
+	 * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+	 * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+	 * in x2APIC must be equal or greater than 0xff.
+	 */
+	if (has_lapic_cpus && apic_id < 0xff)
+		return 0;
+
 	/*
 	 * We need to register disabled CPU as well to permit
 	 * counting disabled CPUs. This allows us to size
@@ -1114,10 +1123,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 
 static int __init acpi_parse_madt_lapic_entries(void)
 {
-	int count;
-	int x2count = 0;
-	int ret;
-	struct acpi_subtable_proc madt_proc[2];
+	int count, x2count = 0;
 
 	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -ENODEV;
@@ -1126,21 +1132,11 @@ static int __init acpi_parse_madt_lapic_entries(void)
 				      acpi_parse_sapic, MAX_LOCAL_APIC);
 
 	if (!count) {
-		memset(madt_proc, 0, sizeof(madt_proc));
-		madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
-		madt_proc[0].handler = acpi_parse_lapic;
-		madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
-		madt_proc[1].handler = acpi_parse_x2apic;
-		ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
-				sizeof(struct acpi_table_madt),
-				madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
-		if (ret < 0) {
-			pr_err("Error parsing LAPIC/X2APIC entries\n");
-			return ret;
-		}
-
-		count = madt_proc[0].count;
-		x2count = madt_proc[1].count;
+		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+					acpi_parse_lapic, MAX_LOCAL_APIC);
+		has_lapic_cpus = count > 0;
+		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+					acpi_parse_x2apic, MAX_LOCAL_APIC);
 	}
 	if (!count && !x2count) {
 		pr_err("No LAPIC entries present\n");
-- 
cgit 


From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Thu, 9 Nov 2023 18:43:28 +0100
Subject: selftests: bpf: xskxceiver: ksft_print_msg: fix format type error

Crossbuilding selftests/bpf for architecture arm64, format specifies
type error show up like.

xskxceiver.c:912:34: error: format specifies type 'int' but the argument
has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat]
 ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n",
                                                                ~~
                                                                %llu
                __func__, pkt->pkt_nb, meta->count);
                                       ^~~~~~~~~~~
xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but
 the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat]
 ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len);
                                    ~~~~             ^~~~

Fixing the issues by casting to (unsigned long long) and changing the
specifiers to be %llu from %d and %u, since with u64s it might be %llx
or %lx, depending on architecture.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 591ca9637b23..b604c570309a 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr)
 	struct xdp_info *meta = data - sizeof(struct xdp_info);
 
 	if (meta->count != pkt->pkt_nb) {
-		ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n",
-			       __func__, pkt->pkt_nb, meta->count);
+		ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n",
+			       __func__, pkt->pkt_nb,
+			       (unsigned long long)meta->count);
 		return false;
 	}
 
@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp
 
 	if (addr >= umem->num_frames * umem->frame_size ||
 	    addr + len > umem->num_frames * umem->frame_size) {
-		ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len);
+		ksft_print_msg("Frag invalid addr: %llx len: %u\n",
+			       (unsigned long long)addr, len);
 		return false;
 	}
 	if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) {
-		ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len);
+		ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n",
+			       (unsigned long long)addr, len);
 		return false;
 	}
 
@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size)
 			u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1);
 
 			ksft_print_msg("[%s] Too many packets completed\n", __func__);
-			ksft_print_msg("Last completion address: %llx\n", addr);
+			ksft_print_msg("Last completion address: %llx\n",
+				       (unsigned long long)addr);
 			return TEST_FAILURE;
 		}
 
@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject)
 	}
 
 	if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) {
-		ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n",
-			       __func__, stats.tx_invalid_descs,
+		ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n",
+			       __func__,
+			       (unsigned long long)stats.tx_invalid_descs,
 			       ifobject->xsk->pkt_stream->nb_pkts);
 		return TEST_FAILURE;
 	}
-- 
cgit 


From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:36 -0800
Subject: bpf: handle ldimm64 properly in check_cfg()

ldimm64 instructions are 16-byte long, and so have to be handled
appropriately in check_cfg(), just like the rest of BPF verifier does.

This has implications in three places:
  - when determining next instruction for non-jump instructions;
  - when determining next instruction for callback address ldimm64
    instructions (in visit_func_call_insn());
  - when checking for unreachable instructions, where second half of
    ldimm64 is expected to be unreachable;

We take this also as an opportunity to report jump into the middle of
ldimm64. And adjust few test_verifier tests accordingly.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Hao Sun <sunhao.th@gmail.com>
Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                             |  8 ++++++--
 kernel/bpf/verifier.c                           | 27 ++++++++++++++++++-------
 tools/testing/selftests/bpf/verifier/ld_imm64.c |  8 ++++----
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..35bff17396c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+static bool bpf_is_ldimm64(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
 static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
-	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
-	       insn->src_reg == BPF_PSEUDO_FUNC;
+	return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 }
 
 struct bpf_prog_ops {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bd1c42eb540f..b87715b364fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15439,15 +15439,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 				struct bpf_verifier_env *env,
 				bool visit_callee)
 {
-	int ret;
+	int ret, insn_sz;
 
-	ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+	insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
 	if (ret)
 		return ret;
 
-	mark_prune_point(env, t + 1);
+	mark_prune_point(env, t + insn_sz);
 	/* when we exit from subprog, we need to record non-linear history */
-	mark_jmp_point(env, t + 1);
+	mark_jmp_point(env, t + insn_sz);
 
 	if (visit_callee) {
 		mark_prune_point(env, t);
@@ -15469,15 +15470,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 static int visit_insn(int t, struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
-	int ret, off;
+	int ret, off, insn_sz;
 
 	if (bpf_pseudo_func(insn))
 		return visit_func_call_insn(t, insns, env, true);
 
 	/* All non-branch instructions have a single fall-through edge. */
 	if (BPF_CLASS(insn->code) != BPF_JMP &&
-	    BPF_CLASS(insn->code) != BPF_JMP32)
-		return push_insn(t, t + 1, FALLTHROUGH, env, false);
+	    BPF_CLASS(insn->code) != BPF_JMP32) {
+		insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+		return push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+	}
 
 	switch (BPF_OP(insn->code)) {
 	case BPF_EXIT:
@@ -15607,11 +15610,21 @@ walk_cfg:
 	}
 
 	for (i = 0; i < insn_cnt; i++) {
+		struct bpf_insn *insn = &env->prog->insnsi[i];
+
 		if (insn_state[i] != EXPLORED) {
 			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
+		if (bpf_is_ldimm64(insn)) {
+			if (insn_state[i + 1] != 0) {
+				verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+				ret = -EINVAL;
+				goto err_free;
+			}
+			i++; /* skip second half of ldimm64 */
+		}
 	}
 	ret = 0; /* cfg looks good */
 
diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c
index f9297900cea6..78f19c255f20 100644
--- a/tools/testing/selftests/bpf/verifier/ld_imm64.c
+++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c
@@ -9,8 +9,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "invalid BPF_LD_IMM insn",
-	.errstr_unpriv = "R1 pointer comparison",
+	.errstr = "jump into the middle of ldimm64 insn 1",
+	.errstr_unpriv = "jump into the middle of ldimm64 insn 1",
 	.result = REJECT,
 },
 {
@@ -23,8 +23,8 @@
 	BPF_LD_IMM64(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "invalid BPF_LD_IMM insn",
-	.errstr_unpriv = "R1 pointer comparison",
+	.errstr = "jump into the middle of ldimm64 insn 1",
+	.errstr_unpriv = "jump into the middle of ldimm64 insn 1",
 	.result = REJECT,
 },
 {
-- 
cgit 


From 4bb7ea946a370707315ab774432963ce47291946 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:37 -0800
Subject: bpf: fix precision backtracking instruction iteration

Fix an edge case in __mark_chain_precision() which prematurely stops
backtracking instructions in a state if it happens that state's first
and last instruction indexes are the same. This situations doesn't
necessarily mean that there were no instructions simulated in a state,
but rather that we starting from the instruction, jumped around a bit,
and then ended up at the same instruction before checkpointing or
marking precision.

To distinguish between these two possible situations, we need to consult
jump history. If it's empty or contain a single record "bridging" parent
state and first instruction of processed state, then we indeed
backtracked all instructions in this state. But if history is not empty,
we are definitely not done yet.

Move this logic inside get_prev_insn_idx() to contain it more nicely.
Use -ENOENT return code to denote "we are out of instructions"
situation.

This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once
the next fix in this patch set is applied.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b87715b364fd..484c742f733e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3516,12 +3516,29 @@ static int push_jmp_history(struct bpf_verifier_env *env,
 
 /* Backtrack one insn at a time. If idx is not at the top of recorded
  * history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
  */
 static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
 			     u32 *history)
 {
 	u32 cnt = *history;
 
+	if (i == st->first_insn_idx) {
+		if (cnt == 0)
+			return -ENOENT;
+		if (cnt == 1 && st->jmp_history[0].idx == i)
+			return -ENOENT;
+	}
+
 	if (cnt && st->jmp_history[cnt - 1].idx == i) {
 		i = st->jmp_history[cnt - 1].prev_idx;
 		(*history)--;
@@ -4401,10 +4418,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				 * Nothing to be tracked further in the parent state.
 				 */
 				return 0;
-			if (i == first_idx)
-				break;
 			subseq_idx = i;
 			i = get_prev_insn_idx(st, i, &history);
+			if (i == -ENOENT)
+				break;
 			if (i >= env->prog->len) {
 				/* This can happen if backtracking reached insn 0
 				 * and there are still reg_mask or stack_mask
-- 
cgit 


From 62ccdb11d3c63dc697dea1fd92b3496fe43dcc1e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:38 -0800
Subject: selftests/bpf: add edge case backtracking logic test

Add a dedicated selftests to try to set up conditions to have a state
with same first and last instruction index, but it actually is a loop
3->4->1->2->3. This confuses mark_chain_precision() if verifier doesn't
take into account jump history.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_precision.c       | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c
index 193c0f8272d0..6b564d4c0986 100644
--- a/tools/testing/selftests/bpf/progs/verifier_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_precision.c
@@ -91,3 +91,43 @@ __naked int bpf_end_bswap(void)
 }
 
 #endif /* v4 instruction */
+
+SEC("?raw_tp")
+__success __log_level(2)
+/*
+ * Without the bug fix there will be no history between "last_idx 3 first_idx 3"
+ * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor
+ * expected log messages to the one specific mark_chain_precision operation.
+ *
+ * This is quite fragile: if verifier checkpointing heuristic changes, this
+ * might need adjusting.
+ */
+__msg("2: (07) r0 += 1                       ; R0_w=6")
+__msg("3: (35) if r0 >= 0xa goto pc+1")
+__msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1")
+__msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1")
+__msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4")
+__msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1")
+__msg("mark_precise: frame0: parent state regs= stack=:  R0_rw=P4")
+__msg("3: R0_w=6")
+__naked int state_loop_first_last_equal(void)
+{
+	asm volatile (
+		"r0 = 0;"
+	"l0_%=:"
+		"r0 += 1;"
+		"r0 += 1;"
+		/* every few iterations we'll have a checkpoint here with
+		 * first_idx == last_idx, potentially confusing precision
+		 * backtracking logic
+		 */
+		"if r0 >= 10 goto l1_%=;"	/* checkpoint + mark_precise */
+		"goto l0_%=;"
+	"l1_%=:"
+		"exit;"
+		::: __clobber_common
+	);
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit 


From 10e14e9652bf9e8104151bfd9200433083deae3d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 22:14:10 -0800
Subject: bpf: fix control-flow graph checking in privileged mode

When BPF program is verified in privileged mode, BPF verifier allows
bounded loops. This means that from CFG point of view there are
definitely some back-edges. Original commit adjusted check_cfg() logic
to not detect back-edges in control flow graph if they are resulting
from conditional jumps, which the idea that subsequent full BPF
verification process will determine whether such loops are bounded or
not, and either accept or reject the BPF program. At least that's my
reading of the intent.

Unfortunately, the implementation of this idea doesn't work correctly in
all possible situations. Conditional jump might not result in immediate
back-edge, but just a few unconditional instructions later we can arrive
at back-edge. In such situations check_cfg() would reject BPF program
even in privileged mode, despite it might be bounded loop. Next patch
adds one simple program demonstrating such scenario.

To keep things simple, instead of trying to detect back edges in
privileged mode, just assume every back edge is valid and let subsequent
BPF verification prove or reject bounded loops.

Note a few test changes. For unknown reason, we have a few tests that
are specified to detect a back-edge in a privileged mode, but looking at
their code it seems like the right outcome is passing check_cfg() and
letting subsequent verification to make a decision about bounded or not
bounded looping.

Bounded recursion case is also interesting. The example should pass, as
recursion is limited to just a few levels and so we never reach maximum
number of nested frames and never exhaust maximum stack depth. But the
way that max stack depth logic works today it falsely detects this as
exceeding max nested frame count. This patch series doesn't attempt to
fix this orthogonal problem, so we just adjust expected verifier failure.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 2589726d12a1 ("bpf: introduce bounded loops")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110061412.2995786-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 23 ++++++++--------------
 .../testing/selftests/bpf/progs/verifier_loops1.c  |  9 ++++++---
 tools/testing/selftests/bpf/verifier/calls.c       |  6 +++---
 3 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 484c742f733e..a2267d5ed14e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15403,8 +15403,7 @@ enum {
  * w - next instruction
  * e - edge
  */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
-		     bool loop_ok)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 {
 	int *insn_stack = env->cfg.insn_stack;
 	int *insn_state = env->cfg.insn_state;
@@ -15436,7 +15435,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
 		insn_stack[env->cfg.cur_stack++] = w;
 		return KEEP_EXPLORING;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		if (loop_ok && env->bpf_capable)
+		if (env->bpf_capable)
 			return DONE_EXPLORING;
 		verbose_linfo(env, t, "%d: ", t);
 		verbose_linfo(env, w, "%d: ", w);
@@ -15459,7 +15458,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 	int ret, insn_sz;
 
 	insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
-	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
 	if (ret)
 		return ret;
 
@@ -15469,12 +15468,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 
 	if (visit_callee) {
 		mark_prune_point(env, t);
-		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
-				/* It's ok to allow recursion from CFG point of
-				 * view. __check_func_call() will do the actual
-				 * check.
-				 */
-				bpf_pseudo_func(insns + t));
+		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
 	}
 	return ret;
 }
@@ -15496,7 +15490,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 	if (BPF_CLASS(insn->code) != BPF_JMP &&
 	    BPF_CLASS(insn->code) != BPF_JMP32) {
 		insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
-		return push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+		return push_insn(t, t + insn_sz, FALLTHROUGH, env);
 	}
 
 	switch (BPF_OP(insn->code)) {
@@ -15543,8 +15537,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			off = insn->imm;
 
 		/* unconditional jump with single edge */
-		ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
-				true);
+		ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
 		if (ret)
 			return ret;
 
@@ -15557,11 +15550,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 		/* conditional jump with two edges */
 		mark_prune_point(env, t);
 
-		ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+		ret = push_insn(t, t + 1, FALLTHROUGH, env);
 		if (ret)
 			return ret;
 
-		return push_insn(t, t + insn->off + 1, BRANCH, env, true);
+		return push_insn(t, t + insn->off + 1, BRANCH, env);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c
index 5bc86af80a9a..71735dbf33d4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_loops1.c
+++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c
@@ -75,9 +75,10 @@ l0_%=:	r0 += 1;					\
 "	::: __clobber_all);
 }
 
-SEC("tracepoint")
+SEC("socket")
 __description("bounded loop, start in the middle")
-__failure __msg("back-edge")
+__success
+__failure_unpriv __msg_unpriv("back-edge")
 __naked void loop_start_in_the_middle(void)
 {
 	asm volatile ("					\
@@ -136,7 +137,9 @@ l0_%=:	exit;						\
 
 SEC("tracepoint")
 __description("bounded recursion")
-__failure __msg("back-edge")
+__failure
+/* verifier limitation in detecting max stack depth */
+__msg("the call stack of 8 frames is too deep !")
 __naked void bounded_recursion(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 1bdf2b43e49e..3d5cd51071f0 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -442,7 +442,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-	.errstr = "back-edge from insn 0 to 0",
+	.errstr = "the call stack of 9 frames is too deep",
 	.result = REJECT,
 },
 {
@@ -799,7 +799,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-	.errstr = "back-edge",
+	.errstr = "the call stack of 9 frames is too deep",
 	.result = REJECT,
 },
 {
@@ -811,7 +811,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-	.errstr = "back-edge",
+	.errstr = "the call stack of 9 frames is too deep",
 	.result = REJECT,
 },
 {
-- 
cgit 


From e2e57d637aa5da0a2f49d83ad44e9febf95df7b4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 22:14:11 -0800
Subject: selftests/bpf: add more test cases for check_cfg()

Add a few more simple cases to validate proper privileged vs unprivileged
loop detection behavior. conditional_loop2 is the one reported by Hao
Sun that triggered this set of fixes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110061412.2995786-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_cfg.c | 62 ++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/verifier_cfg.c b/tools/testing/selftests/bpf/progs/verifier_cfg.c
index df7697b94007..c1f55e1d80a4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_cfg.c
+++ b/tools/testing/selftests/bpf/progs/verifier_cfg.c
@@ -97,4 +97,66 @@ l0_%=:	r2 = r0;					\
 "	::: __clobber_all);
 }
 
+SEC("socket")
+__description("conditional loop (2)")
+__success
+__failure_unpriv __msg_unpriv("back-edge from insn 10 to 11")
+__naked void conditional_loop2(void)
+{
+	asm volatile ("					\
+	r9 = 2 ll;					\
+	r3 = 0x20 ll;					\
+	r4 = 0x35 ll;					\
+	r8 = r4;					\
+	goto l1_%=;					\
+l0_%=:	r9 -= r3;					\
+	r9 -= r4;					\
+	r9 -= r8;					\
+l1_%=:	r8 += r4;					\
+	if r8 < 0x64 goto l0_%=;			\
+	r0 = r9;					\
+	exit;						\
+"	::: __clobber_all);
+}
+
+SEC("socket")
+__description("unconditional loop after conditional jump")
+__failure __msg("infinite loop detected")
+__failure_unpriv __msg_unpriv("back-edge from insn 3 to 2")
+__naked void uncond_loop_after_cond_jmp(void)
+{
+	asm volatile ("					\
+	r0 = 0;						\
+	if r0 > 0 goto l1_%=;				\
+l0_%=:	r0 = 1;						\
+	goto l0_%=;					\
+l1_%=:	exit;						\
+"	::: __clobber_all);
+}
+
+
+__naked __noinline __used
+static unsigned long never_ending_subprog()
+{
+	asm volatile ("					\
+	r0 = r1;					\
+	goto -1;					\
+"	::: __clobber_all);
+}
+
+SEC("socket")
+__description("unconditional loop after conditional jump")
+/* infinite loop is detected *after* check_cfg() */
+__failure __msg("infinite loop detected")
+__naked void uncond_loop_in_subprog_after_cond_jmp(void)
+{
+	asm volatile ("					\
+	r0 = 0;						\
+	if r0 > 0 goto l1_%=;				\
+l0_%=:	r0 += 1;					\
+	call never_ending_subprog;			\
+l1_%=:	exit;						\
+"	::: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit 


From 8a4f030dbced6fc255cbe67b2d0a129947e18493 Mon Sep 17 00:00:00 2001
From: Yuran Pereira <yuran.pereira@hotmail.com>
Date: Wed, 8 Nov 2023 02:18:36 +0530
Subject: ptp: Fixes a null pointer dereference in ptp_ioctl

Syzkaller found a null pointer dereference in ptp_ioctl
originating from the lack of a null check for tsevq.

```
general protection fault, probably for non-canonical
	address 0xdffffc000000020b: 0000 [#1] PREEMPT SMP KASAN
KASAN: probably user-memory-access in range
	[0x0000000000001058-0x000000000000105f]
CPU: 0 PID: 5053 Comm: syz-executor353 Not tainted
	6.6.0-syzkaller-10396-g4652b8e4f3ff #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
	BIOS Google 10/09/2023
RIP: 0010:ptp_ioctl+0xcb7/0x1d10 drivers/ptp/ptp_chardev.c:476
...
Call Trace:
 <TASK>
 posix_clock_ioctl+0xf8/0x160 kernel/time/posix-clock.c:86
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:871 [inline]
 __se_sys_ioctl fs/ioctl.c:857 [inline]
 __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b
```

This patch fixes the issue by adding a check for tsevq and
ensuring ptp_ioctl returns with an error if tsevq is null.

Reported-by: syzbot+8a78ecea7ac1a2ea26e5@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=8a78ecea7ac1a2ea26e5
Fixes: c5a445b1e934 ("ptp: support event queue reader channel masks")
Signed-off-by: Yuran Pereira <yuran.pereira@hotmail.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 3f7a74788802..e95a6ed130ef 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -176,6 +176,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
 	int enable, err = 0;
 
 	tsevq = pccontext->private_clkdata;
+	if (!tsevq)
+		return -EINVAL;
 
 	switch (cmd) {
 
-- 
cgit 


From 871019b22d1bcc9fab2d1feba1b9a564acbb6e99 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 8 Nov 2023 13:13:25 -0800
Subject: net: set SOCK_RCU_FREE before inserting socket into hashtable

We've started to see the following kernel traces:

 WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0

 Call Trace:
  <IRQ>
  __bpf_skc_lookup+0x10d/0x120
  bpf_sk_lookup+0x48/0xd0
  bpf_sk_lookup_tcp+0x19/0x20
  bpf_prog_<redacted>+0x37c/0x16a3
  cls_bpf_classify+0x205/0x2e0
  tcf_classify+0x92/0x160
  __netif_receive_skb_core+0xe52/0xf10
  __netif_receive_skb_list_core+0x96/0x2b0
  napi_complete_done+0x7b5/0xb70
  <redacted>_poll+0x94/0xb0
  net_rx_action+0x163/0x1d70
  __do_softirq+0xdc/0x32e
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x36/0x50
  do_softirq+0x44/0x70

__inet_hash can race with lockless (rcu) readers on the other cpus:

  __inet_hash
    __sk_nulls_add_node_rcu
    <- (bpf triggers here)
    sock_set_flag(SOCK_RCU_FREE)

Let's move the SOCK_RCU_FREE part up a bit, before we are inserting
the socket into hashtables. Note, that the race is really harmless;
the bpf callers are handling this situation (where listener socket
doesn't have SOCK_RCU_FREE set) correctly, so the only
annoyance is a WARN_ONCE.

More details from Eric regarding SOCK_RCU_FREE timeline:

Commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under
synflood") added SOCK_RCU_FREE. At that time, the precise location of
sock_set_flag(sk, SOCK_RCU_FREE) did not matter, because the thread calling
__inet_hash() owns a reference on sk. SOCK_RCU_FREE was only tested
at dismantle time.

Commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
started checking SOCK_RCU_FREE _after_ the lookup to infer whether
the refcount has been taken care of.

Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 598c1b114d2c..a532f749e477 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		if (err)
 			goto unlock;
 	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
 		sk->sk_family == AF_INET6)
 		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
 	else
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
-	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
-- 
cgit 


From cbe9e68e1e0f965fd3b1caf975eb29083c65e6b0 Mon Sep 17 00:00:00 2001
From: Ravi Gunasekaran <r-gunasekaran@ti.com>
Date: Fri, 10 Nov 2023 14:57:49 +0530
Subject: MAINTAINERS: net: Update reviewers for TI's Ethernet drivers

Grygorii is no longer associated with TI and messages addressed to
him bounce.

Add Siddharth, Roger and myself as reviewers.

Signed-off-by: Ravi Gunasekaran <r-gunasekaran@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 350d00657f6b..e3acb36989f0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21769,7 +21769,9 @@ F:	Documentation/devicetree/bindings/counter/ti-eqep.yaml
 F:	drivers/counter/ti-eqep.c
 
 TI ETHERNET SWITCH DRIVER (CPSW)
-R:	Grygorii Strashko <grygorii.strashko@ti.com>
+R:	Siddharth Vadapalli <s-vadapalli@ti.com>
+R:	Ravi Gunasekaran <r-gunasekaran@ti.com>
+R:	Roger Quadros <rogerq@kernel.org>
 L:	linux-omap@vger.kernel.org
 L:	netdev@vger.kernel.org
 S:	Maintained
-- 
cgit 


From 18f039428c7df183b09c69ebf10ffd4e521035d2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Nov 2023 15:22:41 +0000
Subject: ipvlan: add ipvlan_route_v6_outbound() helper

Inspired by syzbot reports using a stack of multiple ipvlan devices.

Reduce stack size needed in ipvlan_process_v6_outbound() by moving
the flowi6 struct used for the route lookup in an non inlined
helper. ipvlan_route_v6_outbound() needs 120 bytes on the stack,
immediately reclaimed.

Also make sure ipvlan_process_v4_outbound() is not inlined.

We might also have to lower MAX_NEST_DEV, because only syzbot uses
setups with more than four stacked devices.

BUG: TASK stack guard page was hit at ffffc9000e803ff8 (stack is ffffc9000e804000..ffffc9000e808000)
stack guard page: 0000 [#1] SMP KASAN
CPU: 0 PID: 13442 Comm: syz-executor.4 Not tainted 6.1.52-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
RIP: 0010:kasan_check_range+0x4/0x2a0 mm/kasan/generic.c:188
Code: 48 01 c6 48 89 c7 e8 db 4e c1 03 31 c0 5d c3 cc 0f 0b eb 02 0f 0b b8 ea ff ff ff 5d c3 cc 00 00 cc cc 00 00 cc cc 55 48 89 e5 <41> 57 41 56 41 55 41 54 53 b0 01 48 85 f6 0f 84 a4 01 00 00 48 89
RSP: 0018:ffffc9000e804000 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff817e5bf2
RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff887c6568
RBP: ffffc9000e804000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff92001d0080c
R13: dffffc0000000000 R14: ffffffff87e6b100 R15: 0000000000000000
FS: 00007fd0c55826c0(0000) GS:ffff8881f6800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffc9000e803ff8 CR3: 0000000170ef7000 CR4: 00000000003506f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<#DF>
</#DF>
<TASK>
[<ffffffff81f281d1>] __kasan_check_read+0x11/0x20 mm/kasan/shadow.c:31
[<ffffffff817e5bf2>] instrument_atomic_read include/linux/instrumented.h:72 [inline]
[<ffffffff817e5bf2>] _test_bit include/asm-generic/bitops/instrumented-non-atomic.h:141 [inline]
[<ffffffff817e5bf2>] cpumask_test_cpu include/linux/cpumask.h:506 [inline]
[<ffffffff817e5bf2>] cpu_online include/linux/cpumask.h:1092 [inline]
[<ffffffff817e5bf2>] trace_lock_acquire include/trace/events/lock.h:24 [inline]
[<ffffffff817e5bf2>] lock_acquire+0xe2/0x590 kernel/locking/lockdep.c:5632
[<ffffffff8563221e>] rcu_lock_acquire+0x2e/0x40 include/linux/rcupdate.h:306
[<ffffffff8561464d>] rcu_read_lock include/linux/rcupdate.h:747 [inline]
[<ffffffff8561464d>] ip6_pol_route+0x15d/0x1440 net/ipv6/route.c:2221
[<ffffffff85618120>] ip6_pol_route_output+0x50/0x80 net/ipv6/route.c:2606
[<ffffffff856f65b5>] pol_lookup_func include/net/ip6_fib.h:584 [inline]
[<ffffffff856f65b5>] fib6_rule_lookup+0x265/0x620 net/ipv6/fib6_rules.c:116
[<ffffffff85618009>] ip6_route_output_flags_noref+0x2d9/0x3a0 net/ipv6/route.c:2638
[<ffffffff8561821a>] ip6_route_output_flags+0xca/0x340 net/ipv6/route.c:2651
[<ffffffff838bd5a3>] ip6_route_output include/net/ip6_route.h:100 [inline]
[<ffffffff838bd5a3>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:473 [inline]
[<ffffffff838bd5a3>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bd5a3>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bd5a3>] ipvlan_queue_xmit+0xc33/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff84d4a65e>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff84d4a65e>] neigh_resolve_output+0x64e/0x750 net/core/neighbour.c:1560
[<ffffffff855ce503>] neigh_output include/net/neighbour.h:545 [inline]
[<ffffffff855ce503>] ip6_finish_output2+0x1643/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff855b9ce4>] dst_output include/net/dst.h:444 [inline]
[<ffffffff855b9ce4>] NF_HOOK include/linux/netfilter.h:309 [inline]
[<ffffffff855b9ce4>] ip6_xmit+0x11a4/0x1b20 net/ipv6/ip6_output.c:352
[<ffffffff8597984e>] sctp_v6_xmit+0x9ae/0x1230 net/sctp/ipv6.c:250
[<ffffffff8594623e>] sctp_packet_transmit+0x25de/0x2bc0 net/sctp/output.c:653
[<ffffffff858f5142>] sctp_packet_singleton+0x202/0x310 net/sctp/outqueue.c:783
[<ffffffff858ea411>] sctp_outq_flush_ctrl net/sctp/outqueue.c:914 [inline]
[<ffffffff858ea411>] sctp_outq_flush+0x661/0x3d40 net/sctp/outqueue.c:1212
[<ffffffff858f02f9>] sctp_outq_uncork+0x79/0xb0 net/sctp/outqueue.c:764
[<ffffffff8589f060>] sctp_side_effects net/sctp/sm_sideeffect.c:1199 [inline]
[<ffffffff8589f060>] sctp_do_sm+0x55c0/0x5c30 net/sctp/sm_sideeffect.c:1170
[<ffffffff85941567>] sctp_primitive_ASSOCIATE+0x97/0xc0 net/sctp/primitive.c:73
[<ffffffff859408b2>] sctp_sendmsg_to_asoc+0xf62/0x17b0 net/sctp/socket.c:1839
[<ffffffff85910b5e>] sctp_sendmsg+0x212e/0x33b0 net/sctp/socket.c:2029
[<ffffffff8544d559>] inet_sendmsg+0x149/0x310 net/ipv4/af_inet.c:849
[<ffffffff84c6c4d2>] sock_sendmsg_nosec net/socket.c:716 [inline]
[<ffffffff84c6c4d2>] sock_sendmsg net/socket.c:736 [inline]
[<ffffffff84c6c4d2>] ____sys_sendmsg+0x572/0x8c0 net/socket.c:2504
[<ffffffff84c6ca91>] ___sys_sendmsg net/socket.c:2558 [inline]
[<ffffffff84c6ca91>] __sys_sendmsg+0x271/0x360 net/socket.c:2587
[<ffffffff84c6cbff>] __do_sys_sendmsg net/socket.c:2596 [inline]
[<ffffffff84c6cbff>] __se_sys_sendmsg net/socket.c:2594 [inline]
[<ffffffff84c6cbff>] __x64_sys_sendmsg+0x7f/0x90 net/socket.c:2594
[<ffffffff85b32553>] do_syscall_x64 arch/x86/entry/common.c:51 [inline]
[<ffffffff85b32553>] do_syscall_64+0x53/0x80 arch/x86/entry/common.c:84
[<ffffffff85c00087>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_core.c | 41 ++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 21e9cac73121..2d5b021b4ea6 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -411,7 +411,7 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h,
 	return addr;
 }
 
-static int ipvlan_process_v4_outbound(struct sk_buff *skb)
+static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb)
 {
 	const struct iphdr *ip4h = ip_hdr(skb);
 	struct net_device *dev = skb->dev;
@@ -453,13 +453,11 @@ out:
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+
+static noinline_for_stack int
+ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb)
 {
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	struct net_device *dev = skb->dev;
-	struct net *net = dev_net(dev);
-	struct dst_entry *dst;
-	int err, ret = NET_XMIT_DROP;
 	struct flowi6 fl6 = {
 		.flowi6_oif = dev->ifindex,
 		.daddr = ip6h->daddr,
@@ -469,27 +467,38 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
 		.flowi6_mark = skb->mark,
 		.flowi6_proto = ip6h->nexthdr,
 	};
+	struct dst_entry *dst;
+	int err;
 
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (dst->error) {
-		ret = dst->error;
+	dst = ip6_route_output(dev_net(dev), NULL, &fl6);
+	err = dst->error;
+	if (err) {
 		dst_release(dst);
-		goto err;
+		return err;
 	}
 	skb_dst_set(skb, dst);
+	return 0;
+}
+
+static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	int err, ret = NET_XMIT_DROP;
+
+	err = ipvlan_route_v6_outbound(dev, skb);
+	if (unlikely(err)) {
+		DEV_STATS_INC(dev, tx_errors);
+		kfree_skb(skb);
+		return err;
+	}
 
 	memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
 
-	err = ip6_local_out(net, skb->sk, skb);
+	err = ip6_local_out(dev_net(dev), skb->sk, skb);
 	if (unlikely(net_xmit_eval(err)))
 		DEV_STATS_INC(dev, tx_errors);
 	else
 		ret = NET_XMIT_SUCCESS;
-	goto out;
-err:
-	DEV_STATS_INC(dev, tx_errors);
-	kfree_skb(skb);
-out:
 	return ret;
 }
 #else
-- 
cgit 


From 719639853d88071dfdfd8d9971eca9c283ff314c Mon Sep 17 00:00:00 2001
From: Shigeru Yoshida <syoshida@redhat.com>
Date: Thu, 9 Nov 2023 00:44:20 +0900
Subject: tty: Fix uninit-value access in ppp_sync_receive()

KMSAN reported the following uninit-value access issue:

=====================================================
BUG: KMSAN: uninit-value in ppp_sync_input drivers/net/ppp/ppp_synctty.c:690 [inline]
BUG: KMSAN: uninit-value in ppp_sync_receive+0xdc9/0xe70 drivers/net/ppp/ppp_synctty.c:334
 ppp_sync_input drivers/net/ppp/ppp_synctty.c:690 [inline]
 ppp_sync_receive+0xdc9/0xe70 drivers/net/ppp/ppp_synctty.c:334
 tiocsti+0x328/0x450 drivers/tty/tty_io.c:2295
 tty_ioctl+0x808/0x1920 drivers/tty/tty_io.c:2694
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:871 [inline]
 __se_sys_ioctl+0x211/0x400 fs/ioctl.c:857
 __x64_sys_ioctl+0x97/0xe0 fs/ioctl.c:857
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

Uninit was created at:
 __alloc_pages+0x75d/0xe80 mm/page_alloc.c:4591
 __alloc_pages_node include/linux/gfp.h:238 [inline]
 alloc_pages_node include/linux/gfp.h:261 [inline]
 __page_frag_cache_refill+0x9a/0x2c0 mm/page_alloc.c:4691
 page_frag_alloc_align+0x91/0x5d0 mm/page_alloc.c:4722
 page_frag_alloc include/linux/gfp.h:322 [inline]
 __netdev_alloc_skb+0x215/0x6d0 net/core/skbuff.c:728
 netdev_alloc_skb include/linux/skbuff.h:3225 [inline]
 dev_alloc_skb include/linux/skbuff.h:3238 [inline]
 ppp_sync_input drivers/net/ppp/ppp_synctty.c:669 [inline]
 ppp_sync_receive+0x237/0xe70 drivers/net/ppp/ppp_synctty.c:334
 tiocsti+0x328/0x450 drivers/tty/tty_io.c:2295
 tty_ioctl+0x808/0x1920 drivers/tty/tty_io.c:2694
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:871 [inline]
 __se_sys_ioctl+0x211/0x400 fs/ioctl.c:857
 __x64_sys_ioctl+0x97/0xe0 fs/ioctl.c:857
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

CPU: 0 PID: 12950 Comm: syz-executor.1 Not tainted 6.6.0-14500-g1c41041124bd #10
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014
=====================================================

ppp_sync_input() checks the first 2 bytes of the data are PPP_ALLSTATIONS
and PPP_UI. However, if the data length is 1 and the first byte is
PPP_ALLSTATIONS, an access to an uninitialized value occurs when checking
PPP_UI. This patch resolves this issue by checking the data length.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/ppp_synctty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index ebcdffdf4f0e..ea261a628786 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -687,7 +687,7 @@ ppp_sync_input(struct syncppp *ap, const u8 *buf, const u8 *flags, int count)
 
 	/* strip address/control field if present */
 	p = skb->data;
-	if (p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) {
+	if (skb->len >= 2 && p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) {
 		/* chop off address/control */
 		if (skb->len < 3)
 			goto err;
-- 
cgit 


From 4b21a669ca21ed8f24ef4530b2918be5730114de Mon Sep 17 00:00:00 2001
From: Kailang Yang <kailang@realtek.com>
Date: Fri, 10 Nov 2023 15:16:06 +0800
Subject: ALSA: hda/realtek - Add Dell ALC295 to pin fall back table

Add ALC295 to pin fall back table.
Remove 5 pin quirks for Dell ALC295.
ALC295 was only support MIC2 for external MIC function.
ALC295 assigned model "ALC269_FIXUP_DELL1_MIC_NO_PRESENCE" for pin
fall back table.
It was assigned wrong model. So, let's remove it.

Fixes: fbc571290d9f ("ALSA: hda/realtek - Fixed Headphone Mic can't record on Dell platform")
Signed-off-by: Kailang Yang <kailang@realtek.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/7c1998e873834df98d59bd7e0d08c72e@realtek.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 669ae3d6e447..d689f0050aae 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -10821,22 +10821,6 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
 		{0x12, 0x90a60130},
 		{0x17, 0x90170110},
 		{0x21, 0x03211020}),
-	SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
-		{0x14, 0x90170110},
-		{0x21, 0x04211020}),
-	SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
-		{0x14, 0x90170110},
-		{0x21, 0x04211030}),
-	SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
-		ALC295_STANDARD_PINS,
-		{0x17, 0x21014020},
-		{0x18, 0x21a19030}),
-	SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
-		ALC295_STANDARD_PINS,
-		{0x17, 0x21014040},
-		{0x18, 0x21a19050}),
-	SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
-		ALC295_STANDARD_PINS),
 	SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE,
 		ALC298_STANDARD_PINS,
 		{0x17, 0x90170110}),
@@ -10880,6 +10864,9 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = {
 	SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
 		{0x19, 0x40000000},
 		{0x1b, 0x40000000}),
+	SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
+		{0x19, 0x40000000},
+		{0x1b, 0x40000000}),
 	SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
 		{0x19, 0x40000000},
 		{0x1a, 0x40000000}),
-- 
cgit 


From 8384c0baf223e1c3bc7b1c711d80a4c6106d210e Mon Sep 17 00:00:00 2001
From: Eymen Yigit <eymenyg01@gmail.com>
Date: Fri, 10 Nov 2023 18:07:15 +0300
Subject: ALSA: hda/realtek: Enable Mute LED on HP 255 G8

This HP Notebook uses ALC236 codec with COEF 0x07 idx 1 controlling
the mute LED. Enable already existing quirk for this device.

Signed-off-by: Eymen Yigit <eymenyg01@gmail.com>
Cc: Luka Guzenko <l.guzenko@web.de>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20231110150715.5141-1-eymenyg01@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index d689f0050aae..c7de74396185 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9832,6 +9832,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST),
 	SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8902, "HP OMEN 16", ALC285_FIXUP_HP_MUTE_LED),
+	SND_PCI_QUIRK(0x103c, 0x890e, "HP 255 G8 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2),
 	SND_PCI_QUIRK(0x103c, 0x8919, "HP Pavilion Aero Laptop 13-be0xxx", ALC287_FIXUP_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x896d, "HP ZBook Firefly 16 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
-- 
cgit 


From 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Nov 2023 15:57:13 +0100
Subject: hrtimers: Push pending hrtimers away from outgoing CPU earlier

2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug")
solved the straight forward CPU hotplug deadlock vs. the scheduler
bandwidth timer. Yu discovered a more involved variant where a task which
has a bandwidth timer started on the outgoing CPU holds a lock and then
gets throttled. If the lock required by one of the CPU hotplug callbacks
the hotplug operation deadlocks because the unthrottling timer event is not
handled on the dying CPU and can only be recovered once the control CPU
reaches the hotplug state which pulls the pending hrtimers from the dead
CPU.

Solve this by pushing the hrtimers away from the dying CPU in the dying
callbacks. Nothing can queue a hrtimer on the dying CPU at that point because
all other CPUs spin in stop_machine() with interrupts disabled and once the
operation is finished the CPU is marked offline.

Reported-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Liu Tie <liutie4@huawei.com>
Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/hrtimer.h    |  4 ++--
 kernel/cpu.c               |  8 +++++++-
 kernel/time/hrtimer.c      | 33 ++++++++++++---------------------
 4 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..448f5f995adc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -193,6 +193,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
 	CPUHP_AP_SMPCFD_DYING,
+	CPUHP_AP_HRTIMERS_DYING,
 	CPUHP_AP_X86_TBOOT_DYING,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
 	CPUHP_AP_ONLINE,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f2044d5a652b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void);
 
 int hrtimers_prepare_cpu(unsigned int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-int hrtimers_dead_cpu(unsigned int cpu);
+int hrtimers_cpu_dying(unsigned int cpu);
 #else
-#define hrtimers_dead_cpu	NULL
+#define hrtimers_cpu_dying	NULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..2e69a1deaa31 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2098,7 +2098,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_HRTIMERS_PREPARE] = {
 		.name			= "hrtimers:prepare",
 		.startup.single		= hrtimers_prepare_cpu,
-		.teardown.single	= hrtimers_dead_cpu,
+		.teardown.single	= NULL,
 	},
 	[CPUHP_SMPCFD_PREPARE] = {
 		.name			= "smpcfd:prepare",
@@ -2190,6 +2190,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= NULL,
 		.teardown.single	= smpcfd_dying_cpu,
 	},
+	[CPUHP_AP_HRTIMERS_DYING] = {
+		.name			= "hrtimers:dying",
+		.startup.single		= NULL,
+		.teardown.single	= hrtimers_cpu_dying,
+	},
+
 	/* Entry state on starting. Interrupts enabled from here on. Transient
 	 * state for synchronsization */
 	[CPUHP_AP_ONLINE] = {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 238262e4aba7..760793998cdd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i;
+	int i, ncpu = cpumask_first(cpu_active_mask);
 
-	BUG_ON(cpu_online(scpu));
-	tick_cancel_sched_timer(scpu);
+	tick_cancel_sched_timer(dying_cpu);
+
+	old_base = this_cpu_ptr(&hrtimer_bases);
+	new_base = &per_cpu(hrtimer_bases, ncpu);
 
-	/*
-	 * this BH disable ensures that raise_softirq_irqoff() does
-	 * not wakeup ksoftirqd (and acquire the pi-lock) while
-	 * holding the cpu_base lock
-	 */
-	local_bh_disable();
-	local_irq_disable();
-	old_base = &per_cpu(hrtimer_bases, scpu);
-	new_base = this_cpu_ptr(&hrtimer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
 	 */
-	raw_spin_lock(&new_base->lock);
-	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+	raw_spin_lock(&old_base->lock);
+	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
 	 * The migration might have changed the first expiring softirq
 	 * timer on this CPU. Update it.
 	 */
-	hrtimer_update_softirq_timer(new_base, false);
+	__hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+	/* Tell the other CPU to retrigger the next event */
+	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
-	raw_spin_unlock(&old_base->lock);
 	raw_spin_unlock(&new_base->lock);
+	raw_spin_unlock(&old_base->lock);
 
-	/* Check, if we got expired work to do */
-	__hrtimer_peek_ahead_timers();
-	local_irq_enable();
-	local_bh_enable();
 	return 0;
 }
 
-- 
cgit 


From e409d7346648c9acff84c3cc8d291767ee2d5326 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 10 Nov 2023 17:13:02 +0100
Subject: net: ti: icssg-prueth: Add missing icss_iep_put to error path

Analogously to prueth_remove, just also taking care for NULL'ing the
iep pointers.

Fixes: 186734c15886 ("net: ti: icssg-prueth: add packet timestamping and ptp support")
Fixes: 443a2367ba3c ("net: ti: icssg-prueth: am65x SR2.0 add 10M full duplex support")
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Reviewed-by: MD Danish Anwar <danishanwar@ti.com>
Reviewed-by: Roger Quadros <rogerq@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/icssg/icssg_prueth.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
index 6c4b64227ac8..3abbeba26f1b 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
@@ -2105,10 +2105,7 @@ static int prueth_probe(struct platform_device *pdev)
 	prueth->iep1 = icss_iep_get_idx(np, 1);
 	if (IS_ERR(prueth->iep1)) {
 		ret = dev_err_probe(dev, PTR_ERR(prueth->iep1), "iep1 get failed\n");
-		icss_iep_put(prueth->iep0);
-		prueth->iep0 = NULL;
-		prueth->iep1 = NULL;
-		goto free_pool;
+		goto put_iep0;
 	}
 
 	if (prueth->pdata.quirk_10m_link_issue) {
@@ -2205,6 +2202,12 @@ netdev_exit:
 exit_iep:
 	if (prueth->pdata.quirk_10m_link_issue)
 		icss_iep_exit_fw(prueth->iep1);
+	icss_iep_put(prueth->iep1);
+
+put_iep0:
+	icss_iep_put(prueth->iep0);
+	prueth->iep0 = NULL;
+	prueth->iep1 = NULL;
 
 free_pool:
 	gen_pool_free(prueth->sram_pool,
-- 
cgit 


From 2bd5b559a1f391f05927bbb0b31381fa71c61e26 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 10 Nov 2023 17:13:08 +0100
Subject: net: ti: icssg-prueth: Fix error cleanup on failing
 pruss_request_mem_region

We were just continuing in this case, surely not desired.

Fixes: 128d5874c082 ("net: ti: icssg-prueth: Add ICSSG ethernet driver")
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Reviewed-by: Roger Quadros <rogerq@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/icssg/icssg_prueth.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
index 3abbeba26f1b..411898a4f38c 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
@@ -2063,7 +2063,7 @@ static int prueth_probe(struct platform_device *pdev)
 				       &prueth->shram);
 	if (ret) {
 		dev_err(dev, "unable to get PRUSS SHRD RAM2: %d\n", ret);
-		pruss_put(prueth->pruss);
+		goto put_pruss;
 	}
 
 	prueth->sram_pool = of_gen_pool_get(np, "sram", 0);
@@ -2215,6 +2215,8 @@ free_pool:
 
 put_mem:
 	pruss_release_mem_region(prueth->pruss, &prueth->shram);
+
+put_pruss:
 	pruss_put(prueth->pruss);
 
 put_cores:
-- 
cgit 


From 713f040cd22285fcc506f40a0d259566e6758c3c Mon Sep 17 00:00:00 2001
From: Chandradeep Dey <codesigning@chandradeepdey.com>
Date: Sat, 11 Nov 2023 19:25:49 +0100
Subject: ALSA: hda/realtek - Enable internal speaker of ASUS K6500ZC

Apply the already existing quirk chain ALC294_FIXUP_ASUS_SPK to enable
the internal speaker of ASUS K6500ZC.

Signed-off-by: Chandradeep Dey <codesigning@chandradeepdey.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/NizcVHQ--3-9@chandradeepdey.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index c7de74396185..cdd808e02b44 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9907,6 +9907,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1043, 0x10a1, "ASUS UX391UA", ALC294_FIXUP_ASUS_SPK),
 	SND_PCI_QUIRK(0x1043, 0x10c0, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
 	SND_PCI_QUIRK(0x1043, 0x10d0, "ASUS X540LA/X540LJ", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
+	SND_PCI_QUIRK(0x1043, 0x10d3, "ASUS K6500ZC", ALC294_FIXUP_ASUS_SPK),
 	SND_PCI_QUIRK(0x1043, 0x115d, "Asus 1015E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
 	SND_PCI_QUIRK(0x1043, 0x11c0, "ASUS X556UR", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1043, 0x125e, "ASUS Q524UQK", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
-- 
cgit 


From 7b211c7671212cad0b83603c674838c7e824d845 Mon Sep 17 00:00:00 2001
From: Robert Marko <robert.marko@sartura.hr>
Date: Fri, 10 Nov 2023 10:30:11 +0100
Subject: Revert "i2c: pxa: move to generic GPIO recovery"

This reverts commit 0b01392c18b9993a584f36ace1d61118772ad0ca.

Conversion of PXA to generic I2C recovery, makes the I2C bus completely
lock up if recovery pinctrl is present in the DT and I2C recovery is
enabled.

So, until the generic I2C recovery can also work with PXA lets revert
to have working I2C and I2C recovery again.

Signed-off-by: Robert Marko <robert.marko@sartura.hr>
Cc: stable@vger.kernel.org # 5.11+
Acked-by: Andi Shyti <andi.shyti@kernel.org>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/busses/i2c-pxa.c | 76 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 8 deletions(-)

diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c
index 1d7648242749..76f79b68cef8 100644
--- a/drivers/i2c/busses/i2c-pxa.c
+++ b/drivers/i2c/busses/i2c-pxa.c
@@ -265,6 +265,9 @@ struct pxa_i2c {
 	u32			hs_mask;
 
 	struct i2c_bus_recovery_info recovery;
+	struct pinctrl		*pinctrl;
+	struct pinctrl_state	*pinctrl_default;
+	struct pinctrl_state	*pinctrl_recovery;
 };
 
 #define _IBMR(i2c)	((i2c)->reg_ibmr)
@@ -1299,12 +1302,13 @@ static void i2c_pxa_prepare_recovery(struct i2c_adapter *adap)
 	 */
 	gpiod_set_value(i2c->recovery.scl_gpiod, ibmr & IBMR_SCLS);
 	gpiod_set_value(i2c->recovery.sda_gpiod, ibmr & IBMR_SDAS);
+
+	WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery));
 }
 
 static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap)
 {
 	struct pxa_i2c *i2c = adap->algo_data;
-	struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
 	u32 isr;
 
 	/*
@@ -1318,7 +1322,7 @@ static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap)
 		i2c_pxa_do_reset(i2c);
 	}
 
-	WARN_ON(pinctrl_select_state(bri->pinctrl, bri->pins_default));
+	WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default));
 
 	dev_dbg(&i2c->adap.dev, "recovery: IBMR 0x%08x ISR 0x%08x\n",
 	        readl(_IBMR(i2c)), readl(_ISR(i2c)));
@@ -1340,20 +1344,76 @@ static int i2c_pxa_init_recovery(struct pxa_i2c *i2c)
 	if (IS_ENABLED(CONFIG_I2C_PXA_SLAVE))
 		return 0;
 
-	bri->pinctrl = devm_pinctrl_get(dev);
-	if (PTR_ERR(bri->pinctrl) == -ENODEV) {
-		bri->pinctrl = NULL;
+	i2c->pinctrl = devm_pinctrl_get(dev);
+	if (PTR_ERR(i2c->pinctrl) == -ENODEV)
+		i2c->pinctrl = NULL;
+	if (IS_ERR(i2c->pinctrl))
+		return PTR_ERR(i2c->pinctrl);
+
+	if (!i2c->pinctrl)
+		return 0;
+
+	i2c->pinctrl_default = pinctrl_lookup_state(i2c->pinctrl,
+						    PINCTRL_STATE_DEFAULT);
+	i2c->pinctrl_recovery = pinctrl_lookup_state(i2c->pinctrl, "recovery");
+
+	if (IS_ERR(i2c->pinctrl_default) || IS_ERR(i2c->pinctrl_recovery)) {
+		dev_info(dev, "missing pinmux recovery information: %ld %ld\n",
+			 PTR_ERR(i2c->pinctrl_default),
+			 PTR_ERR(i2c->pinctrl_recovery));
+		return 0;
+	}
+
+	/*
+	 * Claiming GPIOs can influence the pinmux state, and may glitch the
+	 * I2C bus. Do this carefully.
+	 */
+	bri->scl_gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN);
+	if (bri->scl_gpiod == ERR_PTR(-EPROBE_DEFER))
+		return -EPROBE_DEFER;
+	if (IS_ERR(bri->scl_gpiod)) {
+		dev_info(dev, "missing scl gpio recovery information: %pe\n",
+			 bri->scl_gpiod);
+		return 0;
+	}
+
+	/*
+	 * We have SCL. Pull SCL low and wait a bit so that SDA glitches
+	 * have no effect.
+	 */
+	gpiod_direction_output(bri->scl_gpiod, 0);
+	udelay(10);
+	bri->sda_gpiod = devm_gpiod_get(dev, "sda", GPIOD_OUT_HIGH_OPEN_DRAIN);
+
+	/* Wait a bit in case of a SDA glitch, and then release SCL. */
+	udelay(10);
+	gpiod_direction_output(bri->scl_gpiod, 1);
+
+	if (bri->sda_gpiod == ERR_PTR(-EPROBE_DEFER))
+		return -EPROBE_DEFER;
+
+	if (IS_ERR(bri->sda_gpiod)) {
+		dev_info(dev, "missing sda gpio recovery information: %pe\n",
+			 bri->sda_gpiod);
 		return 0;
 	}
-	if (IS_ERR(bri->pinctrl))
-		return PTR_ERR(bri->pinctrl);
 
 	bri->prepare_recovery = i2c_pxa_prepare_recovery;
 	bri->unprepare_recovery = i2c_pxa_unprepare_recovery;
+	bri->recover_bus = i2c_generic_scl_recovery;
 
 	i2c->adap.bus_recovery_info = bri;
 
-	return 0;
+	/*
+	 * Claiming GPIOs can change the pinmux state, which confuses the
+	 * pinctrl since pinctrl's idea of the current setting is unaffected
+	 * by the pinmux change caused by claiming the GPIO. Work around that
+	 * by switching pinctrl to the GPIO state here. We do it this way to
+	 * avoid glitching the I2C bus.
+	 */
+	pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery);
+
+	return pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default);
 }
 
 static int i2c_pxa_probe(struct platform_device *dev)
-- 
cgit 


From 2a5db859c6825b5d50377dda9c3cc729c20cad43 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 31 Jul 2023 20:46:17 +0800
Subject: xfs: factor out xfs_defer_pending_abort

Factor out xfs_defer_pending_abort() from xfs_defer_trans_abort(), which
not use transaction parameter, so it can be used after the transaction
life cycle.

Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_defer.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index bcfb6a4203cd..88388e12f8e7 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -245,21 +245,18 @@ xfs_defer_create_intents(
 	return ret;
 }
 
-/* Abort all the intents that were committed. */
 STATIC void
-xfs_defer_trans_abort(
-	struct xfs_trans		*tp,
-	struct list_head		*dop_pending)
+xfs_defer_pending_abort(
+	struct xfs_mount		*mp,
+	struct list_head		*dop_list)
 {
 	struct xfs_defer_pending	*dfp;
 	const struct xfs_defer_op_type	*ops;
 
-	trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
 	/* Abort intent items that don't have a done item. */
-	list_for_each_entry(dfp, dop_pending, dfp_list) {
+	list_for_each_entry(dfp, dop_list, dfp_list) {
 		ops = defer_op_types[dfp->dfp_type];
-		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+		trace_xfs_defer_pending_abort(mp, dfp);
 		if (dfp->dfp_intent && !dfp->dfp_done) {
 			ops->abort_intent(dfp->dfp_intent);
 			dfp->dfp_intent = NULL;
@@ -267,6 +264,16 @@ xfs_defer_trans_abort(
 	}
 }
 
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+	struct xfs_trans		*tp,
+	struct list_head		*dop_pending)
+{
+	trace_xfs_defer_trans_abort(tp, _RET_IP_);
+	xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+}
+
 /*
  * Capture resources that the caller said not to release ("held") when the
  * transaction commits.  Caller is responsible for zero-initializing @dres.
-- 
cgit 


From f8f9d952e42dd49ae534f61f2fa7ca0876cb9848 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 31 Jul 2023 20:46:18 +0800
Subject: xfs: abort intent items when recovery intents fail

When recovering intents, we capture newly created intent items as part of
committing recovered intent items.  If intent recovery fails at a later
point, we forget to remove those newly created intent items from the AIL
and hang:

    [root@localhost ~]# cat /proc/539/stack
    [<0>] xfs_ail_push_all_sync+0x174/0x230
    [<0>] xfs_unmount_flush_inodes+0x8d/0xd0
    [<0>] xfs_mountfs+0x15f7/0x1e70
    [<0>] xfs_fs_fill_super+0x10ec/0x1b20
    [<0>] get_tree_bdev+0x3c8/0x730
    [<0>] vfs_get_tree+0x89/0x2c0
    [<0>] path_mount+0xecf/0x1800
    [<0>] do_mount+0xf3/0x110
    [<0>] __x64_sys_mount+0x154/0x1f0
    [<0>] do_syscall_64+0x39/0x80
    [<0>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

When newly created intent items fail to commit via transaction, intent
recovery hasn't created done items for these newly created intent items,
so the capture structure is the sole owner of the captured intent items.
We must release them explicitly or else they leak:

unreferenced object 0xffff888016719108 (size 432):
  comm "mount", pid 529, jiffies 4294706839 (age 144.463s)
  hex dump (first 32 bytes):
    08 91 71 16 80 88 ff ff 08 91 71 16 80 88 ff ff  ..q.......q.....
    18 91 71 16 80 88 ff ff 18 91 71 16 80 88 ff ff  ..q.......q.....
  backtrace:
    [<ffffffff8230c68f>] xfs_efi_init+0x18f/0x1d0
    [<ffffffff8230c720>] xfs_extent_free_create_intent+0x50/0x150
    [<ffffffff821b671a>] xfs_defer_create_intents+0x16a/0x340
    [<ffffffff821bac3e>] xfs_defer_ops_capture_and_commit+0x8e/0xad0
    [<ffffffff82322bb9>] xfs_cui_item_recover+0x819/0x980
    [<ffffffff823289b6>] xlog_recover_process_intents+0x246/0xb70
    [<ffffffff8233249a>] xlog_recover_finish+0x8a/0x9a0
    [<ffffffff822eeafb>] xfs_log_mount_finish+0x2bb/0x4a0
    [<ffffffff822c0f4f>] xfs_mountfs+0x14bf/0x1e70
    [<ffffffff822d1f80>] xfs_fs_fill_super+0x10d0/0x1b20
    [<ffffffff81a21fa2>] get_tree_bdev+0x3d2/0x6d0
    [<ffffffff81a1ee09>] vfs_get_tree+0x89/0x2c0
    [<ffffffff81a9f35f>] path_mount+0xecf/0x1800
    [<ffffffff81a9fd83>] do_mount+0xf3/0x110
    [<ffffffff81aa00e4>] __x64_sys_mount+0x154/0x1f0
    [<ffffffff83968739>] do_syscall_64+0x39/0x80

Fix the problem above by abort intent items that don't have a done item
when recovery intents fail.

Fixes: e6fff81e4870 ("xfs: proper replay of deferred ops queued during log recovery")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_defer.c | 5 +++--
 fs/xfs/libxfs/xfs_defer.h | 2 +-
 fs/xfs/xfs_log_recover.c  | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 88388e12f8e7..f71679ce23b9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -763,12 +763,13 @@ xfs_defer_ops_capture(
 
 /* Release all resources that we used to capture deferred ops. */
 void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
 	struct xfs_mount		*mp,
 	struct xfs_defer_capture	*dfc)
 {
 	unsigned short			i;
 
+	xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
 
 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -809,7 +810,7 @@ xfs_defer_ops_capture_and_commit(
 	/* Commit the transaction and add the capture structure to the list. */
 	error = xfs_trans_commit(tp);
 	if (error) {
-		xfs_defer_ops_capture_free(mp, dfc);
+		xfs_defer_ops_capture_abort(mp, dfc);
 		return error;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 114a3a4930a3..8788ad5f6a73 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
 		struct list_head *capture_list);
 void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
 		struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
 		struct xfs_defer_capture *d);
 void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13b94d2e605b..a1e18b24971a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2511,7 +2511,7 @@ xlog_abort_defer_ops(
 
 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
 		list_del_init(&dfc->dfc_list);
-		xfs_defer_ops_capture_free(mp, dfc);
+		xfs_defer_ops_capture_abort(mp, dfc);
 	}
 }
 
-- 
cgit 


From 00080503612f61d1ad67be641ed9cb4f9f6ba40e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 19 Sep 2023 17:12:45 -0400
Subject: XFS: Update MAINTAINERS to catch all XFS documentation

Assumes that all XFS documentation will be prefixed with xfs-, which
seems like a good policy anyway.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 97f51d5ec1cf..a78795dc9aee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23872,8 +23872,7 @@ T:	git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git
 P:	Documentation/filesystems/xfs-maintainer-entry-profile.rst
 F:	Documentation/ABI/testing/sysfs-fs-xfs
 F:	Documentation/admin-guide/xfs.rst
-F:	Documentation/filesystems/xfs-delayed-logging-design.rst
-F:	Documentation/filesystems/xfs-self-describing-metadata.rst
+F:	Documentation/filesystems/xfs-*
 F:	fs/xfs/
 F:	include/uapi/linux/dqblk_xfs.h
 F:	include/uapi/linux/fsmap.h
-- 
cgit 


From 55f669f34184ecb25b8353f29c7f6f1ae5b313d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Oct 2023 17:28:52 +0200
Subject: xfs: only remap the written blocks in xfs_reflink_end_cow_extent

xfs_reflink_end_cow_extent looks up the COW extent and the data fork
extent at offset_fsb, and then proceeds to remap the common subset
between the two.

It does however not limit the remapped extent to the passed in
[*offset_fsbm end_fsb] range and thus potentially remaps more blocks than
the one handled by the current I/O completion.  This means that with
sufficiently large data and COW extents we could be remapping COW fork
mappings that have not been written to, leading to a stale data exposure
on a powerfail event.

We use to have a xfs_trim_range to make the remap fit the I/O completion
range, but that got (apparently accidentally) removed in commit
df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents").

Note that I've only found this by code inspection, and a test case would
probably require very specific delay and error injection.

Fixes: df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_reflink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 658edee8381d..e5b62dc28466 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent(
 		}
 	}
 	del = got;
+	xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
 
 	/* Grab the corresponding mapping in the data fork. */
 	nmaps = 1;
-- 
cgit 


From 471de20303dda0b67981e06d59cc6c4a83fd2a3c Mon Sep 17 00:00:00 2001
From: Leah Rumancik <leah.rumancik@gmail.com>
Date: Mon, 30 Oct 2023 13:33:49 -0700
Subject: xfs: up(ic_sema) if flushing data device fails

We flush the data device cache before we issue external log IO. If
the flush fails, we shut down the log immediately and return. However,
the iclog->ic_sema is left in a decremented state so let's add an up().
Prior to this patch, xfs/438 would fail consistently when running with
an external log device:

sync
  -> xfs_log_force
  -> xlog_write_iclog
      -> down(&iclog->ic_sema)
      -> blkdev_issue_flush (fail causes us to intiate shutdown)
          -> xlog_force_shutdown
          -> return

unmount
  -> xfs_log_umount
      -> xlog_wait_iclog_completion
          -> down(&iclog->ic_sema) --------> HANG

There is a second early return / shutdown. Make sure the up() happens
for it as well. Also make sure we cleanup the iclog state,
xlog_state_done_syncing, before dropping the iclog lock.

Fixes: b5d721eaae47 ("xfs: external logs need to flush data device")
Fixes: 842a42d126b4 ("xfs: shutdown on failure to add page to log bio")
Fixes: 7d839e325af2 ("xfs: check return codes when flushing block devices")
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_log.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 51c100c86177..ee206facf0dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1893,9 +1893,7 @@ xlog_write_iclog(
 		 * the buffer manually, the code needs to be kept in sync
 		 * with the I/O completion path.
 		 */
-		xlog_state_done_syncing(iclog);
-		up(&iclog->ic_sema);
-		return;
+		goto sync;
 	}
 
 	/*
@@ -1925,20 +1923,17 @@ xlog_write_iclog(
 		 * avoid shutdown re-entering this path and erroring out again.
 		 */
 		if (log->l_targ != log->l_mp->m_ddev_targp &&
-		    blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
-			xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-			return;
-		}
+		    blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+			goto shutdown;
 	}
 	if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
 		iclog->ic_bio.bi_opf |= REQ_FUA;
 
 	iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
 
-	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
-		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-		return;
-	}
+	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+		goto shutdown;
+
 	if (is_vmalloc_addr(iclog->ic_data))
 		flush_kernel_vmap_range(iclog->ic_data, count);
 
@@ -1959,6 +1954,12 @@ xlog_write_iclog(
 	}
 
 	submit_bio(&iclog->ic_bio);
+	return;
+shutdown:
+	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+	xlog_state_done_syncing(iclog);
+	up(&iclog->ic_sema);
 }
 
 /*
-- 
cgit 


From f63a5b3769ad7659da4c0420751d78958ab97675 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 1 Nov 2023 09:41:45 -0700
Subject: xfs: fix internal error from AGFL exhaustion

We've been seeing XFS errors like the following:

XFS: Internal error i != 1 at line 3526 of file fs/xfs/libxfs/xfs_btree.c.  Caller xfs_btree_insert+0x1ec/0x280
...
Call Trace:
 xfs_corruption_error+0x94/0xa0
 xfs_btree_insert+0x221/0x280
 xfs_alloc_fixup_trees+0x104/0x3e0
 xfs_alloc_ag_vextent_size+0x667/0x820
 xfs_alloc_fix_freelist+0x5d9/0x750
 xfs_free_extent_fix_freelist+0x65/0xa0
 __xfs_free_extent+0x57/0x180
...

This is the XFS_IS_CORRUPT() check in xfs_btree_insert() when
xfs_btree_insrec() fails.

After converting this into a panic and dissecting the core dump, I found
that xfs_btree_insrec() is failing because it's trying to split a leaf
node in the cntbt when the AG free list is empty. In particular, it's
failing to get a block from the AGFL _while trying to refill the AGFL_.

If a single operation splits every level of the bnobt and the cntbt (and
the rmapbt if it is enabled) at once, the free list will be empty. Then,
when the next operation tries to refill the free list, it allocates
space. If the allocation does not use a full extent, it will need to
insert records for the remaining space in the bnobt and cntbt. And if
those new records go in full leaves, the leaves (and potentially more
nodes up to the old root) need to be split.

Fix it by accounting for the additional splits that may be required to
refill the free list in the calculation for the minimum free list size.

P.S. As far as I can tell, this bug has existed for a long time -- maybe
back to xfs-history commit afdf80ae7405 ("Add XFS_AG_MAXLEVELS macros
...") in April 1994! It requires a very unlucky sequence of events, and
in fact we didn't hit it until a particular sparse mmap workload updated
from 5.12 to 5.19. But this bug existed in 5.12, so it must've been
exposed by some other change in allocation or writeback patterns. It's
also much less likely to be hit with the rmapbt enabled, since that
increases the minimum free list size and is unlikely to split at the
same time as the bnobt and cntbt.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3069194527dd..100ab5931b31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist(
 
 	ASSERT(mp->m_alloc_maxlevels > 0);
 
+	/*
+	 * For a btree shorter than the maximum height, the worst case is that
+	 * every level gets split and a new level is added, then while inserting
+	 * another entry to refill the AGFL, every level under the old root gets
+	 * split again. This is:
+	 *
+	 *   (full height split reservation) + (AGFL refill split height)
+	 * = (current height + 1) + (current height - 1)
+	 * = (new height) + (new height - 2)
+	 * = 2 * new height - 2
+	 *
+	 * For a btree of maximum height, the worst case is that every level
+	 * under the root gets split, then while inserting another entry to
+	 * refill the AGFL, every level under the root gets split again. This is
+	 * also:
+	 *
+	 *   2 * (current height - 1)
+	 * = 2 * (new height - 1)
+	 * = 2 * new height - 2
+	 */
+
 	/* space needed by-bno freespace btree */
 	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
-				       mp->m_alloc_maxlevels);
+				       mp->m_alloc_maxlevels) * 2 - 2;
 	/* space needed by-size freespace btree */
 	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
-				       mp->m_alloc_maxlevels);
+				       mp->m_alloc_maxlevels) * 2 - 2;
 	/* space needed reverse mapping used space btree */
 	if (xfs_has_rmapbt(mp))
 		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
-						mp->m_rmap_maxlevels);
+						mp->m_rmap_maxlevels) * 2 - 2;
 
 	return min_free;
 }
-- 
cgit 


From a2e4388adfa44684c7c428a5a5980efe0d75e13e Mon Sep 17 00:00:00 2001
From: Anthony Iliopoulos <ailiop@suse.com>
Date: Sun, 5 Nov 2023 20:23:18 +0100
Subject: xfs: fix again select in kconfig XFS_ONLINE_SCRUB_STATS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 57c0f4a8ea3a attempted to fix the select in the kconfig entry
XFS_ONLINE_SCRUB_STATS by selecting XFS_DEBUG, but the original
intention was to select DEBUG_FS, since the feature relies on debugfs to
export the related scrub statistics.

Fixes: 57c0f4a8ea3a ("xfs: fix select in config XFS_ONLINE_SCRUB_STATS")

Reported-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: Anthony Iliopoulos <ailiop@suse.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ed0bc8cbc703..567fb37274d3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
 	bool "XFS online metadata check usage data collection"
 	default y
 	depends on XFS_ONLINE_SCRUB
-	select XFS_DEBUG
+	select DEBUG_FS
 	help
 	  If you say Y here, the kernel will gather usage data about
 	  the online metadata check subsystem.  This includes the number
-- 
cgit 


From 038ca189c0d2c1570b4d922f25b524007c85cf94 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 10 Nov 2023 15:33:13 +1100
Subject: xfs: inode recovery does not validate the recovered inode

Discovered when trying to track down a weird recovery corruption
issue that wasn't detected at recovery time.

The specific corruption was a zero extent count field when big
extent counts are in use, and it turns out the dinode verifier
doesn't detect that specific corruption case, either. So fix it too.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c   |  3 +++
 fs/xfs/xfs_inode_item_recover.c | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 543f3748c2a3..137a65bda95d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -510,6 +510,9 @@ xfs_dinode_verify(
 	if (mode && nextents + naextents > nblocks)
 		return __this_address;
 
+	if (nextents + naextents == 0 && nblocks != 0)
+		return __this_address;
+
 	if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
 		return __this_address;
 
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0e5dba2343ea..1c5ba2a732b0 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2(
 	struct xfs_log_dinode		*ldip;
 	uint				isize;
 	int				need_free = 0;
+	xfs_failaddr_t			fa;
 
 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
 		in_f = item->ri_buf[0].i_addr;
@@ -528,8 +529,19 @@ out_owner_change:
 	    (dip->di_mode != 0))
 		error = xfs_recover_inode_owner_change(mp, dip, in_f,
 						       buffer_list);
-	/* re-generate the checksum. */
+	/* re-generate the checksum and validate the recovered inode. */
 	xfs_dinode_calc_crc(log->l_mp, dip);
+	fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+	if (fa) {
+		XFS_CORRUPTION_ERROR(
+			"Bad dinode after recovery",
+				XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+		xfs_alert(mp,
+			"Metadata corruption detected at %pS, inode 0x%llx",
+			fa, in_f->ilf_ino);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
 
 	ASSERT(bp->b_mount == mp);
 	bp->b_flags |= _XBF_LOGRECOVERY;
-- 
cgit 


From 7930d9e103700cde15833638855b750715c12091 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 10 Nov 2023 15:33:14 +1100
Subject: xfs: recovery should not clear di_flushiter unconditionally

Because on v3 inodes, di_flushiter doesn't exist. It overlaps with
zero padding in the inode, except when NREXT64=1 configurations are
in use and the zero padding is no longer padding but holds the 64
bit extent counter.

This manifests obviously on big endian platforms (e.g. s390) because
the log dinode is in host order and the overlap is the LSBs of the
extent count field. It is not noticed on little endian machines
because the overlap is at the MSB end of the extent count field and
we need to get more than 2^^48 extents in the inode before it
manifests. i.e. the heat death of the universe will occur before we
see the problem in little endian machines.

This is a zero-day issue for NREXT64=1 configuraitons on big endian
machines. Fix it by only clearing di_flushiter on v2 inodes during
recovery.

Fixes: 9b7d16e34bbe ("xfs: Introduce XFS_DIFLAG2_NREXT64 and associated helpers")
cc: stable@kernel.org # 5.19+
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_inode_item_recover.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 1c5ba2a732b0..144198a6b270 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -370,24 +370,26 @@ xlog_recover_inode_commit_pass2(
 	 * superblock flag to determine whether we need to look at di_flushiter
 	 * to skip replay when the on disk inode is newer than the log one
 	 */
-	if (!xfs_has_v3inodes(mp) &&
-	    ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
-		/*
-		 * Deal with the wrap case, DI_MAX_FLUSH is less
-		 * than smaller numbers
-		 */
-		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
-		    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
-			/* do nothing */
-		} else {
-			trace_xfs_log_recover_inode_skip(log, in_f);
-			error = 0;
-			goto out_release;
+	if (!xfs_has_v3inodes(mp)) {
+		if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+			/*
+			 * Deal with the wrap case, DI_MAX_FLUSH is less
+			 * than smaller numbers
+			 */
+			if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+			    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+				/* do nothing */
+			} else {
+				trace_xfs_log_recover_inode_skip(log, in_f);
+				error = 0;
+				goto out_release;
+			}
 		}
+
+		/* Take the opportunity to reset the flush iteration count */
+		ldip->di_flushiter = 0;
 	}
 
-	/* Take the opportunity to reset the flush iteration count */
-	ldip->di_flushiter = 0;
 
 	if (unlikely(S_ISREG(ldip->di_mode))) {
 		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
-- 
cgit 


From e64e7c74b99ec9e439abca75f522f4b98f220bd1 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 24 Oct 2023 13:51:36 +0200
Subject: xen/events: avoid using info_for_irq() in xen_send_IPI_one()

xen_send_IPI_one() is being used by cpuhp_report_idle_dead() after
it calls rcu_report_dead(), meaning that any RCU usage by
xen_send_IPI_one() is a bad idea.

Unfortunately xen_send_IPI_one() is using notify_remote_via_irq()
today, which is using irq_get_chip_data() via info_for_irq(). And
irq_get_chip_data() in turn is using a maple-tree lookup requiring
RCU.

Avoid this problem by caching the ipi event channels in another
percpu variable, allowing the use notify_remote_via_evtchn() in
xen_send_IPI_one().

Fixes: 721255b9826b ("genirq: Use a maple tree for interrupt descriptor management")
Reported-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 6de6b084ea60..237e6b884f72 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -164,6 +164,8 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
+/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */
+static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0};
 
 /* Event channel distribution data */
 static atomic_t channels_on_cpu[NR_CPUS];
@@ -366,6 +368,7 @@ static int xen_irq_info_ipi_setup(unsigned cpu,
 	info->u.ipi = ipi;
 
 	per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+	per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
 
 	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
 }
@@ -981,6 +984,7 @@ static void __unbind_from_irq(unsigned int irq)
 			break;
 		case IRQT_IPI:
 			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
+			per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(irq)] = 0;
 			break;
 		case IRQT_EVTCHN:
 			dev = info->u.interdomain;
@@ -1632,7 +1636,7 @@ EXPORT_SYMBOL_GPL(evtchn_put);
 
 void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 {
-	int irq;
+	evtchn_port_t evtchn;
 
 #ifdef CONFIG_X86
 	if (unlikely(vector == XEN_NMI_VECTOR)) {
@@ -1643,9 +1647,9 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 		return;
 	}
 #endif
-	irq = per_cpu(ipi_to_irq, cpu)[vector];
-	BUG_ON(irq < 0);
-	notify_remote_via_irq(irq);
+	evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
+	BUG_ON(evtchn == 0);
+	notify_remote_via_evtchn(evtchn);
 }
 
 struct evtchn_loop_ctrl {
-- 
cgit 


From bfa993b355d33a438a746523e7129391c8664e8a Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Wed, 8 Nov 2023 16:25:15 -0500
Subject: acpi/processor: sanitize _OSC/_PDC capabilities for Xen dom0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Processor capability bits notify ACPI of the OS capabilities, and
so ACPI can adjust the return of other Processor methods taking the OS
capabilities into account.

When Linux is running as a Xen dom0, the hypervisor is the entity
in charge of processor power management, and hence Xen needs to make
sure the capabilities reported by _OSC/_PDC match the capabilities of
the driver in Xen.

Introduce a small helper to sanitize the buffer when running as Xen
dom0.

When Xen supports HWP, this serves as the equivalent of commit
a21211672c9a ("ACPI / processor: Request native thermal interrupt
handling via _OSC") to avoid SMM crashes.  Xen will set bit
ACPI_PROC_CAP_COLLAB_PROC_PERF (bit 12) in the capability bits and the
_OSC/_PDC call will apply it.

[ jandryuk: Mention Xen HWP's need.  Support _OSC & _PDC ]
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
Reviewed-by: Michal Wilczynski <michal.wilczynski@intel.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20231108212517.72279-1-jandryuk@gmail.com
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/include/asm/acpi.h           | 14 ++++++++++++++
 arch/x86/include/asm/xen/hypervisor.h |  9 +++++++++
 drivers/xen/pcpu.c                    | 22 ++++++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index c8a7fc23f63c..f896eed4516c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -16,6 +16,9 @@
 #include <asm/x86_init.h>
 #include <asm/cpufeature.h>
 #include <asm/irq_vectors.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
 
 #ifdef CONFIG_ACPI_APEI
 # include <asm/pgtable_types.h>
@@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
 	if (!cpu_has(c, X86_FEATURE_MWAIT) ||
 	    boot_option_idle_override == IDLE_NOMWAIT)
 		*cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH);
+
+	if (xen_initial_domain()) {
+		/*
+		 * When Linux is running as Xen dom0, the hypervisor is the
+		 * entity in charge of the processor power management, and so
+		 * Xen needs to check the OS capabilities reported in the
+		 * processor capabilities buffer matches what the hypervisor
+		 * driver supports.
+		 */
+		xen_sanitize_proc_cap_bits(cap);
+	}
 }
 
 static inline bool acpi_has_cpu_in_madt(void)
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 7048dfacc04b..a9088250770f 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -100,4 +100,13 @@ static inline void leave_lazy(enum xen_lazy_mode mode)
 
 enum xen_lazy_mode xen_get_lazy_mode(void);
 
+#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI)
+void xen_sanitize_proc_cap_bits(uint32_t *buf);
+#else
+static inline void xen_sanitize_proc_cap_bits(uint32_t *buf)
+{
+	BUG();
+}
+#endif
+
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
index b3e3d1bb37f3..508655273145 100644
--- a/drivers/xen/pcpu.c
+++ b/drivers/xen/pcpu.c
@@ -47,6 +47,9 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
+#ifdef CONFIG_ACPI
+#include <acpi/processor.h>
+#endif
 
 /*
  * @cpu_id: Xen physical cpu logic number
@@ -400,4 +403,23 @@ bool __init xen_processor_present(uint32_t acpi_id)
 
 	return online;
 }
+
+void xen_sanitize_proc_cap_bits(uint32_t *cap)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.u.set_pminfo.id	= -1,
+		.u.set_pminfo.type	= XEN_PM_PDC,
+	};
+	u32 buf[3] = { ACPI_PDC_REVISION_ID, 1, *cap };
+	int ret;
+
+	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+	ret = HYPERVISOR_platform_op(&op);
+	if (ret)
+		pr_err("sanitize of _PDC buffer bits from Xen failed: %d\n",
+		       ret);
+	else
+		*cap = buf[2];
+}
 #endif
-- 
cgit 


From 50e865a56876bd7a74a79c1778025631150f104a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 5 Nov 2023 21:56:31 -0800
Subject: xen/shbuf: eliminate 17 kernel-doc warnings

Don't use kernel-doc markers ("/**") for comments that are not in
kernel-doc format. This prevents multiple kernel-doc warnings:

xen-front-pgdir-shbuf.c:25: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * This structure represents the structure of a shared page
xen-front-pgdir-shbuf.c:37: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Shared buffer ops which are differently implemented
xen-front-pgdir-shbuf.c:65: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Get granted reference to the very first page of the
xen-front-pgdir-shbuf.c:85: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Map granted references of the shared buffer.
xen-front-pgdir-shbuf.c:106: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Unmap granted references of the shared buffer.
xen-front-pgdir-shbuf.c:127: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Free all the resources of the shared buffer.
xen-front-pgdir-shbuf.c:154: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Get the number of pages the page directory consumes itself.
xen-front-pgdir-shbuf.c:164: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Calculate the number of grant references needed to share the buffer
xen-front-pgdir-shbuf.c:176: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Calculate the number of grant references needed to share the buffer
xen-front-pgdir-shbuf.c:194: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Unmap the buffer previously mapped with grant references
xen-front-pgdir-shbuf.c:242: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Map the buffer with grant references provided by the backend.
xen-front-pgdir-shbuf.c:324: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Fill page directory with grant references to the pages of the
xen-front-pgdir-shbuf.c:354: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Fill page directory with grant references to the pages of the
xen-front-pgdir-shbuf.c:393: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Grant references to the frontend's buffer pages.
xen-front-pgdir-shbuf.c:422: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Grant all the references needed to share the buffer.
xen-front-pgdir-shbuf.c:470: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Allocate all required structures to mange shared buffer.
xen-front-pgdir-shbuf.c:510: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Allocate a new instance of a shared buffer.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: lore.kernel.org/r/202311060203.yQrpPZhm-lkp@intel.com
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: xen-devel@lists.xenproject.org
Link: https://lore.kernel.org/r/20231106055631.21520-1-rdunlap@infradead.org
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/xen-front-pgdir-shbuf.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c
index b52e0fa595a9..223870a0111b 100644
--- a/drivers/xen/xen-front-pgdir-shbuf.c
+++ b/drivers/xen/xen-front-pgdir-shbuf.c
@@ -21,7 +21,7 @@
 
 #include <xen/xen-front-pgdir-shbuf.h>
 
-/**
+/*
  * This structure represents the structure of a shared page
  * that contains grant references to the pages of the shared
  * buffer. This structure is common to many Xen para-virtualized
@@ -33,7 +33,7 @@ struct xen_page_directory {
 	grant_ref_t gref[]; /* Variable length */
 };
 
-/**
+/*
  * Shared buffer ops which are differently implemented
  * depending on the allocation mode, e.g. if the buffer
  * is allocated by the corresponding backend or frontend.
@@ -61,7 +61,7 @@ struct xen_front_pgdir_shbuf_ops {
 	int (*unmap)(struct xen_front_pgdir_shbuf *buf);
 };
 
-/**
+/*
  * Get granted reference to the very first page of the
  * page directory. Usually this is passed to the backend,
  * so it can find/fill the grant references to the buffer's
@@ -81,7 +81,7 @@ xen_front_pgdir_shbuf_get_dir_start(struct xen_front_pgdir_shbuf *buf)
 }
 EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start);
 
-/**
+/*
  * Map granted references of the shared buffer.
  *
  * Depending on the shared buffer mode of allocation
@@ -102,7 +102,7 @@ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf)
 }
 EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map);
 
-/**
+/*
  * Unmap granted references of the shared buffer.
  *
  * Depending on the shared buffer mode of allocation
@@ -123,7 +123,7 @@ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf)
 }
 EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_unmap);
 
-/**
+/*
  * Free all the resources of the shared buffer.
  *
  * \param buf shared buffer which resources to be freed.
@@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_free);
 				 offsetof(struct xen_page_directory, \
 					  gref)) / sizeof(grant_ref_t))
 
-/**
+/*
  * Get the number of pages the page directory consumes itself.
  *
  * \param buf shared buffer.
@@ -160,7 +160,7 @@ static int get_num_pages_dir(struct xen_front_pgdir_shbuf *buf)
 	return DIV_ROUND_UP(buf->num_pages, XEN_NUM_GREFS_PER_PAGE);
 }
 
-/**
+/*
  * Calculate the number of grant references needed to share the buffer
  * and its pages when backend allocates the buffer.
  *
@@ -172,7 +172,7 @@ static void backend_calc_num_grefs(struct xen_front_pgdir_shbuf *buf)
 	buf->num_grefs = get_num_pages_dir(buf);
 }
 
-/**
+/*
  * Calculate the number of grant references needed to share the buffer
  * and its pages when frontend allocates the buffer.
  *
@@ -190,7 +190,7 @@ static void guest_calc_num_grefs(struct xen_front_pgdir_shbuf *buf)
 #define xen_page_to_vaddr(page) \
 	((uintptr_t)pfn_to_kaddr(page_to_xen_pfn(page)))
 
-/**
+/*
  * Unmap the buffer previously mapped with grant references
  * provided by the backend.
  *
@@ -238,7 +238,7 @@ static int backend_unmap(struct xen_front_pgdir_shbuf *buf)
 	return ret;
 }
 
-/**
+/*
  * Map the buffer with grant references provided by the backend.
  *
  * \param buf shared buffer.
@@ -320,7 +320,7 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf)
 	return ret;
 }
 
-/**
+/*
  * Fill page directory with grant references to the pages of the
  * page directory itself.
  *
@@ -350,7 +350,7 @@ static void backend_fill_page_dir(struct xen_front_pgdir_shbuf *buf)
 	page_dir->gref_dir_next_page = XEN_GREF_LIST_END;
 }
 
-/**
+/*
  * Fill page directory with grant references to the pages of the
  * page directory and the buffer we share with the backend.
  *
@@ -389,7 +389,7 @@ static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf)
 	}
 }
 
-/**
+/*
  * Grant references to the frontend's buffer pages.
  *
  * These will be shared with the backend, so it can
@@ -418,7 +418,7 @@ static int guest_grant_refs_for_buffer(struct xen_front_pgdir_shbuf *buf,
 	return 0;
 }
 
-/**
+/*
  * Grant all the references needed to share the buffer.
  *
  * Grant references to the page directory pages and, if
@@ -466,7 +466,7 @@ static int grant_references(struct xen_front_pgdir_shbuf *buf)
 	return 0;
 }
 
-/**
+/*
  * Allocate all required structures to mange shared buffer.
  *
  * \param buf shared buffer.
@@ -506,7 +506,7 @@ static const struct xen_front_pgdir_shbuf_ops local_ops = {
 	.grant_refs_for_buffer = guest_grant_refs_for_buffer,
 };
 
-/**
+/*
  * Allocate a new instance of a shared buffer.
  *
  * \param cfg configuration to be used while allocating a new shared buffer.
-- 
cgit 


From 472a2ff63efb30234cbf6b2cdaf8117f21b4f8bc Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Fri, 10 Nov 2023 17:37:07 +0800
Subject: net: hns3: fix add VLAN fail issue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hclge_sync_vlan_filter is called in periodic task,
trying to remove VLAN from vlan_del_fail_bmap. It can
be concurrence with VLAN adding operation from user.
So once user failed to delete a VLAN id, and add it
again soon, it may be removed by the periodic task,
which may cause the software configuration being
inconsistent with hardware. So add mutex handling
to avoid this.

     user                        hns3 driver

                                           periodic task
                                                │
  add vlan 10 ───── hns3_vlan_rx_add_vid        │
       │             (suppose success)          │
       │                                        │
  del vlan 10 ─────  hns3_vlan_rx_kill_vid      │
       │           (suppose fail,add to         │
       │             vlan_del_fail_bmap)        │
       │                                        │
  add vlan 10 ───── hns3_vlan_rx_add_vid        │
                     (suppose success)          │
                                       foreach vlan_del_fail_bmp
                                            del vlan 10

Fixes: fe4144d47eef ("net: hns3: sync VLAN filter entries when kill VLAN ID failed")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 28 +++++++++++++++-------
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  | 11 +++++++--
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 66e5807903a0..e22279e5d43f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -10025,8 +10025,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
 	struct hclge_vport_vlan_cfg *vlan, *tmp;
 	struct hclge_dev *hdev = vport->back;
 
-	mutex_lock(&hdev->vport_lock);
-
 	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
 		if (vlan->vlan_id == vlan_id) {
 			if (is_write_tbl && vlan->hd_tbl_status)
@@ -10041,8 +10039,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
 			break;
 		}
 	}
-
-	mutex_unlock(&hdev->vport_lock);
 }
 
 void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list)
@@ -10451,11 +10447,16 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
 	 * handle mailbox. Just record the vlan id, and remove it after
 	 * reset finished.
 	 */
+	mutex_lock(&hdev->vport_lock);
 	if ((test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
 	     test_bit(HCLGE_STATE_RST_FAIL, &hdev->state)) && is_kill) {
 		set_bit(vlan_id, vport->vlan_del_fail_bmap);
+		mutex_unlock(&hdev->vport_lock);
 		return -EBUSY;
+	} else if (!is_kill && test_bit(vlan_id, vport->vlan_del_fail_bmap)) {
+		clear_bit(vlan_id, vport->vlan_del_fail_bmap);
 	}
+	mutex_unlock(&hdev->vport_lock);
 
 	/* when port base vlan enabled, we use port base vlan as the vlan
 	 * filter entry. In this case, we don't update vlan filter table
@@ -10470,17 +10471,22 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
 	}
 
 	if (!ret) {
-		if (!is_kill)
+		if (!is_kill) {
 			hclge_add_vport_vlan_table(vport, vlan_id,
 						   writen_to_tbl);
-		else if (is_kill && vlan_id != 0)
+		} else if (is_kill && vlan_id != 0) {
+			mutex_lock(&hdev->vport_lock);
 			hclge_rm_vport_vlan_table(vport, vlan_id, false);
+			mutex_unlock(&hdev->vport_lock);
+		}
 	} else if (is_kill) {
 		/* when remove hw vlan filter failed, record the vlan id,
 		 * and try to remove it from hw later, to be consistence
 		 * with stack
 		 */
+		mutex_lock(&hdev->vport_lock);
 		set_bit(vlan_id, vport->vlan_del_fail_bmap);
+		mutex_unlock(&hdev->vport_lock);
 	}
 
 	hclge_set_vport_vlan_fltr_change(vport);
@@ -10520,6 +10526,7 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev)
 	int i, ret, sync_cnt = 0;
 	u16 vlan_id;
 
+	mutex_lock(&hdev->vport_lock);
 	/* start from vport 1 for PF is always alive */
 	for (i = 0; i < hdev->num_alloc_vport; i++) {
 		struct hclge_vport *vport = &hdev->vport[i];
@@ -10530,21 +10537,26 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev)
 			ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q),
 						       vport->vport_id, vlan_id,
 						       true);
-			if (ret && ret != -EINVAL)
+			if (ret && ret != -EINVAL) {
+				mutex_unlock(&hdev->vport_lock);
 				return;
+			}
 
 			clear_bit(vlan_id, vport->vlan_del_fail_bmap);
 			hclge_rm_vport_vlan_table(vport, vlan_id, false);
 			hclge_set_vport_vlan_fltr_change(vport);
 
 			sync_cnt++;
-			if (sync_cnt >= HCLGE_MAX_SYNC_COUNT)
+			if (sync_cnt >= HCLGE_MAX_SYNC_COUNT) {
+				mutex_unlock(&hdev->vport_lock);
 				return;
+			}
 
 			vlan_id = find_first_bit(vport->vlan_del_fail_bmap,
 						 VLAN_N_VID);
 		}
 	}
+	mutex_unlock(&hdev->vport_lock);
 
 	hclge_sync_vlan_fltr_state(hdev);
 }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index a4d68fb216fb..1c62e58ff6d8 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1206,6 +1206,8 @@ static int hclgevf_set_vlan_filter(struct hnae3_handle *handle,
 	     test_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state)) && is_kill) {
 		set_bit(vlan_id, hdev->vlan_del_fail_bmap);
 		return -EBUSY;
+	} else if (!is_kill && test_bit(vlan_id, hdev->vlan_del_fail_bmap)) {
+		clear_bit(vlan_id, hdev->vlan_del_fail_bmap);
 	}
 
 	hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_VLAN,
@@ -1233,20 +1235,25 @@ static void hclgevf_sync_vlan_filter(struct hclgevf_dev *hdev)
 	int ret, sync_cnt = 0;
 	u16 vlan_id;
 
+	if (bitmap_empty(hdev->vlan_del_fail_bmap, VLAN_N_VID))
+		return;
+
+	rtnl_lock();
 	vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID);
 	while (vlan_id != VLAN_N_VID) {
 		ret = hclgevf_set_vlan_filter(handle, htons(ETH_P_8021Q),
 					      vlan_id, true);
 		if (ret)
-			return;
+			break;
 
 		clear_bit(vlan_id, hdev->vlan_del_fail_bmap);
 		sync_cnt++;
 		if (sync_cnt >= HCLGEVF_MAX_SYNC_COUNT)
-			return;
+			break;
 
 		vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID);
 	}
+	rtnl_unlock();
 }
 
 static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable)
-- 
cgit 


From ac92c0a9a0603fb448e60f38e63302e4eebb8035 Mon Sep 17 00:00:00 2001
From: Yonglong Liu <liuyonglong@huawei.com>
Date: Fri, 10 Nov 2023 17:37:08 +0800
Subject: net: hns3: add barrier in vf mailbox reply process

In hclgevf_mbx_handler() and hclgevf_get_mbx_resp() functions,
there is a typical store-store and load-load scenario between
received_resp and additional_info. This patch adds barrier
to fix the problem.

Fixes: 4671042f1ef0 ("net: hns3: add match_id to check mailbox response from PF to VF")
Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
index bbf7b14079de..85c2a634c8f9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
@@ -63,6 +63,9 @@ static int hclgevf_get_mbx_resp(struct hclgevf_dev *hdev, u16 code0, u16 code1,
 		i++;
 	}
 
+	/* ensure additional_info will be seen after received_resp */
+	smp_rmb();
+
 	if (i >= HCLGEVF_MAX_TRY_TIMES) {
 		dev_err(&hdev->pdev->dev,
 			"VF could not get mbx(%u,%u) resp(=%d) from PF in %d tries\n",
@@ -178,6 +181,10 @@ static void hclgevf_handle_mbx_response(struct hclgevf_dev *hdev,
 	resp->resp_status = hclgevf_resp_to_errno(resp_status);
 	memcpy(resp->additional_info, req->msg.resp_data,
 	       HCLGE_MBX_MAX_RESP_DATA_SIZE * sizeof(u8));
+
+	/* ensure additional_info will be seen before setting received_resp */
+	smp_wmb();
+
 	if (match_id) {
 		/* If match_id is not zero, it means PF support match_id.
 		 * if the match_id is right, VF get the right response, or
-- 
cgit 


From 75b247b57d8b71bcb679e4cb37d0db104848806c Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Fri, 10 Nov 2023 17:37:09 +0800
Subject: net: hns3: fix incorrect capability bit display for copper port

Currently, the FEC capability bit is default set for device version V2.
It's incorrect for the copper port. Eventhough it doesn't make the nic
work abnormal, but the capability information display in debugfs may
confuse user. So clear it when driver get the port type inforamtion.

Fixes: 433ccce83504 ("net: hns3: use FEC capability queried from firmware")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index e22279e5d43f..c393b4ee4a32 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -11663,6 +11663,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 		goto err_msi_irq_uninit;
 
 	if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) {
+		clear_bit(HNAE3_DEV_SUPPORT_FEC_B, ae_dev->caps);
 		if (hnae3_dev_phy_imp_supported(hdev))
 			ret = hclge_update_tp_port_info(hdev);
 		else
-- 
cgit 


From 53aba458f23846112c0d44239580ff59bc5c36c3 Mon Sep 17 00:00:00 2001
From: Yonglong Liu <liuyonglong@huawei.com>
Date: Fri, 10 Nov 2023 17:37:10 +0800
Subject: net: hns3: fix out-of-bounds access may occur when coalesce info is
 read via debugfs

The hns3 driver define an array of string to show the coalesce
info, but if the kernel adds a new mode or a new state,
out-of-bounds access may occur when coalesce info is read via
debugfs, this patch fix the problem.

Fixes: c99fead7cb07 ("net: hns3: add debugfs support for interrupt coalesce")
Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index 0b138635bafa..c083d1d10767 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -503,11 +503,14 @@ static void hns3_get_coal_info(struct hns3_enet_tqp_vector *tqp_vector,
 	}
 
 	sprintf(result[j++], "%d", i);
-	sprintf(result[j++], "%s", dim_state_str[dim->state]);
+	sprintf(result[j++], "%s", dim->state < ARRAY_SIZE(dim_state_str) ?
+		dim_state_str[dim->state] : "unknown");
 	sprintf(result[j++], "%u", dim->profile_ix);
-	sprintf(result[j++], "%s", dim_cqe_mode_str[dim->mode]);
+	sprintf(result[j++], "%s", dim->mode < ARRAY_SIZE(dim_cqe_mode_str) ?
+		dim_cqe_mode_str[dim->mode] : "unknown");
 	sprintf(result[j++], "%s",
-		dim_tune_stat_str[dim->tune_state]);
+		dim->tune_state < ARRAY_SIZE(dim_tune_stat_str) ?
+		dim_tune_stat_str[dim->tune_state] : "unknown");
 	sprintf(result[j++], "%u", dim->steps_left);
 	sprintf(result[j++], "%u", dim->steps_right);
 	sprintf(result[j++], "%u", dim->tired);
-- 
cgit 


From dbd2f3b20c6ae425665b6975d766e3653d453e73 Mon Sep 17 00:00:00 2001
From: Yonglong Liu <liuyonglong@huawei.com>
Date: Fri, 10 Nov 2023 17:37:11 +0800
Subject: net: hns3: fix variable may not initialized problem in
 hns3_init_mac_addr()

When a VF is calling hns3_init_mac_addr(), get_mac_addr() may
return fail, then the value of mac_addr_temp is not initialized.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 06117502001f..b618797a7e8d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -5139,7 +5139,7 @@ static int hns3_init_mac_addr(struct net_device *netdev)
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
 	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
 	struct hnae3_handle *h = priv->ae_handle;
-	u8 mac_addr_temp[ETH_ALEN];
+	u8 mac_addr_temp[ETH_ALEN] = {0};
 	int ret = 0;
 
 	if (h->ae_algo->ops->get_mac_addr)
-- 
cgit 


From 65e98bb56fa3ce2edb400930c05238c9b380500e Mon Sep 17 00:00:00 2001
From: Jijie Shao <shaojijie@huawei.com>
Date: Fri, 10 Nov 2023 17:37:12 +0800
Subject: net: hns3: fix VF reset fail issue

Currently the reset process in hns3 and firmware watchdog init process is
asynchronous. We think firmware watchdog initialization is completed
before VF clear the interrupt source. However, firmware initialization
may not complete early. So VF will receive multiple reset interrupts
and fail to reset.

So we add delay before VF interrupt source and 5 ms delay
is enough to avoid second reset interrupt.

Fixes: 427900d27d86 ("net: hns3: fix the timing issue of VF clearing interrupt sources")
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 14 +++++++++++++-
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 1c62e58ff6d8..0aa9beefd1c7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1981,8 +1981,18 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev,
 	return HCLGEVF_VECTOR0_EVENT_OTHER;
 }
 
+static void hclgevf_reset_timer(struct timer_list *t)
+{
+	struct hclgevf_dev *hdev = from_timer(hdev, t, reset_timer);
+
+	hclgevf_clear_event_cause(hdev, HCLGEVF_VECTOR0_EVENT_RST);
+	hclgevf_reset_task_schedule(hdev);
+}
+
 static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data)
 {
+#define HCLGEVF_RESET_DELAY	5
+
 	enum hclgevf_evt_cause event_cause;
 	struct hclgevf_dev *hdev = data;
 	u32 clearval;
@@ -1994,7 +2004,8 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data)
 
 	switch (event_cause) {
 	case HCLGEVF_VECTOR0_EVENT_RST:
-		hclgevf_reset_task_schedule(hdev);
+		mod_timer(&hdev->reset_timer,
+			  jiffies + msecs_to_jiffies(HCLGEVF_RESET_DELAY));
 		break;
 	case HCLGEVF_VECTOR0_EVENT_MBX:
 		hclgevf_mbx_handler(hdev);
@@ -2937,6 +2948,7 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 		 HCLGEVF_DRIVER_NAME);
 
 	hclgevf_task_schedule(hdev, round_jiffies_relative(HZ));
+	timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
index 81c16b8c8da2..a73f2bf3a56a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
@@ -219,6 +219,7 @@ struct hclgevf_dev {
 	enum hnae3_reset_type reset_level;
 	unsigned long reset_pending;
 	enum hnae3_reset_type reset_type;
+	struct timer_list reset_timer;
 
 #define HCLGEVF_RESET_REQUESTED		0
 #define HCLGEVF_RESET_PENDING		1
-- 
cgit 


From dff655e82faffc287d4a72a59f66fa120bf904e4 Mon Sep 17 00:00:00 2001
From: Jijie Shao <shaojijie@huawei.com>
Date: Fri, 10 Nov 2023 17:37:13 +0800
Subject: net: hns3: fix VF wrong speed and duplex issue

If PF is down, firmware will returns 10 Mbit/s rate and half-duplex mode
when PF queries the port information from firmware.

After imp reset command is executed, PF status changes to down,
and PF will query link status and updates port information
from firmware in a periodic scheduled task.

However, there is a low probability that port information is updated
when PF is down, and then PF link status changes to up.
In this case, PF synchronizes incorrect rate and duplex mode to VF.

This patch fixes it by updating port information before
PF synchronizes the rate and duplex to the VF
when PF changes to up.

Fixes: 18b6e31f8bf4 ("net: hns3: PF add support for pushing link status to VFs")
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c393b4ee4a32..5ea9e59569ef 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -61,6 +61,7 @@ static void hclge_sync_fd_table(struct hclge_dev *hdev);
 static void hclge_update_fec_stats(struct hclge_dev *hdev);
 static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
 				      int wait_cnt);
+static int hclge_update_port_info(struct hclge_dev *hdev);
 
 static struct hnae3_ae_algo ae_algo;
 
@@ -3041,6 +3042,9 @@ static void hclge_update_link_status(struct hclge_dev *hdev)
 
 	if (state != hdev->hw.mac.link) {
 		hdev->hw.mac.link = state;
+		if (state == HCLGE_LINK_STATUS_UP)
+			hclge_update_port_info(hdev);
+
 		client->ops->link_status_change(handle, state);
 		hclge_config_mac_tnl_int(hdev, state);
 		if (rclient && rclient->ops->link_status_change)
-- 
cgit 


From f726eaa787e9f9bc858c902d18a09af6bcbfcdaf Mon Sep 17 00:00:00 2001
From: Jan Bottorff <janb@os.amperecomputing.com>
Date: Thu, 9 Nov 2023 03:19:27 +0000
Subject: i2c: designware: Fix corrupted memory seen in the ISR

When running on a many core ARM64 server, errors were
happening in the ISR that looked like corrupted memory. These
corruptions would fix themselves if small delays were inserted
in the ISR. Errors reported by the driver included "i2c_designware
APMC0D0F:00: i2c_dw_xfer_msg: invalid target address" and
"i2c_designware APMC0D0F:00:controller timed out" during
in-band IPMI SSIF stress tests.

The problem was determined to be memory writes in the driver were not
becoming visible to all cores when execution rapidly shifted between
cores, like when a register write immediately triggers an ISR.
Processors with weak memory ordering, like ARM64, make no
guarantees about the order normal memory writes become globally
visible, unless barrier instructions are used to control ordering.

To solve this, regmap accessor functions configured by this driver
were changed to use non-relaxed forms of the low-level register
access functions, which include a barrier on platforms that require
it. This assures memory writes before a controller register access are
visible to all cores. The community concluded defaulting to correct
operation outweighed defaulting to the small performance gains from
using relaxed access functions. Being a low speed device added weight to
this choice of default register access behavior.

Signed-off-by: Jan Bottorff <janb@os.amperecomputing.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/busses/i2c-designware-common.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c
index affcfb243f0f..35f762872b8a 100644
--- a/drivers/i2c/busses/i2c-designware-common.c
+++ b/drivers/i2c/busses/i2c-designware-common.c
@@ -63,7 +63,7 @@ static int dw_reg_read(void *context, unsigned int reg, unsigned int *val)
 {
 	struct dw_i2c_dev *dev = context;
 
-	*val = readl_relaxed(dev->base + reg);
+	*val = readl(dev->base + reg);
 
 	return 0;
 }
@@ -72,7 +72,7 @@ static int dw_reg_write(void *context, unsigned int reg, unsigned int val)
 {
 	struct dw_i2c_dev *dev = context;
 
-	writel_relaxed(val, dev->base + reg);
+	writel(val, dev->base + reg);
 
 	return 0;
 }
@@ -81,7 +81,7 @@ static int dw_reg_read_swab(void *context, unsigned int reg, unsigned int *val)
 {
 	struct dw_i2c_dev *dev = context;
 
-	*val = swab32(readl_relaxed(dev->base + reg));
+	*val = swab32(readl(dev->base + reg));
 
 	return 0;
 }
@@ -90,7 +90,7 @@ static int dw_reg_write_swab(void *context, unsigned int reg, unsigned int val)
 {
 	struct dw_i2c_dev *dev = context;
 
-	writel_relaxed(swab32(val), dev->base + reg);
+	writel(swab32(val), dev->base + reg);
 
 	return 0;
 }
@@ -99,8 +99,8 @@ static int dw_reg_read_word(void *context, unsigned int reg, unsigned int *val)
 {
 	struct dw_i2c_dev *dev = context;
 
-	*val = readw_relaxed(dev->base + reg) |
-		(readw_relaxed(dev->base + reg + 2) << 16);
+	*val = readw(dev->base + reg) |
+		(readw(dev->base + reg + 2) << 16);
 
 	return 0;
 }
@@ -109,8 +109,8 @@ static int dw_reg_write_word(void *context, unsigned int reg, unsigned int val)
 {
 	struct dw_i2c_dev *dev = context;
 
-	writew_relaxed(val, dev->base + reg);
-	writew_relaxed(val >> 16, dev->base + reg + 2);
+	writew(val, dev->base + reg);
+	writew(val >> 16, dev->base + reg + 2);
 
 	return 0;
 }
-- 
cgit 


From 6979a51ecaec4dfb7c768eb2a77b77df73a74c8e Mon Sep 17 00:00:00 2001
From: MD Danish Anwar <danishanwar@ti.com>
Date: Mon, 13 Nov 2023 15:16:56 +0530
Subject: MAINTAINERS: add entry for TI ICSSG Ethernet driver

Add record for TI Industrial Communication Subsystem - Gigabit (ICSSG)
Ethernet driver.

Also add Roger and myself as maintainer.

Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e3acb36989f0..0443f4d9f736 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21795,6 +21795,15 @@ F:	Documentation/devicetree/bindings/media/i2c/ti,ds90*
 F:	drivers/media/i2c/ds90*
 F:	include/media/i2c/ds90*
 
+TI ICSSG ETHERNET DRIVER (ICSSG)
+R:	MD Danish Anwar <danishanwar@ti.com>
+R:	Roger Quadros <rogerq@kernel.org>
+L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/net/ti,icss*.yaml
+F:	drivers/net/ethernet/ti/icssg/*
+
 TI J721E CSI2RX DRIVER
 M:	Jai Luthra <j-luthra@ti.com>
 L:	linux-media@vger.kernel.org
-- 
cgit 


From 438cbcdf105d84449fceb39a2d0e16d0ec20708f Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 10 Nov 2023 13:05:46 +0100
Subject: net: mdio: fix typo in header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The quotes symbol in
  "EEE "link partner ability 1
should be at the end of the register name
  "EEE link partner ability 1"

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 8fa23bdcedbf..007fd9c3e4b6 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -420,7 +420,7 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising)
  * A function that translates value of following registers to the linkmode:
  * IEEE 802.3-2018 45.2.3.10 "EEE control and capability 1" register (3.20)
  * IEEE 802.3-2018 45.2.7.13 "EEE advertisement 1" register (7.60)
- * IEEE 802.3-2018 45.2.7.14 "EEE "link partner ability 1 register (7.61)
+ * IEEE 802.3-2018 45.2.7.14 "EEE link partner ability 1" register (7.61)
  */
 static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val)
 {
-- 
cgit 


From e6daf129ccb79d3781129f623f82bc676f2cb02c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 10 Nov 2023 10:36:00 -0500
Subject: net: gso_test: support CONFIG_MAX_SKB_FRAGS up to 45

The test allocs a single page to hold all the frag_list skbs. This
is insufficient on kernels with CONFIG_MAX_SKB_FRAGS=45, due to the
increased skb_shared_info frags[] array length.

        gso_test_func: ASSERTION FAILED at net/core/gso_test.c:210
        Expected alloc_size <= ((1UL) << 12), but
            alloc_size == 5075 (0x13d3)
            ((1UL) << 12) == 4096 (0x1000)

Simplify the logic. Just allocate a page for each frag_list skb.

Fixes: 4688ecb1385f ("net: expand skb_segment unit test with frag_list coverage")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gso_test.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/net/core/gso_test.c b/net/core/gso_test.c
index ceb684be4cbf..4c2e77bd12f4 100644
--- a/net/core/gso_test.c
+++ b/net/core/gso_test.c
@@ -180,18 +180,17 @@ static void gso_test_func(struct kunit *test)
 	}
 
 	if (tcase->frag_skbs) {
-		unsigned int total_size = 0, total_true_size = 0, alloc_size = 0;
+		unsigned int total_size = 0, total_true_size = 0;
 		struct sk_buff *frag_skb, *prev = NULL;
 
-		page = alloc_page(GFP_KERNEL);
-		KUNIT_ASSERT_NOT_NULL(test, page);
-		page_ref_add(page, tcase->nr_frag_skbs - 1);
-
 		for (i = 0; i < tcase->nr_frag_skbs; i++) {
 			unsigned int frag_size;
 
+			page = alloc_page(GFP_KERNEL);
+			KUNIT_ASSERT_NOT_NULL(test, page);
+
 			frag_size = tcase->frag_skbs[i];
-			frag_skb = build_skb(page_address(page) + alloc_size,
+			frag_skb = build_skb(page_address(page),
 					     frag_size + shinfo_size);
 			KUNIT_ASSERT_NOT_NULL(test, frag_skb);
 			__skb_put(frag_skb, frag_size);
@@ -204,11 +203,8 @@ static void gso_test_func(struct kunit *test)
 
 			total_size += frag_size;
 			total_true_size += frag_skb->truesize;
-			alloc_size += frag_size + shinfo_size;
 		}
 
-		KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE);
-
 		skb->len += total_size;
 		skb->data_len += total_size;
 		skb->truesize += total_true_size;
-- 
cgit 


From fb317eb23b5ee4c37b0656a9a52a3db58d9dd072 Mon Sep 17 00:00:00 2001
From: Shigeru Yoshida <syoshida@redhat.com>
Date: Sat, 11 Nov 2023 01:39:47 +0900
Subject: tipc: Fix kernel-infoleak due to uninitialized TLV value

KMSAN reported the following kernel-infoleak issue:

=====================================================
BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline]
BUG: KMSAN: kernel-infoleak in copy_to_user_iter lib/iov_iter.c:24 [inline]
BUG: KMSAN: kernel-infoleak in iterate_ubuf include/linux/iov_iter.h:29 [inline]
BUG: KMSAN: kernel-infoleak in iterate_and_advance2 include/linux/iov_iter.h:245 [inline]
BUG: KMSAN: kernel-infoleak in iterate_and_advance include/linux/iov_iter.h:271 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4ec/0x2bc0 lib/iov_iter.c:186
 instrument_copy_to_user include/linux/instrumented.h:114 [inline]
 copy_to_user_iter lib/iov_iter.c:24 [inline]
 iterate_ubuf include/linux/iov_iter.h:29 [inline]
 iterate_and_advance2 include/linux/iov_iter.h:245 [inline]
 iterate_and_advance include/linux/iov_iter.h:271 [inline]
 _copy_to_iter+0x4ec/0x2bc0 lib/iov_iter.c:186
 copy_to_iter include/linux/uio.h:197 [inline]
 simple_copy_to_iter net/core/datagram.c:532 [inline]
 __skb_datagram_iter.5+0x148/0xe30 net/core/datagram.c:420
 skb_copy_datagram_iter+0x52/0x210 net/core/datagram.c:546
 skb_copy_datagram_msg include/linux/skbuff.h:3960 [inline]
 netlink_recvmsg+0x43d/0x1630 net/netlink/af_netlink.c:1967
 sock_recvmsg_nosec net/socket.c:1044 [inline]
 sock_recvmsg net/socket.c:1066 [inline]
 __sys_recvfrom+0x476/0x860 net/socket.c:2246
 __do_sys_recvfrom net/socket.c:2264 [inline]
 __se_sys_recvfrom net/socket.c:2260 [inline]
 __x64_sys_recvfrom+0x130/0x200 net/socket.c:2260
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

Uninit was created at:
 slab_post_alloc_hook+0x103/0x9e0 mm/slab.h:768
 slab_alloc_node mm/slub.c:3478 [inline]
 kmem_cache_alloc_node+0x5f7/0xb50 mm/slub.c:3523
 kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:560
 __alloc_skb+0x2fd/0x770 net/core/skbuff.c:651
 alloc_skb include/linux/skbuff.h:1286 [inline]
 tipc_tlv_alloc net/tipc/netlink_compat.c:156 [inline]
 tipc_get_err_tlv+0x90/0x5d0 net/tipc/netlink_compat.c:170
 tipc_nl_compat_recv+0x1042/0x15d0 net/tipc/netlink_compat.c:1324
 genl_family_rcv_msg_doit net/netlink/genetlink.c:972 [inline]
 genl_family_rcv_msg net/netlink/genetlink.c:1052 [inline]
 genl_rcv_msg+0x1220/0x12c0 net/netlink/genetlink.c:1067
 netlink_rcv_skb+0x4a4/0x6a0 net/netlink/af_netlink.c:2545
 genl_rcv+0x41/0x60 net/netlink/genetlink.c:1076
 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline]
 netlink_unicast+0xf4b/0x1230 net/netlink/af_netlink.c:1368
 netlink_sendmsg+0x1242/0x1420 net/netlink/af_netlink.c:1910
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg net/socket.c:745 [inline]
 ____sys_sendmsg+0x997/0xd60 net/socket.c:2588
 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2642
 __sys_sendmsg net/socket.c:2671 [inline]
 __do_sys_sendmsg net/socket.c:2680 [inline]
 __se_sys_sendmsg net/socket.c:2678 [inline]
 __x64_sys_sendmsg+0x2fa/0x4a0 net/socket.c:2678
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

Bytes 34-35 of 36 are uninitialized
Memory access of size 36 starts at ffff88802d464a00
Data copied to user address 00007ff55033c0a0

CPU: 0 PID: 30322 Comm: syz-executor.0 Not tainted 6.6.0-14500-g1c41041124bd #10
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014
=====================================================

tipc_add_tlv() puts TLV descriptor and value onto `skb`. This size is
calculated with TLV_SPACE() macro. It adds the size of struct tlv_desc and
the length of TLV value passed as an argument, and aligns the result to a
multiple of TLV_ALIGNTO, i.e., a multiple of 4 bytes.

If the size of struct tlv_desc plus the length of TLV value is not aligned,
the current implementation leaves the remaining bytes uninitialized. This
is the cause of the above kernel-infoleak issue.

This patch resolves this issue by clearing data up to an aligned size.

Fixes: d0796d1ef63d ("tipc: convert legacy nl bearer dump to nl compat")
Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 5bc076f2fa74..c763008a8adb 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -102,6 +102,7 @@ static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len)
 		return -EMSGSIZE;
 
 	skb_put(skb, TLV_SPACE(len));
+	memset(tlv, 0, TLV_SPACE(len));
 	tlv->tlv_type = htons(type);
 	tlv->tlv_len = htons(TLV_LENGTH(len));
 	if (len && data)
-- 
cgit 


From ca8add922f9c7f6e2e3c71039da8e0dcc64b87ed Mon Sep 17 00:00:00 2001
From: Sven Auhagen <sven.auhagen@voleatech.de>
Date: Sat, 11 Nov 2023 05:41:12 +0100
Subject: net: mvneta: fix calls to page_pool_get_stats

Calling page_pool_get_stats in the mvneta driver without checks
leads to kernel crashes.
First the page pool is only available if the bm is not used.
The page pool is also not allocated when the port is stopped.
It can also be not allocated in case of errors.

The current implementation leads to the following crash calling
ethstats on a port that is down or when calling it at the wrong moment:

ble to handle kernel NULL pointer dereference at virtual address 00000070
[00000070] *pgd=00000000
Internal error: Oops: 5 [#1] SMP ARM
Hardware name: Marvell Armada 380/385 (Device Tree)
PC is at page_pool_get_stats+0x18/0x1cc
LR is at mvneta_ethtool_get_stats+0xa0/0xe0 [mvneta]
pc : [<c0b413cc>]    lr : [<bf0a98d8>]    psr: a0000013
sp : f1439d48  ip : f1439dc0  fp : 0000001d
r10: 00000100  r9 : c4816b80  r8 : f0d75150
r7 : bf0b400c  r6 : c238f000  r5 : 00000000  r4 : f1439d68
r3 : c2091040  r2 : ffffffd8  r1 : f1439d68  r0 : 00000000
Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 066b004a  DAC: 00000051
Register r0 information: NULL pointer
Register r1 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390
Register r2 information: non-paged memory
Register r3 information: slab kmalloc-2k start c2091000 pointer offset 64 size 2048
Register r4 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390
Register r5 information: NULL pointer
Register r6 information: slab kmalloc-cg-4k start c238f000 pointer offset 0 size 4096
Register r7 information: 15-page vmalloc region starting at 0xbf0a8000 allocated at load_module+0xa30/0x219c
Register r8 information: 1-page vmalloc region starting at 0xf0d75000 allocated at ethtool_get_stats+0x138/0x208
Register r9 information: slab task_struct start c4816b80 pointer offset 0
Register r10 information: non-paged memory
Register r11 information: non-paged memory
Register r12 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390
Process snmpd (pid: 733, stack limit = 0x38de3a88)
Stack: (0xf1439d48 to 0xf143a000)
9d40:                   000000c0 00000001 c238f000 bf0b400c f0d75150 c4816b80
9d60: 00000100 bf0a98d8 00000000 00000000 00000000 00000000 00000000 00000000
9d80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
9da0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
9dc0: 00000dc0 5335509c 00000035 c238f000 bf0b2214 01067f50 f0d75000 c0b9b9c8
9de0: 0000001d 00000035 c2212094 5335509c c4816b80 c238f000 c5ad6e00 01067f50
9e00: c1b0be80 c4816b80 00014813 c0b9d7f0 00000000 00000000 0000001d 0000001d
9e20: 00000000 00001200 00000000 00000000 c216ed90 c73943b8 00000000 00000000
9e40: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
9e60: 00000000 c0ad9034 00000000 00000000 00000000 00000000 00000000 00000000
9e80: 00000000 00000000 00000000 5335509c c1b0be80 f1439ee4 00008946 c1b0be80
9ea0: 01067f50 f1439ee3 00000000 00000046 b6d77ae0 c0b383f0 00008946 becc83e8
9ec0: c1b0be80 00000051 0000000b c68ca480 c7172d00 c0ad8ff0 f1439ee3 cf600e40
9ee0: 01600e40 32687465 00000000 00000000 00000000 01067f50 00000000 00000000
9f00: 00000000 5335509c 00008946 00008946 00000000 c68ca480 becc83e8 c05e2de0
9f20: f1439fb0 c03002f0 00000006 5ac3c35a c4816b80 00000006 b6d77ae0 c030caf0
9f40: c4817350 00000014 f1439e1c 0000000c 00000000 00000051 01000000 00000014
9f60: 00003fec f1439edc 00000001 c0372abc b6d77ae0 c0372abc cf600e40 5335509c
9f80: c21e6800 01015c9c 0000000b 00008946 00000036 c03002f0 c4816b80 00000036
9fa0: b6d77ae0 c03000c0 01015c9c 0000000b 0000000b 00008946 becc83e8 00000000
9fc0: 01015c9c 0000000b 00008946 00000036 00000035 010678a0 b6d797ec b6d77ae0
9fe0: b6dbf738 becc838c b6d186d7 b6baa858 40000030 0000000b 00000000 00000000
 page_pool_get_stats from mvneta_ethtool_get_stats+0xa0/0xe0 [mvneta]
 mvneta_ethtool_get_stats [mvneta] from ethtool_get_stats+0x154/0x208
 ethtool_get_stats from dev_ethtool+0xf48/0x2480
 dev_ethtool from dev_ioctl+0x538/0x63c
 dev_ioctl from sock_ioctl+0x49c/0x53c
 sock_ioctl from sys_ioctl+0x134/0xbd8
 sys_ioctl from ret_fast_syscall+0x0/0x1c
Exception stack(0xf1439fa8 to 0xf1439ff0)
9fa0:                   01015c9c 0000000b 0000000b 00008946 becc83e8 00000000
9fc0: 01015c9c 0000000b 00008946 00000036 00000035 010678a0 b6d797ec b6d77ae0
9fe0: b6dbf738 becc838c b6d186d7 b6baa858
Code: e28dd004 e1a05000 e2514000 0a00006a (e5902070)

This commit adds the proper checks before calling page_pool_get_stats.

Fixes: b3fc79225f05 ("net: mvneta: add support for page_pool_get_stats")
Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
Reported-by: Paulo Da Silva <Paulo.DaSilva@kyberna.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 90817136808d..29aac327574d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4790,14 +4790,17 @@ static void mvneta_ethtool_get_strings(struct net_device *netdev, u32 sset,
 				       u8 *data)
 {
 	if (sset == ETH_SS_STATS) {
+		struct mvneta_port *pp = netdev_priv(netdev);
 		int i;
 
 		for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++)
 			memcpy(data + i * ETH_GSTRING_LEN,
 			       mvneta_statistics[i].name, ETH_GSTRING_LEN);
 
-		data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics);
-		page_pool_ethtool_stats_get_strings(data);
+		if (!pp->bm_priv) {
+			data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics);
+			page_pool_ethtool_stats_get_strings(data);
+		}
 	}
 }
 
@@ -4915,8 +4918,10 @@ static void mvneta_ethtool_pp_stats(struct mvneta_port *pp, u64 *data)
 	struct page_pool_stats stats = {};
 	int i;
 
-	for (i = 0; i < rxq_number; i++)
-		page_pool_get_stats(pp->rxqs[i].page_pool, &stats);
+	for (i = 0; i < rxq_number; i++) {
+		if (pp->rxqs[i].page_pool)
+			page_pool_get_stats(pp->rxqs[i].page_pool, &stats);
+	}
 
 	page_pool_ethtool_stats_get(data, &stats);
 }
@@ -4932,14 +4937,21 @@ static void mvneta_ethtool_get_stats(struct net_device *dev,
 	for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++)
 		*data++ = pp->ethtool_stats[i];
 
-	mvneta_ethtool_pp_stats(pp, data);
+	if (!pp->bm_priv)
+		mvneta_ethtool_pp_stats(pp, data);
 }
 
 static int mvneta_ethtool_get_sset_count(struct net_device *dev, int sset)
 {
-	if (sset == ETH_SS_STATS)
-		return ARRAY_SIZE(mvneta_statistics) +
-		       page_pool_ethtool_stats_get_count();
+	if (sset == ETH_SS_STATS) {
+		int count = ARRAY_SIZE(mvneta_statistics);
+		struct mvneta_port *pp = netdev_priv(dev);
+
+		if (!pp->bm_priv)
+			count += page_pool_ethtool_stats_get_count();
+
+		return count;
+	}
 
 	return -EOPNOTSUPP;
 }
-- 
cgit 


From c0a2a1b0d631fc460d830f52d06211838874d655 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 12 Nov 2023 22:16:32 -0500
Subject: ppp: limit MRU to 64K

ppp_sync_ioctl allows setting device MRU, but does not sanity check
this input.

Limit to a sane upper bound of 64KB.

No implementation I could find generates larger than 64KB frames.
RFC 2823 mentions an upper bound of PPP over SDL of 64KB based on the
16-bit length field. Other protocols will be smaller, such as PPPoE
(9KB jumbo frame) and PPPoA (18190 maximum CPCS-SDU size, RFC 2364).
PPTP and L2TP encapsulate in IP.

Syzbot managed to trigger alloc warning in __alloc_pages:

	if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))

    WARNING: CPU: 1 PID: 37 at mm/page_alloc.c:4544 __alloc_pages+0x3ab/0x4a0 mm/page_alloc.c:4544

    __alloc_skb+0x12b/0x330 net/core/skbuff.c:651
    __netdev_alloc_skb+0x72/0x3f0 net/core/skbuff.c:715
    netdev_alloc_skb include/linux/skbuff.h:3225 [inline]
    dev_alloc_skb include/linux/skbuff.h:3238 [inline]
    ppp_sync_input drivers/net/ppp/ppp_synctty.c:669 [inline]
    ppp_sync_receive+0xff/0x680 drivers/net/ppp/ppp_synctty.c:334
    tty_ldisc_receive_buf+0x14c/0x180 drivers/tty/tty_buffer.c:390
    tty_port_default_receive_buf+0x70/0xb0 drivers/tty/tty_port.c:37
    receive_buf drivers/tty/tty_buffer.c:444 [inline]
    flush_to_ldisc+0x261/0x780 drivers/tty/tty_buffer.c:494
    process_one_work+0x884/0x15c0 kernel/workqueue.c:2630

With call

    ioctl$PPPIOCSMRU1(r1, 0x40047452, &(0x7f0000000100)=0x5e6417a8)

Similar code exists in other drivers that implement ppp_channel_ops
ioctl PPPIOCSMRU. Those might also be in scope. Notably excluded from
this are pppol2tp_ioctl and pppoe_ioctl.

This code goes back to the start of git history.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: syzbot+6177e1f90d92583bcc58@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/ppp_synctty.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index ea261a628786..52d05ce4a281 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -453,6 +453,10 @@ ppp_sync_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg)
 	case PPPIOCSMRU:
 		if (get_user(val, (int __user *) argp))
 			break;
+		if (val > U16_MAX) {
+			err = -EINVAL;
+			break;
+		}
 		if (val < PPP_MRU)
 			val = PPP_MRU;
 		ap->mru = val;
-- 
cgit 


From 47d970204054f859f35a2237baa75c2d84fcf436 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 25 Sep 2023 17:54:13 +0200
Subject: xen/events: fix delayed eoi list handling

When delaying eoi handling of events, the related elements are queued
into the percpu lateeoi list. In case the list isn't empty, the
elements should be sorted by the time when eoi handling is to happen.

Unfortunately a new element will never be queued at the start of the
list, even if it has a handling time lower than all other list
elements.

Fix that by handling that case the same way as for an empty list.

Fixes: e99502f76271 ("xen/events: defer eoi in case of excessive number of events")
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 237e6b884f72..cd33a418344a 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -604,7 +604,9 @@ static void lateeoi_list_add(struct irq_info *info)
 
 	spin_lock_irqsave(&eoi->eoi_list_lock, flags);
 
-	if (list_empty(&eoi->eoi_list)) {
+	elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
+					eoi_list);
+	if (!elem || info->eoi_time < elem->eoi_time) {
 		list_add(&info->eoi_list, &eoi->eoi_list);
 		mod_delayed_work_on(info->eoi_cpu, system_wq,
 				    &eoi->delayed, delay);
-- 
cgit 


From f96c6c588ca81255566a5168e51c9cbbe7b86def Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 26 Sep 2023 14:29:54 +0200
Subject: xen/events: remove unused functions

There are no users of xen_irq_from_pirq() and xen_set_irq_pending().

Remove those functions.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 30 ------------------------------
 include/xen/events.h             |  4 ----
 2 files changed, 34 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index cd33a418344a..c5d86128eb73 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1169,29 +1169,6 @@ out:
 	return rc;
 }
 
-int xen_irq_from_pirq(unsigned pirq)
-{
-	int irq;
-
-	struct irq_info *info;
-
-	mutex_lock(&irq_mapping_update_lock);
-
-	list_for_each_entry(info, &xen_irq_list_head, list) {
-		if (info->type != IRQT_PIRQ)
-			continue;
-		irq = info->irq;
-		if (info->u.pirq.pirq == pirq)
-			goto out;
-	}
-	irq = -1;
-out:
-	mutex_unlock(&irq_mapping_update_lock);
-
-	return irq;
-}
-
-
 int xen_pirq_from_irq(unsigned irq)
 {
 	return pirq_from_irq(irq);
@@ -2031,13 +2008,6 @@ void xen_clear_irq_pending(int irq)
 		event_handler_exit(info);
 }
 EXPORT_SYMBOL(xen_clear_irq_pending);
-void xen_set_irq_pending(int irq)
-{
-	evtchn_port_t evtchn = evtchn_from_irq(irq);
-
-	if (VALID_EVTCHN(evtchn))
-		set_evtchn(evtchn);
-}
 
 bool xen_test_irq_pending(int irq)
 {
diff --git a/include/xen/events.h b/include/xen/events.h
index 23932b0673dc..a129cafa80ed 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -88,7 +88,6 @@ void xen_irq_resume(void);
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);
-void xen_set_irq_pending(int irq);
 bool xen_test_irq_pending(int irq);
 
 /* Poll waiting for an irq to become pending.  In the usual case, the
@@ -122,9 +121,6 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 /* De-allocates the above mentioned physical interrupt. */
 int xen_destroy_irq(int irq);
 
-/* Return irq from pirq */
-int xen_irq_from_pirq(unsigned pirq);
-
 /* Return the pirq allocated to the irq. */
 int xen_pirq_from_irq(unsigned irq);
 
-- 
cgit 


From b0077e269f6c152e807fdac90b58caf012cdbaab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 13 Nov 2023 11:52:31 +0800
Subject: blk-mq: make sure active queue usage is held for bio_integrity_prep()

blk_integrity_unregister() can come if queue usage counter isn't held
for one bio with integrity prepared, so this request may be completed with
calling profile->complete_fn, then kernel panic.

Another constraint is that bio_integrity_prep() needs to be called
before bio merge.

Fix the issue by:

- call bio_integrity_prep() with one queue usage counter grabbed reliably

- call bio_integrity_prep() before bio merge

Fixes: 900e080752025f00 ("block: move queue enter logic into blk_mq_submit_bio()")
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Link: https://lore.kernel.org/r/20231113035231.2708053-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 75 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2d11183f62e..900c1be1fee1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2858,11 +2858,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	};
 	struct request *rq;
 
-	if (unlikely(bio_queue_enter(bio)))
-		return NULL;
-
 	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
-		goto queue_exit;
+		return NULL;
 
 	rq_qos_throttle(q, bio);
 
@@ -2878,35 +2875,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	rq_qos_cleanup(q, bio);
 	if (bio->bi_opf & REQ_NOWAIT)
 		bio_wouldblock_error(bio);
-queue_exit:
-	blk_queue_exit(q);
 	return NULL;
 }
 
-static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
-		struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
+/* return true if this @rq can be used for @bio */
+static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+		struct bio *bio)
 {
-	struct request *rq;
-	enum hctx_type type, hctx_type;
+	enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
+	enum hctx_type hctx_type = rq->mq_hctx->type;
 
-	if (!plug)
-		return NULL;
-	rq = rq_list_peek(&plug->cached_rq);
-	if (!rq || rq->q != q)
-		return NULL;
+	WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
 
-	if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
-		*bio = NULL;
-		return NULL;
-	}
-
-	type = blk_mq_get_hctx_type((*bio)->bi_opf);
-	hctx_type = rq->mq_hctx->type;
 	if (type != hctx_type &&
 	    !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
-		return NULL;
-	if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
-		return NULL;
+		return false;
+	if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+		return false;
 
 	/*
 	 * If any qos ->throttle() end up blocking, we will have flushed the
@@ -2914,12 +2899,12 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
 	 * before we throttle.
 	 */
 	plug->cached_rq = rq_list_next(rq);
-	rq_qos_throttle(q, *bio);
+	rq_qos_throttle(rq->q, bio);
 
 	blk_mq_rq_time_init(rq, 0);
-	rq->cmd_flags = (*bio)->bi_opf;
+	rq->cmd_flags = bio->bi_opf;
 	INIT_LIST_HEAD(&rq->queuelist);
-	return rq;
+	return true;
 }
 
 static void bio_set_ioprio(struct bio *bio)
@@ -2949,7 +2934,7 @@ void blk_mq_submit_bio(struct bio *bio)
 	struct blk_plug *plug = blk_mq_plug(bio);
 	const int is_sync = op_is_sync(bio->bi_opf);
 	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
+	struct request *rq = NULL;
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
@@ -2960,20 +2945,36 @@ void blk_mq_submit_bio(struct bio *bio)
 			return;
 	}
 
-	if (!bio_integrity_prep(bio))
-		return;
-
 	bio_set_ioprio(bio);
 
-	rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
-	if (!rq) {
-		if (!bio)
+	if (plug) {
+		rq = rq_list_peek(&plug->cached_rq);
+		if (rq && rq->q != q)
+			rq = NULL;
+	}
+	if (rq) {
+		if (!bio_integrity_prep(bio))
 			return;
-		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
-		if (unlikely(!rq))
+		if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
 			return;
+		if (blk_mq_can_use_cached_rq(rq, plug, bio))
+			goto done;
+		percpu_ref_get(&q->q_usage_counter);
+	} else {
+		if (unlikely(bio_queue_enter(bio)))
+			return;
+		if (!bio_integrity_prep(bio))
+			goto fail;
+	}
+
+	rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+	if (unlikely(!rq)) {
+fail:
+		blk_queue_exit(q);
+		return;
 	}
 
+done:
 	trace_block_getrq(bio);
 
 	rq_qos_track(q, rq, bio);
-- 
cgit 


From d02ef87db9d6137fc2a98231b92f24ead4f7966d Mon Sep 17 00:00:00 2001
From: Simon Trimmer <simont@opensource.cirrus.com>
Date: Mon, 13 Nov 2023 16:40:29 +0000
Subject: ALSA: hda: cs35l56: Enable low-power hibernation mode on i2c

This can now be re-enabled as the sequence to reliably wake the device
has been implemented in the shared ASoC code.

This has a functional dependency on commit 3df761bdbc8b
("ASoC: cs35l56: Wake transactions need to be issued twice")

To protect against this, enabling hibernation is conditional on
CS35L56_WAKE_HOLD_TIME_US being defined, which indicates that the new
hibernation sequences are available.

Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20231113164029.1156669-1-rf@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/cs35l56_hda_i2c.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sound/pci/hda/cs35l56_hda_i2c.c b/sound/pci/hda/cs35l56_hda_i2c.c
index 757a4d193e0f..a9ef6d86de83 100644
--- a/sound/pci/hda/cs35l56_hda_i2c.c
+++ b/sound/pci/hda/cs35l56_hda_i2c.c
@@ -21,6 +21,10 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt)
 		return -ENOMEM;
 
 	cs35l56->base.dev = &clt->dev;
+
+#ifdef CS35L56_WAKE_HOLD_TIME_US
+	cs35l56->base.can_hibernate = true;
+#endif
 	cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c);
 	if (IS_ERR(cs35l56->base.regmap)) {
 		ret = PTR_ERR(cs35l56->base.regmap);
-- 
cgit 


From 58c09cad1754c56cb000ef07477e8781e3fad4d3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Nov 2023 08:52:24 -0800
Subject: drm/ci: make github dependabot happy again

The drm CI scripts for gitlab have a requirements file that makes the
github 'dependabot' worry about a few of the required tooling versions.

It wants to update the pip requirements from 23.2.1 to 23.3:

 "When installing a package from a Mercurial VCS URL, e.g. pip install
  hg+..., with pip prior to v23.3, the specified Mercurial revision
  could be used to inject arbitrary configuration options to the hg
  clone call (e.g. --config). Controlling the Mercurial configuration
  can modify how and which repository is installed. This vulnerability
  does not affect users who aren't installing from Mercurial"

and upgrade the urllib3 requirements from 2.0.4 to 2.0.7 due to two
issues:

 "urllib3's request body not stripped after redirect from 303 status
  changes request method to GET"

 "`Cookie` HTTP header isn't stripped on cross-origin redirects"

The file also ends up not having a newline at the end, that my editor
ends up wanting to fix automatically.

Link: https://github.com/dependabot
Tested-by: Helen Koike <helen.koike@collabora.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/ci/xfails/requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/ci/xfails/requirements.txt b/drivers/gpu/drm/ci/xfails/requirements.txt
index d8856d1581fd..e9994c9db799 100644
--- a/drivers/gpu/drm/ci/xfails/requirements.txt
+++ b/drivers/gpu/drm/ci/xfails/requirements.txt
@@ -5,7 +5,7 @@ termcolor==2.3.0
 certifi==2023.7.22
 charset-normalizer==3.2.0
 idna==3.4
-pip==23.2.1
+pip==23.3
 python-gitlab==3.15.0
 requests==2.31.0
 requests-toolbelt==1.0.0
@@ -13,5 +13,5 @@ ruamel.yaml==0.17.32
 ruamel.yaml.clib==0.2.7
 setuptools==68.0.0
 tenacity==8.2.3
-urllib3==2.0.4
-wheel==0.41.1
\ No newline at end of file
+urllib3==2.0.7
+wheel==0.41.1
-- 
cgit 


From 382561d16854a747e6df71034da08d20d6013dfe Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Sun, 12 Nov 2023 18:32:45 -0800
Subject: i2c: ocores: Move system PM hooks to the NOIRQ phase

When an I2C device contains a wake IRQ subordinate to a regmap-irq chip,
the regmap-irq code must be able to perform I2C transactions during
suspend_device_irqs() and resume_device_irqs(). Therefore, the bus must
be suspended/resumed during the NOIRQ phase.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Acked-by: Peter Korsgaard <peter@korsgaard.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/busses/i2c-ocores.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 041a76f71a49..e106af83cef4 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -771,8 +771,8 @@ static int ocores_i2c_resume(struct device *dev)
 	return ocores_init(dev, i2c);
 }
 
-static DEFINE_SIMPLE_DEV_PM_OPS(ocores_i2c_pm,
-				ocores_i2c_suspend, ocores_i2c_resume);
+static DEFINE_NOIRQ_DEV_PM_OPS(ocores_i2c_pm,
+			       ocores_i2c_suspend, ocores_i2c_resume);
 
 static struct platform_driver ocores_i2c_driver = {
 	.probe   = ocores_i2c_probe,
-- 
cgit 


From 7a1aba89ac54ccf6cad23a91a34c0ab24b1d7997 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Fri, 13 Oct 2023 12:25:10 +0200
Subject: ice: dpll: fix initial lock status of dpll

When dpll device is registered and dpll subsystem performs notify of a
new device, the lock state value provided to dpll subsystem equals 0
which is invalid value for the `enum dpll_lock_status`.
Provide correct value by obtaining it from firmware before registering
the dpll device.

Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu")
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_dpll.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index 835c419ccc74..607f534055b6 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -1756,6 +1756,7 @@ ice_dpll_init_dpll(struct ice_pf *pf, struct ice_dpll *d, bool cgu,
 	}
 	d->pf = pf;
 	if (cgu) {
+		ice_dpll_update_state(pf, d, true);
 		ret = dpll_device_register(d->dpll, type, &ice_dpll_ops, d);
 		if (ret) {
 			dpll_device_put(d->dpll);
@@ -1796,8 +1797,6 @@ static int ice_dpll_init_worker(struct ice_pf *pf)
 	struct ice_dplls *d = &pf->dplls;
 	struct kthread_worker *kworker;
 
-	ice_dpll_update_state(pf, &d->eec, true);
-	ice_dpll_update_state(pf, &d->pps, true);
 	kthread_init_delayed_work(&d->work, ice_dpll_periodic_work);
 	kworker = kthread_create_worker(0, "ice-dplls-%s",
 					dev_name(ice_pf_to_dev(pf)));
-- 
cgit 


From 4a4027f25dc3f39c2aafb3bf8926125c5378c9dc Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Tue, 31 Oct 2023 18:06:54 +0100
Subject: ice: dpll: fix check for dpll input priority range

Supported priority value for input pins may differ with regard of NIC
firmware version. E810T NICs with 3.20/4.00 FW versions would accept
priority range 0-31, where firmware 4.10+ would support the range 0-9
and extra value of 255.
Remove the in-range check as the driver has no information on supported
values from the running firmware, let firmware decide if given value is
correct and return extack error if the value is not supported.

Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu")
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_dpll.c | 6 ------
 drivers/net/ethernet/intel/ice/ice_dpll.h | 1 -
 2 files changed, 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index 607f534055b6..831ba6683962 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -815,12 +815,6 @@ ice_dpll_input_prio_set(const struct dpll_pin *pin, void *pin_priv,
 	struct ice_pf *pf = d->pf;
 	int ret;
 
-	if (prio > ICE_DPLL_PRIO_MAX) {
-		NL_SET_ERR_MSG_FMT(extack, "prio out of supported range 0-%d",
-				   ICE_DPLL_PRIO_MAX);
-		return -EINVAL;
-	}
-
 	mutex_lock(&pf->dplls.lock);
 	ret = ice_dpll_hw_input_prio_set(pf, d, p, prio, extack);
 	mutex_unlock(&pf->dplls.lock);
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.h b/drivers/net/ethernet/intel/ice/ice_dpll.h
index bb32b6d88373..93172e93995b 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.h
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.h
@@ -6,7 +6,6 @@
 
 #include "ice.h"
 
-#define ICE_DPLL_PRIO_MAX	0xF
 #define ICE_DPLL_RCLK_NUM_MAX	4
 
 /** ice_dpll_pin - store info about pins
-- 
cgit 


From 6db5f2cd9ebb12e930a82c01714a6589576cd50f Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Tue, 31 Oct 2023 18:08:00 +0100
Subject: ice: dpll: fix output pin capabilities

The dpll output pins which are used to feed clock signal of PHY and MAC
circuits cannot be disconnected, those integrated circuits require clock
signal for operation.
By stopping assignment of DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE pin
capability, prevent the user from invoking the state set callback on
those pins, setting the state on those pins already returns error, as
firmware doesn't allow the change of their state.

Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu")
Fixes: 8a3a565ff210 ("ice: add admin commands to access cgu configuration")
Reviewed-by: Andrii Staikov <andrii.staikov@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_dpll.c   | 12 ++++---
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 54 +++++++++++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_ptp_hw.h |  2 ++
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index 831ba6683962..86b180cb32a0 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -1823,6 +1823,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 	int num_pins, i, ret = -EINVAL;
 	struct ice_hw *hw = &pf->hw;
 	struct ice_dpll_pin *pins;
+	unsigned long caps;
 	u8 freq_supp_num;
 	bool input;
 
@@ -1842,6 +1843,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 	}
 
 	for (i = 0; i < num_pins; i++) {
+		caps = 0;
 		pins[i].idx = i;
 		pins[i].prop.board_label = ice_cgu_get_pin_name(hw, i, input);
 		pins[i].prop.type = ice_cgu_get_pin_type(hw, i, input);
@@ -1854,8 +1856,8 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 						      &dp->input_prio[i]);
 			if (ret)
 				return ret;
-			pins[i].prop.capabilities |=
-				DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE;
+			caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE |
+				 DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE);
 			pins[i].prop.phase_range.min =
 				pf->dplls.input_phase_adj_max;
 			pins[i].prop.phase_range.max =
@@ -1865,9 +1867,11 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 				pf->dplls.output_phase_adj_max;
 			pins[i].prop.phase_range.max =
 				-pf->dplls.output_phase_adj_max;
+			ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps);
+			if (ret)
+				return ret;
 		}
-		pins[i].prop.capabilities |=
-			DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE;
+		pins[i].prop.capabilities = caps;
 		ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL);
 		if (ret)
 			return ret;
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
index 6d573908de7a..a00b55e14aac 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -3961,3 +3961,57 @@ int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num)
 
 	return ret;
 }
+
+/**
+ * ice_cgu_get_output_pin_state_caps - get output pin state capabilities
+ * @hw: pointer to the hw struct
+ * @pin_id: id of a pin
+ * @caps: capabilities to modify
+ *
+ * Return:
+ * * 0 - success, state capabilities were modified
+ * * negative - failure, capabilities were not modified
+ */
+int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id,
+				      unsigned long *caps)
+{
+	bool can_change = true;
+
+	switch (hw->device_id) {
+	case ICE_DEV_ID_E810C_SFP:
+		if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3)
+			can_change = false;
+		break;
+	case ICE_DEV_ID_E810C_QSFP:
+		if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3 || pin_id == ZL_OUT4)
+			can_change = false;
+		break;
+	case ICE_DEV_ID_E823L_10G_BASE_T:
+	case ICE_DEV_ID_E823L_1GBE:
+	case ICE_DEV_ID_E823L_BACKPLANE:
+	case ICE_DEV_ID_E823L_QSFP:
+	case ICE_DEV_ID_E823L_SFP:
+	case ICE_DEV_ID_E823C_10G_BASE_T:
+	case ICE_DEV_ID_E823C_BACKPLANE:
+	case ICE_DEV_ID_E823C_QSFP:
+	case ICE_DEV_ID_E823C_SFP:
+	case ICE_DEV_ID_E823C_SGMII:
+		if (hw->cgu_part_number ==
+		    ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL30632_80032 &&
+		    pin_id == ZL_OUT2)
+			can_change = false;
+		else if (hw->cgu_part_number ==
+			 ICE_AQC_GET_LINK_TOPO_NODE_NR_SI5383_5384 &&
+			 pin_id == SI_OUT1)
+			can_change = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (can_change)
+		*caps |= DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE;
+	else
+		*caps &= ~DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
index 36aeeef99ec0..cf76701566c7 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
@@ -282,6 +282,8 @@ int ice_get_cgu_state(struct ice_hw *hw, u8 dpll_idx,
 int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num);
 
 void ice_ptp_init_phy_model(struct ice_hw *hw);
+int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id,
+				      unsigned long *caps);
 
 #define PFTSYN_SEM_BYTES	4
 
-- 
cgit 


From a778616e4cc2d5e3a253c7d8959aafa5218fc5e4 Mon Sep 17 00:00:00 2001
From: Dan Nowlin <dan.nowlin@intel.com>
Date: Tue, 7 Nov 2023 12:32:27 -0500
Subject: ice: fix DDP package download for packages without signature segment

Commit 3cbdb0343022 ("ice: Add support for E830 DDP package segment")
incorrectly removed support for package download for packages without a
signature segment. These packages include the signature buffer inline
in the configurations buffers, and not in a signature segment.

Fix package download by providing download support for both packages
with (ice_download_pkg_with_sig_seg()) and without signature segment
(ice_download_pkg_without_sig_seg()).

Fixes: 3cbdb0343022 ("ice: Add support for E830 DDP package segment")
Reported-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Closes: https://lore.kernel.org/netdev/ZUT50a94kk2pMGKb@boxer/
Tested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ddp.c | 103 ++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c
index cfb1580f5850..8b7504a9df31 100644
--- a/drivers/net/ethernet/intel/ice/ice_ddp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ddp.c
@@ -1479,14 +1479,14 @@ ice_post_dwnld_pkg_actions(struct ice_hw *hw)
 }
 
 /**
- * ice_download_pkg
+ * ice_download_pkg_with_sig_seg
  * @hw: pointer to the hardware structure
  * @pkg_hdr: pointer to package header
  *
  * Handles the download of a complete package.
  */
 static enum ice_ddp_state
-ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
+ice_download_pkg_with_sig_seg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
 {
 	enum ice_aq_err aq_err = hw->adminq.sq_last_status;
 	enum ice_ddp_state state = ICE_DDP_PKG_ERR;
@@ -1519,6 +1519,103 @@ ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
 		state = ice_post_dwnld_pkg_actions(hw);
 
 	ice_release_global_cfg_lock(hw);
+
+	return state;
+}
+
+/**
+ * ice_dwnld_cfg_bufs
+ * @hw: pointer to the hardware structure
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
+ *
+ * Obtains global config lock and downloads the package configuration buffers
+ * to the firmware.
+ */
+static enum ice_ddp_state
+ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+{
+	enum ice_ddp_state state;
+	struct ice_buf_hdr *bh;
+	int status;
+
+	if (!bufs || !count)
+		return ICE_DDP_PKG_ERR;
+
+	/* If the first buffer's first section has its metadata bit set
+	 * then there are no buffers to be downloaded, and the operation is
+	 * considered a success.
+	 */
+	bh = (struct ice_buf_hdr *)bufs;
+	if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF)
+		return ICE_DDP_PKG_SUCCESS;
+
+	status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE);
+	if (status) {
+		if (status == -EALREADY)
+			return ICE_DDP_PKG_ALREADY_LOADED;
+		return ice_map_aq_err_to_ddp_state(hw->adminq.sq_last_status);
+	}
+
+	state = ice_dwnld_cfg_bufs_no_lock(hw, bufs, 0, count, true);
+	if (!state)
+		state = ice_post_dwnld_pkg_actions(hw);
+
+	ice_release_global_cfg_lock(hw);
+
+	return state;
+}
+
+/**
+ * ice_download_pkg_without_sig_seg
+ * @hw: pointer to the hardware structure
+ * @ice_seg: pointer to the segment of the package to be downloaded
+ *
+ * Handles the download of a complete package without signature segment.
+ */
+static enum ice_ddp_state
+ice_download_pkg_without_sig_seg(struct ice_hw *hw, struct ice_seg *ice_seg)
+{
+	struct ice_buf_table *ice_buf_tbl;
+
+	ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n",
+		  ice_seg->hdr.seg_format_ver.major,
+		  ice_seg->hdr.seg_format_ver.minor,
+		  ice_seg->hdr.seg_format_ver.update,
+		  ice_seg->hdr.seg_format_ver.draft);
+
+	ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n",
+		  le32_to_cpu(ice_seg->hdr.seg_type),
+		  le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_id);
+
+	ice_buf_tbl = ice_find_buf_table(ice_seg);
+
+	ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
+		  le32_to_cpu(ice_buf_tbl->buf_count));
+
+	return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
+				  le32_to_cpu(ice_buf_tbl->buf_count));
+}
+
+/**
+ * ice_download_pkg
+ * @hw: pointer to the hardware structure
+ * @pkg_hdr: pointer to package header
+ * @ice_seg: pointer to the segment of the package to be downloaded
+ *
+ * Handles the download of a complete package.
+ */
+static enum ice_ddp_state
+ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr,
+		 struct ice_seg *ice_seg)
+{
+	enum ice_ddp_state state;
+
+	if (hw->pkg_has_signing_seg)
+		state = ice_download_pkg_with_sig_seg(hw, pkg_hdr);
+	else
+		state = ice_download_pkg_without_sig_seg(hw, ice_seg);
+
 	ice_post_pkg_dwnld_vlan_mode_cfg(hw);
 
 	return state;
@@ -2083,7 +2180,7 @@ enum ice_ddp_state ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len)
 
 	/* initialize package hints and then download package */
 	ice_init_pkg_hints(hw, seg);
-	state = ice_download_pkg(hw, pkg);
+	state = ice_download_pkg(hw, pkg, seg);
 	if (state == ICE_DDP_PKG_ALREADY_LOADED) {
 		ice_debug(hw, ICE_DBG_INIT,
 			  "package previously loaded - no work.\n");
-- 
cgit 


From ff31ba19d732efb9aca3633935d71085e68d5076 Mon Sep 17 00:00:00 2001
From: Anastasia Belova <abelova@astralinux.ru>
Date: Mon, 13 Nov 2023 17:52:32 +0300
Subject: cifs: spnego: add ';' in HOST_KEY_LEN

"host=" should start with ';' (as in cifs_get_spnego_key)
So its length should be 6.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Reviewed-by: Paulo Alcantara (SUSE) <pc@manguebit.com>
Fixes: 7c9c3760b3a5 ("[CIFS] add constants for string lengths of keynames in SPNEGO upcall string")
Signed-off-by: Anastasia Belova <abelova@astralinux.ru>
Co-developed-by: Ekaterina Esina <eesina@astralinux.ru>
Signed-off-by: Ekaterina Esina <eesina@astralinux.ru>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifs_spnego.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 6f3285f1dfee..af7849e5974f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* strlen of "host=" */
-#define HOST_KEY_LEN		5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN		6
 
 /* strlen of ";ip4=" or ";ip6=" */
 #define IP_KEY_LEN		5
-- 
cgit 


From 181724fc72486dec2bec8803459be05b5162aaa8 Mon Sep 17 00:00:00 2001
From: Ekaterina Esina <eesina@astralinux.ru>
Date: Mon, 13 Nov 2023 19:42:41 +0300
Subject: cifs: fix check of rc in function generate_smb3signingkey

Remove extra check after condition, add check after generating key
for encryption. The check is needed to return non zero rc before
rewriting it with generating key for decryption.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Reviewed-by: Paulo Alcantara (SUSE) <pc@manguebit.com>
Fixes: d70e9fa55884 ("cifs: try opening channels after mounting")
Signed-off-by: Ekaterina Esina <eesina@astralinux.ru>
Co-developed-by: Anastasia Belova <abelova@astralinux.ru>
Signed-off-by: Anastasia Belova <abelova@astralinux.ru>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2transport.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 84ea67301303..5a3ca62d2f07 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -458,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
 				  ptriplet->encryption.context,
 				  ses->smb3encryptionkey,
 				  SMB3_ENC_DEC_KEY_SIZE);
+		if (rc)
+			return rc;
 		rc = generate_key(ses, ptriplet->decryption.label,
 				  ptriplet->decryption.context,
 				  ses->smb3decryptionkey,
@@ -466,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses,
 			return rc;
 	}
 
-	if (rc)
-		return rc;
-
 #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
 	cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
 	/*
-- 
cgit 


From 48d584b7f90f48d2623fb01743b099efbf0d36c2 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Mon, 6 Nov 2023 10:34:01 +0800
Subject: bcachefs: make bch2_target_to_text_sb static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bch2_target_to_text_sb are not used outside the file disk_groups.c,
so the modification is defined as static.

fs/bcachefs/disk_groups.c:583:6: warning: no previous prototype for ‘bch2_target_to_text_sb’.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7144
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/disk_groups.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index d613695abf9f..1f334124055b 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -580,7 +580,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 	}
 }
 
-void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
 	struct target t = target_decode(v);
 
-- 
cgit 


From c4f1f80a0e8d829ce4e29ca52cb7f74b22f67454 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Nov 2023 12:30:19 -0500
Subject: bcachefs: Use correct fgf_t type as function argument

This quiets a sparse complaint.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-pagecache.c | 2 +-
 fs/bcachefs/fs-io-pagecache.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 8bd9bcdd27f7..ff664fd0d8ef 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -13,7 +13,7 @@
 
 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 				     loff_t start, u64 end,
-				     int fgp_flags, gfp_t gfp,
+				     fgf_t fgp_flags, gfp_t gfp,
 				     folios *fs)
 {
 	struct folio *f;
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index a2222ad586e9..27f712ae37a6 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -7,7 +7,7 @@
 typedef DARRAY(struct folio *) folios;
 
 int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
-				     u64, int, gfp_t, folios *);
+				     u64, fgf_t, gfp_t, folios *);
 int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /*
-- 
cgit 


From 1b8bc556280d3f4970407480e6a5ff49efe5601b Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 6 Nov 2023 16:27:02 -0600
Subject: bcachefs: Use DECLARE_FLEX_ARRAY() helper and fix multiple
 -Warray-bounds warnings

Transform zero-length array `s` into a proper flexible-array
member in `struct snapshot_table` via the DECLARE_FLEX_ARRAY()
helper; and fix tons of the following -Warray-bounds warnings:

fs/bcachefs/snapshot.h:36:21: warning: array subscript <unknown> is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=]
fs/bcachefs/snapshot.h:36:21: warning: array subscript <unknown> is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=]
fs/bcachefs/snapshot.c:135:70: warning: array subscript <unknown> is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=]
fs/bcachefs/snapshot.h:36:21: warning: array subscript <unknown> is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=]
fs/bcachefs/snapshot.h:36:21: warning: array subscript <unknown> is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=]
fs/bcachefs/snapshot.h:36:21: warning: array subscript <unknown> is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=]

This helps with the ongoing efforts to globally enable -Warray-bounds.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 86833445af20..2d2e66a4e468 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,7 +20,7 @@ struct snapshot_t {
 };
 
 struct snapshot_table {
-	struct snapshot_t	s[0];
+	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
 };
 
 typedef struct {
-- 
cgit 


From 274c2f8fd27158d15524abe63c3df6fb96707dd3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 6 Nov 2023 15:40:22 -0600
Subject: bcachefs: Fix multiple -Warray-bounds warnings

Transform zero-length array `entries` into a proper flexible-array
member in `struct journal_seq_blacklist_table`; and fix the following
-Warray-bounds warnings:

fs/bcachefs/journal_seq_blacklist.c:148:26: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:150:30: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:154:27: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:176:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:177:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:297:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:298:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]
fs/bcachefs/journal_seq_blacklist.c:300:31: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=]

This results in no differences in binary output.

This helps with the ongoing efforts to globally enable -Warray-bounds.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9cb8684959ee..403aa3389fcc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -617,7 +617,7 @@ struct journal_seq_blacklist_table {
 		u64		start;
 		u64		end;
 		bool		dirty;
-	}			entries[0];
+	}			entries[];
 };
 
 struct journal_keys {
-- 
cgit 


From 03cc1e67a243cbb2c85d5fd84f369449f94d4269 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 7 Nov 2023 10:30:22 -0500
Subject: bcachefs: Fix null ptr deref in bch2_backpointer_get_node()

bch2_btree_iter_peek_node() can return a NULL ptr (when the tree is
shorter than the search depth); handle this with an early return.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Fixes: https://lore.kernel.org/linux-bcachefs/5fc3c28b-c232-4ec7-b0ac-4ef220ddf976@moroto.mountain/T/
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ef02c9bb0354..23c0834a97a4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 				  bp.level - 1,
 				  0);
 	b = bch2_btree_iter_peek_node(iter);
-	if (IS_ERR(b))
+	if (IS_ERR_OR_NULL(b))
 		goto err;
 
 	BUG_ON(b->c.level != bp.level - 1);
 
-	if (b && extent_matches_bp(c, bp.btree_id, bp.level,
-				   bkey_i_to_s_c(&b->key),
-				   bucket, bp))
+	if (extent_matches_bp(c, bp.btree_id, bp.level,
+			      bkey_i_to_s_c(&b->key),
+			      bucket, bp))
 		return b;
 
-	if (b && btree_node_will_make_reachable(b)) {
+	if (btree_node_will_make_reachable(b)) {
 		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
 	} else {
 		backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
-- 
cgit 


From 4d6128dca6d940015fe2aa383ec1a0eeb9632f08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Nov 2023 11:59:05 -0500
Subject: bcachefs: Guard against insufficient devices to create stripes

We can't create stripes if we don't have enough devices - this
manifested as an integer underflow bug later.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 875f7c5a6fca..2a77de18c004 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 			h->nr_active_devs++;
 
 	rcu_read_unlock();
+
+	/*
+	 * If we only have redundancy + 1 devices, we're better off with just
+	 * replication:
+	 */
+	if (h->nr_active_devs < h->redundancy + 2)
+		bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+			h->nr_active_devs, h->redundancy + 2);
+
 	list_add(&h->list, &c->ec_stripe_head_list);
 	return h;
 }
@@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
 
 	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
 found:
+	if (!IS_ERR_OR_NULL(h) &&
+	    h->nr_active_devs < h->redundancy + 2) {
+		mutex_unlock(&h->lock);
+		h = NULL;
+	}
 	mutex_unlock(&c->ec_stripe_head_lock);
 	return h;
 }
@@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	int ret;
 
 	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
-	if (!h)
-		bch_err(c, "no stripe head");
 	if (IS_ERR_OR_NULL(h))
 		return h;
 
-- 
cgit 


From 1bd5bcc9f5eef968ed021d72b14a157be7abdb49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Nov 2023 21:44:14 -0500
Subject: bcachefs: Split out btree_key_cache_types.h

More consistent organization.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache_types.h | 30 ++++++++++++++++++++++++++++++
 fs/bcachefs/btree_types.h           | 27 +--------------------------
 2 files changed, 31 insertions(+), 26 deletions(-)
 create mode 100644 fs/bcachefs/btree_key_cache_types.h

diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
new file mode 100644
index 000000000000..0f967808d766
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+	struct bkey_cached	*objs[16];
+	unsigned		nr;
+};
+
+struct btree_key_cache {
+	struct mutex		lock;
+	struct rhashtable	table;
+	bool			table_init_done;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
+	struct shrinker		*shrink;
+	unsigned		shrink_iter;
+	struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+	atomic_long_t		nr_freed;
+	atomic_long_t		nr_keys;
+	atomic_long_t		nr_dirty;
+};
+
+struct bkey_cached_key {
+	u32			btree_id;
+	struct bpos		pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 941841a0c5bf..be5d6027e796 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,7 +5,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "darray.h"
 #include "errcode.h"
@@ -312,31 +312,6 @@ struct btree_iter {
 #endif
 };
 
-struct btree_key_cache_freelist {
-	struct bkey_cached	*objs[16];
-	unsigned		nr;
-};
-
-struct btree_key_cache {
-	struct mutex		lock;
-	struct rhashtable	table;
-	bool			table_init_done;
-	struct list_head	freed_pcpu;
-	struct list_head	freed_nonpcpu;
-	struct shrinker		*shrink;
-	unsigned		shrink_iter;
-	struct btree_key_cache_freelist __percpu *pcpu_freed;
-
-	atomic_long_t		nr_freed;
-	atomic_long_t		nr_keys;
-	atomic_long_t		nr_dirty;
-};
-
-struct bkey_cached_key {
-	u32			btree_id;
-	struct bpos		pos;
-} __packed __aligned(4);
-
 #define BKEY_CACHED_ACCESSED		0
 #define BKEY_CACHED_DIRTY		1
 
-- 
cgit 


From c65c13f0eac61218c9ee4635c05661c0b9760e58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Nov 2023 09:53:14 -0500
Subject: bcachefs: Run btree key cache shrinker less aggressively

The btree key cache maintains lists of items that have been freed, but
can't yet be reclaimed because a bch2_trans_relock() call might find
them - we're waiting for SRCU readers to release.

Previously, we wouldn't count these items against the number we're
attempting to scan for, which would mean we'd evict more live key cache
entries - doing quite a bit of potentially unecessary work.

With recent work to make sure we don't hold SRCU locks for too long, it
should be safe to count all the items on the freelists against number to
scan - even if we can't reclaim them yet, we will be able to soon.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c       | 23 +++++++++++++++++++----
 fs/bcachefs/btree_key_cache_types.h |  4 ++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9b78f78a75b5..b3305a04d808 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 	ck->btree_trans_barrier_seq =
 		start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-	if (ck->c.lock.readers)
+	if (ck->c.lock.readers) {
 		list_move_tail(&ck->list, &bc->freed_pcpu);
-	else
+		bc->nr_freed_pcpu++;
+	} else {
 		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+		bc->nr_freed_nonpcpu++;
+	}
 	atomic_long_inc(&bc->nr_freed);
 
 	kfree(ck->k);
@@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 {
 	struct bkey_cached *pos;
 
+	bc->nr_freed_nonpcpu++;
+
 	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
 		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
 				 pos->btree_trans_barrier_seq)) {
@@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 #else
 		mutex_lock(&bc->lock);
 		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+		bc->nr_freed_nonpcpu++;
 		mutex_unlock(&bc->lock);
 #endif
 	} else {
@@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 			       f->nr < ARRAY_SIZE(f->objs) / 2) {
 				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 				list_del_init(&ck->list);
+				bc->nr_freed_nonpcpu--;
 				f->objs[f->nr++] = ck;
 			}
 
@@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		if (!list_empty(&bc->freed_nonpcpu)) {
 			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 			list_del_init(&ck->list);
+			bc->nr_freed_nonpcpu--;
 		}
 		mutex_unlock(&bc->lock);
 #endif
@@ -850,6 +858,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	 * Newest freed entries are at the end of the list - once we hit one
 	 * that's too new to be freed, we can bail out:
 	 */
+	scanned += bc->nr_freed_nonpcpu;
+
 	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
 		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 						 ck->btree_trans_barrier_seq))
@@ -859,13 +869,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
-		scanned++;
 		freed++;
+		bc->nr_freed_nonpcpu--;
 	}
 
 	if (scanned >= nr)
 		goto out;
 
+	scanned += bc->nr_freed_pcpu;
+
 	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
 		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 						 ck->btree_trans_barrier_seq))
@@ -875,8 +887,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
-		scanned++;
 		freed++;
+		bc->nr_freed_pcpu--;
 	}
 
 	if (scanned >= nr)
@@ -982,6 +994,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	}
 #endif
 
+	BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+	BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
 	list_splice(&bc->freed_pcpu,	&items);
 	list_splice(&bc->freed_nonpcpu,	&items);
 
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 0f967808d766..290e4e57df5b 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -11,8 +11,12 @@ struct btree_key_cache {
 	struct mutex		lock;
 	struct rhashtable	table;
 	bool			table_init_done;
+
 	struct list_head	freed_pcpu;
+	size_t			nr_freed_pcpu;
 	struct list_head	freed_nonpcpu;
+	size_t			nr_freed_nonpcpu;
+
 	struct shrinker		*shrink;
 	unsigned		shrink_iter;
 	struct btree_key_cache_freelist __percpu *pcpu_freed;
-- 
cgit 


From 3b8c4507779691984e31e64e0b80abb03cc02d0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Nov 2023 19:49:47 -0500
Subject: bcachefs: btree_trans->write_locked

As prep work for the next patch to fix a key cache reclaim issue, we
need to start tracking whether we're currently holding write locks - so
that we can release and retake the before calling into memory reclaim.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 85 +++++++++++++++++++++++-----------------
 fs/bcachefs/btree_types.h        |  1 +
 2 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index decad7b66c59..02491f7bb831 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
 		bch2_btree_init_next(trans, b);
 }
 
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	EBUG_ON(trans->write_locked);
+
+	trans_for_each_update(trans, i) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+			return trans_lock_write_fail(trans, i);
+
+		if (!i->cached)
+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trans->write_locked = true;
+	return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+	if (likely(trans->write_locked)) {
+		struct btree_insert_entry *i;
+
+		trans_for_each_update(trans, i)
+			if (!same_leaf_as_prev(trans, i))
+				bch2_btree_node_unlock_write_inlined(trans, i->path,
+								     insert_l(i)->b);
+		trans->write_locked = false;
+	}
+}
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -732,37 +779,6 @@ revert_fs_usage:
 	return ret;
 }
 
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	while (--i >= trans->updates) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
-	}
-
-	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_update(trans, i) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
-			return trans_lock_write_fail(trans, i);
-
-		if (!i->cached)
-			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-	}
-
-	return 0;
-}
-
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
@@ -838,7 +854,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 	if (unlikely(ret))
 		return ret;
 
-	ret = trans_lock_write(trans);
+	ret = bch2_trans_lock_write(trans);
 	if (unlikely(ret))
 		return ret;
 
@@ -847,10 +863,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 	if (!ret && unlikely(trans->journal_replay_not_finished))
 		bch2_drop_overwrites_from_journal(trans);
 
-	trans_for_each_update(trans, i)
-		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_unlock_write_inlined(trans, i->path,
-							insert_l(i)->b);
+	bch2_trans_unlock_write(trans);
 
 	if (!ret && trans->journal_pin)
 		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index be5d6027e796..f3669fa68591 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -409,6 +409,7 @@ struct btree_trans {
 	bool			journal_transaction_names:1;
 	bool			journal_replay_not_finished:1;
 	bool			notrace_relock_fail:1;
+	bool			write_locked:1;
 	enum bch_errcode	restarted:16;
 	u32			restart_count;
 	unsigned long		last_begin_ip;
-- 
cgit 


From 09b0283ee23a02094a43a9b93146d1060c58fc3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 5 Nov 2023 15:28:44 -0500
Subject: bcachefs: Make sure to drop/retake btree locks before reclaim

We really don't want to be invoking memory reclaim with btree locks
held: even aside from (solvable, but tricky) recursion issues, it can
cause painful to diagnose performance edge cases.

This fixes a recently reported issue in btree_key_can_insert_cached().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reported-by: Mateusz Guzik <mjguzik@gmail.com>
Fixes: https://lore.kernel.org/linux-bcachefs/CAGudoHEsb_hGRMeWeXh+UF6po0qQuuq_NKSEo+s1sEb6bDLjpA@mail.gmail.com/T/
---
 fs/bcachefs/btree_trans_commit.c | 48 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 02491f7bb831..55a120eb8692 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -368,6 +368,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
 	return 0;
 }
 
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+				     struct btree_path *path, unsigned new_u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct bkey_i *new_k;
+	int ret;
+
+	bch2_trans_unlock_write(trans);
+	bch2_trans_unlock(trans);
+
+	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+	if (!new_k) {
+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch2_btree_id_str(path->btree_id), new_u64s);
+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+	}
+
+	ret =   bch2_trans_relock(trans) ?:
+		bch2_trans_lock_write(trans);
+	if (unlikely(ret)) {
+		kfree(new_k);
+		return ret;
+	}
+
+	memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	kfree(ck->k);
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
 static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
 				       struct btree_path *path, unsigned u64s)
 {
@@ -394,12 +433,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
 		return 0;
 
 	new_u64s	= roundup_pow_of_two(u64s);
-	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-	if (!new_k) {
-		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-			bch2_btree_id_str(path->btree_id), new_u64s);
-		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-	}
+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+	if (unlikely(!new_k))
+		return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
 
 	trans_for_each_update(trans, i)
 		if (i->old_v == &ck->k->v)
-- 
cgit 


From 701ff57eb3d7c86c9a53de959e0c48fa8ca446d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 Nov 2023 18:38:35 -0400
Subject: bcachefs: Check for nonce offset inconsistency in data_update path

We've rarely been seeing a nonce offset inconsistency that doesn't show
up in tests: this adds some extra verification code to the data update
path that prints out more relevant info when it occurs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0771a6d880bf..5ed66202c226 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -239,6 +239,34 @@ restart_drop_extra_replicas:
 
 		next_pos = insert->k.p;
 
+		/*
+		 * Check for nonce offset inconsistency:
+		 * This is debug code - we've been seeing this bug rarely, and
+		 * it's been hard to reproduce, so this should give us some more
+		 * information when it does occur:
+		 */
+		struct printbuf err = PRINTBUF;
+		int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+		printbuf_exit(&err);
+
+		if (invalid) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "about to insert invalid key in data update path");
+			prt_str(&buf, "\nold: ");
+			bch2_bkey_val_to_text(&buf, c, old);
+			prt_str(&buf, "\nk:   ");
+			bch2_bkey_val_to_text(&buf, c, k);
+			prt_str(&buf, "\nnew: ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+			bch2_print_string_as_lines(KERN_ERR, buf.buf);
+			printbuf_exit(&buf);
+
+			bch2_fatal_error(c);
+			goto out;
+		}
+
 		ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
 						k.k->p, bkey_start_pos(&insert->k)) ?:
 			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-- 
cgit 


From 4b3812d90b2c93723adf4b6ce99240d301f7d5f9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 13 Nov 2023 20:49:39 -0800
Subject: Revert "ptp: Fixes a null pointer dereference in ptp_ioctl"

This reverts commit 8a4f030dbced6fc255cbe67b2d0a129947e18493.

Richard says:

  The test itself is harmless, but keeping it will make people think,
  "oh this pointer can be invalid."

  In fact the core stack ensures that ioctl() can't be invoked after
  release(), otherwise Bad Stuff happens.

Fixes: 8a4f030dbced ("ptp: Fixes a null pointer dereference in ptp_ioctl")
Link: https://lore.kernel.org/all/ZVAf_qdRfDAQYUt-@hoboy.vegasvil.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/ptp/ptp_chardev.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index e95a6ed130ef..3f7a74788802 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -176,8 +176,6 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
 	int enable, err = 0;
 
 	tsevq = pccontext->private_clkdata;
-	if (!tsevq)
-		return -EINVAL;
 
 	switch (cmd) {
 
-- 
cgit 


From 73bde5a3294853947252cd9092a3517c7cb0cd2d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Nov 2023 17:48:59 +0000
Subject: ptp: annotate data-race around q->head and q->tail

As I was working on a syzbot report, I found that KCSAN would
probably complain that reading q->head or q->tail without
barriers could lead to invalid results.

Add corresponding READ_ONCE() and WRITE_ONCE() to avoid
load-store tearing.

Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Link: https://lore.kernel.org/r/20231109174859.3995880-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/ptp/ptp_chardev.c | 3 ++-
 drivers/ptp/ptp_clock.c   | 5 +++--
 drivers/ptp/ptp_private.h | 8 ++++++--
 drivers/ptp/ptp_sysfs.c   | 3 ++-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 3f7a74788802..7513018c9f9a 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -572,7 +572,8 @@ ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags,
 
 	for (i = 0; i < cnt; i++) {
 		event[i] = queue->buf[queue->head];
-		queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS;
+		/* Paired with READ_ONCE() in queue_cnt() */
+		WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS);
 	}
 
 	spin_unlock_irqrestore(&queue->lock, flags);
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 3134568af622..15b804ba4868 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -57,10 +57,11 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
 	dst->t.sec = seconds;
 	dst->t.nsec = remainder;
 
+	/* Both WRITE_ONCE() are paired with READ_ONCE() in queue_cnt() */
 	if (!queue_free(queue))
-		queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS;
+		WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS);
 
-	queue->tail = (queue->tail + 1) % PTP_MAX_TIMESTAMPS;
+	WRITE_ONCE(queue->tail, (queue->tail + 1) % PTP_MAX_TIMESTAMPS);
 
 	spin_unlock_irqrestore(&queue->lock, flags);
 }
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 35fde0a05746..45f9002a5dca 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -85,9 +85,13 @@ struct ptp_vclock {
  * that a writer might concurrently increment the tail does not
  * matter, since the queue remains nonempty nonetheless.
  */
-static inline int queue_cnt(struct timestamp_event_queue *q)
+static inline int queue_cnt(const struct timestamp_event_queue *q)
 {
-	int cnt = q->tail - q->head;
+	/*
+	 * Paired with WRITE_ONCE() in enqueue_external_timestamp(),
+	 * ptp_read(), extts_fifo_show().
+	 */
+	int cnt = READ_ONCE(q->tail) - READ_ONCE(q->head);
 	return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt;
 }
 
diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c
index 7d023d9d0acb..f7a499a1bd39 100644
--- a/drivers/ptp/ptp_sysfs.c
+++ b/drivers/ptp/ptp_sysfs.c
@@ -94,7 +94,8 @@ static ssize_t extts_fifo_show(struct device *dev,
 	qcnt = queue_cnt(queue);
 	if (qcnt) {
 		event = queue->buf[queue->head];
-		queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS;
+		/* Paired with READ_ONCE() in queue_cnt() */
+		WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS);
 	}
 	spin_unlock_irqrestore(&queue->lock, flags);
 
-- 
cgit 


From 3cffa2ddc4d3fcf70cde361236f5a614f81a09b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Nov 2023 18:01:02 +0000
Subject: bonding: stop the device in bond_setup_by_slave()

Commit 9eed321cde22 ("net: lapbether: only support ethernet devices")
has been able to keep syzbot away from net/lapb, until today.

In the following splat [1], the issue is that a lapbether device has
been created on a bonding device without members. Then adding a non
ARPHRD_ETHER member forced the bonding master to change its type.

The fix is to make sure we call dev_close() in bond_setup_by_slave()
so that the potential linked lapbether devices (or any other devices
having assumptions on the physical device) are removed.

A similar bug has been addressed in commit 40baec225765
("bonding: fix panic on non-ARPHRD_ETHER enslave failure")

[1]
skbuff: skb_under_panic: text:ffff800089508810 len:44 put:40 head:ffff0000c78e7c00 data:ffff0000c78e7bea tail:0x16 end:0x140 dev:bond0
kernel BUG at net/core/skbuff.c:192 !
Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
Modules linked in:
CPU: 0 PID: 6007 Comm: syz-executor383 Not tainted 6.6.0-rc3-syzkaller-gbf6547d8715b #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/04/2023
pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : skb_panic net/core/skbuff.c:188 [inline]
pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:202
lr : skb_panic net/core/skbuff.c:188 [inline]
lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:202
sp : ffff800096a06aa0
x29: ffff800096a06ab0 x28: ffff800096a06ba0 x27: dfff800000000000
x26: ffff0000ce9b9b50 x25: 0000000000000016 x24: ffff0000c78e7bea
x23: ffff0000c78e7c00 x22: 000000000000002c x21: 0000000000000140
x20: 0000000000000028 x19: ffff800089508810 x18: ffff800096a06100
x17: 0000000000000000 x16: ffff80008a629a3c x15: 0000000000000001
x14: 1fffe00036837a32 x13: 0000000000000000 x12: 0000000000000000
x11: 0000000000000201 x10: 0000000000000000 x9 : cb50b496c519aa00
x8 : cb50b496c519aa00 x7 : 0000000000000001 x6 : 0000000000000001
x5 : ffff800096a063b8 x4 : ffff80008e280f80 x3 : ffff8000805ad11c
x2 : 0000000000000001 x1 : 0000000100000201 x0 : 0000000000000086
Call trace:
skb_panic net/core/skbuff.c:188 [inline]
skb_under_panic+0x13c/0x140 net/core/skbuff.c:202
skb_push+0xf0/0x108 net/core/skbuff.c:2446
ip6gre_header+0xbc/0x738 net/ipv6/ip6_gre.c:1384
dev_hard_header include/linux/netdevice.h:3136 [inline]
lapbeth_data_transmit+0x1c4/0x298 drivers/net/wan/lapbether.c:257
lapb_data_transmit+0x8c/0xb0 net/lapb/lapb_iface.c:447
lapb_transmit_buffer+0x178/0x204 net/lapb/lapb_out.c:149
lapb_send_control+0x220/0x320 net/lapb/lapb_subr.c:251
__lapb_disconnect_request+0x9c/0x17c net/lapb/lapb_iface.c:326
lapb_device_event+0x288/0x4e0 net/lapb/lapb_iface.c:492
notifier_call_chain+0x1a4/0x510 kernel/notifier.c:93
raw_notifier_call_chain+0x3c/0x50 kernel/notifier.c:461
call_netdevice_notifiers_info net/core/dev.c:1970 [inline]
call_netdevice_notifiers_extack net/core/dev.c:2008 [inline]
call_netdevice_notifiers net/core/dev.c:2022 [inline]
__dev_close_many+0x1b8/0x3c4 net/core/dev.c:1508
dev_close_many+0x1e0/0x470 net/core/dev.c:1559
dev_close+0x174/0x250 net/core/dev.c:1585
lapbeth_device_event+0x2e4/0x958 drivers/net/wan/lapbether.c:466
notifier_call_chain+0x1a4/0x510 kernel/notifier.c:93
raw_notifier_call_chain+0x3c/0x50 kernel/notifier.c:461
call_netdevice_notifiers_info net/core/dev.c:1970 [inline]
call_netdevice_notifiers_extack net/core/dev.c:2008 [inline]
call_netdevice_notifiers net/core/dev.c:2022 [inline]
__dev_close_many+0x1b8/0x3c4 net/core/dev.c:1508
dev_close_many+0x1e0/0x470 net/core/dev.c:1559
dev_close+0x174/0x250 net/core/dev.c:1585
bond_enslave+0x2298/0x30cc drivers/net/bonding/bond_main.c:2332
bond_do_ioctl+0x268/0xc64 drivers/net/bonding/bond_main.c:4539
dev_ifsioc+0x754/0x9ac
dev_ioctl+0x4d8/0xd34 net/core/dev_ioctl.c:786
sock_do_ioctl+0x1d4/0x2d0 net/socket.c:1217
sock_ioctl+0x4e8/0x834 net/socket.c:1322
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:871 [inline]
__se_sys_ioctl fs/ioctl.c:857 [inline]
__arm64_sys_ioctl+0x14c/0x1c8 fs/ioctl.c:857
__invoke_syscall arch/arm64/kernel/syscall.c:37 [inline]
invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51
el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136
do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155
el0_svc+0x58/0x16c arch/arm64/kernel/entry-common.c:678
el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696
el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591
Code: aa1803e6 aa1903e7 a90023f5 94785b8b (d4210000)

Fixes: 872254dd6b1f ("net/bonding: Enable bonding to enslave non ARPHRD_ETHER")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://lore.kernel.org/r/20231109180102.4085183-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 51d47eda1c87..8e6cc0e133b7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1500,6 +1500,10 @@ done:
 static void bond_setup_by_slave(struct net_device *bond_dev,
 				struct net_device *slave_dev)
 {
+	bool was_up = !!(bond_dev->flags & IFF_UP);
+
+	dev_close(bond_dev);
+
 	bond_dev->header_ops	    = slave_dev->header_ops;
 
 	bond_dev->type		    = slave_dev->type;
@@ -1514,6 +1518,8 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 		bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST);
 		bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP);
 	}
+	if (was_up)
+		dev_open(bond_dev, NULL);
 }
 
 /* On bonding slaves other than the currently active slave, suppress
-- 
cgit 


From 510e35fb931ffc3b100e5d5ae4595cd3beca9f1a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 9 Nov 2023 10:03:12 +0100
Subject: net: ethernet: cortina: Fix max RX frame define

Enumerator 3 is 1548 bytes according to the datasheet.
Not 1542.

Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-1-6e611528db08@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/cortina/gemini.c | 4 ++--
 drivers/net/ethernet/cortina/gemini.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index 5423fe26b4ef..562b0917316a 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -432,8 +432,8 @@ static const struct gmac_max_framelen gmac_maxlens[] = {
 		.val = CONFIG0_MAXLEN_1536,
 	},
 	{
-		.max_l3_len = 1542,
-		.val = CONFIG0_MAXLEN_1542,
+		.max_l3_len = 1548,
+		.val = CONFIG0_MAXLEN_1548,
 	},
 	{
 		.max_l3_len = 9212,
diff --git a/drivers/net/ethernet/cortina/gemini.h b/drivers/net/ethernet/cortina/gemini.h
index 9fdf77d5eb37..99efb1155743 100644
--- a/drivers/net/ethernet/cortina/gemini.h
+++ b/drivers/net/ethernet/cortina/gemini.h
@@ -787,7 +787,7 @@ union gmac_config0 {
 #define  CONFIG0_MAXLEN_1536	0
 #define  CONFIG0_MAXLEN_1518	1
 #define  CONFIG0_MAXLEN_1522	2
-#define  CONFIG0_MAXLEN_1542	3
+#define  CONFIG0_MAXLEN_1548	3
 #define  CONFIG0_MAXLEN_9k	4	/* 9212 */
 #define  CONFIG0_MAXLEN_10k	5	/* 10236 */
 #define  CONFIG0_MAXLEN_1518__6	6
-- 
cgit 


From d4d0c5b4d279bfe3585fbd806efefd3e51c82afa Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 9 Nov 2023 10:03:13 +0100
Subject: net: ethernet: cortina: Handle large frames

The Gemini ethernet controller provides hardware checksumming
for frames up to 1514 bytes including ethernet headers but not
FCS.

If we start sending bigger frames (after first bumping up the MTU
on both interfaces sending and receiving the frames), truncated
packets start to appear on the target such as in this tcpdump
resulting from ping -s 1474:

23:34:17.241983 14:d6:4d:a8:3c:4f (oui Unknown) > bc:ae:c5:6b:a8:3d (oui Unknown),
ethertype IPv4 (0x0800), length 1514: truncated-ip - 2 bytes missing!
(tos 0x0, ttl 64, id 32653, offset 0, flags [DF], proto ICMP (1), length 1502)
OpenWrt.lan > Fecusia: ICMP echo request, id 1672, seq 50, length 1482

If we bypass the hardware checksumming and provide a software
fallback, everything starts working fine up to the max TX MTU
of 2047 bytes, for example ping -s2000 192.168.1.2:

00:44:29.587598 bc:ae:c5:6b:a8:3d (oui Unknown) > 14:d6:4d:a8:3c:4f (oui Unknown),
ethertype IPv4 (0x0800), length 2042:
(tos 0x0, ttl 64, id 51828, offset 0, flags [none], proto ICMP (1), length 2028)
Fecusia > OpenWrt.lan: ICMP echo reply, id 1683, seq 4, length 2008

The bit enabling to bypass hardware checksum (or any of the
"TSS" bits) are undocumented in the hardware reference manual.
The entire hardware checksum unit appears undocumented. The
conclusion that we need to use the "bypass" bit was found by
trial-and-error.

Since no hardware checksum will happen, we slot in a software
checksum fallback.

Check for the condition where we need to compute checksum on the
skb with either hardware or software using == CHECKSUM_PARTIAL instead
of != CHECKSUM_NONE which is an incomplete check according to
<linux/skbuff.h>.

On the D-Link DIR-685 router this fixes a bug on the conduit
interface to the RTL8366RB DSA switch: as the switch needs to add
space for its tag it increases the MTU on the conduit interface
to 1504 and that means that when the router sends packages
of 1500 bytes these get an extra 4 bytes of DSA tag and the
transfer fails because of the erroneous hardware checksumming,
affecting such basic functionality as the LuCI web interface.

Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet")
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-2-6e611528db08@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/cortina/gemini.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index 562b0917316a..e5e73dee13ac 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -1145,6 +1145,7 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb,
 	dma_addr_t mapping;
 	unsigned short mtu;
 	void *buffer;
+	int ret;
 
 	mtu  = ETH_HLEN;
 	mtu += netdev->mtu;
@@ -1159,9 +1160,30 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb,
 		word3 |= mtu;
 	}
 
-	if (skb->ip_summed != CHECKSUM_NONE) {
+	if (skb->len >= ETH_FRAME_LEN) {
+		/* Hardware offloaded checksumming isn't working on frames
+		 * bigger than 1514 bytes. A hypothesis about this is that the
+		 * checksum buffer is only 1518 bytes, so when the frames get
+		 * bigger they get truncated, or the last few bytes get
+		 * overwritten by the FCS.
+		 *
+		 * Just use software checksumming and bypass on bigger frames.
+		 */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			ret = skb_checksum_help(skb);
+			if (ret)
+				return ret;
+		}
+		word1 |= TSS_BYPASS_BIT;
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		int tcp = 0;
 
+		/* We do not switch off the checksumming on non TCP/UDP
+		 * frames: as is shown from tests, the checksumming engine
+		 * is smart enough to see that a frame is not actually TCP
+		 * or UDP and then just pass it through without any changes
+		 * to the frame.
+		 */
 		if (skb->protocol == htons(ETH_P_IP)) {
 			word1 |= TSS_IP_CHKSUM_BIT;
 			tcp = ip_hdr(skb)->protocol == IPPROTO_TCP;
-- 
cgit 


From dc6c0bfbaa947dd7976e30e8c29b10c868b6fa42 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 9 Nov 2023 10:03:14 +0100
Subject: net: ethernet: cortina: Fix MTU max setting

The RX max frame size is over 10000 for the Gemini ethernet,
but the TX max frame size is actually just 2047 (0x7ff after
checking the datasheet). Reflect this in what we offer to Linux,
cap the MTU at the TX max frame minus ethernet headers.

We delete the code disabling the hardware checksum for large
MTUs as netdev->mtu can no longer be larger than
netdev->max_mtu meaning the if()-clause in gmac_fix_features()
is never true.

Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-3-6e611528db08@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/cortina/gemini.c | 17 ++++-------------
 drivers/net/ethernet/cortina/gemini.h |  2 +-
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index e5e73dee13ac..78287cfcbf63 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -2000,15 +2000,6 @@ static int gmac_change_mtu(struct net_device *netdev, int new_mtu)
 	return 0;
 }
 
-static netdev_features_t gmac_fix_features(struct net_device *netdev,
-					   netdev_features_t features)
-{
-	if (netdev->mtu + ETH_HLEN + VLAN_HLEN > MTU_SIZE_BIT_MASK)
-		features &= ~GMAC_OFFLOAD_FEATURES;
-
-	return features;
-}
-
 static int gmac_set_features(struct net_device *netdev,
 			     netdev_features_t features)
 {
@@ -2234,7 +2225,6 @@ static const struct net_device_ops gmac_351x_ops = {
 	.ndo_set_mac_address	= gmac_set_mac_address,
 	.ndo_get_stats64	= gmac_get_stats64,
 	.ndo_change_mtu		= gmac_change_mtu,
-	.ndo_fix_features	= gmac_fix_features,
 	.ndo_set_features	= gmac_set_features,
 };
 
@@ -2486,11 +2476,12 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev)
 
 	netdev->hw_features = GMAC_OFFLOAD_FEATURES;
 	netdev->features |= GMAC_OFFLOAD_FEATURES | NETIF_F_GRO;
-	/* We can handle jumbo frames up to 10236 bytes so, let's accept
-	 * payloads of 10236 bytes minus VLAN and ethernet header
+	/* We can receive jumbo frames up to 10236 bytes but only
+	 * transmit 2047 bytes so, let's accept payloads of 2047
+	 * bytes minus VLAN and ethernet header
 	 */
 	netdev->min_mtu = ETH_MIN_MTU;
-	netdev->max_mtu = 10236 - VLAN_ETH_HLEN;
+	netdev->max_mtu = MTU_SIZE_BIT_MASK - VLAN_ETH_HLEN;
 
 	port->freeq_refill = 0;
 	netif_napi_add(netdev, &port->napi, gmac_napi_poll);
diff --git a/drivers/net/ethernet/cortina/gemini.h b/drivers/net/ethernet/cortina/gemini.h
index 99efb1155743..24bb989981f2 100644
--- a/drivers/net/ethernet/cortina/gemini.h
+++ b/drivers/net/ethernet/cortina/gemini.h
@@ -502,7 +502,7 @@ union gmac_txdesc_3 {
 #define SOF_BIT			0x80000000
 #define EOF_BIT			0x40000000
 #define EOFIE_BIT		BIT(29)
-#define MTU_SIZE_BIT_MASK	0x1fff
+#define MTU_SIZE_BIT_MASK	0x7ff /* Max MTU 2047 bytes */
 
 /* GMAC Tx Descriptor */
 struct gmac_txdesc {
-- 
cgit 


From 0ab0c45d8aaea5192328bfa6989673aceafc767c Mon Sep 17 00:00:00 2001
From: ChunHao Lin <hau@realtek.com>
Date: Fri, 10 Nov 2023 01:33:59 +0800
Subject: r8169: add handling DASH when DASH is disabled

For devices that support DASH, even DASH is disabled, there may still
exist a default firmware that will influence device behavior.
So driver needs to handle DASH for devices that support DASH, no
matter the DASH status is.

This patch also prepares for "fix network lost after resume on DASH
systems".

Fixes: ee7a1beb9759 ("r8169:call "rtl8168_driver_start" "rtl8168_driver_stop" only when hardware dash function is enabled")
Cc: stable@vger.kernel.org
Signed-off-by: ChunHao Lin <hau@realtek.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/20231109173400.4573-2-hau@realtek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/realtek/r8169_main.c | 36 +++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 0c76c162b8a9..cfcb40d90920 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -624,6 +624,7 @@ struct rtl8169_private {
 
 	unsigned supports_gmii:1;
 	unsigned aspm_manageable:1;
+	unsigned dash_enabled:1;
 	dma_addr_t counters_phys_addr;
 	struct rtl8169_counters *counters;
 	struct rtl8169_tc_offsets tc_offset;
@@ -1253,14 +1254,26 @@ static bool r8168ep_check_dash(struct rtl8169_private *tp)
 	return r8168ep_ocp_read(tp, 0x128) & BIT(0);
 }
 
-static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp)
+static bool rtl_dash_is_enabled(struct rtl8169_private *tp)
+{
+	switch (tp->dash_type) {
+	case RTL_DASH_DP:
+		return r8168dp_check_dash(tp);
+	case RTL_DASH_EP:
+		return r8168ep_check_dash(tp);
+	default:
+		return false;
+	}
+}
+
+static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp)
 {
 	switch (tp->mac_version) {
 	case RTL_GIGA_MAC_VER_28:
 	case RTL_GIGA_MAC_VER_31:
-		return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE;
+		return RTL_DASH_DP;
 	case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53:
-		return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE;
+		return RTL_DASH_EP;
 	default:
 		return RTL_DASH_NONE;
 	}
@@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 
 	device_set_wakeup_enable(tp_to_dev(tp), wolopts);
 
-	if (tp->dash_type == RTL_DASH_NONE) {
+	if (!tp->dash_enabled) {
 		rtl_set_d3_pll_down(tp, !wolopts);
 		tp->dev->wol_enabled = wolopts ? 1 : 0;
 	}
@@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp)
 
 static void rtl_prepare_power_down(struct rtl8169_private *tp)
 {
-	if (tp->dash_type != RTL_DASH_NONE)
+	if (tp->dash_enabled)
 		return;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_32 ||
@@ -4869,7 +4882,7 @@ static int rtl8169_runtime_idle(struct device *device)
 {
 	struct rtl8169_private *tp = dev_get_drvdata(device);
 
-	if (tp->dash_type != RTL_DASH_NONE)
+	if (tp->dash_enabled)
 		return -EBUSY;
 
 	if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev))
@@ -4895,8 +4908,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
 	/* Restore original MAC address */
 	rtl_rar_set(tp, tp->dev->perm_addr);
 
-	if (system_state == SYSTEM_POWER_OFF &&
-	    tp->dash_type == RTL_DASH_NONE) {
+	if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) {
 		pci_wake_from_d3(pdev, tp->saved_wolopts);
 		pci_set_power_state(pdev, PCI_D3hot);
 	}
@@ -5254,7 +5266,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1);
 	tp->aspm_manageable = !rc;
 
-	tp->dash_type = rtl_check_dash(tp);
+	tp->dash_type = rtl_get_dash_type(tp);
+	tp->dash_enabled = rtl_dash_is_enabled(tp);
 
 	tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK;
 
@@ -5325,7 +5338,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* configure chip for default features */
 	rtl8169_set_features(dev, dev->features);
 
-	if (tp->dash_type == RTL_DASH_NONE) {
+	if (!tp->dash_enabled) {
 		rtl_set_d3_pll_down(tp, true);
 	} else {
 		rtl_set_d3_pll_down(tp, false);
@@ -5365,7 +5378,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			    "ok" : "ko");
 
 	if (tp->dash_type != RTL_DASH_NONE) {
-		netdev_info(dev, "DASH enabled\n");
+		netdev_info(dev, "DASH %s\n",
+			    tp->dash_enabled ? "enabled" : "disabled");
 		rtl8168_driver_start(tp);
 	}
 
-- 
cgit 


From 868c3b95afef4883bfb66c9397482da6840b5baf Mon Sep 17 00:00:00 2001
From: ChunHao Lin <hau@realtek.com>
Date: Fri, 10 Nov 2023 01:34:00 +0800
Subject: r8169: fix network lost after resume on DASH systems

Device that support DASH may be reseted or powered off during suspend.
So driver needs to handle DASH during system suspend and resume. Or
DASH firmware will influence device behavior and causes network lost.

Fixes: b646d90053f8 ("r8169: magic.")
Cc: stable@vger.kernel.org
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: ChunHao Lin <hau@realtek.com>
Link: https://lore.kernel.org/r/20231109173400.4573-3-hau@realtek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/realtek/r8169_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index cfcb40d90920..b9bb1d2f0237 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4661,10 +4661,16 @@ static void rtl8169_down(struct rtl8169_private *tp)
 	rtl8169_cleanup(tp);
 	rtl_disable_exit_l1(tp);
 	rtl_prepare_power_down(tp);
+
+	if (tp->dash_type != RTL_DASH_NONE)
+		rtl8168_driver_stop(tp);
 }
 
 static void rtl8169_up(struct rtl8169_private *tp)
 {
+	if (tp->dash_type != RTL_DASH_NONE)
+		rtl8168_driver_start(tp);
+
 	pci_set_master(tp->pci_dev);
 	phy_init_hw(tp->phydev);
 	phy_resume(tp->phydev);
-- 
cgit 


From b28060db7172e6d8912d88b369123eb89e0d36b4 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 12 Nov 2023 11:12:49 +0200
Subject: ovl: fix misformatted comment

Remove misleading /** prefix from a regular comment.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311121628.byHp8tkv-lkp@intel.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 50a201e9cd39..c3f020ca13a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -978,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
 	return 0;
 }
 
-/**
+/*
  * Caller must hold a reference to inode to prevent it from being freed while
  * it is marked inuse.
  */
-- 
cgit 


From 37f32f52643869131ec01bb69bdf9f404f6109fb Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 12 Nov 2023 10:11:25 +0200
Subject: ovl: fix memory leak in ovl_parse_param()

On failure to parse parameters in ovl_parse_param_lowerdir(), it is
necessary to update ctx->nr with the correct nr before using
ovl_reset_lowerdirs() to release l->name.

Reported-and-tested-by: syzbot+26eedf3631650972f17c@syzkaller.appspotmail.com
Fixes: c835110b588a ("ovl: remove unused code in lowerdir param parsing")
Co-authored-by: Edward Adam Davis <eadavis@qq.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/params.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index ddab9ea267d1..3fe2dde1598f 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -430,7 +430,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 	struct ovl_fs_context *ctx = fc->fs_private;
 	struct ovl_fs_context_layer *l;
 	char *dup = NULL, *iter;
-	ssize_t nr_lower = 0, nr = 0, nr_data = 0;
+	ssize_t nr_lower, nr;
 	bool data_layer = false;
 
 	/*
@@ -482,6 +482,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 	iter = dup;
 	l = ctx->lower;
 	for (nr = 0; nr < nr_lower; nr++, l++) {
+		ctx->nr++;
 		memset(l, 0, sizeof(*l));
 
 		err = ovl_mount_dir(iter, &l->path);
@@ -498,10 +499,10 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 			goto out_put;
 
 		if (data_layer)
-			nr_data++;
+			ctx->nr_data++;
 
 		/* Calling strchr() again would overrun. */
-		if ((nr + 1) == nr_lower)
+		if (ctx->nr == nr_lower)
 			break;
 
 		err = -EINVAL;
@@ -511,7 +512,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 			 * This is a regular layer so we require that
 			 * there are no data layers.
 			 */
-			if ((ctx->nr_data + nr_data) > 0) {
+			if (ctx->nr_data > 0) {
 				pr_err("regular lower layers cannot follow data lower layers");
 				goto out_put;
 			}
@@ -524,8 +525,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 		data_layer = true;
 		iter++;
 	}
-	ctx->nr = nr_lower;
-	ctx->nr_data += nr_data;
 	kfree(dup);
 	return 0;
 
-- 
cgit 


From 686464514fbebb6c8de4415238319e414c3500a4 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 27 Sep 2023 08:58:05 +0200
Subject: xen/events: reduce externally visible helper functions

get_evtchn_to_irq() has only one external user while irq_from_evtchn()
provides the same functionality and is exported for a wider user base.
Modify the only external user of get_evtchn_to_irq() to use
irq_from_evtchn() instead and make get_evtchn_to_irq() static.

evtchn_from_irq() and irq_from_virq() have a single external user and
can easily be combined to a new helper irq_evtchn_from_virq() allowing
to drop irq_from_virq() and to make evtchn_from_irq() static.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_2l.c       |  8 ++++----
 drivers/xen/events/events_base.c     | 13 +++++++++----
 drivers/xen/events/events_internal.h |  1 -
 include/xen/events.h                 |  4 ++--
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index b8f2f971c2f0..e3585330cf98 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -171,11 +171,11 @@ static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl)
 	int i;
 	struct shared_info *s = HYPERVISOR_shared_info;
 	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	evtchn_port_t evtchn;
 
 	/* Timer interrupt has highest priority. */
-	irq = irq_from_virq(cpu, VIRQ_TIMER);
+	irq = irq_evtchn_from_virq(cpu, VIRQ_TIMER, &evtchn);
 	if (irq != -1) {
-		evtchn_port_t evtchn = evtchn_from_irq(irq);
 		word_idx = evtchn / BITS_PER_LONG;
 		bit_idx = evtchn % BITS_PER_LONG;
 		if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx))
@@ -328,9 +328,9 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 	for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
 		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
 			int word_idx = i / BITS_PER_EVTCHN_WORD;
-			printk("  %d: event %d -> irq %d%s%s%s\n",
+			printk("  %d: event %d -> irq %u%s%s%s\n",
 			       cpu_from_evtchn(i), i,
-			       get_evtchn_to_irq(i),
+			       irq_from_evtchn(i),
 			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
 			       ? "" : " l2-clear",
 			       !sync_test_bit(i, BM(sh->evtchn_mask))
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index c5d86128eb73..a74ea28dcc3d 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -248,7 +248,7 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq)
 	return 0;
 }
 
-int get_evtchn_to_irq(evtchn_port_t evtchn)
+static int get_evtchn_to_irq(evtchn_port_t evtchn)
 {
 	if (evtchn >= xen_evtchn_max_channels())
 		return -1;
@@ -415,7 +415,7 @@ static void xen_irq_info_cleanup(struct irq_info *info)
 /*
  * Accessors for packed IRQ information.
  */
-evtchn_port_t evtchn_from_irq(unsigned irq)
+static evtchn_port_t evtchn_from_irq(unsigned int irq)
 {
 	const struct irq_info *info = NULL;
 
@@ -433,9 +433,14 @@ unsigned int irq_from_evtchn(evtchn_port_t evtchn)
 }
 EXPORT_SYMBOL_GPL(irq_from_evtchn);
 
-int irq_from_virq(unsigned int cpu, unsigned int virq)
+int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq,
+			 evtchn_port_t *evtchn)
 {
-	return per_cpu(virq_to_irq, cpu)[virq];
+	int irq = per_cpu(virq_to_irq, cpu)[virq];
+
+	*evtchn = evtchn_from_irq(irq);
+
+	return irq;
 }
 
 static enum ipi_vector ipi_from_irq(unsigned irq)
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index 4d3398eff9cd..19ae31695edc 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -33,7 +33,6 @@ struct evtchn_ops {
 
 extern const struct evtchn_ops *evtchn_ops;
 
-int get_evtchn_to_irq(evtchn_port_t evtchn);
 void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl);
 
 unsigned int cpu_from_evtchn(evtchn_port_t evtchn);
diff --git a/include/xen/events.h b/include/xen/events.h
index a129cafa80ed..3b07409f8032 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -100,8 +100,8 @@ void xen_poll_irq_timeout(int irq, u64 timeout);
 
 /* Determine the IRQ which is bound to an event channel */
 unsigned int irq_from_evtchn(evtchn_port_t evtchn);
-int irq_from_virq(unsigned int cpu, unsigned int virq);
-evtchn_port_t evtchn_from_irq(unsigned irq);
+int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq,
+			 evtchn_port_t *evtchn);
 
 int xen_set_callback_via(uint64_t via);
 int xen_evtchn_do_upcall(void);
-- 
cgit 


From 3bdb0ac350fe5e6301562143e4573971dd01ae0b Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 27 Sep 2023 08:24:46 +0200
Subject: xen/events: remove some simple helpers from events_base.c

The helper functions type_from_irq() and cpu_from_irq() are just one
line functions used only internally.

Open code them where needed. At the same time modify and rename
get_evtchn_to_irq() to return a struct irq_info instead of the IRQ
number.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 97 ++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 59 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index a74ea28dcc3d..a810e8904fbf 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -248,15 +248,6 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq)
 	return 0;
 }
 
-static int get_evtchn_to_irq(evtchn_port_t evtchn)
-{
-	if (evtchn >= xen_evtchn_max_channels())
-		return -1;
-	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
-		return -1;
-	return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
-}
-
 /* Get info for IRQ */
 static struct irq_info *info_for_irq(unsigned irq)
 {
@@ -274,6 +265,19 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info)
 		irq_set_chip_data(irq, info);
 }
 
+static struct irq_info *evtchn_to_info(evtchn_port_t evtchn)
+{
+	int irq;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return NULL;
+	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
+		return NULL;
+	irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
+
+	return (irq < 0) ? NULL : info_for_irq(irq);
+}
+
 /* Per CPU channel accounting */
 static void channels_on_cpu_dec(struct irq_info *info)
 {
@@ -429,7 +433,9 @@ static evtchn_port_t evtchn_from_irq(unsigned int irq)
 
 unsigned int irq_from_evtchn(evtchn_port_t evtchn)
 {
-	return get_evtchn_to_irq(evtchn);
+	struct irq_info *info = evtchn_to_info(evtchn);
+
+	return info ? info->irq : -1;
 }
 EXPORT_SYMBOL_GPL(irq_from_evtchn);
 
@@ -473,25 +479,11 @@ static unsigned pirq_from_irq(unsigned irq)
 	return info->u.pirq.pirq;
 }
 
-static enum xen_irq_type type_from_irq(unsigned irq)
-{
-	return info_for_irq(irq)->type;
-}
-
-static unsigned cpu_from_irq(unsigned irq)
-{
-	return info_for_irq(irq)->cpu;
-}
-
 unsigned int cpu_from_evtchn(evtchn_port_t evtchn)
 {
-	int irq = get_evtchn_to_irq(evtchn);
-	unsigned ret = 0;
-
-	if (irq != -1)
-		ret = cpu_from_irq(irq);
+	struct irq_info *info = evtchn_to_info(evtchn);
 
-	return ret;
+	return info ? info->cpu : 0;
 }
 
 static void do_mask(struct irq_info *info, u8 reason)
@@ -540,13 +532,12 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
 			       bool force_affinity)
 {
-	int irq = get_evtchn_to_irq(evtchn);
-	struct irq_info *info = info_for_irq(irq);
+	struct irq_info *info = evtchn_to_info(evtchn);
 
-	BUG_ON(irq == -1);
+	BUG_ON(info == NULL);
 
 	if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
-		struct irq_data *data = irq_get_irq_data(irq);
+		struct irq_data *data = irq_get_irq_data(info->irq);
 
 		irq_data_update_affinity(data, cpumask_of(cpu));
 		irq_data_update_effective_affinity(data, cpumask_of(cpu));
@@ -979,13 +970,13 @@ static void __unbind_from_irq(unsigned int irq)
 	}
 
 	if (VALID_EVTCHN(evtchn)) {
-		unsigned int cpu = cpu_from_irq(irq);
+		unsigned int cpu = info->cpu;
 		struct xenbus_device *dev;
 
 		if (!info->is_static)
 			xen_evtchn_close(evtchn);
 
-		switch (type_from_irq(irq)) {
+		switch (info->type) {
 		case IRQT_VIRQ:
 			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
 			break;
@@ -1185,15 +1176,16 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
 {
 	int irq;
 	int ret;
+	struct irq_info *info;
 
 	if (evtchn >= xen_evtchn_max_channels())
 		return -ENOMEM;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = get_evtchn_to_irq(evtchn);
+	info = evtchn_to_info(evtchn);
 
-	if (irq == -1) {
+	if (!info) {
 		irq = xen_allocate_irq_dynamic();
 		if (irq < 0)
 			goto out;
@@ -1216,9 +1208,9 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
 		 */
 		bind_evtchn_to_cpu(evtchn, 0, false);
 	} else {
-		struct irq_info *info = info_for_irq(irq);
-		if (!WARN_ON(!info || info->type != IRQT_EVTCHN))
+		if (!WARN_ON(info->type != IRQT_EVTCHN))
 			info->refcnt++;
+		irq = info->irq;
 	}
 
 out:
@@ -1556,13 +1548,7 @@ EXPORT_SYMBOL_GPL(xen_set_irq_priority);
 
 int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static)
 {
-	int irq = get_evtchn_to_irq(evtchn);
-	struct irq_info *info;
-
-	if (irq == -1)
-		return -ENOENT;
-
-	info = info_for_irq(irq);
+	struct irq_info *info = evtchn_to_info(evtchn);
 
 	if (!info)
 		return -ENOENT;
@@ -1578,7 +1564,6 @@ EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
 
 int evtchn_get(evtchn_port_t evtchn)
 {
-	int irq;
 	struct irq_info *info;
 	int err = -ENOENT;
 
@@ -1587,11 +1572,7 @@ int evtchn_get(evtchn_port_t evtchn)
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = get_evtchn_to_irq(evtchn);
-	if (irq == -1)
-		goto done;
-
-	info = info_for_irq(irq);
+	info = evtchn_to_info(evtchn);
 
 	if (!info)
 		goto done;
@@ -1611,10 +1592,11 @@ EXPORT_SYMBOL_GPL(evtchn_get);
 
 void evtchn_put(evtchn_port_t evtchn)
 {
-	int irq = get_evtchn_to_irq(evtchn);
-	if (WARN_ON(irq == -1))
+	struct irq_info *info = evtchn_to_info(evtchn);
+
+	if (WARN_ON(!info))
 		return;
-	unbind_from_irq(irq);
+	unbind_from_irq(info->irq);
 }
 EXPORT_SYMBOL_GPL(evtchn_put);
 
@@ -1644,12 +1626,10 @@ struct evtchn_loop_ctrl {
 
 void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
 {
-	int irq;
-	struct irq_info *info;
+	struct irq_info *info = evtchn_to_info(port);
 	struct xenbus_device *dev;
 
-	irq = get_evtchn_to_irq(port);
-	if (irq == -1)
+	if (!info)
 		return;
 
 	/*
@@ -1674,7 +1654,6 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
 		}
 	}
 
-	info = info_for_irq(irq);
 	if (xchg_acquire(&info->is_active, 1))
 		return;
 
@@ -1688,7 +1667,7 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
 		info->eoi_time = get_jiffies_64() + event_eoi_delay;
 	}
 
-	generic_handle_irq(irq);
+	generic_handle_irq(info->irq);
 }
 
 int xen_evtchn_do_upcall(void)
@@ -1746,7 +1725,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
 	mutex_lock(&irq_mapping_update_lock);
 
 	/* After resume the irq<->evtchn mappings are all cleared out */
-	BUG_ON(get_evtchn_to_irq(evtchn) != -1);
+	BUG_ON(evtchn_to_info(evtchn));
 	/* Expect irq to have been bound before,
 	   so there should be a proper type */
 	BUG_ON(info->type == IRQT_UNBOUND);
-- 
cgit 


From 4b7b492615cf3017190f55444f7016812b66611d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Nov 2023 13:49:38 +0000
Subject: af_unix: fix use-after-free in unix_stream_read_actor()

syzbot reported the following crash [1]

After releasing unix socket lock, u->oob_skb can be changed
by another thread. We must temporarily increase skb refcount
to make sure this other thread will not free the skb under us.

[1]

BUG: KASAN: slab-use-after-free in unix_stream_read_actor+0xa7/0xc0 net/unix/af_unix.c:2866
Read of size 4 at addr ffff88801f3b9cc4 by task syz-executor107/5297

CPU: 1 PID: 5297 Comm: syz-executor107 Not tainted 6.6.0-syzkaller-15910-gb8e3a87a627b #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106
print_address_description mm/kasan/report.c:364 [inline]
print_report+0xc4/0x620 mm/kasan/report.c:475
kasan_report+0xda/0x110 mm/kasan/report.c:588
unix_stream_read_actor+0xa7/0xc0 net/unix/af_unix.c:2866
unix_stream_recv_urg net/unix/af_unix.c:2587 [inline]
unix_stream_read_generic+0x19a5/0x2480 net/unix/af_unix.c:2666
unix_stream_recvmsg+0x189/0x1b0 net/unix/af_unix.c:2903
sock_recvmsg_nosec net/socket.c:1044 [inline]
sock_recvmsg+0xe2/0x170 net/socket.c:1066
____sys_recvmsg+0x21f/0x5c0 net/socket.c:2803
___sys_recvmsg+0x115/0x1a0 net/socket.c:2845
__sys_recvmsg+0x114/0x1e0 net/socket.c:2875
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82
entry_SYSCALL_64_after_hwframe+0x63/0x6b
RIP: 0033:0x7fc67492c559
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fc6748ab228 EFLAGS: 00000246 ORIG_RAX: 000000000000002f
RAX: ffffffffffffffda RBX: 000000000000001c RCX: 00007fc67492c559
RDX: 0000000040010083 RSI: 0000000020000140 RDI: 0000000000000004
RBP: 00007fc6749b6348 R08: 00007fc6748ab6c0 R09: 00007fc6748ab6c0
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc6749b6340
R13: 00007fc6749b634c R14: 00007ffe9fac52a0 R15: 00007ffe9fac5388
</TASK>

Allocated by task 5295:
kasan_save_stack+0x33/0x50 mm/kasan/common.c:45
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
__kasan_slab_alloc+0x81/0x90 mm/kasan/common.c:328
kasan_slab_alloc include/linux/kasan.h:188 [inline]
slab_post_alloc_hook mm/slab.h:763 [inline]
slab_alloc_node mm/slub.c:3478 [inline]
kmem_cache_alloc_node+0x180/0x3c0 mm/slub.c:3523
__alloc_skb+0x287/0x330 net/core/skbuff.c:641
alloc_skb include/linux/skbuff.h:1286 [inline]
alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331
sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780
sock_alloc_send_skb include/net/sock.h:1884 [inline]
queue_oob net/unix/af_unix.c:2147 [inline]
unix_stream_sendmsg+0xb5f/0x10a0 net/unix/af_unix.c:2301
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0xd5/0x180 net/socket.c:745
____sys_sendmsg+0x6ac/0x940 net/socket.c:2584
___sys_sendmsg+0x135/0x1d0 net/socket.c:2638
__sys_sendmsg+0x117/0x1e0 net/socket.c:2667
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82
entry_SYSCALL_64_after_hwframe+0x63/0x6b

Freed by task 5295:
kasan_save_stack+0x33/0x50 mm/kasan/common.c:45
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522
____kasan_slab_free mm/kasan/common.c:236 [inline]
____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200
kasan_slab_free include/linux/kasan.h:164 [inline]
slab_free_hook mm/slub.c:1800 [inline]
slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826
slab_free mm/slub.c:3809 [inline]
kmem_cache_free+0xf8/0x340 mm/slub.c:3831
kfree_skbmem+0xef/0x1b0 net/core/skbuff.c:1015
__kfree_skb net/core/skbuff.c:1073 [inline]
consume_skb net/core/skbuff.c:1288 [inline]
consume_skb+0xdf/0x170 net/core/skbuff.c:1282
queue_oob net/unix/af_unix.c:2178 [inline]
unix_stream_sendmsg+0xd49/0x10a0 net/unix/af_unix.c:2301
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0xd5/0x180 net/socket.c:745
____sys_sendmsg+0x6ac/0x940 net/socket.c:2584
___sys_sendmsg+0x135/0x1d0 net/socket.c:2638
__sys_sendmsg+0x117/0x1e0 net/socket.c:2667
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82
entry_SYSCALL_64_after_hwframe+0x63/0x6b

The buggy address belongs to the object at ffff88801f3b9c80
which belongs to the cache skbuff_head_cache of size 240
The buggy address is located 68 bytes inside of
freed 240-byte region [ffff88801f3b9c80, ffff88801f3b9d70)

The buggy address belongs to the physical page:
page:ffffea00007cee40 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1f3b9
flags: 0xfff00000000800(slab|node=0|zone=1|lastcpupid=0x7ff)
page_type: 0xffffffff()
raw: 00fff00000000800 ffff888142a60640 dead000000000122 0000000000000000
raw: 0000000000000000 00000000000c000c 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 5299, tgid 5283 (syz-executor107), ts 103803840339, free_ts 103600093431
set_page_owner include/linux/page_owner.h:31 [inline]
post_alloc_hook+0x2cf/0x340 mm/page_alloc.c:1537
prep_new_page mm/page_alloc.c:1544 [inline]
get_page_from_freelist+0xa25/0x36c0 mm/page_alloc.c:3312
__alloc_pages+0x1d0/0x4a0 mm/page_alloc.c:4568
alloc_pages_mpol+0x258/0x5f0 mm/mempolicy.c:2133
alloc_slab_page mm/slub.c:1870 [inline]
allocate_slab+0x251/0x380 mm/slub.c:2017
new_slab mm/slub.c:2070 [inline]
___slab_alloc+0x8c7/0x1580 mm/slub.c:3223
__slab_alloc.constprop.0+0x56/0xa0 mm/slub.c:3322
__slab_alloc_node mm/slub.c:3375 [inline]
slab_alloc_node mm/slub.c:3468 [inline]
kmem_cache_alloc_node+0x132/0x3c0 mm/slub.c:3523
__alloc_skb+0x287/0x330 net/core/skbuff.c:641
alloc_skb include/linux/skbuff.h:1286 [inline]
alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331
sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780
sock_alloc_send_skb include/net/sock.h:1884 [inline]
queue_oob net/unix/af_unix.c:2147 [inline]
unix_stream_sendmsg+0xb5f/0x10a0 net/unix/af_unix.c:2301
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0xd5/0x180 net/socket.c:745
____sys_sendmsg+0x6ac/0x940 net/socket.c:2584
___sys_sendmsg+0x135/0x1d0 net/socket.c:2638
__sys_sendmsg+0x117/0x1e0 net/socket.c:2667
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1137 [inline]
free_unref_page_prepare+0x4f8/0xa90 mm/page_alloc.c:2347
free_unref_page+0x33/0x3b0 mm/page_alloc.c:2487
__unfreeze_partials+0x21d/0x240 mm/slub.c:2655
qlink_free mm/kasan/quarantine.c:168 [inline]
qlist_free_all+0x6a/0x170 mm/kasan/quarantine.c:187
kasan_quarantine_reduce+0x18e/0x1d0 mm/kasan/quarantine.c:294
__kasan_slab_alloc+0x65/0x90 mm/kasan/common.c:305
kasan_slab_alloc include/linux/kasan.h:188 [inline]
slab_post_alloc_hook mm/slab.h:763 [inline]
slab_alloc_node mm/slub.c:3478 [inline]
slab_alloc mm/slub.c:3486 [inline]
__kmem_cache_alloc_lru mm/slub.c:3493 [inline]
kmem_cache_alloc+0x15d/0x380 mm/slub.c:3502
vm_area_dup+0x21/0x2f0 kernel/fork.c:500
__split_vma+0x17d/0x1070 mm/mmap.c:2365
split_vma mm/mmap.c:2437 [inline]
vma_modify+0x25d/0x450 mm/mmap.c:2472
vma_modify_flags include/linux/mm.h:3271 [inline]
mprotect_fixup+0x228/0xc80 mm/mprotect.c:635
do_mprotect_pkey+0x852/0xd60 mm/mprotect.c:809
__do_sys_mprotect mm/mprotect.c:830 [inline]
__se_sys_mprotect mm/mprotect.c:827 [inline]
__x64_sys_mprotect+0x78/0xb0 mm/mprotect.c:827
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82
entry_SYSCALL_64_after_hwframe+0x63/0x6b

Memory state around the buggy address:
ffff88801f3b9b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88801f3b9c00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
>ffff88801f3b9c80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff88801f3b9d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc
ffff88801f3b9d80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb

Fixes: 876c14ad014d ("af_unix: fix holding spinlock in oob handling")
Reported-and-tested-by: syzbot+7a2d546fa43e49315ed3@syzkaller.appspotmail.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Rao Shoaib <rao.shoaib@oracle.com>
Reviewed-by: Rao shoaib <rao.shoaib@oracle.com>
Link: https://lore.kernel.org/r/20231113134938.168151-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/unix/af_unix.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 45506a95b25f..a357dc5f2404 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2581,15 +2581,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
 
 	if (!(state->flags & MSG_PEEK))
 		WRITE_ONCE(u->oob_skb, NULL);
-
+	else
+		skb_get(oob_skb);
 	unix_state_unlock(sk);
 
 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
 
-	if (!(state->flags & MSG_PEEK)) {
+	if (!(state->flags & MSG_PEEK))
 		UNIXCB(oob_skb).consumed += 1;
-		kfree_skb(oob_skb);
-	}
+
+	consume_skb(oob_skb);
 
 	mutex_unlock(&u->iolock);
 
-- 
cgit 


From 67059b61597c004e23b074547b40604222bee3a0 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Wed, 1 Nov 2023 09:33:51 +0800
Subject: netfilter: nft_set_rbtree: Remove unused variable nft_net
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code that uses nft_net has been removed, and the nft_pernet function
is merely obtaining a reference to shared data through the net pointer.
The content of the net pointer is not modified or changed, so both of
them should be removed.

silence the warning:
net/netfilter/nft_set_rbtree.c:627:26: warning: variable ‘nft_net’ set but not used

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7103
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 6f1186abd47b..baa3fea4fe65 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -624,14 +624,12 @@ static void nft_rbtree_gc(struct nft_set *set)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	struct nft_rbtree_elem *rbe, *rbe_end = NULL;
-	struct nftables_pernet *nft_net;
 	struct rb_node *node, *next;
 	struct nft_trans_gc *gc;
 	struct net *net;
 
 	set  = nft_set_container_of(priv);
 	net  = read_pnet(&set->net);
-	nft_net = nft_pernet(net);
 
 	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
 	if (!gc)
-- 
cgit 


From a44af08e3d4d7566eeea98d7a29fe06e7b9de944 Mon Sep 17 00:00:00 2001
From: Linkui Xiao <xiaolinkui@kylinos.cn>
Date: Wed, 1 Nov 2023 11:20:18 +0800
Subject: netfilter: nf_conntrack_bridge: initialize err to 0

K2CI reported a problem:

	consume_skb(skb);
	return err;
[nf_br_ip_fragment() error]  uninitialized symbol 'err'.

err is not initialized, because returning 0 is expected, initialize err
to 0.

Fixes: 3c171f496ef5 ("netfilter: bridge: add connection tracking system")
Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Linkui Xiao <xiaolinkui@kylinos.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nf_conntrack_bridge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index b5c406a6e765..abb090f94ed2 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -37,7 +37,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 	ktime_t tstamp = skb->tstamp;
 	struct ip_frag_state state;
 	struct iphdr *iph;
-	int err;
+	int err = 0;
 
 	/* for offloaded checksums cleanup checksum before fragmentation */
 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
-- 
cgit 


From c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 3 Nov 2023 09:42:51 +0300
Subject: netfilter: nf_tables: fix pointer math issue in nft_byteorder_eval()

The problem is in nft_byteorder_eval() where we are iterating through a
loop and writing to dst[0], dst[1], dst[2] and so on...  On each
iteration we are writing 8 bytes.  But dst[] is an array of u32 so each
element only has space for 4 bytes.  That means that every iteration
overwrites part of the previous element.

I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter:
nf_tables: prevent OOB access in nft_byteorder_eval") which is a related
issue.  I think that the reason we have not detected this bug in testing
is that most of time we only write one element.

Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 4 ++--
 net/netfilter/nft_byteorder.c     | 5 +++--
 net/netfilter/nft_meta.c          | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3bbd13ab1ecf..b157c5cafd14 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg)
 	return *(__force __be32 *)sreg;
 }
 
-static inline void nft_reg_store64(u32 *dreg, u64 val)
+static inline void nft_reg_store64(u64 *dreg, u64 val)
 {
-	put_unaligned(val, (u64 *)dreg);
+	put_unaligned(val, dreg);
 }
 
 static inline u64 nft_reg_load64(const u32 *sreg)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index e596d1a842f7..f6e791a68101 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -38,13 +38,14 @@ void nft_byteorder_eval(const struct nft_expr *expr,
 
 	switch (priv->size) {
 	case 8: {
+		u64 *dst64 = (void *)dst;
 		u64 src64;
 
 		switch (priv->op) {
 		case NFT_BYTEORDER_NTOH:
 			for (i = 0; i < priv->len / 8; i++) {
 				src64 = nft_reg_load64(&src[i]);
-				nft_reg_store64(&dst[i],
+				nft_reg_store64(&dst64[i],
 						be64_to_cpu((__force __be64)src64));
 			}
 			break;
@@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr,
 			for (i = 0; i < priv->len / 8; i++) {
 				src64 = (__force __u64)
 					cpu_to_be64(nft_reg_load64(&src[i]));
-				nft_reg_store64(&dst[i], src64);
+				nft_reg_store64(&dst64[i], src64);
 			}
 			break;
 		}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index f7da7c43333b..ba0d3683a45d 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key,
 {
 	switch (key) {
 	case NFT_META_TIME_NS:
-		nft_reg_store64(dest, ktime_get_real_ns());
+		nft_reg_store64((u64 *)dest, ktime_get_real_ns());
 		break;
 	case NFT_META_TIME_DAY:
 		nft_reg_store8(dest, nft_meta_weekday());
-- 
cgit 


From a7d5a955bfa854ac6b0c53aaf933394b4e6139e4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 13 Nov 2023 20:34:56 +0100
Subject: netfilter: nf_tables: bogus ENOENT when destroying element which does
 not exist

destroy element command bogusly reports ENOENT in case a set element
does not exist. ENOENT errors are skipped, however, err is still set
and propagated to userspace.

 # nft destroy element ip raw BLACKLIST { 1.2.3.4 }
 Error: Could not process rule: No such file or directory
 destroy element ip raw BLACKLIST { 1.2.3.4 }
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Fixes: f80a612dd77c ("netfilter: nf_tables: add support to destroy operation")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a761ee6796f6..debea1c67701 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7263,10 +7263,11 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
 
 		if (err < 0) {
 			NL_SET_BAD_ATTR(extack, attr);
-			break;
+			return err;
 		}
 	}
-	return err;
+
+	return 0;
 }
 
 /*
-- 
cgit 


From 28628fa952fefc7f2072ce6e8016968cc452b1ba Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Mon, 13 Nov 2023 21:13:23 +0100
Subject: netfilter: ipset: fix race condition between swap/destroy and kernel
 side add/del/test

Linkui Xiao reported that there's a race condition when ipset swap and destroy is
called, which can lead to crash in add/del/test element operations. Swap then
destroy are usual operations to replace a set with another one in a production
system. The issue can in some cases be reproduced with the script:

ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576
ipset add hash_ip1 172.20.0.0/16
ipset add hash_ip1 192.168.0.0/16
iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT
while [ 1 ]
do
	# ... Ongoing traffic...
        ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576
        ipset add hash_ip2 172.20.0.0/16
        ipset swap hash_ip1 hash_ip2
        ipset destroy hash_ip2
        sleep 0.05
done

In the race case the possible order of the operations are

	CPU0			CPU1
	ip_set_test
				ipset swap hash_ip1 hash_ip2
				ipset destroy hash_ip2
	hash_net_kadt

Swap replaces hash_ip1 with hash_ip2 and then destroy removes hash_ip2 which
is the original hash_ip1. ip_set_test was called on hash_ip1 and because destroy
removed it, hash_net_kadt crashes.

The fix is to force ip_set_swap() to wait for all readers to finish accessing the
old set pointers by calling synchronize_rcu().

The first version of the patch was written by Linkui Xiao <xiaolinkui@kylinos.cn>.

v2: synchronize_rcu() is moved into ip_set_swap() in order not to burden
    ip_set_destroy() unnecessarily when all sets are destroyed.
v3: Florian Westphal pointed out that all netfilter hooks run with rcu_read_lock() held
    and em_ipset.c wraps the entire ip_set_test() in rcu read lock/unlock pair.
    So there's no need to extend the rcu read locked area in ipset itself.

Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@netfilter.org/
Reported by: Linkui Xiao <xiaolinkui@kylinos.cn>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 35d2f9c9ada0..4c133e06be1d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -61,6 +61,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 	ip_set_dereference((inst)->ip_set_list)[id]
 #define ip_set_ref_netlink(inst,id)	\
 	rcu_dereference_raw((inst)->ip_set_list)[id]
+#define ip_set_dereference_nfnl(p)	\
+	rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
 
 /* The set types are implemented in modules and registered set types
  * can be found in ip_set_type_list. Adding/deleting types is
@@ -708,15 +710,10 @@ __ip_set_put_netlink(struct ip_set *set)
 static struct ip_set *
 ip_set_rcu_get(struct net *net, ip_set_id_t index)
 {
-	struct ip_set *set;
 	struct ip_set_net *inst = ip_set_pernet(net);
 
-	rcu_read_lock();
-	/* ip_set_list itself needs to be protected */
-	set = rcu_dereference(inst->ip_set_list)[index];
-	rcu_read_unlock();
-
-	return set;
+	/* ip_set_list and the set pointer need to be protected */
+	return ip_set_dereference_nfnl(inst->ip_set_list)[index];
 }
 
 static inline void
@@ -1397,6 +1394,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
 	ip_set(inst, to_id) = from;
 	write_unlock_bh(&ip_set_ref_lock);
 
+	/* Make sure all readers of the old set pointers are completed. */
+	synchronize_rcu();
+
 	return 0;
 }
 
-- 
cgit 


From 8837ba3e58ea1e3d09ae36db80b1e80853aada95 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 13 Nov 2023 21:13:31 +0100
Subject: netfilter: nf_tables: split async and sync catchall in two functions

list_for_each_entry_safe() does not work for the async case which runs
under RCU, therefore, split GC logic for catchall in two functions
instead, one for each of the sync and async GC variants.

The catchall sync GC variant never sees a _DEAD bit set on ever, thus,
this handling is removed in such case, moreover, allocate GC sync batch
via GFP_KERNEL.

Fixes: 93995bf4af2c ("netfilter: nf_tables: remove catchall element in GC sync path")
Reported-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 55 +++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index debea1c67701..c0a42989b982 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -9680,16 +9680,14 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
 	call_rcu(&trans->rcu, nft_trans_gc_trans_free);
 }
 
-static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
-						  unsigned int gc_seq,
-						  bool sync)
+struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
+						 unsigned int gc_seq)
 {
-	struct nft_set_elem_catchall *catchall, *next;
+	struct nft_set_elem_catchall *catchall;
 	const struct nft_set *set = gc->set;
-	struct nft_elem_priv *elem_priv;
 	struct nft_set_ext *ext;
 
-	list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
 		ext = nft_set_elem_ext(set, catchall->elem);
 
 		if (!nft_set_elem_expired(ext))
@@ -9699,35 +9697,42 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
 
 		nft_set_elem_dead(ext);
 dead_elem:
-		if (sync)
-			gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
-		else
-			gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
-
+		gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
 		if (!gc)
 			return NULL;
 
-		elem_priv = catchall->elem;
-		if (sync) {
-			nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
-			nft_setelem_catchall_destroy(catchall);
-		}
-
-		nft_trans_gc_elem_add(gc, elem_priv);
+		nft_trans_gc_elem_add(gc, catchall->elem);
 	}
 
 	return gc;
 }
 
-struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
-						 unsigned int gc_seq)
-{
-	return nft_trans_gc_catchall(gc, gc_seq, false);
-}
-
 struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
 {
-	return nft_trans_gc_catchall(gc, 0, true);
+	struct nft_set_elem_catchall *catchall, *next;
+	const struct nft_set *set = gc->set;
+	struct nft_elem_priv *elem_priv;
+	struct nft_set_ext *ext;
+
+	WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));
+
+	list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+
+		if (!nft_set_elem_expired(ext))
+			continue;
+
+		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+		if (!gc)
+			return NULL;
+
+		elem_priv = catchall->elem;
+		nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
+		nft_setelem_catchall_destroy(catchall);
+		nft_trans_gc_elem_add(gc, elem_priv);
+	}
+
+	return gc;
 }
 
 static void nf_tables_module_autoload_cleanup(struct net *net)
-- 
cgit 


From b944aa9d86d5f782bfe5e51336434c960304839c Mon Sep 17 00:00:00 2001
From: Matus Malych <matus@malych.org>
Date: Tue, 14 Nov 2023 14:35:25 +0100
Subject: ALSA: hda/realtek: Enable Mute LED on HP 255 G10

HP 255 G10 has a mute LED that can be made to work using quirk
ALC236_FIXUP_HP_MUTE_LED_COEFBIT2.
Enable already existing quirk - at correct line to keep order

Signed-off-by: Matus Malych <matus@malych.org>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20231114133524.11340-1-matus@malych.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index cdd808e02b44..3c85b8247c11 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9868,6 +9868,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+	SND_PCI_QUIRK(0x103c, 0x8b2f, "HP 255 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2),
 	SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
-- 
cgit 


From 2e6ef8aaba6b709ce91164401fa1c12668510360 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 14 Nov 2023 07:57:53 -0600
Subject: Remove myself as maintainer of GFS2

I am retiring from Red Hat and will no longer be a maintainer of the
gfs2 file system.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 97f51d5ec1cf..5c9f868e13b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8950,7 +8950,6 @@ S:	Maintained
 F:	scripts/get_maintainer.pl
 
 GFS2 FILE SYSTEM
-M:	Bob Peterson <rpeterso@redhat.com>
 M:	Andreas Gruenbacher <agruenba@redhat.com>
 L:	gfs2@lists.linux.dev
 S:	Supported
-- 
cgit 


From 782ce431613cf08c3a00dca42ad925c3b1108d09 Mon Sep 17 00:00:00 2001
From: Konstantin Runov <runebone1@gmail.com>
Date: Mon, 30 Oct 2023 12:45:08 +0300
Subject: gcc-plugins: latent_entropy: Fix typo (args -> argc) in plugin
 description

Fix the typo in the plugin description comment. Clearly, "argc" should
be used.

Signed-off-by: Konstantin Runov <runebone1@gmail.com>
Link: https://lore.kernel.org/r/20231030094508.245432-1-runebone1@gmail.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 scripts/gcc-plugins/latent_entropy_plugin.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c
index 39e86be60dd2..ff0b192be91f 100644
--- a/scripts/gcc-plugins/latent_entropy_plugin.c
+++ b/scripts/gcc-plugins/latent_entropy_plugin.c
@@ -17,7 +17,7 @@
  *	if (argc <= 1)
  *		printf("%s: no command arguments :(\n", *argv);
  *	else
- *		printf("%s: %d command arguments!\n", *argv, args - 1);
+ *		printf("%s: %d command arguments!\n", *argv, argc - 1);
  * }
  *
  * after:
@@ -47,7 +47,7 @@
  *		// perturb_local_entropy()
  *	} else {
  *		local_entropy ^= 3896280633962944730;
- *		printf("%s: %d command arguments!\n", *argv, args - 1);
+ *		printf("%s: %d command arguments!\n", *argv, argc - 1);
  *	}
  *
  *	// latent_entropy_execute() 4.
-- 
cgit 


From 29954d5b1e0d67a4cd61c30c2201030c97e94b1e Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Tue, 14 Nov 2023 04:54:12 +0000
Subject: cifs: fix leak of iface for primary channel

My last change in this area introduced a change which
accounted for primary channel in the interface ref count.
However, it did not reduce this ref count on deallocation
of the primary channel. i.e. during umount.

Fixing this leak here, by dropping this ref count for
primary channel while freeing up the session.

Fixes: fa1d0508bdd4 ("cifs: account for primary channel in the interface list")
Cc: stable@vger.kernel.org
Reported-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 57c2a7df3457..f896f60c924b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2065,6 +2065,12 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
 		ses->chans[i].server = NULL;
 	}
 
+	/* we now account for primary channel in iface->refcount */
+	if (ses->chans[0].iface) {
+		kref_put(&ses->chans[0].iface->refcount, release_iface);
+		ses->chans[0].server = NULL;
+	}
+
 	sesInfoFree(ses);
 	cifs_put_tcp_session(server, 0);
 }
-- 
cgit 


From 5eef12c4e3230f2025dc46ad8c4a3bc19978e5d7 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Tue, 14 Nov 2023 04:58:23 +0000
Subject: cifs: fix lock ordering while disabling multichannel

The code to handle the case of server disabling multichannel
was picking iface_lock with chan_lock held. This goes against
the lock ordering rules, as iface_lock is a higher order lock
(even if it isn't so obvious).

This change fixes the lock ordering by doing the following in
that order for each secondary channel:
1. store iface and server pointers in local variable
2. remove references to iface and server in channels
3. unlock chan_lock
4. lock iface_lock
5. dec ref count for iface
6. unlock iface_lock
7. dec ref count for server
8. lock chan_lock again

Since this function can only be called in smb2_reconnect, and
that cannot be called by two parallel processes, we should not
have races due to dropping chan_lock between steps 3 and 8.

Fixes: ee1d21794e55 ("cifs: handle when server stops supporting multichannel")
Reported-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/sess.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 0bb2ac929061..8b2d7c1ca428 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -322,28 +322,32 @@ cifs_disable_secondary_channels(struct cifs_ses *ses)
 		iface = ses->chans[i].iface;
 		server = ses->chans[i].server;
 
+		/*
+		 * remove these references first, since we need to unlock
+		 * the chan_lock here, since iface_lock is a higher lock
+		 */
+		ses->chans[i].iface = NULL;
+		ses->chans[i].server = NULL;
+		spin_unlock(&ses->chan_lock);
+
 		if (iface) {
 			spin_lock(&ses->iface_lock);
 			kref_put(&iface->refcount, release_iface);
-			ses->chans[i].iface = NULL;
 			iface->num_channels--;
 			if (iface->weight_fulfilled)
 				iface->weight_fulfilled--;
 			spin_unlock(&ses->iface_lock);
 		}
 
-		spin_unlock(&ses->chan_lock);
-		if (server && !server->terminate) {
-			server->terminate = true;
-			cifs_signal_cifsd_for_reconnect(server, false);
-		}
-		spin_lock(&ses->chan_lock);
-
 		if (server) {
-			ses->chans[i].server = NULL;
+			if (!server->terminate) {
+				server->terminate = true;
+				cifs_signal_cifsd_for_reconnect(server, false);
+			}
 			cifs_put_tcp_session(server, false);
 		}
 
+		spin_lock(&ses->chan_lock);
 	}
 
 done:
-- 
cgit 


From eab03c23c2a162085b13200d7942fc5a00b5ccc8 Mon Sep 17 00:00:00 2001
From: Abel Wu <wuyun.abel@bytedance.com>
Date: Tue, 7 Nov 2023 17:05:07 +0800
Subject: sched/eevdf: Fix vruntime adjustment on reweight

vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
it gets re-weighted, and the calculations can be simplified based
on the fact that re-weight won't change the w-average of all the
entities. Please check the proofs in comments.

But adjusting vruntime can also cause position change in RB-tree
hence require re-queue to fix up which might be costly. This might
be avoided by deferring adjustment to the time the entity actually
leaves tree (dequeue/pick), but that will negatively affect task
selection and probably not good enough either.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com
---
 kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 128 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2048138ce54b..025d90925bf6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3666,41 +3666,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			   unsigned long weight)
+{
+	unsigned long old_weight = se->load.weight;
+	u64 avruntime = avg_vruntime(cfs_rq);
+	s64 vlag, vslice;
+
+	/*
+	 * VRUNTIME
+	 * ========
+	 *
+	 * COROLLARY #1: The virtual runtime of the entity needs to be
+	 * adjusted if re-weight at !0-lag point.
+	 *
+	 * Proof: For contradiction assume this is not true, so we can
+	 * re-weight without changing vruntime at !0-lag point.
+	 *
+	 *             Weight	VRuntime   Avg-VRuntime
+	 *     before    w          v            V
+	 *      after    w'         v'           V'
+	 *
+	 * Since lag needs to be preserved through re-weight:
+	 *
+	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
+	 *	==>	V' = (V - v)*w/w' + v		(1)
+	 *
+	 * Let W be the total weight of the entities before reweight,
+	 * since V' is the new weighted average of entities:
+	 *
+	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
+	 *
+	 * by using (1) & (2) we obtain:
+	 *
+	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+	 *
+	 * Since we are doing at !0-lag point which means V != v, we
+	 * can simplify (3):
+	 *
+	 *	==>	W / (W + w' - w) = w / w'
+	 *	==>	Ww' = Ww + ww' - ww
+	 *	==>	W * (w' - w) = w * (w' - w)
+	 *	==>	W = w	(re-weight indicates w' != w)
+	 *
+	 * So the cfs_rq contains only one entity, hence vruntime of
+	 * the entity @v should always equal to the cfs_rq's weighted
+	 * average vruntime @V, which means we will always re-weight
+	 * at 0-lag point, thus breach assumption. Proof completed.
+	 *
+	 *
+	 * COROLLARY #2: Re-weight does NOT affect weighted average
+	 * vruntime of all the entities.
+	 *
+	 * Proof: According to corollary #1, Eq. (1) should be:
+	 *
+	 *	(V - v)*w = (V' - v')*w'
+	 *	==>    v' = V' - (V - v)*w/w'		(4)
+	 *
+	 * According to the weighted average formula, we have:
+	 *
+	 *	V' = (WV - wv + w'v') / (W - w + w')
+	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+	 *	   = (WV + w'V' - Vw) / (W - w + w')
+	 *
+	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
+	 *	==>	V' * (W - w) = (W - w) * V	(5)
+	 *
+	 * If the entity is the only one in the cfs_rq, then reweight
+	 * always occurs at 0-lag point, so V won't change. Or else
+	 * there are other entities, hence W != w, then Eq. (5) turns
+	 * into V' = V. So V won't change in either case, proof done.
+	 *
+	 *
+	 * So according to corollary #1 & #2, the effect of re-weight
+	 * on vruntime should be:
+	 *
+	 *	v' = V' - (V - v) * w / w'		(4)
+	 *	   = V  - (V - v) * w / w'
+	 *	   = V  - vl * w / w'
+	 *	   = V  - vl'
+	 */
+	if (avruntime != se->vruntime) {
+		vlag = (s64)(avruntime - se->vruntime);
+		vlag = div_s64(vlag * old_weight, weight);
+		se->vruntime = avruntime - vlag;
+	}
+
+	/*
+	 * DEADLINE
+	 * ========
+	 *
+	 * When the weight changes, the virtual time slope changes and
+	 * we should adjust the relative virtual deadline accordingly.
+	 *
+	 *	d' = v' + (d - v)*w/w'
+	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  + (d - V)*w/w'
+	 */
+	vslice = (s64)(se->deadline - avruntime);
+	vslice = div_s64(vslice * old_weight, weight);
+	se->deadline = avruntime + vslice;
+}
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
-	unsigned long old_weight = se->load.weight;
+	bool curr = cfs_rq->curr == se;
 
 	if (se->on_rq) {
 		/* commit outstanding execution time */
-		if (cfs_rq->curr == se)
+		if (curr)
 			update_curr(cfs_rq);
 		else
-			avg_vruntime_sub(cfs_rq, se);
+			__dequeue_entity(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
 
-	update_load_set(&se->load, weight);
-
 	if (!se->on_rq) {
 		/*
 		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 		 * we need to scale se->vlag when w_i changes.
 		 */
-		se->vlag = div_s64(se->vlag * old_weight, weight);
+		se->vlag = div_s64(se->vlag * se->load.weight, weight);
 	} else {
-		s64 deadline = se->deadline - se->vruntime;
-		/*
-		 * When the weight changes, the virtual time slope changes and
-		 * we should adjust the relative virtual deadline accordingly.
-		 */
-		deadline = div_s64(deadline * old_weight, weight);
-		se->deadline = se->vruntime + deadline;
-		if (se != cfs_rq->curr)
-			min_deadline_cb_propagate(&se->run_node, NULL);
+		reweight_eevdf(cfs_rq, se, weight);
 	}
 
+	update_load_set(&se->load, weight);
+
 #ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-		if (cfs_rq->curr != se)
-			avg_vruntime_add(cfs_rq, se);
+		if (!curr) {
+			/*
+			 * The entity's vruntime has been adjusted, so let's check
+			 * whether the rq-wide min_vruntime needs updated too. Since
+			 * the calculations above require stable min_vruntime rather
+			 * than up-to-date one, we do the update at the end of the
+			 * reweight process.
+			 */
+			__enqueue_entity(cfs_rq, se);
+			update_min_vruntime(cfs_rq);
+		}
 	}
 }
 
@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
 
 #ifndef CONFIG_SMP
 	shares = READ_ONCE(gcfs_rq->tg->shares);
-
-	if (likely(se->load.weight == shares))
-		return;
 #else
-	shares   = calc_group_shares(gcfs_rq);
+	shares = calc_group_shares(gcfs_rq);
 #endif
-
-	reweight_entity(cfs_rq_of(se), se, shares);
+	if (unlikely(se->load.weight != shares))
+		reweight_entity(cfs_rq_of(se), se, shares);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit 


From 8b39d20eceeda6c4eb23df1497f9ed2fffdc8f69 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 26 Oct 2023 12:41:14 -0400
Subject: sched: psi: fix unprivileged polling against cgroups

519fabc7aaba ("psi: remove 500ms min window size limitation for
triggers") breaks unprivileged psi polling on cgroups.

Historically, we had a privilege check for polling in the open() of a
pressure file in /proc, but were erroneously missing it for the open()
of cgroup pressure files.

When unprivileged polling was introduced in d82caa273565 ("sched/psi:
Allow unprivileged polling of N*2s period"), it needed to filter
privileges depending on the exact polling parameters, and as such
moved the CAP_SYS_RESOURCE check from the proc open() callback to
psi_trigger_create(). Both the proc files as well as cgroup files go
through this during write(). This implicitly added the missing check
for privileges required for HT polling for cgroups.

When 519fabc7aaba ("psi: remove 500ms min window size limitation for
triggers") followed right after to remove further restrictions on the
RT polling window, it incorrectly assumed the cgroup privilege check
was still missing and added it to the cgroup open(), mirroring what we
used to do for proc files in the past.

As a result, unprivileged poll requests that would be supported now
get rejected when opening the cgroup pressure file for writing.

Remove the cgroup open() check. psi_trigger_create() handles it.

Fixes: 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers")
Reported-by: Luca Boccassi <bluca@debian.org>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Luca Boccassi <bluca@debian.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: stable@vger.kernel.org # 6.5+
Link: https://lore.kernel.org/r/20231026164114.2488682-1-hannes@cmpxchg.org
---
 kernel/cgroup/cgroup.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1d5b9de3b1b9..4b9ff41ca603 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
 	return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
 }
 
-static int cgroup_pressure_open(struct kernfs_open_file *of)
-{
-	if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	return 0;
-}
-
 static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
@@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "io.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "memory.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "cpu.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "irq.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_irq_pressure_show,
 		.write = cgroup_irq_pressure_write,
 		.poll = cgroup_pressure_poll,
-- 
cgit 


From 6d7e4782bcf549221b4ccfffec2cf4d1a473f1a3 Mon Sep 17 00:00:00 2001
From: Keisuke Nishimura <keisuke.nishimura@inria.fr>
Date: Tue, 31 Oct 2023 14:38:22 +0100
Subject: sched/fair: Fix the decision for load balance

should_we_balance is called for the decision to do load-balancing.
When sched ticks invoke this function, only one CPU should return
true. However, in the current code, two CPUs can return true. The
following situation, where b means busy and i means idle, is an
example, because CPU 0 and CPU 2 return true.

        [0, 1] [2, 3]
         b  b   i  b

This fix checks if there exists an idle CPU with busy sibling(s)
after looking for a CPU on an idle core. If some idle CPUs with busy
siblings are found, just the first one should do load-balancing.

Fixes: b1bfeab9b002 ("sched/fair: Consider the idle state of the whole core for load balance")
Signed-off-by: Keisuke Nishimura <keisuke.nishimura@inria.fr>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20231031133821.1570861-1-keisuke.nishimura@inria.fr
---
 kernel/sched/fair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 025d90925bf6..d7a3c63a2171 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11184,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
 			continue;
 		}
 
-		/* Are we the first idle CPU? */
+		/*
+		 * Are we the first idle core in a non-SMT domain or higher,
+		 * or the first idle CPU in a SMT domain?
+		 */
 		return cpu == env->dst_cpu;
 	}
 
-	if (idle_smt == env->dst_cpu)
-		return true;
+	/* Are we the first idle CPU with busy siblings? */
+	if (idle_smt != -1)
+		return idle_smt == env->dst_cpu;
 
 	/* Are we the first CPU of this group ? */
 	return group_balance_cpu(sg) == env->dst_cpu;
-- 
cgit 


From 09f12bf9f790052710bd6e48a1fc1bc4d9e17389 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Tue, 7 Nov 2023 18:18:31 +0300
Subject: nouveau/gsp/r535: uninitialized variable in r535_gsp_acpi_mux_id()

The if we hit the "continue" statement on the first iteration through
the loop then "handle_mux" needs to be set to NULL so we continue
looping.

Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Danilo Krummrich <dakr@redhat.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1d864f6e-43e9-43d8-9d90-30e76c9c843b@moroto.mountain
---
 drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
index e31f9641114b..afa8e7377a76 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
@@ -1159,7 +1159,7 @@ static void
 r535_gsp_acpi_mux_id(acpi_handle handle, u32 id, MUX_METHOD_DATA_ELEMENT *mode,
 						 MUX_METHOD_DATA_ELEMENT *part)
 {
-	acpi_handle iter = NULL, handle_mux;
+	acpi_handle iter = NULL, handle_mux = NULL;
 	acpi_status status;
 	unsigned long long value;
 
-- 
cgit 


From 42bd415bd8bd43721d423930b4695c565661e687 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 8 Nov 2023 10:40:21 +0300
Subject: nouveau/gsp/r535: Fix a NULL vs error pointer bug

The r535_gsp_cmdq_get() function returns error pointers but this code
checks for NULL.  Also we need to propagate the error pointer back to
the callers in r535_gsp_rpc_get().  Returning NULL will lead to a NULL
pointer dereference.

Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Danilo Krummrich <dakr@redhat.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/f71996d9-d1cb-45ea-a4b2-2dfc21312d8c@kili.mountain
---
 drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
index afa8e7377a76..dc44f5c7833f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
@@ -689,8 +689,8 @@ r535_gsp_rpc_get(struct nvkm_gsp *gsp, u32 fn, u32 argc)
 	struct nvfw_gsp_rpc *rpc;
 
 	rpc = r535_gsp_cmdq_get(gsp, ALIGN(sizeof(*rpc) + argc, sizeof(u64)));
-	if (!rpc)
-		return NULL;
+	if (IS_ERR(rpc))
+		return ERR_CAST(rpc);
 
 	rpc->header_version = 0x03000000;
 	rpc->signature = ('C' << 24) | ('P' << 16) | ('R' << 8) | 'V';
-- 
cgit 


From a2e36cd56041e277d7d81d35638fd8d9731e21f5 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 7 Nov 2023 15:32:55 +1000
Subject: nouveau: use an rwlock for the event lock.

This allows it to break the following circular locking dependency.

Aug 10 07:01:29 dg1test kernel: ======================================================
Aug 10 07:01:29 dg1test kernel: WARNING: possible circular locking dependency detected
Aug 10 07:01:29 dg1test kernel: 6.4.0-rc7+ #10 Not tainted
Aug 10 07:01:29 dg1test kernel: ------------------------------------------------------
Aug 10 07:01:29 dg1test kernel: wireplumber/2236 is trying to acquire lock:
Aug 10 07:01:29 dg1test kernel: ffff8fca5320da18 (&fctx->lock){-...}-{2:2}, at: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau]
Aug 10 07:01:29 dg1test kernel:
                                but task is already holding lock:
Aug 10 07:01:29 dg1test kernel: ffff8fca41208610 (&event->list_lock#2){-...}-{2:2}, at: nvkm_event_ntfy+0x50/0xf0 [nouveau]
Aug 10 07:01:29 dg1test kernel:
                                which lock already depends on the new lock.
Aug 10 07:01:29 dg1test kernel:
                                the existing dependency chain (in reverse order) is:
Aug 10 07:01:29 dg1test kernel:
                                -> #3 (&event->list_lock#2){-...}-{2:2}:
Aug 10 07:01:29 dg1test kernel:        _raw_spin_lock_irqsave+0x4b/0x70
Aug 10 07:01:29 dg1test kernel:        nvkm_event_ntfy+0x50/0xf0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        ga100_fifo_nonstall_intr+0x24/0x30 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_intr+0x12c/0x240 [nouveau]
Aug 10 07:01:29 dg1test kernel:        __handle_irq_event_percpu+0x88/0x240
Aug 10 07:01:29 dg1test kernel:        handle_irq_event+0x38/0x80
Aug 10 07:01:29 dg1test kernel:        handle_edge_irq+0xa3/0x240
Aug 10 07:01:29 dg1test kernel:        __common_interrupt+0x72/0x160
Aug 10 07:01:29 dg1test kernel:        common_interrupt+0x60/0xe0
Aug 10 07:01:29 dg1test kernel:        asm_common_interrupt+0x26/0x40
Aug 10 07:01:29 dg1test kernel:
                                -> #2 (&device->intr.lock){-...}-{2:2}:
Aug 10 07:01:29 dg1test kernel:        _raw_spin_lock_irqsave+0x4b/0x70
Aug 10 07:01:29 dg1test kernel:        nvkm_inth_allow+0x2c/0x80 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_event_ntfy_state+0x181/0x250 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_event_ntfy_allow+0x63/0xd0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_uevent_mthd+0x4d/0x70 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_ioctl+0x10b/0x250 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvif_object_mthd+0xa8/0x1f0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvif_event_allow+0x2a/0xa0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nouveau_fence_enable_signaling+0x78/0x80 [nouveau]
Aug 10 07:01:29 dg1test kernel:        __dma_fence_enable_signaling+0x5e/0x100
Aug 10 07:01:29 dg1test kernel:        dma_fence_add_callback+0x4b/0xd0
Aug 10 07:01:29 dg1test kernel:        nouveau_cli_work_queue+0xae/0x110 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nouveau_gem_object_close+0x1d1/0x2a0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        drm_gem_handle_delete+0x70/0xe0 [drm]
Aug 10 07:01:29 dg1test kernel:        drm_ioctl_kernel+0xa5/0x150 [drm]
Aug 10 07:01:29 dg1test kernel:        drm_ioctl+0x256/0x490 [drm]
Aug 10 07:01:29 dg1test kernel:        nouveau_drm_ioctl+0x5a/0xb0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        __x64_sys_ioctl+0x91/0xd0
Aug 10 07:01:29 dg1test kernel:        do_syscall_64+0x3c/0x90
Aug 10 07:01:29 dg1test kernel:        entry_SYSCALL_64_after_hwframe+0x72/0xdc
Aug 10 07:01:29 dg1test kernel:
                                -> #1 (&event->refs_lock#4){....}-{2:2}:
Aug 10 07:01:29 dg1test kernel:        _raw_spin_lock_irqsave+0x4b/0x70
Aug 10 07:01:29 dg1test kernel:        nvkm_event_ntfy_state+0x37/0x250 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_event_ntfy_allow+0x63/0xd0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_uevent_mthd+0x4d/0x70 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_ioctl+0x10b/0x250 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvif_object_mthd+0xa8/0x1f0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvif_event_allow+0x2a/0xa0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nouveau_fence_enable_signaling+0x78/0x80 [nouveau]
Aug 10 07:01:29 dg1test kernel:        __dma_fence_enable_signaling+0x5e/0x100
Aug 10 07:01:29 dg1test kernel:        dma_fence_add_callback+0x4b/0xd0
Aug 10 07:01:29 dg1test kernel:        nouveau_cli_work_queue+0xae/0x110 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nouveau_gem_object_close+0x1d1/0x2a0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        drm_gem_handle_delete+0x70/0xe0 [drm]
Aug 10 07:01:29 dg1test kernel:        drm_ioctl_kernel+0xa5/0x150 [drm]
Aug 10 07:01:29 dg1test kernel:        drm_ioctl+0x256/0x490 [drm]
Aug 10 07:01:29 dg1test kernel:        nouveau_drm_ioctl+0x5a/0xb0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        __x64_sys_ioctl+0x91/0xd0
Aug 10 07:01:29 dg1test kernel:        do_syscall_64+0x3c/0x90
Aug 10 07:01:29 dg1test kernel:        entry_SYSCALL_64_after_hwframe+0x72/0xdc
Aug 10 07:01:29 dg1test kernel:
                                -> #0 (&fctx->lock){-...}-{2:2}:
Aug 10 07:01:29 dg1test kernel:        __lock_acquire+0x14e3/0x2240
Aug 10 07:01:29 dg1test kernel:        lock_acquire+0xc8/0x2a0
Aug 10 07:01:29 dg1test kernel:        _raw_spin_lock_irqsave+0x4b/0x70
Aug 10 07:01:29 dg1test kernel:        nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_client_event+0xf/0x20 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_event_ntfy+0x9b/0xf0 [nouveau]
Aug 10 07:01:29 dg1test kernel:        ga100_fifo_nonstall_intr+0x24/0x30 [nouveau]
Aug 10 07:01:29 dg1test kernel:        nvkm_intr+0x12c/0x240 [nouveau]
Aug 10 07:01:29 dg1test kernel:        __handle_irq_event_percpu+0x88/0x240
Aug 10 07:01:29 dg1test kernel:        handle_irq_event+0x38/0x80
Aug 10 07:01:29 dg1test kernel:        handle_edge_irq+0xa3/0x240
Aug 10 07:01:29 dg1test kernel:        __common_interrupt+0x72/0x160
Aug 10 07:01:29 dg1test kernel:        common_interrupt+0x60/0xe0
Aug 10 07:01:29 dg1test kernel:        asm_common_interrupt+0x26/0x40
Aug 10 07:01:29 dg1test kernel:
                                other info that might help us debug this:
Aug 10 07:01:29 dg1test kernel: Chain exists of:
                                  &fctx->lock --> &device->intr.lock --> &event->list_lock#2
Aug 10 07:01:29 dg1test kernel:  Possible unsafe locking scenario:
Aug 10 07:01:29 dg1test kernel:        CPU0                    CPU1
Aug 10 07:01:29 dg1test kernel:        ----                    ----
Aug 10 07:01:29 dg1test kernel:   lock(&event->list_lock#2);
Aug 10 07:01:29 dg1test kernel:                                lock(&device->intr.lock);
Aug 10 07:01:29 dg1test kernel:                                lock(&event->list_lock#2);
Aug 10 07:01:29 dg1test kernel:   lock(&fctx->lock);
Aug 10 07:01:29 dg1test kernel:
                                 *** DEADLOCK ***
Aug 10 07:01:29 dg1test kernel: 2 locks held by wireplumber/2236:
Aug 10 07:01:29 dg1test kernel:  #0: ffff8fca53177bf8 (&device->intr.lock){-...}-{2:2}, at: nvkm_intr+0x29/0x240 [nouveau]
Aug 10 07:01:29 dg1test kernel:  #1: ffff8fca41208610 (&event->list_lock#2){-...}-{2:2}, at: nvkm_event_ntfy+0x50/0xf0 [nouveau]
Aug 10 07:01:29 dg1test kernel:
                                stack backtrace:
Aug 10 07:01:29 dg1test kernel: CPU: 6 PID: 2236 Comm: wireplumber Not tainted 6.4.0-rc7+ #10
Aug 10 07:01:29 dg1test kernel: Hardware name: Gigabyte Technology Co., Ltd. Z390 I AORUS PRO WIFI/Z390 I AORUS PRO WIFI-CF, BIOS F8 11/05/2021
Aug 10 07:01:29 dg1test kernel: Call Trace:
Aug 10 07:01:29 dg1test kernel:  <TASK>
Aug 10 07:01:29 dg1test kernel:  dump_stack_lvl+0x5b/0x90
Aug 10 07:01:29 dg1test kernel:  check_noncircular+0xe2/0x110
Aug 10 07:01:29 dg1test kernel:  __lock_acquire+0x14e3/0x2240
Aug 10 07:01:29 dg1test kernel:  lock_acquire+0xc8/0x2a0
Aug 10 07:01:29 dg1test kernel:  ? nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau]
Aug 10 07:01:29 dg1test kernel:  ? lock_acquire+0xc8/0x2a0
Aug 10 07:01:29 dg1test kernel:  _raw_spin_lock_irqsave+0x4b/0x70
Aug 10 07:01:29 dg1test kernel:  ? nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau]
Aug 10 07:01:29 dg1test kernel:  nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau]
Aug 10 07:01:29 dg1test kernel:  nvkm_client_event+0xf/0x20 [nouveau]
Aug 10 07:01:29 dg1test kernel:  nvkm_event_ntfy+0x9b/0xf0 [nouveau]
Aug 10 07:01:29 dg1test kernel:  ga100_fifo_nonstall_intr+0x24/0x30 [nouveau]
Aug 10 07:01:29 dg1test kernel:  nvkm_intr+0x12c/0x240 [nouveau]
Aug 10 07:01:29 dg1test kernel:  __handle_irq_event_percpu+0x88/0x240
Aug 10 07:01:29 dg1test kernel:  handle_irq_event+0x38/0x80
Aug 10 07:01:29 dg1test kernel:  handle_edge_irq+0xa3/0x240
Aug 10 07:01:29 dg1test kernel:  __common_interrupt+0x72/0x160
Aug 10 07:01:29 dg1test kernel:  common_interrupt+0x60/0xe0
Aug 10 07:01:29 dg1test kernel:  asm_common_interrupt+0x26/0x40
Aug 10 07:01:29 dg1test kernel: RIP: 0033:0x7fb66174d700
Aug 10 07:01:29 dg1test kernel: Code: c1 e2 05 29 ca 8d 0c 10 0f be 07 84 c0 75 eb 89 c8 c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa e9 d7 0f fc ff 0f 1f 80 00 00 00 00 <f3> 0f 1e fa e9 c7 0f fc>
Aug 10 07:01:29 dg1test kernel: RSP: 002b:00007ffdd3c48438 EFLAGS: 00000206
Aug 10 07:01:29 dg1test kernel: RAX: 000055bb758763c0 RBX: 000055bb758752c0 RCX: 00000000000028b0
Aug 10 07:01:29 dg1test kernel: RDX: 000055bb758752c0 RSI: 000055bb75887490 RDI: 000055bb75862950
Aug 10 07:01:29 dg1test kernel: RBP: 00007ffdd3c48490 R08: 000055bb75873b10 R09: 0000000000000001
Aug 10 07:01:29 dg1test kernel: R10: 0000000000000004 R11: 000055bb7587f000 R12: 000055bb75887490
Aug 10 07:01:29 dg1test kernel: R13: 000055bb757f6280 R14: 000055bb758875c0 R15: 000055bb757f6280
Aug 10 07:01:29 dg1test kernel:  </TASK>

Signed-off-by: Dave Airlie <airlied@redhat.com>
Tested-by: Danilo Krummrich <dakr@redhat.com>
Reviewed-by: Danilo Krummrich <dakr@redhat.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231107053255.2257079-1-airlied@gmail.com
---
 drivers/gpu/drm/nouveau/include/nvkm/core/event.h |  4 ++--
 drivers/gpu/drm/nouveau/nvkm/core/event.c         | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/event.h b/drivers/gpu/drm/nouveau/include/nvkm/core/event.h
index 82b267c11147..460459af272d 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/event.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/event.h
@@ -14,7 +14,7 @@ struct nvkm_event {
 	int index_nr;
 
 	spinlock_t refs_lock;
-	spinlock_t list_lock;
+	rwlock_t list_lock;
 	int *refs;
 
 	struct list_head ntfy;
@@ -38,7 +38,7 @@ nvkm_event_init(const struct nvkm_event_func *func, struct nvkm_subdev *subdev,
 		int types_nr, int index_nr, struct nvkm_event *event)
 {
 	spin_lock_init(&event->refs_lock);
-	spin_lock_init(&event->list_lock);
+	rwlock_init(&event->list_lock);
 	return __nvkm_event_init(func, subdev, types_nr, index_nr, event);
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/event.c b/drivers/gpu/drm/nouveau/nvkm/core/event.c
index a6c877135598..61fed7792e41 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/event.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/event.c
@@ -81,17 +81,17 @@ nvkm_event_ntfy_state(struct nvkm_event_ntfy *ntfy)
 static void
 nvkm_event_ntfy_remove(struct nvkm_event_ntfy *ntfy)
 {
-	spin_lock_irq(&ntfy->event->list_lock);
+	write_lock_irq(&ntfy->event->list_lock);
 	list_del_init(&ntfy->head);
-	spin_unlock_irq(&ntfy->event->list_lock);
+	write_unlock_irq(&ntfy->event->list_lock);
 }
 
 static void
 nvkm_event_ntfy_insert(struct nvkm_event_ntfy *ntfy)
 {
-	spin_lock_irq(&ntfy->event->list_lock);
+	write_lock_irq(&ntfy->event->list_lock);
 	list_add_tail(&ntfy->head, &ntfy->event->ntfy);
-	spin_unlock_irq(&ntfy->event->list_lock);
+	write_unlock_irq(&ntfy->event->list_lock);
 }
 
 static void
@@ -176,7 +176,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits)
 		return;
 
 	nvkm_trace(event->subdev, "event: ntfy %08x on %d\n", bits, id);
-	spin_lock_irqsave(&event->list_lock, flags);
+	read_lock_irqsave(&event->list_lock, flags);
 
 	list_for_each_entry_safe(ntfy, ntmp, &event->ntfy, head) {
 		if (ntfy->id == id && ntfy->bits & bits) {
@@ -185,7 +185,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits)
 		}
 	}
 
-	spin_unlock_irqrestore(&event->list_lock, flags);
+	read_unlock_irqrestore(&event->list_lock, flags);
 }
 
 void
-- 
cgit 


From 969d90ec212bae4b45bf9d21d7daa30aa6cf055e Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 14 Nov 2023 17:25:48 -0500
Subject: audit: don't WARN_ON_ONCE(!current->mm) in audit_exe_compare()

eBPF can end up calling into the audit code from some odd places, and
some of these places don't have @current set properly so we end up
tripping the `WARN_ON_ONCE(!current->mm)` near the top of
`audit_exe_compare()`.  While the basic `!current->mm` check is good,
the `WARN_ON_ONCE()` results in some scary console messages so let's
drop that and just do the regular `!current->mm` check to avoid
problems.

Cc: <stable@vger.kernel.org>
Fixes: 47846d51348d ("audit: don't take task_lock() in audit_exe_compare() code path")
Reported-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_watch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 91e82e34b51e..7a98cd176a12 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	if (tsk != current)
 		return 0;
 
-	if (WARN_ON_ONCE(!current->mm))
+	if (!current->mm)
 		return 0;
 	exe_file = get_mm_exe_file(current->mm);
 	if (!exe_file)
-- 
cgit 


From 77618db346455129424fadbbaec596a09feaf3bb Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Thu, 12 Oct 2023 12:55:34 -0700
Subject: zstd: Fix array-index-out-of-bounds UBSAN warning

Zstd used an array of length 1 to mean a flexible array for C89
compatibility. Switch to a C99 flexible array to fix the UBSAN warning.

Tested locally by booting the kernel and writing to and reading from a
BtrFS filesystem with zstd compression enabled. I was unable to reproduce
the issue before the fix, however it is a trivial change.

Link: https://lkml.kernel.org/r/20231012213428.1390905-1-nickrterrell@gmail.com
Reported-by: syzbot+1f2eb3e8cd123ffce499@syzkaller.appspotmail.com
Reported-by: Eric Biggers <ebiggers@kernel.org>
Reported-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Nick Terrell <terrelln@fb.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 lib/zstd/common/fse_decompress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
index a0d06095be83..8dcb8ca39767 100644
--- a/lib/zstd/common/fse_decompress.c
+++ b/lib/zstd/common/fse_decompress.c
@@ -312,7 +312,7 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
 
 typedef struct {
     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
-    FSE_DTable dtable[1]; /* Dynamically sized */
+    FSE_DTable dtable[]; /* Dynamically sized */
 } FSE_DecompressWksp;
 
 
-- 
cgit 


From c9bd1568d5462f4108417518ce1af7b924acfb6f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 14 Nov 2023 21:36:13 +0100
Subject: futex: Fix hardcoded flags

Xi reported that commit 5694289ce183 ("futex: Flag conversion") broke
glibc's robust futex tests.

This was narrowed down to the change of FLAGS_SHARED from 0x01 to
0x10, at which point Florian noted that handle_futex_death() has a
hardcoded flags argument of 1.

Change this to: FLAGS_SIZE_32 | FLAGS_SHARED, matching how
futex_to_flags() unconditionally sets FLAGS_SIZE_32 for all legacy
futex ops.

Reported-by: Xi Ruoyao <xry111@xry111.site>
Reported-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20231114201402.GA25315@noisy.programming.kicks-ass.net
Fixes: 5694289ce183 ("futex: Flag conversion")
Cc: <stable@vger.kernel.org>
---
 kernel/futex/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 52695c59d041..dad981a865b8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -700,7 +700,8 @@ retry:
 	owner = uval & FUTEX_TID_MASK;
 
 	if (pending_op && !pi && !owner) {
-		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+			   FUTEX_BITSET_MATCH_ANY);
 		return 0;
 	}
 
@@ -752,8 +753,10 @@ retry:
 	 * Wake robust non-PI futexes here. The wakeup of
 	 * PI futexes happens in exit_pi_state():
 	 */
-	if (!pi && (uval & FUTEX_WAITERS))
-		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+	if (!pi && (uval & FUTEX_WAITERS)) {
+		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+			   FUTEX_BITSET_MATCH_ANY);
+	}
 
 	return 0;
 }
-- 
cgit 


From 889c58b3155ff4c8e8671c95daef63d6fabbb6b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 12:34:46 +0200
Subject: perf/core: Fix cpuctx refcounting

Audit of the refcounting turned up that perf_pmu_migrate_context()
fails to migrate the ctx refcount.

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230612093539.085862001@infradead.org
Cc: <stable@vger.kernel.org>
---
 include/linux/perf_event.h | 13 ++++++++-----
 kernel/events/core.c       | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index afb028c54f33..5547ba68e6e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -843,11 +843,11 @@ struct perf_event {
 };
 
 /*
- *           ,-----------------------[1:n]----------------------.
- *           V                                                  V
- * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
- *           ^                      ^     |                     |
- *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
+ *           ,-----------------------[1:n]------------------------.
+ *           V                                                    V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
+ *                                        |                       |
+ *                                        `--[n:1]-> pmu <-[1:n]--'
  *
  *
  * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
@@ -865,6 +865,9 @@ struct perf_event {
  * ctx->mutex pinning the configuration. Since we hold a reference on
  * group_leader (through the filedesc) it can't go away, therefore it's
  * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ *
+ * perf_event holds a refcount on perf_event_context
+ * perf_event holds a refcount on perf_event_pmu_context
  */
 struct perf_event_pmu_context {
 	struct pmu			*pmu;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 683dc086ef10..b704d83a28b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
 	void *task_ctx_data = NULL;
 
 	if (!ctx->task) {
+		/*
+		 * perf_pmu_migrate_context() / __perf_pmu_install_event()
+		 * relies on the fact that find_get_pmu_context() cannot fail
+		 * for CPU contexts.
+		 */
 		struct perf_cpu_pmu_context *cpc;
 
 		cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
@@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu,
 				     int cpu, struct perf_event *event)
 {
 	struct perf_event_pmu_context *epc;
+	struct perf_event_context *old_ctx = event->ctx;
+
+	get_ctx(ctx); /* normally find_get_context() */
 
 	event->cpu = cpu;
 	epc = find_get_pmu_context(pmu, ctx, event);
@@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu,
 	if (event->state >= PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_INACTIVE;
 	perf_install_in_context(ctx, event, cpu);
+
+	/*
+	 * Now that event->ctx is updated and visible, put the old ctx.
+	 */
+	put_ctx(old_ctx);
 }
 
 static void __perf_pmu_install(struct perf_event_context *ctx,
@@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 	struct perf_event_context *src_ctx, *dst_ctx;
 	LIST_HEAD(events);
 
+	/*
+	 * Since per-cpu context is persistent, no need to grab an extra
+	 * reference.
+	 */
 	src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
 	dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
 
-- 
cgit 


From fa02de9e75889915b554eda1964a631fd019973b Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Mon, 13 Nov 2023 19:42:49 +0200
Subject: net: stmmac: fix rx budget limit check

The while loop condition verifies 'count < limit'. Neither value change
before the 'count >= limit' check. As is this check is dead code. But
code inspection reveals a code path that modifies 'count' and then goto
'drain_data' and back to 'read_again'. So there is a need to verify
count value sanity after 'read_again'.

Move 'read_again' up to fix the count limit check.

Fixes: ec222003bd94 ("net: stmmac: Prepare to add Split Header support")
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Link: https://lore.kernel.org/r/d9486296c3b6b12ab3a0515fcd47d56447a07bfc.1699897370.git.baruch@tkos.co.il
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3e50fd53a617..f28838c8cdb3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5328,10 +5328,10 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			len = 0;
 		}
 
+read_again:
 		if (count >= limit)
 			break;
 
-read_again:
 		buf1_len = 0;
 		buf2_len = 0;
 		entry = next_entry;
-- 
cgit 


From b6cb4541853c7ee512111b0e7ddf3cb66c99c137 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Mon, 13 Nov 2023 19:42:50 +0200
Subject: net: stmmac: avoid rx queue overrun

dma_rx_size can be set as low as 64. Rx budget might be higher than
that. Make sure to not overrun allocated rx buffers when budget is
larger.

Leave one descriptor unused to avoid wrap around of 'dirty_rx' vs
'cur_rx'.

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Fixes: 47dd7a540b8a ("net: add support for STMicroelectronics Ethernet controllers.")
Link: https://lore.kernel.org/r/d95413e44c97d4692e72cec13a75f894abeb6998.1699897370.git.baruch@tkos.co.il
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f28838c8cdb3..2afb2bd25977 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5293,6 +5293,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 
 	dma_dir = page_pool_get_dma_dir(rx_q->page_pool);
 	buf_sz = DIV_ROUND_UP(priv->dma_conf.dma_buf_sz, PAGE_SIZE) * PAGE_SIZE;
+	limit = min(priv->dma_conf.dma_rx_size - 1, (unsigned int)limit);
 
 	if (netif_msg_rx_status(priv)) {
 		void *rx_head;
-- 
cgit 


From 907d1bdb8b2cc0357d03a1c34d2a08d9943760b1 Mon Sep 17 00:00:00 2001
From: Alex Pakhunov <alexey.pakhunov@spacex.com>
Date: Mon, 13 Nov 2023 10:23:49 -0800
Subject: tg3: Move the [rt]x_dropped counters to tg3_napi

This change moves [rt]x_dropped counters to tg3_napi so that they can be
updated by a single writer, race-free.

Signed-off-by: Alex Pakhunov <alexey.pakhunov@spacex.com>
Signed-off-by: Vincent Wong <vincent.wong2@spacex.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://lore.kernel.org/r/20231113182350.37472-1-alexey.pakhunov@spacex.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/tg3.c | 38 ++++++++++++++++++++++++++++++++-----
 drivers/net/ethernet/broadcom/tg3.h |  4 ++--
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 1dee27349367..884e9eecc471 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -6889,7 +6889,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
 				       desc_idx, *post_ptr);
 		drop_it_no_recycle:
 			/* Other statistics kept track of by card. */
-			tp->rx_dropped++;
+			tnapi->rx_dropped++;
 			goto next_pkt;
 		}
 
@@ -8190,7 +8190,7 @@ dma_error:
 drop:
 	dev_kfree_skb_any(skb);
 drop_nofree:
-	tp->tx_dropped++;
+	tnapi->tx_dropped++;
 	return NETDEV_TX_OK;
 }
 
@@ -9405,7 +9405,7 @@ static void __tg3_set_rx_mode(struct net_device *);
 /* tp->lock is held. */
 static int tg3_halt(struct tg3 *tp, int kind, bool silent)
 {
-	int err;
+	int err, i;
 
 	tg3_stop_fw(tp);
 
@@ -9426,6 +9426,13 @@ static int tg3_halt(struct tg3 *tp, int kind, bool silent)
 
 		/* And make sure the next sample is new data */
 		memset(tp->hw_stats, 0, sizeof(struct tg3_hw_stats));
+
+		for (i = 0; i < TG3_IRQ_MAX_VECS; ++i) {
+			struct tg3_napi *tnapi = &tp->napi[i];
+
+			tnapi->rx_dropped = 0;
+			tnapi->tx_dropped = 0;
+		}
 	}
 
 	return err;
@@ -11975,6 +11982,9 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats)
 {
 	struct rtnl_link_stats64 *old_stats = &tp->net_stats_prev;
 	struct tg3_hw_stats *hw_stats = tp->hw_stats;
+	unsigned long rx_dropped;
+	unsigned long tx_dropped;
+	int i;
 
 	stats->rx_packets = old_stats->rx_packets +
 		get_stat64(&hw_stats->rx_ucast_packets) +
@@ -12021,8 +12031,26 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats)
 	stats->rx_missed_errors = old_stats->rx_missed_errors +
 		get_stat64(&hw_stats->rx_discards);
 
-	stats->rx_dropped = tp->rx_dropped;
-	stats->tx_dropped = tp->tx_dropped;
+	/* Aggregate per-queue counters. The per-queue counters are updated
+	 * by a single writer, race-free. The result computed by this loop
+	 * might not be 100% accurate (counters can be updated in the middle of
+	 * the loop) but the next tg3_get_nstats() will recompute the current
+	 * value so it is acceptable.
+	 *
+	 * Note that these counters wrap around at 4G on 32bit machines.
+	 */
+	rx_dropped = (unsigned long)(old_stats->rx_dropped);
+	tx_dropped = (unsigned long)(old_stats->tx_dropped);
+
+	for (i = 0; i < tp->irq_cnt; i++) {
+		struct tg3_napi *tnapi = &tp->napi[i];
+
+		rx_dropped += tnapi->rx_dropped;
+		tx_dropped += tnapi->tx_dropped;
+	}
+
+	stats->rx_dropped = rx_dropped;
+	stats->tx_dropped = tx_dropped;
 }
 
 static int tg3_get_regs_len(struct net_device *dev)
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index ae5c01bd1110..5016475e5005 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -3018,6 +3018,7 @@ struct tg3_napi {
 	u16				*rx_rcb_prod_idx;
 	struct tg3_rx_prodring_set	prodring;
 	struct tg3_rx_buffer_desc	*rx_rcb;
+	unsigned long			rx_dropped;
 
 	u32				tx_prod	____cacheline_aligned;
 	u32				tx_cons;
@@ -3026,6 +3027,7 @@ struct tg3_napi {
 	u32				prodmbox;
 	struct tg3_tx_buffer_desc	*tx_ring;
 	struct tg3_tx_ring_info		*tx_buffers;
+	unsigned long			tx_dropped;
 
 	dma_addr_t			status_mapping;
 	dma_addr_t			rx_rcb_mapping;
@@ -3220,8 +3222,6 @@ struct tg3 {
 
 
 	/* begin "everything else" cacheline(s) section */
-	unsigned long			rx_dropped;
-	unsigned long			tx_dropped;
 	struct rtnl_link_stats64	net_stats_prev;
 	struct tg3_ethtool_stats	estats_prev;
 
-- 
cgit 


From 17dd5efe5f36a96bd78012594fabe21efb01186b Mon Sep 17 00:00:00 2001
From: Alex Pakhunov <alexey.pakhunov@spacex.com>
Date: Mon, 13 Nov 2023 10:23:50 -0800
Subject: tg3: Increment tx_dropped in tg3_tso_bug()

tg3_tso_bug() drops a packet if it cannot be segmented for any reason.
The number of discarded frames should be incremented accordingly.

Signed-off-by: Alex Pakhunov <alexey.pakhunov@spacex.com>
Signed-off-by: Vincent Wong <vincent.wong2@spacex.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Link: https://lore.kernel.org/r/20231113182350.37472-2-alexey.pakhunov@spacex.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/tg3.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 884e9eecc471..48b6191efa56 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -7918,8 +7918,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi,
 
 	segs = skb_gso_segment(skb, tp->dev->features &
 				    ~(NETIF_F_TSO | NETIF_F_TSO6));
-	if (IS_ERR(segs) || !segs)
+	if (IS_ERR(segs) || !segs) {
+		tnapi->tx_dropped++;
 		goto tg3_tso_bug_end;
+	}
 
 	skb_list_walk_safe(segs, seg, next) {
 		skb_mark_not_on_list(seg);
-- 
cgit 


From 09d4c14c6c5e6e781a3879fed7f8e116a18b8c65 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Mon, 13 Nov 2023 10:32:56 -0800
Subject: pds_core: use correct index to mask irq

Use the qcq's interrupt index, not the irq number, to mask
the interrupt.  Since the irq number can be out of range from
the number of possible interrupts, we can end up accessing
and potentially scribbling on out-of-range and/or unmapped
memory, making the kernel angry.

    [ 3116.039364] BUG: unable to handle page fault for address: ffffbeea1c3edf84
    [ 3116.047059] #PF: supervisor write access in kernel mode
    [ 3116.052895] #PF: error_code(0x0002) - not-present page
    [ 3116.058636] PGD 100000067 P4D 100000067 PUD 1001f2067 PMD 10f82e067 PTE 0
    [ 3116.066221] Oops: 0002 [#1] SMP NOPTI
    [ 3116.092948] RIP: 0010:iowrite32+0x9/0x76
    [ 3116.190452] Call Trace:
    [ 3116.193185]  <IRQ>
    [ 3116.195430]  ? show_trace_log_lvl+0x1d6/0x2f9
    [ 3116.200298]  ? show_trace_log_lvl+0x1d6/0x2f9
    [ 3116.205166]  ? pdsc_adminq_isr+0x43/0x55 [pds_core]
    [ 3116.210618]  ? __die_body.cold+0x8/0xa
    [ 3116.214806]  ? page_fault_oops+0x16d/0x1ac
    [ 3116.219382]  ? exc_page_fault+0xbe/0x13b
    [ 3116.223764]  ? asm_exc_page_fault+0x22/0x27
    [ 3116.228440]  ? iowrite32+0x9/0x76
    [ 3116.232143]  pdsc_adminq_isr+0x43/0x55 [pds_core]
    [ 3116.237627]  __handle_irq_event_percpu+0x3a/0x184
    [ 3116.243088]  handle_irq_event+0x57/0xb0
    [ 3116.247575]  handle_edge_irq+0x87/0x225
    [ 3116.252062]  __common_interrupt+0x3e/0xbc
    [ 3116.256740]  common_interrupt+0x7b/0x98
    [ 3116.261216]  </IRQ>
    [ 3116.263745]  <TASK>
    [ 3116.266268]  asm_common_interrupt+0x22/0x27

Reported-by: Joao Martins <joao.m.martins@oracle.com>
Fixes: 01ba61b55b20 ("pds_core: Add adminq processing and commands")
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Link: https://lore.kernel.org/r/20231113183257.71110-2-shannon.nelson@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/amd/pds_core/adminq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c
index 045fe133f6ee..5beadabc2136 100644
--- a/drivers/net/ethernet/amd/pds_core/adminq.c
+++ b/drivers/net/ethernet/amd/pds_core/adminq.c
@@ -146,7 +146,7 @@ irqreturn_t pdsc_adminq_isr(int irq, void *data)
 	}
 
 	queue_work(pdsc->wq, &qcq->work);
-	pds_core_intr_mask(&pdsc->intr_ctrl[irq], PDS_CORE_INTR_MASK_CLEAR);
+	pds_core_intr_mask(&pdsc->intr_ctrl[qcq->intx], PDS_CORE_INTR_MASK_CLEAR);
 
 	return IRQ_HANDLED;
 }
-- 
cgit 


From 7c02f6ae676a954216a192612040f9a0cde3adf7 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Mon, 13 Nov 2023 10:32:57 -0800
Subject: pds_core: fix up some format-truncation complaints

Our friendly kernel test robot pointed out a couple of potential
string truncation issues.  None of which were we worried about,
but can be relatively easily fixed to quiet the complaints.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310211736.66syyDpp-lkp@intel.com/
Fixes: 45d76f492938 ("pds_core: set up device and adminq")
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Link: https://lore.kernel.org/r/20231113183257.71110-3-shannon.nelson@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/amd/pds_core/core.h    | 2 +-
 drivers/net/ethernet/amd/pds_core/dev.c     | 8 ++++++--
 drivers/net/ethernet/amd/pds_core/devlink.c | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index f3a7deda9972..e35d3e7006bf 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -15,7 +15,7 @@
 #define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
 
 #define PDSC_WATCHDOG_SECS	5
-#define PDSC_QUEUE_NAME_MAX_SZ  32
+#define PDSC_QUEUE_NAME_MAX_SZ  16
 #define PDSC_ADMINQ_MIN_LENGTH	16	/* must be a power of two */
 #define PDSC_NOTIFYQ_LENGTH	64	/* must be a power of two */
 #define PDSC_TEARDOWN_RECOVERY	false
diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c
index 7c1b965d61a9..31940b857e0e 100644
--- a/drivers/net/ethernet/amd/pds_core/dev.c
+++ b/drivers/net/ethernet/amd/pds_core/dev.c
@@ -261,10 +261,14 @@ static int pdsc_identify(struct pdsc *pdsc)
 	struct pds_core_drv_identity drv = {};
 	size_t sz;
 	int err;
+	int n;
 
 	drv.drv_type = cpu_to_le32(PDS_DRIVER_LINUX);
-	snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str),
-		 "%s %s", PDS_CORE_DRV_NAME, utsname()->release);
+	/* Catching the return quiets a Wformat-truncation complaint */
+	n = snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str),
+		     "%s %s", PDS_CORE_DRV_NAME, utsname()->release);
+	if (n > sizeof(drv.driver_ver_str))
+		dev_dbg(pdsc->dev, "release name truncated, don't care\n");
 
 	/* Next let's get some info about the device
 	 * We use the devcmd_lock at this level in order to
diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c
index 57f88c8b37de..e9948ea5bbcd 100644
--- a/drivers/net/ethernet/amd/pds_core/devlink.c
+++ b/drivers/net/ethernet/amd/pds_core/devlink.c
@@ -104,7 +104,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 	struct pds_core_fw_list_info fw_list;
 	struct pdsc *pdsc = devlink_priv(dl);
 	union pds_core_dev_comp comp;
-	char buf[16];
+	char buf[32];
 	int listlen;
 	int err;
 	int i;
-- 
cgit 


From 278a370c1766060d2144d6cf0b06c101e1043b6d Mon Sep 17 00:00:00 2001
From: Ziwei Xiao <ziweixiao@google.com>
Date: Mon, 13 Nov 2023 16:41:44 -0800
Subject: gve: Fixes for napi_poll when budget is 0

Netpoll will explicilty pass the polling call with a budget of 0 to
indicate it's clearing the Tx path only. For the gve_rx_poll and
gve_xdp_poll, they were mistakenly taking the 0 budget as the indication
to do all the work. Add check to avoid the rx path and xdp path being
called when budget is 0. And also avoid napi_complete_done being called
when budget is 0 for netpoll.

Fixes: f5cedc84a30d ("gve: Add transmit and receive support")
Signed-off-by: Ziwei Xiao <ziweixiao@google.com>
Link: https://lore.kernel.org/r/20231114004144.2022268-1-ziweixiao@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/google/gve/gve_main.c | 8 +++++++-
 drivers/net/ethernet/google/gve/gve_rx.c   | 4 ----
 drivers/net/ethernet/google/gve/gve_tx.c   | 4 ----
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 276f996f95dc..2d42e733837b 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -254,10 +254,13 @@ static int gve_napi_poll(struct napi_struct *napi, int budget)
 	if (block->tx) {
 		if (block->tx->q_num < priv->tx_cfg.num_queues)
 			reschedule |= gve_tx_poll(block, budget);
-		else
+		else if (budget)
 			reschedule |= gve_xdp_poll(block, budget);
 	}
 
+	if (!budget)
+		return 0;
+
 	if (block->rx) {
 		work_done = gve_rx_poll(block, budget);
 		reschedule |= work_done == budget;
@@ -298,6 +301,9 @@ static int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
 	if (block->tx)
 		reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
 
+	if (!budget)
+		return 0;
+
 	if (block->rx) {
 		work_done = gve_rx_poll_dqo(block, budget);
 		reschedule |= work_done == budget;
diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index e84a066aa1a4..73655347902d 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -1007,10 +1007,6 @@ int gve_rx_poll(struct gve_notify_block *block, int budget)
 
 	feat = block->napi.dev->features;
 
-	/* If budget is 0, do all the work */
-	if (budget == 0)
-		budget = INT_MAX;
-
 	if (budget > 0)
 		work_done = gve_clean_rx_done(rx, budget, feat);
 
diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c
index 6957a865cff3..9f6ffc4a54f0 100644
--- a/drivers/net/ethernet/google/gve/gve_tx.c
+++ b/drivers/net/ethernet/google/gve/gve_tx.c
@@ -925,10 +925,6 @@ bool gve_xdp_poll(struct gve_notify_block *block, int budget)
 	bool repoll;
 	u32 to_do;
 
-	/* If budget is 0, do all the work */
-	if (budget == 0)
-		budget = INT_MAX;
-
 	/* Find out how much work there is to be done */
 	nic_done = gve_tx_load_event_counter(priv, tx);
 	to_do = min_t(u32, (nic_done - tx->done), budget);
-- 
cgit 


From 9fce92f050f448a0d1ddd9083ef967d9930f1e52 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 14 Nov 2023 00:16:13 +0100
Subject: mptcp: deal with large GSO size

After the blamed commit below, the TCP sockets (and the MPTCP subflows)
can build egress packets larger than 64K. That exceeds the maximum DSS
data size, the length being misrepresent on the wire and the stream being
corrupted, as later observed on the receiver:

  WARNING: CPU: 0 PID: 9696 at net/mptcp/protocol.c:705 __mptcp_move_skbs_from_subflow+0x2604/0x26e0
  CPU: 0 PID: 9696 Comm: syz-executor.7 Not tainted 6.6.0-rc5-gcd8bdf563d46 #45
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
  netlink: 8 bytes leftover after parsing attributes in process `syz-executor.4'.
  RIP: 0010:__mptcp_move_skbs_from_subflow+0x2604/0x26e0 net/mptcp/protocol.c:705
  RSP: 0018:ffffc90000006e80 EFLAGS: 00010246
  RAX: ffffffff83e9f674 RBX: ffff88802f45d870 RCX: ffff888102ad0000
  netlink: 8 bytes leftover after parsing attributes in process `syz-executor.4'.
  RDX: 0000000080000303 RSI: 0000000000013908 RDI: 0000000000003908
  RBP: ffffc90000007110 R08: ffffffff83e9e078 R09: 1ffff1100e548c8a
  R10: dffffc0000000000 R11: ffffed100e548c8b R12: 0000000000013908
  R13: dffffc0000000000 R14: 0000000000003908 R15: 000000000031cf29
  FS:  00007f239c47e700(0000) GS:ffff88811b200000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f239c45cd78 CR3: 000000006a66c006 CR4: 0000000000770ef0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600
  PKRU: 55555554
  Call Trace:
   <IRQ>
   mptcp_data_ready+0x263/0xac0 net/mptcp/protocol.c:819
   subflow_data_ready+0x268/0x6d0 net/mptcp/subflow.c:1409
   tcp_data_queue+0x21a1/0x7a60 net/ipv4/tcp_input.c:5151
   tcp_rcv_established+0x950/0x1d90 net/ipv4/tcp_input.c:6098
   tcp_v6_do_rcv+0x554/0x12f0 net/ipv6/tcp_ipv6.c:1483
   tcp_v6_rcv+0x2e26/0x3810 net/ipv6/tcp_ipv6.c:1749
   ip6_protocol_deliver_rcu+0xd6b/0x1ae0 net/ipv6/ip6_input.c:438
   ip6_input+0x1c5/0x470 net/ipv6/ip6_input.c:483
   ipv6_rcv+0xef/0x2c0 include/linux/netfilter.h:304
   __netif_receive_skb+0x1ea/0x6a0 net/core/dev.c:5532
   process_backlog+0x353/0x660 net/core/dev.c:5974
   __napi_poll+0xc6/0x5a0 net/core/dev.c:6536
   net_rx_action+0x6a0/0xfd0 net/core/dev.c:6603
   __do_softirq+0x184/0x524 kernel/softirq.c:553
   do_softirq+0xdd/0x130 kernel/softirq.c:454

Address the issue explicitly bounding the maximum GSO size to what MPTCP
actually allows.

Reported-by: Christoph Paasch <cpaasch@apple.com>
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/450
Fixes: 7c4e983c4f3c ("net: allow gso_max_size to exceed 65536")
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-1-7b9cd6a7b7f4@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/protocol.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a0b8356cd8c5..66e947054945 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1230,6 +1230,8 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
 	mptcp_do_fallback(ssk);
 }
 
+#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
+
 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 			      struct mptcp_data_frag *dfrag,
 			      struct mptcp_sendmsg_info *info)
@@ -1256,6 +1258,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 		return -EAGAIN;
 
 	/* compute send limit */
+	if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE))
+		ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE;
 	info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
 	copy = info->size_goal;
 
-- 
cgit 


From d109a7767273d1706b541c22b83a0323823dfde4 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 14 Nov 2023 00:16:14 +0100
Subject: mptcp: fix possible NULL pointer dereference on close

After the blamed commit below, the MPTCP release callback can
dereference the first subflow pointer via __mptcp_set_connected()
and send buffer auto-tuning. Such pointer is always expected to be
valid, except at socket destruction time, when the first subflow is
deleted and the pointer zeroed.

If the connect event is handled by the release callback while the
msk socket is finally released, MPTCP hits the following splat:

  general protection fault, probably for non-canonical address 0xdffffc00000000f2: 0000 [#1] PREEMPT SMP KASAN
  KASAN: null-ptr-deref in range [0x0000000000000790-0x0000000000000797]
  CPU: 1 PID: 26719 Comm: syz-executor.2 Not tainted 6.6.0-syzkaller-10102-gff269e2cd5ad #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
  RIP: 0010:mptcp_subflow_ctx net/mptcp/protocol.h:542 [inline]
  RIP: 0010:__mptcp_propagate_sndbuf net/mptcp/protocol.h:813 [inline]
  RIP: 0010:__mptcp_set_connected+0x57/0x3e0 net/mptcp/subflow.c:424
  RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8a62323c
  RDX: 00000000000000f2 RSI: ffffffff8a630116 RDI: 0000000000000790
  RBP: ffff88803334b100 R08: 0000000000000001 R09: 0000000000000000
  R10: 0000000000000001 R11: 0000000000000034 R12: ffff88803334b198
  R13: ffff888054f0b018 R14: 0000000000000000 R15: ffff88803334b100
  FS:  0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fbcb4f75198 CR3: 000000006afb5000 CR4: 00000000003506f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <TASK>
   mptcp_release_cb+0xa2c/0xc40 net/mptcp/protocol.c:3405
   release_sock+0xba/0x1f0 net/core/sock.c:3537
   mptcp_close+0x32/0xf0 net/mptcp/protocol.c:3084
   inet_release+0x132/0x270 net/ipv4/af_inet.c:433
   inet6_release+0x4f/0x70 net/ipv6/af_inet6.c:485
   __sock_release+0xae/0x260 net/socket.c:659
   sock_close+0x1c/0x20 net/socket.c:1419
   __fput+0x270/0xbb0 fs/file_table.c:394
   task_work_run+0x14d/0x240 kernel/task_work.c:180
   exit_task_work include/linux/task_work.h:38 [inline]
   do_exit+0xa92/0x2a20 kernel/exit.c:876
   do_group_exit+0xd4/0x2a0 kernel/exit.c:1026
   get_signal+0x23ba/0x2790 kernel/signal.c:2900
   arch_do_signal_or_restart+0x90/0x7f0 arch/x86/kernel/signal.c:309
   exit_to_user_mode_loop kernel/entry/common.c:168 [inline]
   exit_to_user_mode_prepare+0x11f/0x240 kernel/entry/common.c:204
   __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline]
   syscall_exit_to_user_mode+0x1d/0x60 kernel/entry/common.c:296
   do_syscall_64+0x4b/0x110 arch/x86/entry/common.c:88
   entry_SYSCALL_64_after_hwframe+0x63/0x6b
  RIP: 0033:0x7fb515e7cae9
  Code: Unable to access opcode bytes at 0x7fb515e7cabf.
  RSP: 002b:00007fb516c560c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
  RAX: 000000000000003c RBX: 00007fb515f9c120 RCX: 00007fb515e7cae9
  RDX: 0000000000000000 RSI: 0000000020000140 RDI: 0000000000000006
  RBP: 00007fb515ec847a R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
  R13: 000000000000006e R14: 00007fb515f9c120 R15: 00007ffc631eb968
   </TASK>

To avoid sparkling unneeded conditionals, address the issue explicitly
checking msk->first only in the critical place.

Fixes: 8005184fd1ca ("mptcp: refactor sndbuf auto-tuning")
Cc: stable@vger.kernel.org
Reported-by: <syzbot+9dfbaedb6e6baca57a32@syzkaller.appspotmail.com>
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/454
Reported-by: Eric Dumazet <edumazet@google.com>
Closes: https://lore.kernel.org/netdev/CANn89iLZUA6S2a=K8GObnS62KK6Jt4B7PsAs7meMFooM8xaTgw@mail.gmail.com/
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-2-7b9cd6a7b7f4@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/protocol.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 66e947054945..bc81ea53a049 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3402,10 +3402,11 @@ static void mptcp_release_cb(struct sock *sk)
 	if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
 		__mptcp_clean_una_wakeup(sk);
 	if (unlikely(msk->cb_flags)) {
-		/* be sure to set the current sk state before tacking actions
-		 * depending on sk_state, that is processing MPTCP_ERROR_REPORT
+		/* be sure to set the current sk state before taking actions
+		 * depending on sk_state (MPTCP_ERROR_REPORT)
+		 * On sk release avoid actions depending on the first subflow
 		 */
-		if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
+		if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags) && msk->first)
 			__mptcp_set_connected(sk);
 		if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
 			__mptcp_error_report(sk);
-- 
cgit 


From 8df220b29282e8b450ea57be62e1eccd4996837c Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Tue, 14 Nov 2023 00:16:15 +0100
Subject: mptcp: add validity check for sending RM_ADDR

This patch adds the validity check for sending RM_ADDRs for userspace PM
in mptcp_pm_remove_addrs(), only send a RM_ADDR when the address is in the
anno_list or conn_list.

Fixes: 8b1c94da1e48 ("mptcp: only send RM_ADDR in nl_cmd_remove")
Cc: stable@vger.kernel.org
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-3-7b9cd6a7b7f4@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/pm_netlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 1529ec358815..bf4d96f6f99a 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1515,8 +1515,9 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list)
 	struct mptcp_pm_addr_entry *entry;
 
 	list_for_each_entry(entry, rm_list, list) {
-		remove_anno_list_by_saddr(msk, &entry->addr);
-		if (alist.nr < MPTCP_RM_IDS_MAX)
+		if ((remove_anno_list_by_saddr(msk, &entry->addr) ||
+		     lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) &&
+		    alist.nr < MPTCP_RM_IDS_MAX)
 			alist.ids[alist.nr++] = entry->addr.id;
 	}
 
-- 
cgit 


From 7679d34f97b7a09fd565f5729f79fd61b7c55329 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 14 Nov 2023 00:16:16 +0100
Subject: mptcp: fix setsockopt(IP_TOS) subflow locking

The MPTCP implementation of the IP_TOS socket option uses the lockless
variant of the TOS manipulation helper and does not hold such lock at
the helper invocation time.

Add the required locking.

Fixes: ffcacff87cd6 ("mptcp: Support for IP_TOS for MPTCP setsockopt()")
Cc: stable@vger.kernel.org
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/457
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-4-7b9cd6a7b7f4@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/sockopt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 77f5e8932abf..353680733700 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -738,8 +738,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
 	val = READ_ONCE(inet_sk(sk)->tos);
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+		bool slow;
 
+		slow = lock_sock_fast(ssk);
 		__ip_sock_set_tos(ssk, val);
+		unlock_sock_fast(ssk, slow);
 	}
 	release_sock(sk);
 
-- 
cgit 


From 7cefbe5e1dacc7236caa77e9d072423f21422fe2 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 14 Nov 2023 00:16:17 +0100
Subject: selftests: mptcp: fix fastclose with csum failure

Running the mp_join selftest manually with the following command line:

  ./mptcp_join.sh -z -C

leads to some failures:

  002 fastclose server test
  # ...
  rtx                                 [fail] got 1 MP_RST[s] TX expected 0
  # ...
  rstrx                               [fail] got 1 MP_RST[s] RX expected 0

The problem is really in the wrong expectations for the RST checks
implied by the csum validation. Note that the same check is repeated
explicitly in the same test-case, with the correct expectation and
pass successfully.

Address the issue explicitly setting the correct expectation for
the failing checks.

Reported-by: Xiumei Mu <xmu@redhat.com>
Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases")
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-5-7b9cd6a7b7f4@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 75a2438efdf3..3c94f2f194d6 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3240,7 +3240,7 @@ fastclose_tests()
 	if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then
 		test_linkfail=1024 fastclose=server \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
+		chk_join_nr 0 0 0 0 0 0 1
 		chk_fclose_nr 1 1 invert
 		chk_rst_nr 1 1
 	fi
-- 
cgit 


From 006ccc3090e2f30f5f97857f3946312692a5279e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Nov 2023 22:54:26 -0400
Subject: bcachefs: Kill journal pre-reservations

This deletes the complicated and somewhat expensive journal
pre-reservation machinery in favor of just using journal watermarks:
when the journal is more than half full, we run journal reclaim more
aggressively, and when the journal is more than 3/4s full we only allow
journal reclaim to get new journal reservations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  2 -
 fs/bcachefs/btree_key_cache.c       | 14 ------
 fs/bcachefs/btree_trans_commit.c    | 36 +-------------
 fs/bcachefs/btree_types.h           |  3 --
 fs/bcachefs/btree_update_interior.c | 30 ------------
 fs/bcachefs/btree_update_interior.h |  1 -
 fs/bcachefs/journal.c               | 31 ------------
 fs/bcachefs/journal.h               | 98 -------------------------------------
 fs/bcachefs/journal_reclaim.c       | 42 ++++++----------
 fs/bcachefs/journal_types.h         | 26 ----------
 fs/bcachefs/trace.h                 | 11 +----
 11 files changed, 19 insertions(+), 275 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c2adf3fbb0b3..6fa90bcd7016 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
 		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 	}
 
-	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
 	kfree(trans->extra_journal_entries.data);
 
 	if (trans->fs_usage_deltas) {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index b3305a04d808..37fbf22de8fc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -672,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		goto out;
 
 	bch2_journal_pin_drop(j, &ck->journal);
-	bch2_journal_preres_put(j, &ck->res);
 
 	BUG_ON(!btree_node_locked(c_iter.path, 0));
 
@@ -770,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
 	BUG_ON(insert->k.u64s > ck->u64s);
 
-	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		int difference;
-
-		BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
-		difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
-		if (difference > 0) {
-			trans->journal_preres.u64s	-= difference;
-			ck->res.u64s			+= difference;
-		}
-	}
-
 	bkey_copy(ck->k, insert);
 	ck->valid = true;
 
@@ -1006,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 		cond_resched();
 
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
-		bch2_journal_preres_put(&c->journal, &ck->res);
 
 		list_del(&ck->list);
 		kfree(ck->k);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 55a120eb8692..12907beda98c 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -323,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
-				   unsigned long trace_ip)
-{
-	return drop_locks_do(trans,
-		bch2_journal_preres_get(&trans->c->journal,
-			&trans->journal_preres,
-			trans->journal_preres_u64s,
-			(flags & BCH_WATERMARK_MASK)));
-}
-
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 						      unsigned flags)
 {
@@ -882,14 +871,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 		}
 	}
 
-	ret = bch2_journal_preres_get(&c->journal,
-			&trans->journal_preres, trans->journal_preres_u64s,
-			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
-	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
-	if (unlikely(ret))
-		return ret;
-
 	ret = bch2_trans_lock_write(trans);
 	if (unlikely(ret))
 		return ret;
@@ -1052,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
 	struct btree_write_buffered_key *wb;
-	unsigned u64s;
 	int ret = 0;
 
 	if (!trans->nr_updates &&
@@ -1112,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 
 	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 
-	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
 	trans->journal_u64s		= trans->extra_journal_entries.nr;
-	trans->journal_preres_u64s	= 0;
-
 	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
 	if (trans->journal_transaction_names)
 		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
@@ -1134,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 		if (i->key_cache_already_flushed)
 			continue;
 
-		/* we're going to journal the key being updated: */
-		u64s = jset_u64s(i->k->k.u64s);
-		if (i->cached &&
-		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
-			trans->journal_preres_u64s += u64s;
-
 		if (i->flags & BTREE_UPDATE_NOJOURNAL)
 			continue;
 
-		trans->journal_u64s += u64s;
+		/* we're going to journal the key being updated: */
+		trans->journal_u64s += jset_u64s(i->k->k.u64s);
 
 		/* and we're also going to log the overwrite: */
 		if (trans->journal_transaction_names)
@@ -1175,8 +1145,6 @@ retry:
 
 	trace_and_count(c, transaction_commit, trans, _RET_IP_);
 out:
-	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
 	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
 		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f3669fa68591..6fbd4ef3df6b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -327,7 +327,6 @@ struct bkey_cached {
 	struct rhash_head	hash;
 	struct list_head	list;
 
-	struct journal_preres	res;
 	struct journal_entry_pin journal;
 	u64			seq;
 
@@ -441,11 +440,9 @@ struct btree_trans {
 	struct journal_entry_pin *journal_pin;
 
 	struct journal_res	journal_res;
-	struct journal_preres	journal_preres;
 	u64			*journal_seq;
 	struct disk_reservation *disk_res;
 	unsigned		journal_u64s;
-	unsigned		journal_preres_u64s;
 	struct replicas_delta_list *fs_usage_deltas;
 };
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 39c2db68123b..76f27bc9fa24 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
 		up_read(&c->gc_lock);
 	as->took_gc_lock = false;
 
-	bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);
 	bch2_disk_reservation_put(c, &as->disk_res);
@@ -734,8 +732,6 @@ err:
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 
-	bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
 	mutex_lock(&c->btree_interior_update_lock);
 	for (i = 0; i < as->nr_new_nodes; i++) {
 		b = as->new_nodes[i];
@@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	unsigned nr_nodes[2] = { 0, 0 };
 	unsigned update_level = level;
 	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	unsigned journal_flags = 0;
 	int ret = 0;
 	u32 restart_count = trans->restart_count;
 
@@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	flags &= ~BCH_WATERMARK_MASK;
 	flags |= watermark;
 
-	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
-		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
-	journal_flags |= watermark;
-
 	while (1) {
 		nr_nodes[!!update_level] += 1 + split;
 		update_level++;
@@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (ret)
 		goto err;
 
-	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-				      BTREE_UPDATE_JOURNAL_RES,
-				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
-	if (ret) {
-		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-			ret = -BCH_ERR_journal_reclaim_would_deadlock;
-			goto err;
-		}
-
-		ret = drop_locks_do(trans,
-			bch2_journal_preres_get(&c->journal, &as->journal_preres,
-					      BTREE_UPDATE_JOURNAL_RES,
-					      journal_flags));
-		if (ret == -BCH_ERR_journal_preres_get_blocked) {
-			trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
-			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
-		}
-		if (ret)
-			goto err;
-	}
-
 	ret = bch2_disk_reservation_get(c, &as->disk_res,
 			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
 			c->opts.metadata_replicas,
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 4df21512d640..031076e75fa1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -55,7 +55,6 @@ struct btree_update {
 	unsigned			update_level;
 
 	struct disk_reservation		disk_res;
-	struct journal_preres		journal_preres;
 
 	/*
 	 * BTREE_INTERIOR_UPDATING_NODE:
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5b5d69f2316b..23a9b7845d11 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 	return ret;
 }
 
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
-				     struct journal_preres *res,
-				     unsigned new_u64s,
-				     unsigned flags)
-{
-	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
-	if (!ret && mutex_trylock(&j->reclaim_lock)) {
-		bch2_journal_reclaim(j);
-		mutex_unlock(&j->reclaim_lock);
-	}
-
-	return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
-			      struct journal_preres *res,
-			      unsigned new_u64s,
-			      unsigned flags)
-{
-	int ret;
-
-	closure_wait_event(&j->preres_wait,
-		   (ret = bch2_journal_error(j)) ||
-		   journal_preres_available(j, res, new_u64s, flags));
-	return ret;
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *j,
@@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
 	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
 	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
-	prt_printf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
 	prt_printf(out, "watermark:\t\t%s\n",		bch2_watermarks[j->watermark]);
 	prt_printf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
 	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 011711e99c8d..c85d01cf4948 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -395,104 +395,6 @@ out:
 	return 0;
 }
 
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
-	union journal_preres_state s = READ_ONCE(j->prereserved);
-	unsigned watermark = BCH_WATERMARK_stripe;
-
-	if (fifo_free(&j->pin) < j->pin.size / 4)
-		watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-	if (fifo_free(&j->pin) < j->pin.size / 8)
-		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-	if (s.reserved > s.remaining)
-		watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-	if (!s.remaining)
-		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-	if (watermark == j->watermark)
-		return;
-
-	swap(watermark, j->watermark);
-	if (watermark > j->watermark)
-		journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
-					   struct journal_preres *res)
-{
-	union journal_preres_state s = { .reserved = res->u64s };
-
-	if (!res->u64s)
-		return;
-
-	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
-	res->u64s = 0;
-
-	if (unlikely(s.waiting)) {
-		clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
-			  (unsigned long *) &j->prereserved.v);
-		closure_wake_up(&j->preres_wait);
-	}
-
-	if (s.reserved <= s.remaining && j->watermark)
-		journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
-			struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
-					       struct journal_preres *res,
-					       unsigned new_u64s,
-					       unsigned flags,
-					       bool set_waiting)
-{
-	int d = new_u64s - res->u64s;
-	union journal_preres_state old, new;
-	u64 v = atomic64_read(&j->prereserved.counter);
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	int ret;
-
-	do {
-		old.v = new.v = v;
-		ret = 0;
-
-		if (watermark == BCH_WATERMARK_reclaim ||
-		    new.reserved + d < new.remaining) {
-			new.reserved += d;
-			ret = 1;
-		} else if (set_waiting && !new.waiting)
-			new.waiting = true;
-		else
-			return 0;
-	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-				       old.v, new.v)) != old.v);
-
-	if (ret)
-		res->u64s += d;
-	return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
-					  struct journal_preres *res,
-					  unsigned new_u64s,
-					  unsigned flags)
-{
-	if (new_u64s <= res->u64s)
-		return 0;
-
-	if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
-		return 0;
-
-	if (flags & JOURNAL_RES_GET_NONBLOCK)
-		return -BCH_ERR_journal_preres_get_blocked;
-
-	return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 9a584aaaa2eb..e63c6eda86af 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 	return available;
 }
 
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j, bool low_on_space)
 {
-	union journal_preres_state old, new;
-	u64 v = atomic64_read(&j->prereserved.counter);
+	unsigned watermark = BCH_WATERMARK_stripe;
 
-	do {
-		old.v = new.v = v;
-		new.remaining = u64s_remaining;
-	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-				       old.v, new.v)) != old.v);
+	if (low_on_space)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+	if (fifo_free(&j->pin) < j->pin.size / 4)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+	if (watermark == j->watermark)
+		return;
+
+	swap(watermark, j->watermark);
+	if (watermark > j->watermark)
+		journal_wake(j);
 }
 
 static struct journal_space
@@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned clean, clean_ondisk, total;
-	s64 u64s_remaining = 0;
 	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
 				       j->buf[1].buf_size >> 9);
 	unsigned i, nr_online = 0, nr_devs_want;
@@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j)
 	else
 		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 
-	u64s_remaining  = (u64) clean << 6;
-	u64s_remaining -= (u64) total << 3;
-	u64s_remaining = max(0LL, u64s_remaining);
-	u64s_remaining /= 4;
-	u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+	journal_set_watermark(j, clean * 4 <= total);
 out:
 	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 	j->cur_entry_error	= ret;
-	journal_set_remaining(j, u64s_remaining);
-	journal_set_watermark(j);
 
 	if (!ret)
 		journal_wake(j);
@@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j)
 		/* Try to keep the journal at most half full: */
 		nr_buckets = ja->nr / 2;
 
-		/* And include pre-reservations: */
-		nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
-					   (ca->mi.bucket_size << 6) -
-					   journal_entry_overhead(j));
-
 		nr_buckets = min(nr_buckets, ja->nr);
 
 		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
 			min_nr = 1;
 
-		if (j->prereserved.reserved * 4 > j->prereserved.remaining)
-			min_nr = 1;
-
-		if (fifo_free(&j->pin) <= 32)
+		if (j->watermark != BCH_WATERMARK_stripe)
 			min_nr = 1;
 
 		if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 		trace_and_count(c, journal_reclaim_start, c,
 				direct, kicked,
 				min_nr, min_key_cache,
-				j->prereserved.reserved,
-				j->prereserved.remaining,
 				atomic_read(&c->btree_cache.dirty),
 				c->btree_cache.used,
 				atomic_long_read(&c->btree_key_cache.nr_dirty),
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 42504e16acb6..a756b69582e3 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -76,14 +76,6 @@ struct journal_res {
 	u64			seq;
 };
 
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
-	unsigned		u64s;
-};
-
 union journal_res_state {
 	struct {
 		atomic64_t	counter;
@@ -104,22 +96,6 @@ union journal_res_state {
 	};
 };
 
-union journal_preres_state {
-	struct {
-		atomic64_t	counter;
-	};
-
-	struct {
-		u64		v;
-	};
-
-	struct {
-		u64		waiting:1,
-				reserved:31,
-				remaining:32;
-	};
-};
-
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
 	union journal_res_state reservations;
 	enum bch_watermark	watermark;
 
-	union journal_preres_state prereserved;
-
 	} __aligned(SMP_CACHE_BYTES);
 
 	unsigned long		flags;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 893304a1f06e..7857671159b4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
 TRACE_EVENT(journal_reclaim_start,
 	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
 		 u64 min_nr, u64 min_key_cache,
-		 u64 prereserved, u64 prereserved_total,
 		 u64 btree_cache_dirty, u64 btree_cache_total,
 		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-	TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+	TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
 		btree_cache_dirty, btree_cache_total,
 		btree_key_cache_dirty, btree_key_cache_total),
 
@@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
 		__field(bool,		kicked			)
 		__field(u64,		min_nr			)
 		__field(u64,		min_key_cache		)
-		__field(u64,		prereserved		)
-		__field(u64,		prereserved_total	)
 		__field(u64,		btree_cache_dirty	)
 		__field(u64,		btree_cache_total	)
 		__field(u64,		btree_key_cache_dirty	)
@@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
 		__entry->kicked			= kicked;
 		__entry->min_nr			= min_nr;
 		__entry->min_key_cache		= min_key_cache;
-		__entry->prereserved		= prereserved;
-		__entry->prereserved_total	= prereserved_total;
 		__entry->btree_cache_dirty	= btree_cache_dirty;
 		__entry->btree_cache_total	= btree_cache_total;
 		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
 		__entry->btree_key_cache_total	= btree_key_cache_total;
 	),
 
-	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->direct,
 		  __entry->kicked,
 		  __entry->min_nr,
 		  __entry->min_key_cache,
-		  __entry->prereserved,
-		  __entry->prereserved_total,
 		  __entry->btree_cache_dirty,
 		  __entry->btree_cache_total,
 		  __entry->btree_key_cache_dirty,
-- 
cgit 


From 069749688ea4bbaeff0ca3b229b443ea96b03757 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Nov 2023 22:15:59 -0500
Subject: bcachefs: Fix iterator leak in may_delete_deleted_inode()

may_delete_deleted_inode() was returning without exiting a btree
iterator, eventually causing propagate_key_to_snaphot_leaves() to go
into an infinite loop hitting btree_trans_too_many_iters().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index def77f2d8802..dab12c14d1ad 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 		 * unlinked inodes in the snapshot leaves:
 		 */
 		*need_another_pass = true;
-		return 0;
+		goto out;
 	}
 
 	ret = 1;
-- 
cgit 


From b783fc4d1366658200bf759e1010655a9e2e145c Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@quora.org>
Date: Sun, 12 Nov 2023 00:38:41 +0000
Subject: bcachefs: Fix potential sleeping during mount

During mount, bcachefs mount option processing may sleep while allocating a string buffer.

Fix this by reference counting in order to take the atomic path.

Signed-off-by: Daniel J Blueman <daniel@quora.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 1f334124055b..4d0cb0ccff32 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 	case TARGET_DEV: {
 		struct bch_dev *ca;
 
+		out->atomic++;
 		rcu_read_lock();
 		ca = t.dev < c->sb.nr_devices
 			? rcu_dereference(c->devs[t.dev])
@@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 		}
 
 		rcu_read_unlock();
+		out->atomic--;
 		break;
 	}
 	case TARGET_GROUP:
-- 
cgit 


From 178c4873fd06c0361d260547ce70fcdc29b74809 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Nov 2023 14:15:35 -0500
Subject: bcachefs: Fix error path in bch2_mount()

This fixes a bug discovered by generic/388 where sb->s_fs_info was NULL
while the superblock was still active - the error path was entirely
fubar, and was trying to do something unclear and unecessary.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 166d8d8abe68..8ef817304e4a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1922,10 +1922,7 @@ out:
 	return dget(sb->s_root);
 
 err_put_super:
-	sb->s_fs_info = NULL;
-	c->vfs_sb = NULL;
 	deactivate_locked_super(sb);
-	bch2_fs_stop(c);
 	return ERR_PTR(bch2_err_class(ret));
 }
 
@@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
 {
 	struct bch_fs *c = sb->s_fs_info;
 
-	if (c)
-		c->vfs_sb = NULL;
 	generic_shutdown_super(sb);
-	if (c)
-		bch2_fs_free(c);
+	bch2_fs_free(c);
 }
 
 static struct file_system_type bcache_fs_type = {
-- 
cgit 


From f42fa17883e73d8509fff5925781d4157db82f00 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Nov 2023 15:47:02 -0500
Subject: bcachefs: Fix missing transaction commit

In may_delete_deleted_inode(), there's a corner case when a snapshot was
taken while we had an unlinked inode: we don't want to delete the inode
in the internal (shared) snapshot node, since it might have been
reattached in a descendent snapshot.

Instead we propagate the key to any snapshot leaves it doesn't exist in,
so that it can be deleted there if necessary, and then clear the
unlinked flag in the internal node.

But we forgot to commit after clearing the unlinked flag, causing us to
go into an infinite loop.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index dab12c14d1ad..c7849b0753e7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1169,8 +1169,10 @@ again:
 	 */
 	for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
 			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p,
-								     &need_another_pass));
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW,
+			may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
 		if (ret < 0)
 			break;
 
-- 
cgit 


From 497c57a303590ea69ace23506e182c489e85694d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Nov 2023 17:02:08 -0500
Subject: bcachefs: Disable debug log statements

The journal read path had some informational log statements preperatory
for ZNS support - they're not of interest to users, so we can turn them
off.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index f4bc2cdbfdd7..786a09285509 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1079,6 +1079,12 @@ found:
 
 	if (ja->bucket_seq[ja->cur_idx] &&
 	    ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+		/*
+		 * Debug code for ZNS support, where we (probably) want to be
+		 * correlated where we stopped in the journal to the zone write
+		 * points:
+		 */
 		bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
 		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
 		for (i = 0; i < 3; i++) {
@@ -1086,6 +1092,7 @@ found:
 
 			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
 		}
+#endif
 		ja->sectors_free = 0;
 	}
 
-- 
cgit 


From 7125063fc6dfb77138b3a100527f3d8f9203ff2a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 2 Mar 2023 23:52:57 -0500
Subject: bcachefs: Don't decrease BTREE_ITER_MAX when LOCKDEP=y

Running with fewer max btree paths doesn't work anymore when replication
is enabled - as we've added e.g. the freespace and bucket gens btrees,
we naturally end up needing more btree paths.

This is an issue with lockdep, we end up taking more locks than lockdep
will track (the MAX_LOCKD_DEPTH constant). But bcachefs as merged does
not yet support lockdep anyways, so we can leave that for later.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 6fbd4ef3df6b..60453ba86c4b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -363,11 +363,7 @@ struct btree_insert_entry {
 	unsigned long		ip_allocated;
 };
 
-#ifndef CONFIG_LOCKDEP
 #define BTREE_ITER_MAX		64
-#else
-#define BTREE_ITER_MAX		32
-#endif
 
 struct btree_trans_commit_hook;
 typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
-- 
cgit 


From db18ef1a02bc2cd924f86b2582302f2c2711b67c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Nov 2023 21:17:19 -0500
Subject: bcachefs: Fix bch2_check_nlinks() for snapshots

When searching the link table for the matching inode, we were searching
for a specific - incorrect - snapshot ID as well, causing us to fail to
find the inode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9f3e9bd3d767..e0c5cd119acc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r)
 	const struct nlink *l = _l;
 	const struct nlink *r = _r;
 
-	return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+	return cmp_int(l->inum, r->inum);
 }
 
 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
-- 
cgit 


From 62d73dfc44d54c97e0df6b947f0bccf6c4b8030e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Nov 2023 21:46:52 -0500
Subject: bcachefs: Fix no_data_io mode checksum check

In no_data_io mode, we expect data checksums to be wrong - don't want to
spew the log with them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f02b3f7d26a0..d704a8f829c8 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
 	 * checksum:
 	 */
 	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-	if (bch2_crc_cmp(op->crc.csum, csum))
+	if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
 		return -EIO;
 
 	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-- 
cgit 


From 61b85cb0d773115d9a4b20c3e67286844cf73f34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Nov 2023 18:52:22 -0500
Subject: bcachefs: six locks: Fix lost wakeup

In percpu reader mode, trylock() for read had a lost wakeup: on failure
to get the lock, we may have caused a writer to fail to get the lock,
because we temporarily elevated the reader count.

We need to check for waiters after decrementing the read count - not
before.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index b775cf0fb7cb..97790445e67a 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 		this_cpu_sub(*lock->readers, !ret);
 		preempt_enable();
 
-		if (!ret && (old & SIX_LOCK_WAITING_write))
-			ret = -1 - SIX_LOCK_write;
+		if (!ret) {
+			smp_mb();
+			if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+				ret = -1 - SIX_LOCK_write;
+		}
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
 			atomic_add(SIX_LOCK_HELD_write, &lock->state);
-- 
cgit 


From 5dd9ad32d7758b1a76742f394acf0eb3ac8a636a Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 27 Sep 2023 10:29:02 +0200
Subject: xen/events: drop xen_allocate_irqs_dynamic()

Instead of having a common function for allocating a single IRQ or a
consecutive number of IRQs, split up the functionality into the callers
of xen_allocate_irqs_dynamic().

This allows to handle any allocation error in xen_irq_init() gracefully
instead of panicing the system. Let xen_irq_init() return the irq_info
pointer or NULL in case of an allocation error.

Additionally set the IRQ into irq_info already at allocation time, as
otherwise the IRQ would be '0' (which is a valid IRQ number) until
being set.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 74 ++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index a810e8904fbf..75f8d1dcbbb7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -304,6 +304,13 @@ static void channels_on_cpu_inc(struct irq_info *info)
 	info->is_accounted = 1;
 }
 
+static void xen_irq_free_desc(unsigned int irq)
+{
+	/* Legacy IRQ descriptors are managed by the arch. */
+	if (irq >= nr_legacy_irqs())
+		irq_free_desc(irq);
+}
+
 static void delayed_free_irq(struct work_struct *work)
 {
 	struct irq_info *info = container_of(to_rcu_work(work), struct irq_info,
@@ -315,9 +322,7 @@ static void delayed_free_irq(struct work_struct *work)
 
 	kfree(info);
 
-	/* Legacy IRQ descriptors are managed by the arch. */
-	if (irq >= nr_legacy_irqs())
-		irq_free_desc(irq);
+	xen_irq_free_desc(irq);
 }
 
 /* Constructors for packed IRQ information. */
@@ -332,7 +337,6 @@ static int xen_irq_info_common_setup(struct irq_info *info,
 	BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
 
 	info->type = type;
-	info->irq = irq;
 	info->evtchn = evtchn;
 	info->cpu = cpu;
 	info->mask_reason = EVT_MASK_REASON_EXPLICIT;
@@ -733,47 +737,45 @@ void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags)
 }
 EXPORT_SYMBOL_GPL(xen_irq_lateeoi);
 
-static void xen_irq_init(unsigned irq)
+static struct irq_info *xen_irq_init(unsigned int irq)
 {
 	struct irq_info *info;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (info == NULL)
-		panic("Unable to allocate metadata for IRQ%d\n", irq);
+	if (info) {
+		info->irq = irq;
+		info->type = IRQT_UNBOUND;
+		info->refcnt = -1;
+		INIT_RCU_WORK(&info->rwork, delayed_free_irq);
 
-	info->type = IRQT_UNBOUND;
-	info->refcnt = -1;
-	INIT_RCU_WORK(&info->rwork, delayed_free_irq);
+		set_info_for_irq(irq, info);
+		/*
+		 * Interrupt affinity setting can be immediate. No point
+		 * in delaying it until an interrupt is handled.
+		 */
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 
-	set_info_for_irq(irq, info);
-	/*
-	 * Interrupt affinity setting can be immediate. No point
-	 * in delaying it until an interrupt is handled.
-	 */
-	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+		INIT_LIST_HEAD(&info->eoi_list);
+		list_add_tail(&info->list, &xen_irq_list_head);
+	}
 
-	INIT_LIST_HEAD(&info->eoi_list);
-	list_add_tail(&info->list, &xen_irq_list_head);
+	return info;
 }
 
-static int __must_check xen_allocate_irqs_dynamic(int nvec)
+static inline int __must_check xen_allocate_irq_dynamic(void)
 {
-	int i, irq = irq_alloc_descs(-1, 0, nvec, -1);
+	int irq = irq_alloc_desc_from(0, -1);
 
 	if (irq >= 0) {
-		for (i = 0; i < nvec; i++)
-			xen_irq_init(irq + i);
+		if (!xen_irq_init(irq)) {
+			xen_irq_free_desc(irq);
+			irq = -1;
+		}
 	}
 
 	return irq;
 }
 
-static inline int __must_check xen_allocate_irq_dynamic(void)
-{
-
-	return xen_allocate_irqs_dynamic(1);
-}
-
 static int __must_check xen_allocate_irq_gsi(unsigned gsi)
 {
 	int irq;
@@ -793,7 +795,10 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi)
 	else
 		irq = irq_alloc_desc_at(gsi, -1);
 
-	xen_irq_init(irq);
+	if (!xen_irq_init(irq)) {
+		xen_irq_free_desc(irq);
+		irq = -1;
+	}
 
 	return irq;
 }
@@ -963,6 +968,11 @@ static void __unbind_from_irq(unsigned int irq)
 	evtchn_port_t evtchn = evtchn_from_irq(irq);
 	struct irq_info *info = info_for_irq(irq);
 
+	if (!info) {
+		xen_irq_free_desc(irq);
+		return;
+	}
+
 	if (info->refcnt > 0) {
 		info->refcnt--;
 		if (info->refcnt != 0)
@@ -1101,11 +1111,14 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = xen_allocate_irqs_dynamic(nvec);
+	irq = irq_alloc_descs(-1, 0, nvec, -1);
 	if (irq < 0)
 		goto out;
 
 	for (i = 0; i < nvec; i++) {
+		if (!xen_irq_init(irq + i))
+			goto error_irq;
+
 		irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
 
 		ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
@@ -1730,6 +1743,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
 	   so there should be a proper type */
 	BUG_ON(info->type == IRQT_UNBOUND);
 
+	info->irq = irq;
 	(void)xen_irq_info_evtchn_setup(irq, evtchn, NULL);
 
 	mutex_unlock(&irq_mapping_update_lock);
-- 
cgit 


From 3fcdaf3d7634338c3f5cbfa7451eb0b6b0024844 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 28 Sep 2023 09:09:52 +0200
Subject: xen/events: modify internal [un]bind interfaces

Modify the internal bind- and unbind-interfaces to take a struct
irq_info parameter. When allocating a new IRQ pass the pointer from
the allocating function further up.

This will reduce the number of info_for_irq() calls and make the code
more efficient.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 259 +++++++++++++++++++--------------------
 1 file changed, 124 insertions(+), 135 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 75f8d1dcbbb7..88f0c80d0f87 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -327,7 +327,6 @@ static void delayed_free_irq(struct work_struct *work)
 
 /* Constructors for packed IRQ information. */
 static int xen_irq_info_common_setup(struct irq_info *info,
-				     unsigned irq,
 				     enum xen_irq_type type,
 				     evtchn_port_t evtchn,
 				     unsigned short cpu)
@@ -342,23 +341,22 @@ static int xen_irq_info_common_setup(struct irq_info *info,
 	info->mask_reason = EVT_MASK_REASON_EXPLICIT;
 	raw_spin_lock_init(&info->lock);
 
-	ret = set_evtchn_to_irq(evtchn, irq);
+	ret = set_evtchn_to_irq(evtchn, info->irq);
 	if (ret < 0)
 		return ret;
 
-	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
+	irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN);
 
 	return xen_evtchn_port_setup(evtchn);
 }
 
-static int xen_irq_info_evtchn_setup(unsigned irq,
+static int xen_irq_info_evtchn_setup(struct irq_info *info,
 				     evtchn_port_t evtchn,
 				     struct xenbus_device *dev)
 {
-	struct irq_info *info = info_for_irq(irq);
 	int ret;
 
-	ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
+	ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0);
 	info->u.interdomain = dev;
 	if (dev)
 		atomic_inc(&dev->event_channels);
@@ -366,50 +364,37 @@ static int xen_irq_info_evtchn_setup(unsigned irq,
 	return ret;
 }
 
-static int xen_irq_info_ipi_setup(unsigned cpu,
-				  unsigned irq,
-				  evtchn_port_t evtchn,
-				  enum ipi_vector ipi)
+static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu,
+				  evtchn_port_t evtchn, enum ipi_vector ipi)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	info->u.ipi = ipi;
 
-	per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+	per_cpu(ipi_to_irq, cpu)[ipi] = info->irq;
 	per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
 
-	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
+	return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0);
 }
 
-static int xen_irq_info_virq_setup(unsigned cpu,
-				   unsigned irq,
-				   evtchn_port_t evtchn,
-				   unsigned virq)
+static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu,
+				   evtchn_port_t evtchn, unsigned int virq)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	info->u.virq = virq;
 
-	per_cpu(virq_to_irq, cpu)[virq] = irq;
+	per_cpu(virq_to_irq, cpu)[virq] = info->irq;
 
-	return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);
+	return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0);
 }
 
-static int xen_irq_info_pirq_setup(unsigned irq,
-				   evtchn_port_t evtchn,
-				   unsigned pirq,
-				   unsigned gsi,
-				   uint16_t domid,
-				   unsigned char flags)
+static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn,
+				   unsigned int pirq, unsigned int gsi,
+				   uint16_t domid, unsigned char flags)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	info->u.pirq.pirq = pirq;
 	info->u.pirq.gsi = gsi;
 	info->u.pirq.domid = domid;
 	info->u.pirq.flags = flags;
 
-	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
+	return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0);
 }
 
 static void xen_irq_info_cleanup(struct irq_info *info)
@@ -453,20 +438,16 @@ int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq,
 	return irq;
 }
 
-static enum ipi_vector ipi_from_irq(unsigned irq)
+static enum ipi_vector ipi_from_irq(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	BUG_ON(info == NULL);
 	BUG_ON(info->type != IRQT_IPI);
 
 	return info->u.ipi;
 }
 
-static unsigned virq_from_irq(unsigned irq)
+static unsigned int virq_from_irq(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	BUG_ON(info == NULL);
 	BUG_ON(info->type != IRQT_VIRQ);
 
@@ -533,13 +514,9 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
+static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu,
 			       bool force_affinity)
 {
-	struct irq_info *info = evtchn_to_info(evtchn);
-
-	BUG_ON(info == NULL);
-
 	if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
 		struct irq_data *data = irq_get_irq_data(info->irq);
 
@@ -547,7 +524,7 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
 		irq_data_update_effective_affinity(data, cpumask_of(cpu));
 	}
 
-	xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
+	xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu);
 
 	channels_on_cpu_dec(info);
 	info->cpu = cpu;
@@ -762,23 +739,24 @@ static struct irq_info *xen_irq_init(unsigned int irq)
 	return info;
 }
 
-static inline int __must_check xen_allocate_irq_dynamic(void)
+static struct irq_info *xen_allocate_irq_dynamic(void)
 {
 	int irq = irq_alloc_desc_from(0, -1);
+	struct irq_info *info = NULL;
 
 	if (irq >= 0) {
-		if (!xen_irq_init(irq)) {
+		info = xen_irq_init(irq);
+		if (!info)
 			xen_irq_free_desc(irq);
-			irq = -1;
-		}
 	}
 
-	return irq;
+	return info;
 }
 
-static int __must_check xen_allocate_irq_gsi(unsigned gsi)
+static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi)
 {
 	int irq;
+	struct irq_info *info;
 
 	/*
 	 * A PV guest has no concept of a GSI (since it has no ACPI
@@ -795,18 +773,15 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi)
 	else
 		irq = irq_alloc_desc_at(gsi, -1);
 
-	if (!xen_irq_init(irq)) {
+	info = xen_irq_init(irq);
+	if (!info)
 		xen_irq_free_desc(irq);
-		irq = -1;
-	}
 
-	return irq;
+	return info;
 }
 
-static void xen_free_irq(unsigned irq)
+static void xen_free_irq(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	if (WARN_ON(!info))
 		return;
 
@@ -897,7 +872,7 @@ static unsigned int __startup_pirq(unsigned int irq)
 		goto err;
 
 	info->evtchn = evtchn;
-	bind_evtchn_to_cpu(evtchn, 0, false);
+	bind_evtchn_to_cpu(info, 0, false);
 
 	rc = xen_evtchn_port_setup(evtchn);
 	if (rc)
@@ -963,10 +938,9 @@ int xen_irq_from_gsi(unsigned gsi)
 }
 EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
 
-static void __unbind_from_irq(unsigned int irq)
+static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
 {
-	evtchn_port_t evtchn = evtchn_from_irq(irq);
-	struct irq_info *info = info_for_irq(irq);
+	evtchn_port_t evtchn;
 
 	if (!info) {
 		xen_irq_free_desc(irq);
@@ -979,6 +953,8 @@ static void __unbind_from_irq(unsigned int irq)
 			return;
 	}
 
+	evtchn = info->evtchn;
+
 	if (VALID_EVTCHN(evtchn)) {
 		unsigned int cpu = info->cpu;
 		struct xenbus_device *dev;
@@ -988,11 +964,11 @@ static void __unbind_from_irq(unsigned int irq)
 
 		switch (info->type) {
 		case IRQT_VIRQ:
-			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
+			per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1;
 			break;
 		case IRQT_IPI:
-			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
-			per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(irq)] = 0;
+			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1;
+			per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0;
 			break;
 		case IRQT_EVTCHN:
 			dev = info->u.interdomain;
@@ -1006,7 +982,7 @@ static void __unbind_from_irq(unsigned int irq)
 		xen_irq_info_cleanup(info);
 	}
 
-	xen_free_irq(irq);
+	xen_free_irq(info);
 }
 
 /*
@@ -1022,24 +998,24 @@ static void __unbind_from_irq(unsigned int irq)
 int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 			     unsigned pirq, int shareable, char *name)
 {
-	int irq;
+	struct irq_info *info;
 	struct physdev_irq irq_op;
 	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = xen_irq_from_gsi(gsi);
-	if (irq != -1) {
+	ret = xen_irq_from_gsi(gsi);
+	if (ret != -1) {
 		pr_info("%s: returning irq %d for gsi %u\n",
-			__func__, irq, gsi);
+			__func__, ret, gsi);
 		goto out;
 	}
 
-	irq = xen_allocate_irq_gsi(gsi);
-	if (irq < 0)
+	info = xen_allocate_irq_gsi(gsi);
+	if (!info)
 		goto out;
 
-	irq_op.irq = irq;
+	irq_op.irq = info->irq;
 	irq_op.vector = 0;
 
 	/* Only the privileged domain can do this. For non-priv, the pcifront
@@ -1047,20 +1023,19 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 	 * this in the priv domain. */
 	if (xen_initial_domain() &&
 	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-		xen_free_irq(irq);
-		irq = -ENOSPC;
+		xen_free_irq(info);
+		ret = -ENOSPC;
 		goto out;
 	}
 
-	ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,
+	ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF,
 			       shareable ? PIRQ_SHAREABLE : 0);
 	if (ret < 0) {
-		__unbind_from_irq(irq);
-		irq = ret;
+		__unbind_from_irq(info, info->irq);
 		goto out;
 	}
 
-	pirq_query_unmask(irq);
+	pirq_query_unmask(info->irq);
 	/* We try to use the handler with the appropriate semantic for the
 	 * type of interrupt: if the interrupt is an edge triggered
 	 * interrupt we use handle_edge_irq.
@@ -1077,16 +1052,18 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 	 * is the right choice either way.
 	 */
 	if (shareable)
-		irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+		irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip,
 				handle_fasteoi_irq, name);
 	else
-		irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+		irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip,
 				handle_edge_irq, name);
 
+	ret = info->irq;
+
 out:
 	mutex_unlock(&irq_mapping_update_lock);
 
-	return irq;
+	return ret;
 }
 
 #ifdef CONFIG_PCI_MSI
@@ -1108,6 +1085,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 			     int pirq, int nvec, const char *name, domid_t domid)
 {
 	int i, irq, ret;
+	struct irq_info *info;
 
 	mutex_lock(&irq_mapping_update_lock);
 
@@ -1116,12 +1094,13 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 		goto out;
 
 	for (i = 0; i < nvec; i++) {
-		if (!xen_irq_init(irq + i))
+		info = xen_irq_init(irq + i);
+		if (!info)
 			goto error_irq;
 
 		irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
 
-		ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
+		ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid,
 					      i == 0 ? 0 : PIRQ_MSI_GROUP);
 		if (ret < 0)
 			goto error_irq;
@@ -1133,9 +1112,12 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 out:
 	mutex_unlock(&irq_mapping_update_lock);
 	return irq;
+
 error_irq:
-	while (nvec--)
-		__unbind_from_irq(irq + nvec);
+	while (nvec--) {
+		info = info_for_irq(irq + nvec);
+		__unbind_from_irq(info, irq + nvec);
+	}
 	mutex_unlock(&irq_mapping_update_lock);
 	return ret;
 }
@@ -1171,7 +1153,7 @@ int xen_destroy_irq(int irq)
 		}
 	}
 
-	xen_free_irq(irq);
+	xen_free_irq(info);
 
 out:
 	mutex_unlock(&irq_mapping_update_lock);
@@ -1187,8 +1169,7 @@ EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
 static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
 				   struct xenbus_device *dev)
 {
-	int irq;
-	int ret;
+	int ret = -ENOMEM;
 	struct irq_info *info;
 
 	if (evtchn >= xen_evtchn_max_channels())
@@ -1199,17 +1180,16 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
 	info = evtchn_to_info(evtchn);
 
 	if (!info) {
-		irq = xen_allocate_irq_dynamic();
-		if (irq < 0)
+		info = xen_allocate_irq_dynamic();
+		if (!info)
 			goto out;
 
-		irq_set_chip_and_handler_name(irq, chip,
+		irq_set_chip_and_handler_name(info->irq, chip,
 					      handle_edge_irq, "event");
 
-		ret = xen_irq_info_evtchn_setup(irq, evtchn, dev);
+		ret = xen_irq_info_evtchn_setup(info, evtchn, dev);
 		if (ret < 0) {
-			__unbind_from_irq(irq);
-			irq = ret;
+			__unbind_from_irq(info, info->irq);
 			goto out;
 		}
 		/*
@@ -1219,17 +1199,17 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
 		 * affinity setting is not invoked on them so nothing would
 		 * bind the channel.
 		 */
-		bind_evtchn_to_cpu(evtchn, 0, false);
-	} else {
-		if (!WARN_ON(info->type != IRQT_EVTCHN))
-			info->refcnt++;
-		irq = info->irq;
+		bind_evtchn_to_cpu(info, 0, false);
+	} else if (!WARN_ON(info->type != IRQT_EVTCHN)) {
+		info->refcnt++;
 	}
 
+	ret = info->irq;
+
 out:
 	mutex_unlock(&irq_mapping_update_lock);
 
-	return irq;
+	return ret;
 }
 
 int bind_evtchn_to_irq(evtchn_port_t evtchn)
@@ -1248,18 +1228,19 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
 	struct evtchn_bind_ipi bind_ipi;
 	evtchn_port_t evtchn;
-	int ret, irq;
+	struct irq_info *info;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = per_cpu(ipi_to_irq, cpu)[ipi];
+	ret = per_cpu(ipi_to_irq, cpu)[ipi];
 
-	if (irq == -1) {
-		irq = xen_allocate_irq_dynamic();
-		if (irq < 0)
+	if (ret == -1) {
+		info = xen_allocate_irq_dynamic();
+		if (!info)
 			goto out;
 
-		irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+		irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip,
 					      handle_percpu_irq, "ipi");
 
 		bind_ipi.vcpu = xen_vcpu_nr(cpu);
@@ -1268,25 +1249,25 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 			BUG();
 		evtchn = bind_ipi.port;
 
-		ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+		ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi);
 		if (ret < 0) {
-			__unbind_from_irq(irq);
-			irq = ret;
+			__unbind_from_irq(info, info->irq);
 			goto out;
 		}
 		/*
 		 * Force the affinity mask to the target CPU so proc shows
 		 * the correct target.
 		 */
-		bind_evtchn_to_cpu(evtchn, cpu, true);
+		bind_evtchn_to_cpu(info, cpu, true);
+		ret = info->irq;
 	} else {
-		struct irq_info *info = info_for_irq(irq);
+		info = info_for_irq(ret);
 		WARN_ON(info == NULL || info->type != IRQT_IPI);
 	}
 
  out:
 	mutex_unlock(&irq_mapping_update_lock);
-	return irq;
+	return ret;
 }
 
 static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev,
@@ -1354,22 +1335,23 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
 {
 	struct evtchn_bind_virq bind_virq;
 	evtchn_port_t evtchn = 0;
-	int irq, ret;
+	struct irq_info *info;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = per_cpu(virq_to_irq, cpu)[virq];
+	ret = per_cpu(virq_to_irq, cpu)[virq];
 
-	if (irq == -1) {
-		irq = xen_allocate_irq_dynamic();
-		if (irq < 0)
+	if (ret == -1) {
+		info = xen_allocate_irq_dynamic();
+		if (!info)
 			goto out;
 
 		if (percpu)
-			irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+			irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip,
 						      handle_percpu_irq, "virq");
 		else
-			irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
+			irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip,
 						      handle_edge_irq, "virq");
 
 		bind_virq.virq = virq;
@@ -1384,10 +1366,9 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
 			BUG_ON(ret < 0);
 		}
 
-		ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+		ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq);
 		if (ret < 0) {
-			__unbind_from_irq(irq);
-			irq = ret;
+			__unbind_from_irq(info, info->irq);
 			goto out;
 		}
 
@@ -1395,22 +1376,26 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
 		 * Force the affinity mask for percpu interrupts so proc
 		 * shows the correct target.
 		 */
-		bind_evtchn_to_cpu(evtchn, cpu, percpu);
+		bind_evtchn_to_cpu(info, cpu, percpu);
+		ret = info->irq;
 	} else {
-		struct irq_info *info = info_for_irq(irq);
+		info = info_for_irq(ret);
 		WARN_ON(info == NULL || info->type != IRQT_VIRQ);
 	}
 
 out:
 	mutex_unlock(&irq_mapping_update_lock);
 
-	return irq;
+	return ret;
 }
 
 static void unbind_from_irq(unsigned int irq)
 {
+	struct irq_info *info;
+
 	mutex_lock(&irq_mapping_update_lock);
-	__unbind_from_irq(irq);
+	info = info_for_irq(irq);
+	__unbind_from_irq(info, irq);
 	mutex_unlock(&irq_mapping_update_lock);
 }
 
@@ -1744,11 +1729,11 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
 	BUG_ON(info->type == IRQT_UNBOUND);
 
 	info->irq = irq;
-	(void)xen_irq_info_evtchn_setup(irq, evtchn, NULL);
+	(void)xen_irq_info_evtchn_setup(info, evtchn, NULL);
 
 	mutex_unlock(&irq_mapping_update_lock);
 
-	bind_evtchn_to_cpu(evtchn, info->cpu, false);
+	bind_evtchn_to_cpu(info, info->cpu, false);
 
 	/* Unmask the event channel. */
 	enable_irq(irq);
@@ -1782,7 +1767,7 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu)
 	 * it, but don't do the xenlinux-level rebind in that case.
 	 */
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
-		bind_evtchn_to_cpu(evtchn, tcpu, false);
+		bind_evtchn_to_cpu(info, tcpu, false);
 
 	do_unmask(info, EVT_MASK_REASON_TEMPORARY);
 
@@ -1933,7 +1918,7 @@ static void restore_pirqs(void)
 		if (rc) {
 			pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
 				gsi, irq, pirq, rc);
-			xen_free_irq(irq);
+			xen_free_irq(info);
 			continue;
 		}
 
@@ -1947,13 +1932,15 @@ static void restore_cpu_virqs(unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
 	evtchn_port_t evtchn;
+	struct irq_info *info;
 	int virq, irq;
 
 	for (virq = 0; virq < NR_VIRQS; virq++) {
 		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
 			continue;
+		info = info_for_irq(irq);
 
-		BUG_ON(virq_from_irq(irq) != virq);
+		BUG_ON(virq_from_irq(info) != virq);
 
 		/* Get a new binding from Xen. */
 		bind_virq.virq = virq;
@@ -1964,9 +1951,9 @@ static void restore_cpu_virqs(unsigned int cpu)
 		evtchn = bind_virq.port;
 
 		/* Record the new mapping. */
-		(void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+		xen_irq_info_virq_setup(info, cpu, evtchn, virq);
 		/* The affinity mask is still valid */
-		bind_evtchn_to_cpu(evtchn, cpu, false);
+		bind_evtchn_to_cpu(info, cpu, false);
 	}
 }
 
@@ -1974,13 +1961,15 @@ static void restore_cpu_ipis(unsigned int cpu)
 {
 	struct evtchn_bind_ipi bind_ipi;
 	evtchn_port_t evtchn;
+	struct irq_info *info;
 	int ipi, irq;
 
 	for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
 		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
 			continue;
+		info = info_for_irq(irq);
 
-		BUG_ON(ipi_from_irq(irq) != ipi);
+		BUG_ON(ipi_from_irq(info) != ipi);
 
 		/* Get a new binding from Xen. */
 		bind_ipi.vcpu = xen_vcpu_nr(cpu);
@@ -1990,9 +1979,9 @@ static void restore_cpu_ipis(unsigned int cpu)
 		evtchn = bind_ipi.port;
 
 		/* Record the new mapping. */
-		(void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+		xen_irq_info_ipi_setup(info, cpu, evtchn, ipi);
 		/* The affinity mask is still valid */
-		bind_evtchn_to_cpu(evtchn, cpu, false);
+		bind_evtchn_to_cpu(info, cpu, false);
 	}
 }
 
-- 
cgit 


From cee96422e863f0b0e9d3d0c2d617271ef2255858 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 28 Sep 2023 11:25:32 +0200
Subject: xen/events: remove some info_for_irq() calls in pirq handling

Instead of the IRQ number user the struct irq_info pointer as parameter
in the internal pirq related functions. This allows to drop some calls
of info_for_irq().

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 117 +++++++++++++++++++++++----------------
 1 file changed, 68 insertions(+), 49 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 88f0c80d0f87..f5edb9e27e3c 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -174,7 +174,7 @@ static int **evtchn_to_irq;
 #ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
 #endif
-static bool (*pirq_needs_eoi)(unsigned irq);
+static bool (*pirq_needs_eoi)(struct irq_info *info);
 
 #define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
 #define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
@@ -190,7 +190,6 @@ static struct irq_chip xen_lateeoi_chip;
 static struct irq_chip xen_percpu_chip;
 static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
-static void disable_dynirq(struct irq_data *data);
 
 static DEFINE_PER_CPU(unsigned int, irq_epoch);
 
@@ -454,10 +453,8 @@ static unsigned int virq_from_irq(struct irq_info *info)
 	return info->u.virq;
 }
 
-static unsigned pirq_from_irq(unsigned irq)
+static unsigned int pirq_from_irq(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(irq);
-
 	BUG_ON(info == NULL);
 	BUG_ON(info->type != IRQT_PIRQ);
 
@@ -500,15 +497,14 @@ static void do_unmask(struct irq_info *info, u8 reason)
 }
 
 #ifdef CONFIG_X86
-static bool pirq_check_eoi_map(unsigned irq)
+static bool pirq_check_eoi_map(struct irq_info *info)
 {
-	return test_bit(pirq_from_irq(irq), pirq_eoi_map);
+	return test_bit(pirq_from_irq(info), pirq_eoi_map);
 }
 #endif
 
-static bool pirq_needs_eoi_flag(unsigned irq)
+static bool pirq_needs_eoi_flag(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(irq);
 	BUG_ON(info->type != IRQT_PIRQ);
 
 	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
@@ -802,14 +798,11 @@ static void event_handler_exit(struct irq_info *info)
 	clear_evtchn(info->evtchn);
 }
 
-static void pirq_query_unmask(int irq)
+static void pirq_query_unmask(struct irq_info *info)
 {
 	struct physdev_irq_status_query irq_status;
-	struct irq_info *info = info_for_irq(irq);
 
-	BUG_ON(info->type != IRQT_PIRQ);
-
-	irq_status.irq = pirq_from_irq(irq);
+	irq_status.irq = pirq_from_irq(info);
 	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
 		irq_status.flags = 0;
 
@@ -818,56 +811,76 @@ static void pirq_query_unmask(int irq)
 		info->u.pirq.flags |= PIRQ_NEEDS_EOI;
 }
 
-static void eoi_pirq(struct irq_data *data)
+static void do_eoi_pirq(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(data->irq);
-	evtchn_port_t evtchn = info ? info->evtchn : 0;
-	struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
+	struct physdev_eoi eoi = { .irq = pirq_from_irq(info) };
 	int rc = 0;
 
-	if (!VALID_EVTCHN(evtchn))
+	if (!VALID_EVTCHN(info->evtchn))
 		return;
 
 	event_handler_exit(info);
 
-	if (pirq_needs_eoi(data->irq)) {
+	if (pirq_needs_eoi(info)) {
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
 		WARN_ON(rc);
 	}
 }
 
+static void eoi_pirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+
+	do_eoi_pirq(info);
+}
+
+static void do_disable_dynirq(struct irq_info *info)
+{
+	if (VALID_EVTCHN(info->evtchn))
+		do_mask(info, EVT_MASK_REASON_EXPLICIT);
+}
+
+static void disable_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+
+	if (info)
+		do_disable_dynirq(info);
+}
+
 static void mask_ack_pirq(struct irq_data *data)
 {
-	disable_dynirq(data);
-	eoi_pirq(data);
+	struct irq_info *info = info_for_irq(data->irq);
+
+	if (info) {
+		do_disable_dynirq(info);
+		do_eoi_pirq(info);
+	}
 }
 
-static unsigned int __startup_pirq(unsigned int irq)
+static unsigned int __startup_pirq(struct irq_info *info)
 {
 	struct evtchn_bind_pirq bind_pirq;
-	struct irq_info *info = info_for_irq(irq);
-	evtchn_port_t evtchn = evtchn_from_irq(irq);
+	evtchn_port_t evtchn = info->evtchn;
 	int rc;
 
-	BUG_ON(info->type != IRQT_PIRQ);
-
 	if (VALID_EVTCHN(evtchn))
 		goto out;
 
-	bind_pirq.pirq = pirq_from_irq(irq);
+	bind_pirq.pirq = pirq_from_irq(info);
 	/* NB. We are happy to share unless we are probing. */
 	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
 					BIND_PIRQ__WILL_SHARE : 0;
 	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
 	if (rc != 0) {
-		pr_warn("Failed to obtain physical IRQ %d\n", irq);
+		pr_warn("Failed to obtain physical IRQ %d\n", info->irq);
 		return 0;
 	}
 	evtchn = bind_pirq.port;
 
-	pirq_query_unmask(irq);
+	pirq_query_unmask(info);
 
-	rc = set_evtchn_to_irq(evtchn, irq);
+	rc = set_evtchn_to_irq(evtchn, info->irq);
 	if (rc)
 		goto err;
 
@@ -881,26 +894,28 @@ static unsigned int __startup_pirq(unsigned int irq)
 out:
 	do_unmask(info, EVT_MASK_REASON_EXPLICIT);
 
-	eoi_pirq(irq_get_irq_data(irq));
+	do_eoi_pirq(info);
 
 	return 0;
 
 err:
-	pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc);
+	pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq,
+	       rc);
 	xen_evtchn_close(evtchn);
 	return 0;
 }
 
 static unsigned int startup_pirq(struct irq_data *data)
 {
-	return __startup_pirq(data->irq);
+	struct irq_info *info = info_for_irq(data->irq);
+
+	return __startup_pirq(info);
 }
 
 static void shutdown_pirq(struct irq_data *data)
 {
-	unsigned int irq = data->irq;
-	struct irq_info *info = info_for_irq(irq);
-	evtchn_port_t evtchn = evtchn_from_irq(irq);
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info->evtchn;
 
 	BUG_ON(info->type != IRQT_PIRQ);
 
@@ -1035,7 +1050,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 		goto out;
 	}
 
-	pirq_query_unmask(info->irq);
+	pirq_query_unmask(info);
 	/* We try to use the handler with the appropriate semantic for the
 	 * type of interrupt: if the interrupt is an edge triggered
 	 * interrupt we use handle_edge_irq.
@@ -1162,7 +1177,9 @@ out:
 
 int xen_pirq_from_irq(unsigned irq)
 {
-	return pirq_from_irq(irq);
+	struct irq_info *info = info_for_irq(irq);
+
+	return pirq_from_irq(info);
 }
 EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
 
@@ -1824,28 +1841,30 @@ static void enable_dynirq(struct irq_data *data)
 		do_unmask(info, EVT_MASK_REASON_EXPLICIT);
 }
 
-static void disable_dynirq(struct irq_data *data)
+static void do_ack_dynirq(struct irq_info *info)
 {
-	struct irq_info *info = info_for_irq(data->irq);
-	evtchn_port_t evtchn = info ? info->evtchn : 0;
+	evtchn_port_t evtchn = info->evtchn;
 
 	if (VALID_EVTCHN(evtchn))
-		do_mask(info, EVT_MASK_REASON_EXPLICIT);
+		event_handler_exit(info);
 }
 
 static void ack_dynirq(struct irq_data *data)
 {
 	struct irq_info *info = info_for_irq(data->irq);
-	evtchn_port_t evtchn = info ? info->evtchn : 0;
 
-	if (VALID_EVTCHN(evtchn))
-		event_handler_exit(info);
+	if (info)
+		do_ack_dynirq(info);
 }
 
 static void mask_ack_dynirq(struct irq_data *data)
 {
-	disable_dynirq(data);
-	ack_dynirq(data);
+	struct irq_info *info = info_for_irq(data->irq);
+
+	if (info) {
+		do_disable_dynirq(info);
+		do_ack_dynirq(info);
+	}
 }
 
 static void lateeoi_ack_dynirq(struct irq_data *data)
@@ -1924,7 +1943,7 @@ static void restore_pirqs(void)
 
 		printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
 
-		__startup_pirq(irq);
+		__startup_pirq(info);
 	}
 }
 
-- 
cgit 


From 9e2e7efbbbff69d8340abb56d375dd79d1f5770f Mon Sep 17 00:00:00 2001
From: Johnathan Mantey <johnathanx.mantey@intel.com>
Date: Mon, 13 Nov 2023 08:30:29 -0800
Subject: Revert ncsi: Propagate carrier gain/loss events to the NCSI
 controller

This reverts commit 3780bb29311eccb7a1c9641032a112eed237f7e3.

The cited commit introduced unwanted behavior.

The intent for the commit was to be able to detect carrier loss/gain
for just the NIC connected to the BMC. The unwanted effect is a
carrier loss for auxiliary paths also causes the BMC to lose
carrier. The BMC never regains carrier despite the secondary NIC
regaining a link.

This change, when merged, needs to be backported to stable kernels.
5.4-stable, 5.10-stable, 5.15-stable, 6.1-stable, 6.5-stable

Fixes: 3780bb29311e ("ncsi: Propagate carrier gain/loss events to the NCSI controller")
CC: stable@vger.kernel.org
Signed-off-by: Johnathan Mantey <johnathanx.mantey@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-aen.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index f8854bff286c..62fb1031763d 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -89,11 +89,6 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	if ((had_link == has_link) || chained)
 		return 0;
 
-	if (had_link)
-		netif_carrier_off(ndp->ndev.dev);
-	else
-		netif_carrier_on(ndp->ndev.dev);
-
 	if (!ndp->multi_package && !nc->package->multi_channel) {
 		if (had_link) {
 			ndp->flags |= NCSI_DEV_RESHUFFLE;
-- 
cgit 


From efc0c8363bc6dab2cd540acc886e6097deee8bb9 Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Mon, 13 Nov 2023 17:44:12 +0100
Subject: dt-bindings: net: ethernet-controller: Fix formatting error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When moving the *-internal-delay-ps properties to only apply for RGMII
interface modes there where a typo in the text formatting.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/ethernet-controller.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
index 9f6a5ccbcefe..d14d123ad7a0 100644
--- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -275,12 +275,12 @@ allOf:
       properties:
         rx-internal-delay-ps:
           description:
-            RGMII Receive Clock Delay defined in pico seconds.This is used for
+            RGMII Receive Clock Delay defined in pico seconds. This is used for
             controllers that have configurable RX internal delays. If this
             property is present then the MAC applies the RX delay.
         tx-internal-delay-ps:
           description:
-            RGMII Transmit Clock Delay defined in pico seconds.This is used for
+            RGMII Transmit Clock Delay defined in pico seconds. This is used for
             controllers that have configurable TX internal delays. If this
             property is present then the MAC applies the TX delay.
 
-- 
cgit 


From 00a614fc3527ba8846e6d823013bcf724e9a68f6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 Oct 2023 17:26:23 +0200
Subject: accel/ivpu: avoid build failure with CONFIG_PM=n

The usage count of struct dev_pm_info is an implementation detail that
is only available if CONFIG_PM is enabled, so printing it in a debug message
causes a build failure in configurations without PM:

In file included from include/linux/device.h:15,
                 from include/linux/pci.h:37,
                 from drivers/accel/ivpu/ivpu_pm.c:8:
drivers/accel/ivpu/ivpu_pm.c: In function 'ivpu_rpm_get_if_active':
drivers/accel/ivpu/ivpu_pm.c:254:51: error: 'struct dev_pm_info' has no member named 'usage_count'
  254 |                  atomic_read(&vdev->drm.dev->power.usage_count));
      |                                                   ^
include/linux/dev_printk.h:129:48: note: in definition of macro 'dev_printk'
  129 |                 _dev_printk(level, dev, fmt, ##__VA_ARGS__);            \
      |                                                ^~~~~~~~~~~
drivers/accel/ivpu/ivpu_drv.h:75:17: note: in expansion of macro 'dev_dbg'
   75 |                 dev_dbg((vdev)->drm.dev, "[%s] " fmt, #type, ##args);          \
      |                 ^~~~~~~
drivers/accel/ivpu/ivpu_pm.c:253:9: note: in expansion of macro 'ivpu_dbg'
  253 |         ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n",
      |         ^~~~~~~~

The print message does not seem essential, so the easiest workaround is
to just remove it.

Fixes: c39dc15191c4 ("accel/ivpu: Read clock rate only if device is up")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Signed-off-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231027152633.528490-1-arnd@kernel.org
(cherry picked from commit 1470acbef122c7e2e588f6346ce459c26d0568a2)
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_pm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index 0ace218783c8..e9b16cbc26f4 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -250,9 +250,6 @@ int ivpu_rpm_get_if_active(struct ivpu_device *vdev)
 {
 	int ret;
 
-	ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n",
-		 atomic_read(&vdev->drm.dev->power.usage_count));
-
 	ret = pm_runtime_get_if_active(vdev->drm.dev, false);
 	drm_WARN_ON(&vdev->drm, ret < 0);
 
-- 
cgit 


From 674e318089468ece99aef4796eaef7add57f36b2 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Tue, 14 Nov 2023 09:56:18 +0200
Subject: net: Fix undefined behavior in netdev name allocation

Cited commit removed the strscpy() call and kept the snprintf() only.

It is common to use 'dev->name' as the format string before a netdev is
registered, this results in 'res' and 'name' pointers being equal.
According to POSIX, if copying takes place between objects that overlap
as a result of a call to sprintf() or snprintf(), the results are
undefined.

Add back the strscpy() and use 'buf' as an intermediate buffer.

Fixes: 7ad17b04dc7b ("net: trust the bitmap in __dev_alloc_name()")
Cc: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 0d548431f3fa..af53f6d838ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1119,7 +1119,9 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
 	if (i == max_netdevices)
 		return -ENFILE;
 
-	snprintf(res, IFNAMSIZ, name, i);
+	/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
+	strscpy(buf, name, IFNAMSIZ);
+	snprintf(res, IFNAMSIZ, buf, i);
 	return i;
 }
 
-- 
cgit 


From 9e88b493157a9901fa498f23cc3c9ab82b43ce83 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Wed, 15 Nov 2023 13:36:25 +0100
Subject: ALSA: hda: i915: Alays handle -EPROBE_DEFER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out that even if the comment says that the driver can load
fine, it's not really the case and no codecs are detected.
Specifically for -EPROBE_DEFER, always fail the probe.

This fixes a regression when HDA-intel is loaded before i915.

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Closes: https://lore.kernel.org/r/ZVNUxZzCGcxQzqJX@intel.com
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Tested-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Fixes: e6d0c13e9f46 ("ALSA: hda: i915: Remove extra argument from snd_hdac_i915_init")
Link: https://gitlab.freedesktop.org/drm/intel/-/issues/9671
Link: https://lore.kernel.org/r/20231115123625.74286-1-maarten.lankhorst@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/hda_intel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index e871afeeb383..e79508002bb1 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2141,6 +2141,9 @@ static int azx_probe(struct pci_dev *pci,
 	if (chip->driver_caps & AZX_DCAPS_I915_COMPONENT) {
 		err = snd_hdac_i915_init(azx_bus(chip));
 		if (err < 0) {
+			if (err == -EPROBE_DEFER)
+				goto out_free;
+
 			/* if the controller is bound only with HDMI/DP
 			 * (for HSW and BDW), we need to abort the probe;
 			 * for other chips, still continue probing as other
-- 
cgit 


From a0d45c3f596be53c1bd8822a1984532d14fdcea9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Nov 2023 09:55:50 -0700
Subject: io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval

A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.

Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.

We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.

With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.

Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c |  9 ++-------
 io_uring/sqpoll.c | 12 ++++++++++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index f04a43044d91..976e9500f651 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -145,13 +145,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
 		struct io_sq_data *sq = ctx->sq_data;
 
-		if (mutex_trylock(&sq->lock)) {
-			if (sq->thread) {
-				sq_pid = task_pid_nr(sq->thread);
-				sq_cpu = task_cpu(sq->thread);
-			}
-			mutex_unlock(&sq->lock);
-		}
+		sq_pid = sq->task_pid;
+		sq_cpu = sq->sq_cpu;
 	}
 
 	seq_printf(m, "SqThread:\t%d\n", sq_pid);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index bd6c2c7959a5..65b5dbe3c850 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -214,6 +214,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
 			did_sig = get_signal(&ksig);
 		cond_resched();
 		mutex_lock(&sqd->lock);
+		sqd->sq_cpu = raw_smp_processor_id();
 	}
 	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 }
@@ -229,10 +230,15 @@ static int io_sq_thread(void *data)
 	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 	set_task_comm(current, buf);
 
-	if (sqd->sq_cpu != -1)
+	/* reset to our pid after we've set task_comm, for fdinfo */
+	sqd->task_pid = current->pid;
+
+	if (sqd->sq_cpu != -1) {
 		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
-	else
+	} else {
 		set_cpus_allowed_ptr(current, cpu_online_mask);
+		sqd->sq_cpu = raw_smp_processor_id();
+	}
 
 	mutex_lock(&sqd->lock);
 	while (1) {
@@ -261,6 +267,7 @@ static int io_sq_thread(void *data)
 				mutex_unlock(&sqd->lock);
 				cond_resched();
 				mutex_lock(&sqd->lock);
+				sqd->sq_cpu = raw_smp_processor_id();
 			}
 			continue;
 		}
@@ -294,6 +301,7 @@ static int io_sq_thread(void *data)
 				mutex_unlock(&sqd->lock);
 				schedule();
 				mutex_lock(&sqd->lock);
+				sqd->sq_cpu = raw_smp_processor_id();
 			}
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				atomic_andnot(IORING_SQ_NEED_WAKEUP,
-- 
cgit 


From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 10 Nov 2023 17:39:28 -0800
Subject: bpf: Do not allocate percpu memory at init stage

Kirill Shutemov reported significant percpu memory consumption increase after
booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
non-fix-size percpu mem allocation"). The percpu memory consumption is
increased from 111MB to 969MB. The number is from /proc/meminfo.

I tried to reproduce the issue with my local VM which at most supports upto
255 cpus. With 252 cpus, without the above commit, the percpu memory
consumption immediately after boot is 57MB while with the above commit the
percpu memory consumption is 231MB.

This is not good since so far percpu memory from bpf memory allocator is not
widely used yet. Let us change pre-allocation in init stage to on-demand
allocation when verifier detects there is a need of percpu memory for bpf
program. With this change, percpu memory consumption after boot can be reduced
signicantly.

  [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/

Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/core.c     |  8 +++-----
 kernel/bpf/verifier.c | 20 ++++++++++++++++++--
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 35bff17396c0..6762dac3ef76 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 08626b519ce2..cd3afe57ece3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
 
 /* No hurry in this branch
  *
@@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-	bpf_global_percpu_ma_set = !ret;
-	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
+	return ret;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2267d5ed14e..6da370a047fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
 #include <linux/poison.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
 #include <net/xdp.h>
 
 #include "disasm.h"
@@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
 #undef BPF_LINK_TYPE
 };
 
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
 struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
 
 static const struct bpf_line_info *
 find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
@@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
 					return -ENOMEM;
 
-				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
-					return -ENOMEM;
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!bpf_global_percpu_ma_set) {
+						mutex_lock(&bpf_percpu_ma_lock);
+						if (!bpf_global_percpu_ma_set) {
+							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+							if (!err)
+								bpf_global_percpu_ma_set = true;
+						}
+						mutex_unlock(&bpf_percpu_ma_lock);
+						if (err)
+							return err;
+					}
+				}
 
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
 					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
-- 
cgit 


From 430143b0d3611f4a9c8434319e5e504244749e79 Mon Sep 17 00:00:00 2001
From: Brenton Simpson <appsforartists@google.com>
Date: Tue, 14 Nov 2023 23:38:59 +0000
Subject: drm: panel-orientation-quirks: Add quirk for Lenovo Legion Go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Legion Go has a 2560x1600 portrait screen, with the native "up" facing
the right controller (90° CW from the rest of the device).

Signed-off-by: Brenton Simpson <appsforartists@google.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231114233859.274189-1-appsforartists@google.com
---
 drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c
index d5c15292ae93..3d92f66e550c 100644
--- a/drivers/gpu/drm/drm_panel_orientation_quirks.c
+++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c
@@ -336,6 +336,12 @@ static const struct dmi_system_id orientation_data[] = {
 		  DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "IdeaPad Duet 3 10IGL5"),
 		},
 		.driver_data = (void *)&lcd1200x1920_rightside_up,
+	}, {	/* Lenovo Legion Go 8APU1 */
+		.matches = {
+		  DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		  DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Legion Go 8APU1"),
+		},
+		.driver_data = (void *)&lcd1600x2560_leftside_up,
 	}, {	/* Lenovo Yoga Book X90F / X90L */
 		.matches = {
 		  DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Intel Corporation"),
-- 
cgit 


From ae1aadb1eb8d3cbc52e42bee71d67bd4a71f9f07 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 16 Nov 2023 00:39:33 +1000
Subject: nouveau: don't fail driver load if no display hw present.

If we get back ENODEV don't fail load. There are nvidia devices
that don't have display blocks and the driver should work on those.

Fixes: 15740541e8f0 ("drm/nouveau/devinit/tu102-: prepare for GSP-RM")
Link: https://gitlab.freedesktop.org/drm/nouveau/-/issues/270
Signed-off-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231115143933.261287-1-airlied@gmail.com
---
 drivers/gpu/drm/nouveau/nouveau_display.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c
index d8c92521226d..f28f9a857458 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -726,6 +726,11 @@ nouveau_display_create(struct drm_device *dev)
 
 	if (nouveau_modeset != 2) {
 		ret = nvif_disp_ctor(&drm->client.device, "kmsDisp", 0, &disp->disp);
+		/* no display hw */
+		if (ret == -ENODEV) {
+			ret = 0;
+			goto disp_create_err;
+		}
 
 		if (!ret && (disp->disp.outp_mask || drm->vbios.dcb.entries)) {
 			nouveau_display_create_properties(dev);
-- 
cgit 


From 61cbc08fdb04fd445458b0f4cba7e6929afdfaef Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Wed, 15 Nov 2023 16:21:15 +0000
Subject: ALSA: hda/realtek: Add quirks for ASUS 2024 Zenbooks

These ASUS Zenbook laptops use Realtek HDA codec combined with
2xCS35L41 Amplifiers using SPI or I2C with External Boost or
Internal Boost.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20231115162116.494968-2-sbinding@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 3c85b8247c11..a1e124370283 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9947,13 +9947,17 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1043, 0x19e1, "ASUS UX581LV", ALC295_FIXUP_ASUS_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW),
 	SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC),
+	SND_PCI_QUIRK(0x1043, 0x1a63, "ASUS UX3405MA", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2),
 	SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B),
 	SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC),
 	SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
+	SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2),
 	SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
+	SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2),
+	SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401),
 	SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS),
 	SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JI", ALC285_FIXUP_ASUS_HEADSET_MIC),
-- 
cgit 


From 5d639b60971f003d3a9b2b31f8ec73b0718b5d57 Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Wed, 15 Nov 2023 16:21:16 +0000
Subject: ALSA: hda/realtek: Add quirks for HP Laptops

These HP laptops use Realtek HDA codec combined with 2 or 4 CS35L41
Amplifiers using SPI with Internal Boost.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20231115162116.494968-3-sbinding@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index a1e124370283..5618b1d9bfd1 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9902,6 +9902,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
+	SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+	SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+	SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
 	SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300),
 	SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
-- 
cgit 


From 85c2ceaafbd306814a3a4740bf4d95ac26a8b36a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Mon, 6 Nov 2023 17:07:40 +0300
Subject: mm/damon/sysfs: eliminate potential uninitialized variable warning

The "err" variable is not initialized if damon_target_has_pid(ctx) is
false and sys_target->regions->nr is zero.

Link: https://lkml.kernel.org/r/739e6aaf-a634-4e33-98a8-16546379ec9f@moroto.mountain
Fixes: 0bcd216c4741 ("mm/damon/sysfs: update monitoring target regions for online input commit")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index e27846708b5a..1dfa96d4de99 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target,
 		struct damon_ctx *ctx,
 		struct damon_sysfs_target *sys_target)
 {
-	int err;
+	int err = 0;
 
 	if (damon_target_has_pid(ctx)) {
 		err = damon_sysfs_update_target_pid(target, sys_target->pid);
-- 
cgit 


From 019b277b680f5b95135c042c78dd79318d8f9e3c Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 3 Nov 2023 23:23:41 +0500
Subject: selftests: mm: skip whole test instead of failure

Some architectures don't support userfaultfd.  Skip running the whole test
on them instead of registering the failure.

Link: https://lkml.kernel.org/r/20231103182343.2874015-1-usama.anjum@collabora.com
Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests")

Reported-by: Ryan Roberts <ryan.roberts@arm.com>
Closes: https://lore.kernel.org/all/f8463381-2697-49e9-9460-9dc73452830d@arm.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index 0161fb49fc6e..f8685a2ea07e 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -94,19 +94,19 @@ int init_uffd(void)
 
 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
 	if (uffd == -1)
-		ksft_exit_fail_msg("uffd syscall failed\n");
+		return uffd;
 
 	uffdio_api.api = UFFD_API;
 	uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC |
 			      UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
 	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
-		ksft_exit_fail_msg("UFFDIO_API\n");
+		return -1;
 
 	if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) ||
 	    !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) ||
 	    !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) ||
 	    !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM))
-		ksft_exit_fail_msg("UFFDIO_API error %llu\n", uffdio_api.api);
+		return -1;
 
 	return 0;
 }
@@ -1479,6 +1479,10 @@ int main(void)
 	struct stat sbuf;
 
 	ksft_print_header();
+
+	if (init_uffd())
+		return ksft_exit_pass();
+
 	ksft_set_plan(115);
 
 	page_size = getpagesize();
@@ -1488,9 +1492,6 @@ int main(void)
 	if (pagemap_fd < 0)
 		return -EINVAL;
 
-	if (init_uffd())
-		ksft_exit_fail_msg("uffd init failed\n");
-
 	/* 1. Sanity testing */
 	sanity_tests_sd();
 
-- 
cgit 


From 9297e5360c3bd777f95d5146dbeda7fb9ba4273a Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 3 Nov 2023 23:23:42 +0500
Subject: selftests: mm: fix some build warnings

Fix build warnings:
pagemap_ioctl.c:1154:38: warning: format `%s' expects a matching `char *' argument [-Wformat=]
pagemap_ioctl.c:1162:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=]
pagemap_ioctl.c:1192:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=]
pagemap_ioctl.c:1600:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=]
pagemap_ioctl.c:1628:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=]

Link: https://lkml.kernel.org/r/20231103182343.2874015-2-usama.anjum@collabora.com
Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index f8685a2ea07e..befab43719ba 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -1151,7 +1151,7 @@ int sanity_tests(void)
 	/* 9. Memory mapped file */
 	fd = open(__FILE__, O_RDONLY);
 	if (fd < 0)
-		ksft_exit_fail_msg("%s Memory mapped file\n");
+		ksft_exit_fail_msg("%s Memory mapped file\n", __func__);
 
 	ret = stat(__FILE__, &sbuf);
 	if (ret < 0)
@@ -1159,7 +1159,7 @@ int sanity_tests(void)
 
 	fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
 	if (fmem == MAP_FAILED)
-		ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+		ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
 	tmp_buf = malloc(sbuf.st_size);
 	memcpy(tmp_buf, fmem, sbuf.st_size);
@@ -1189,7 +1189,7 @@ int sanity_tests(void)
 
 	fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
 	if (fmem == MAP_FAILED)
-		ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+		ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
 	wp_init(fmem, buf_size);
 	wp_addr_range(fmem, buf_size);
@@ -1596,7 +1596,7 @@ int main(void)
 
 	fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
 	if (fmem == MAP_FAILED)
-		ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+		ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
 	wp_init(fmem, sbuf.st_size);
 	wp_addr_range(fmem, sbuf.st_size);
@@ -1624,7 +1624,7 @@ int main(void)
 
 	fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
 	if (fmem == MAP_FAILED)
-		ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+		ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
 	wp_init(fmem, buf_size);
 	wp_addr_range(fmem, buf_size);
-- 
cgit 


From dd9b35efd719be242e227f9eebad1e50ea5c914f Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 3 Nov 2023 10:33:59 -0700
Subject: selftests/mm: restore number of hugepages

The test mm `hugetlb_fault_after_madv` selftest needs one and only one
huge page to run, thus it sets `/proc/sys/vm/nr_hugepages` to 1.

The problem is that further tests require the previous number of hugepages
allocated in order to succeed.

Save the number of huge pages before changing it, and restore it once the
test finishes, so, further tests could run successfully.

Link: https://lkml.kernel.org/r/20231103173400.1608403-1-leitao@debian.org
Fixes: 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb")
Signed-off-by: Breno Leitao <leitao@debian.org>
Reported-by: Ryan Roberts <ryan.roberts@arm.com>
Closes: https://lore.kernel.org/all/662df57e-47f1-4c15-9b84-f2f2d587fc5c@arm.com/
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index cc16f6ca8533..00757445278e 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -223,9 +223,12 @@ CATEGORY="hugetlb" run_test ./hugepage-mremap
 CATEGORY="hugetlb" run_test ./hugepage-vmemmap
 CATEGORY="hugetlb" run_test ./hugetlb-madvise
 
+nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
 # For this test, we need one and just one huge page
 echo 1 > /proc/sys/vm/nr_hugepages
 CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+# Restore the previous number of huge pages, since further tests rely on it
+echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
 
 if test_selected "hugetlb"; then
 	echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
-- 
cgit 


From edf14544324dd036183fafe372fe5709708bdddd Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 3 Nov 2023 10:34:00 -0700
Subject: selftests/mm: add hugetlb_fault_after_madv to .gitignore

commit 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb")
added a new test case, but, it didn't add the binary name in
tools/testing/selftests/mm/.gitignore.

Add hugetlb_fault_after_madv to tools/testing/selftests/mm/.gitignore.

Link: https://lkml.kernel.org/r/20231103173400.1608403-2-leitao@debian.org
Fixes: 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb")
Signed-off-by: Breno Leitao <leitao@debian.org>
Reported-by: Ryan Roberts <ryan.roberts@arm.com>
Closes: https://lore.kernel.org/all/662df57e-47f1-4c15-9b84-f2f2d587fc5c@arm.com/
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index cc920c79ff1c..4ff10ea61461 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -45,3 +45,4 @@ mdwe_test
 gup_longterm
 mkdirty
 va_high_addr_switch
+hugetlb_fault_after_madv
-- 
cgit 


From a48d5bdc877b85201e42cef9c2fdf5378164c23a Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Mon, 6 Nov 2023 10:19:18 -0800
Subject: mm: fix for negative counter: nr_file_hugepages

While qualifiying the 6.4 release, the following warning was detected in
messages:

vmstat_refresh: nr_file_hugepages -15664

The warning is caused by the incorrect updating of the NR_FILE_THPS
counter in the function split_huge_page_to_list.  The if case is checking
for folio_test_swapbacked, but the else case is missing the check for
folio_test_pmd_mappable.  The other functions that manipulate the counter
like __filemap_add_folio and filemap_unaccount_folio have the
corresponding check.

I have a test case, which reproduces the problem. It can be found here:
  https://github.com/sroeschus/testcase/blob/main/vmstat_refresh/madv.c

The test case reproduces on an XFS filesystem. Running the same test
case on a BTRFS filesystem does not reproduce the problem.

AFAIK version 6.1 until 6.6 are affected by this problem.

[akpm@linux-foundation.org: whitespace fix]
[shr@devkernel.io: test for folio_test_pmd_mappable()]
  Link: https://lkml.kernel.org/r/20231108171517.2436103-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20231106181918.1091043-1-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Co-debugged-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f31f02472396..4f542444a91f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2769,13 +2769,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			int nr = folio_nr_pages(folio);
 
 			xas_split(&xas, folio, folio_order(folio));
-			if (folio_test_swapbacked(folio)) {
-				__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
-							-nr);
-			} else {
-				__lruvec_stat_mod_folio(folio, NR_FILE_THPS,
-							-nr);
-				filemap_nr_thps_dec(mapping);
+			if (folio_test_pmd_mappable(folio)) {
+				if (folio_test_swapbacked(folio)) {
+					__lruvec_stat_mod_folio(folio,
+							NR_SHMEM_THPS, -nr);
+				} else {
+					__lruvec_stat_mod_folio(folio,
+							NR_FILE_THPS, -nr);
+					filemap_nr_thps_dec(mapping);
+				}
 			}
 		}
 
-- 
cgit 


From b4936b544b08ed44949055b92bd25f77759ebafc Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Nov 2023 23:34:06 +0000
Subject: mm/damon/sysfs: check error from damon_sysfs_update_target()

Patch series "mm/damon/sysfs: fix unhandled return values".

Some of DAMON sysfs interface code is not handling return values from some
functions.  As a result, confusing user input handling or NULL-dereference
is possible.  Check those properly.


This patch (of 3):

damon_sysfs_update_target() returns error code for failures, but its
caller, damon_sysfs_set_targets() is ignoring that.  The update function
seems making no critical change in case of such failures, but the behavior
will look like DAMON sysfs is silently ignoring or only partially
accepting the user input.  Fix it.

Link: https://lkml.kernel.org/r/20231106233408.51159-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231106233408.51159-2-sj@kernel.org
Fixes: 19467a950b49 ("mm/damon/sysfs: remove requested targets when online-commit inputs")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1dfa96d4de99..7472404456aa 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 
 	damon_for_each_target_safe(t, next, ctx) {
 		if (i < sysfs_targets->nr) {
-			damon_sysfs_update_target(t, ctx,
+			err = damon_sysfs_update_target(t, ctx,
 					sysfs_targets->targets_arr[i]);
+			if (err)
+				return err;
 		} else {
 			if (damon_target_has_pid(ctx))
 				put_pid(t->pid);
-- 
cgit 


From 84055688b6bc075c92a88e2d6c3ad26ab93919f9 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Nov 2023 23:34:07 +0000
Subject: mm/damon/sysfs-schemes: handle tried regions sysfs directory
 allocation failure

DAMOS tried regions sysfs directory allocation function
(damon_sysfs_scheme_regions_alloc()) is not handling the memory allocation
failure.  In the case, the code will dereference NULL pointer.  Handle the
failure to avoid such invalid access.

Link: https://lkml.kernel.org/r/20231106233408.51159-3-sj@kernel.org
Fixes: 9277d0367ba1 ("mm/damon/sysfs-schemes: implement scheme region directory")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 45bd0fd4a8b1..7413cb35c5a9 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -162,6 +162,9 @@ damon_sysfs_scheme_regions_alloc(void)
 	struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
 			GFP_KERNEL);
 
+	if (!regions)
+		return NULL;
+
 	regions->kobj = (struct kobject){};
 	INIT_LIST_HEAD(&regions->regions_list);
 	regions->nr_regions = 0;
-- 
cgit 


From ae636ae2bbfd9279f5681dbf320d1da817e52b68 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Nov 2023 23:34:08 +0000
Subject: mm/damon/sysfs-schemes: handle tried region directory allocation
 failure

DAMON sysfs interface's before_damos_apply callback
(damon_sysfs_before_damos_apply()), which creates the DAMOS tried regions
for each DAMOS action applied region, is not handling the allocation
failure for the sysfs directory data.  As a result, NULL pointer
derefeence is possible.  Fix it by handling the case.

Link: https://lkml.kernel.org/r/20231106233408.51159-4-sj@kernel.org
Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 7413cb35c5a9..be667236b8e6 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1826,6 +1826,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 		return 0;
 
 	region = damon_sysfs_scheme_region_alloc(r);
+	if (!region)
+		return 0;
 	list_add_tail(&region->list, &sysfs_regions->regions_list);
 	sysfs_regions->nr_regions++;
 	if (kobject_init_and_add(&region->kobj,
-- 
cgit 


From 24948e3b7b12e0031a6edb4f49bbb9fb2ad1e4e9 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 7 Nov 2023 09:18:02 -0800
Subject: mm: kmem: drop __GFP_NOFAIL when allocating objcg vectors

Objcg vectors attached to slab pages to store slab object ownership
information are allocated using gfp flags for the original slab
allocation.  Depending on slab page order and the size of slab objects,
objcg vector can take several pages.

If the original allocation was done with the __GFP_NOFAIL flag, it
triggered a warning in the page allocation code.  Indeed, order > 1 pages
should not been allocated with the __GFP_NOFAIL flag.

Fix this by simply dropping the __GFP_NOFAIL flag when allocating the
objcg vector.  It effectively allows to skip the accounting of a single
slab object under a heavy memory pressure.

An alternative would be to implement the mechanism to fallback to order-0
allocations for accounting metadata, which is also not perfect because it
will increase performance penalty and memory footprint of the kernel
memory accounting under memory pressure.

Link: https://lkml.kernel.org/r/ZUp8ZFGxwmCx4ZFr@P9FQF9L96D.corp.robot.car
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reported-by: Christoph Lameter <cl@linux.com>
Closes: https://lkml.kernel.org/r/6b42243e-f197-600a-5d22-56bd728a5ad8@gentwo.org
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 774bd6e21e27..1c1061df9cd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2936,7 +2936,8 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
  * Moreover, it should not come from DMA buffer and is not readily
  * reclaimable. So those GFP bits should be masked off.
  */
-#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
+#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
+				 __GFP_ACCOUNT | __GFP_NOFAIL)
 
 /*
  * mod_objcg_mlstate() may be called with irq enabled, so
-- 
cgit 


From 13b2a4b22e98ff80b888a160a2acd92d81b05925 Mon Sep 17 00:00:00 2001
From: Hyeongtak Ji <hyeongtak.ji@sk.com>
Date: Fri, 10 Nov 2023 14:37:09 +0900
Subject: mm/damon/core.c: avoid unintentional filtering out of schemes

The function '__damos_filter_out()' causes DAMON to always filter out
schemes whose filter type is anon or memcg if its matching value is set
to false.

This commit addresses the issue by ensuring that '__damos_filter_out()'
no longer applies to filters whose type is 'anon' or 'memcg'.

Link: https://lkml.kernel.org/r/1699594629-3816-1-git-send-email-hyeongtak.ji@gmail.com
Fixes: ab9bda001b681 ("mm/damon/core: introduce address range type damos filter")
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 630077d95dc6..6262d55904e7 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -924,7 +924,7 @@ static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
 		matched = true;
 		break;
 	default:
-		break;
+		return false;
 	}
 
 	return matched == filter->matching;
-- 
cgit 


From 5f74f820f6fc844b95f9e5e406e0a07d97510420 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Mon, 13 Nov 2023 11:12:57 +0100
Subject: parisc: fix mmap_base calculation when stack grows upwards

Matoro reported various userspace crashes on the parisc platform with kernel
6.6 and bisected it to commit 3033cd430768 ("parisc: Use generic mmap top-down
layout and brk randomization").

That commit switched parisc to use the common infrastructure to calculate
mmap_base, but missed that the mmap_base() function takes care for
architectures where the stack grows downwards only.

Fix the mmap_base() calculation to include the stack-grows-upwards case
and thus fix the userspace crashes on parisc.

Link: https://lkml.kernel.org/r/ZVH2qeS1bG7/1J/l@p100
Fixes: 3033cd430768 ("parisc: Use generic mmap top-down layout and brk randomization")
Signed-off-by: Helge Deller <deller@gmx.de>
Reported-by: matoro <matoro_mailinglist_kernel@matoro.tk>
Tested-by: matoro <matoro_mailinglist_kernel@matoro.tk>
Cc: <stable@vger.kernel.org>	[6.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/parisc/Kconfig                 |  6 +++---
 arch/parisc/include/asm/elf.h       | 10 +---------
 arch/parisc/include/asm/processor.h |  2 ++
 arch/parisc/kernel/sys_parisc.c     |  2 +-
 mm/util.c                           | 10 ++++++++++
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fd69dfa0cdab..a7c9c0e69e5a 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -140,11 +140,11 @@ config ARCH_MMAP_RND_COMPAT_BITS_MIN
 	default 8
 
 config ARCH_MMAP_RND_BITS_MAX
-	default 24 if 64BIT
-	default 17
+	default 18 if 64BIT
+	default 13
 
 config ARCH_MMAP_RND_COMPAT_BITS_MAX
-	default 17
+	default 13
 
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
diff --git a/arch/parisc/include/asm/elf.h b/arch/parisc/include/asm/elf.h
index 140eaa97bf21..2d73d3c3cd37 100644
--- a/arch/parisc/include/asm/elf.h
+++ b/arch/parisc/include/asm/elf.h
@@ -349,15 +349,7 @@ struct pt_regs;	/* forward declaration... */
 
 #define ELF_HWCAP	0
 
-/* Masks for stack and mmap randomization */
-#define BRK_RND_MASK	(is_32bit_task() ? 0x07ffUL : 0x3ffffUL)
-#define MMAP_RND_MASK	(is_32bit_task() ? 0x1fffUL : 0x3ffffUL)
-#define STACK_RND_MASK	MMAP_RND_MASK
-
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *);
-#define arch_randomize_brk arch_randomize_brk
-
+#define STACK_RND_MASK	0x7ff	/* 8MB of VA */
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 struct linux_binprm;
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index c05d121cf5d0..982aca20f56f 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -47,6 +47,8 @@
 
 #ifndef __ASSEMBLY__
 
+struct rlimit;
+unsigned long mmap_upper_limit(struct rlimit *rlim_stack);
 unsigned long calc_max_stack_size(unsigned long stack_max);
 
 /*
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index ab896eff7a1d..98af719d5f85 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -77,7 +77,7 @@ unsigned long calc_max_stack_size(unsigned long stack_max)
  * indicating that "current" should be used instead of a passed-in
  * value from the exec bprm as done with arch_pick_mmap_layout().
  */
-static unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
+unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
 {
 	unsigned long stack_base;
 
diff --git a/mm/util.c b/mm/util.c
index aa01f6ea5a75..744b4d7e3fae 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -414,6 +414,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack)
 
 static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
 {
+#ifdef CONFIG_STACK_GROWSUP
+	/*
+	 * For an upwards growing stack the calculation is much simpler.
+	 * Memory for the maximum stack size is reserved at the top of the
+	 * task. mmap_base starts directly below the stack and grows
+	 * downwards.
+	 */
+	return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
+#else
 	unsigned long gap = rlim_stack->rlim_cur;
 	unsigned long pad = stack_guard_gap;
 
@@ -431,6 +440,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
 		gap = MAX_GAP;
 
 	return PAGE_ALIGN(STACK_TOP - gap - rnd);
+#endif
 }
 
 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-- 
cgit 


From afccb0804fc74ac2f6737af6a139632606cb461d Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 14 Nov 2023 15:49:45 +0000
Subject: mm: more ptep_get() conversion

Commit c33c794828f2 ("mm: ptep_get() conversion") converted all (non-arch)
call sites to use ptep_get() instead of doing a direct dereference of the
pte.  Full rationale can be found in that commit's log.

Since then, three new call sites have snuck in, which directly dereference
the pte, so let's fix those up.

Unfortunately there is no reliable automated mechanism to catch these; I'm
relying on a combination of Coccinelle (which throws up a lot of false
positives) and some compiler magic to force a compiler error on
dereference (While this approach finds dereferences, it also yields a
non-booting kernel so can't be committed).

Link: https://lkml.kernel.org/r/20231114154945.490401-1-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/filemap.c     | 2 +-
 mm/ksm.c         | 2 +-
 mm/userfaultfd.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 9710f43a89ac..32eedf3afd45 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3443,7 +3443,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 		 * handled in the specific fault path, and it'll prohibit the
 		 * fault-around logic.
 		 */
-		if (!pte_none(vmf->pte[count]))
+		if (!pte_none(ptep_get(&vmf->pte[count])))
 			goto skip;
 
 		count++;
diff --git a/mm/ksm.c b/mm/ksm.c
index 7efcc68ccc6e..6a831009b4cb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -468,7 +468,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
 			page = pfn_swap_entry_to_page(entry);
 	}
 	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
-	ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte);
+	ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent);
 	pte_unmap_unlock(pte, ptl);
 	return ret;
 }
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 96d9eae5c7cc..0b6ca553bebe 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -312,7 +312,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
 
 	ret = -EEXIST;
 	/* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
-	if (!pte_none(*dst_pte))
+	if (!pte_none(ptep_get(dst_pte)))
 		goto out_unlock;
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-- 
cgit 


From df3aafe501853c92bc9e25b05dcb030fee072962 Mon Sep 17 00:00:00 2001
From: Itamar Gozlan <igozlan@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:32 -0800
Subject: Revert "net/mlx5: DR, Supporting inline WQE when possible"

This reverts commit 95c337cce0e11d06a715da73e6796ade9216637f.
The revert is required due to the suspicion it cause some tests
fail and will be moved to further investigation.

Fixes: 95c337cce0e1 ("net/mlx5: DR, Supporting inline WQE when possible")
Signed-off-by: Itamar Gozlan <igozlan@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-2-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 115 +++------------------
 1 file changed, 13 insertions(+), 102 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 4e8527a724f5..6fa06ba2d346 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -52,7 +52,6 @@ struct dr_qp_init_attr {
 	u32 cqn;
 	u32 pdn;
 	u32 max_send_wr;
-	u32 max_send_sge;
 	struct mlx5_uars_page *uar;
 	u8 isolate_vl_tc:1;
 };
@@ -247,37 +246,6 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
 	return err == CQ_POLL_ERR ? err : npolled;
 }
 
-static int dr_qp_get_args_update_send_wqe_size(struct dr_qp_init_attr *attr)
-{
-	return roundup_pow_of_two(sizeof(struct mlx5_wqe_ctrl_seg) +
-				  sizeof(struct mlx5_wqe_flow_update_ctrl_seg) +
-				  sizeof(struct mlx5_wqe_header_modify_argument_update_seg));
-}
-
-/* We calculate for specific RC QP with the required functionality */
-static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr)
-{
-	int update_arg_size;
-	int inl_size = 0;
-	int tot_size;
-	int size;
-
-	update_arg_size = dr_qp_get_args_update_send_wqe_size(attr);
-
-	size = sizeof(struct mlx5_wqe_ctrl_seg) +
-	       sizeof(struct mlx5_wqe_raddr_seg);
-	inl_size = size + ALIGN(sizeof(struct mlx5_wqe_inline_seg) +
-				DR_STE_SIZE, 16);
-
-	size += attr->max_send_sge * sizeof(struct mlx5_wqe_data_seg);
-
-	size = max(size, update_arg_size);
-
-	tot_size = max(size, inl_size);
-
-	return ALIGN(tot_size, MLX5_SEND_WQE_BB);
-}
-
 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 					 struct dr_qp_init_attr *attr)
 {
@@ -285,7 +253,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 	u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
 	struct mlx5_wq_param wqp;
 	struct mlx5dr_qp *dr_qp;
-	int wqe_size;
 	int inlen;
 	void *qpc;
 	void *in;
@@ -365,15 +332,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 	if (err)
 		goto err_in;
 	dr_qp->uar = attr->uar;
-	wqe_size = dr_qp_calc_rc_send_wqe(attr);
-	dr_qp->max_inline_data = min(wqe_size -
-				     (sizeof(struct mlx5_wqe_ctrl_seg) +
-				      sizeof(struct mlx5_wqe_raddr_seg) +
-				      sizeof(struct mlx5_wqe_inline_seg)),
-				     (2 * MLX5_SEND_WQE_BB -
-				      (sizeof(struct mlx5_wqe_ctrl_seg) +
-				       sizeof(struct mlx5_wqe_raddr_seg) +
-				       sizeof(struct mlx5_wqe_inline_seg))));
 
 	return dr_qp;
 
@@ -437,48 +395,8 @@ dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
 		MLX5_SEND_WQE_DS;
 }
 
-static int dr_set_data_inl_seg(struct mlx5dr_qp *dr_qp,
-			       struct dr_data_seg *data_seg, void *wqe)
-{
-	int inline_header_size = sizeof(struct mlx5_wqe_ctrl_seg) +
-				sizeof(struct mlx5_wqe_raddr_seg) +
-				sizeof(struct mlx5_wqe_inline_seg);
-	struct mlx5_wqe_inline_seg *seg;
-	int left_space;
-	int inl = 0;
-	void *addr;
-	int len;
-	int idx;
-
-	seg = wqe;
-	wqe += sizeof(*seg);
-	addr = (void *)(unsigned long)(data_seg->addr);
-	len  = data_seg->length;
-	inl += len;
-	left_space = MLX5_SEND_WQE_BB - inline_header_size;
-
-	if (likely(len > left_space)) {
-		memcpy(wqe, addr, left_space);
-		len -= left_space;
-		addr += left_space;
-		idx = (dr_qp->sq.pc + 1) & (dr_qp->sq.wqe_cnt - 1);
-		wqe = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
-	}
-
-	memcpy(wqe, addr, len);
-
-	if (likely(inl)) {
-		seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
-		return DIV_ROUND_UP(inl + sizeof(seg->byte_count),
-				    MLX5_SEND_WQE_DS);
-	} else {
-		return 0;
-	}
-}
-
 static void
-dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp,
-				  struct mlx5_wqe_ctrl_seg *wq_ctrl,
+dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
 				  u64 remote_addr,
 				  u32 rkey,
 				  struct dr_data_seg *data_seg,
@@ -494,17 +412,15 @@ dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp,
 	wq_raddr->reserved = 0;
 
 	wq_dseg = (void *)(wq_raddr + 1);
-	/* WQE ctrl segment + WQE remote addr segment */
-	*size = (sizeof(*wq_ctrl) + sizeof(*wq_raddr)) / MLX5_SEND_WQE_DS;
 
-	if (data_seg->send_flags & IB_SEND_INLINE) {
-		*size += dr_set_data_inl_seg(dr_qp, data_seg, wq_dseg);
-	} else {
-		wq_dseg->byte_count = cpu_to_be32(data_seg->length);
-		wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
-		wq_dseg->addr = cpu_to_be64(data_seg->addr);
-		*size += sizeof(*wq_dseg) / MLX5_SEND_WQE_DS;  /* WQE data segment */
-	}
+	wq_dseg->byte_count = cpu_to_be32(data_seg->length);
+	wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
+	wq_dseg->addr = cpu_to_be64(data_seg->addr);
+
+	*size = (sizeof(*wq_ctrl) +    /* WQE ctrl segment */
+		 sizeof(*wq_dseg) +    /* WQE data segment */
+		 sizeof(*wq_raddr)) /  /* WQE remote addr segment */
+		MLX5_SEND_WQE_DS;
 }
 
 static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl,
@@ -535,7 +451,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
 	switch (opcode) {
 	case MLX5_OPCODE_RDMA_READ:
 	case MLX5_OPCODE_RDMA_WRITE:
-		dr_rdma_handle_icm_write_segments(dr_qp, wq_ctrl, remote_addr,
+		dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr,
 						  rkey, data_seg, &size);
 		break;
 	case MLX5_OPCODE_FLOW_TBL_ACCESS:
@@ -656,7 +572,7 @@ static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring,
 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
 		send_info->write.send_flags |= IB_SEND_SIGNALED;
 	else
-		send_info->write.send_flags &= ~IB_SEND_SIGNALED;
+		send_info->write.send_flags = 0;
 }
 
 static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
@@ -680,13 +596,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
 	}
 
 	send_ring->pending_wqe++;
-	if (!send_info->write.lkey)
-		send_info->write.send_flags |= IB_SEND_INLINE;
 
 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
 		send_info->write.send_flags |= IB_SEND_SIGNALED;
-	else
-		send_info->write.send_flags &= ~IB_SEND_SIGNALED;
 
 	send_ring->pending_wqe++;
 	send_info->read.length = send_info->write.length;
@@ -696,9 +608,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
 	send_info->read.lkey = send_ring->sync_mr->mkey;
 
 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
-		send_info->read.send_flags |= IB_SEND_SIGNALED;
+		send_info->read.send_flags = IB_SEND_SIGNALED;
 	else
-		send_info->read.send_flags &= ~IB_SEND_SIGNALED;
+		send_info->read.send_flags = 0;
 }
 
 static void dr_fill_data_segs(struct mlx5dr_domain *dmn,
@@ -1345,7 +1257,6 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
 	dmn->send_ring->cq->qp = dmn->send_ring->qp;
 
 	dmn->info.max_send_wr = QUEUE_SIZE;
-	init_attr.max_send_sge = 1;
 	dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
 					DR_STE_SIZE);
 
-- 
cgit 


From 7d2f74d1d4385a5bcf90618537f16a45121c30ae Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:33 -0800
Subject: net/mlx5: Free used cpus mask when an IRQ is released

Each EQ table maintains a cpumask of the already used CPUs that are mapped
to IRQs to ensure that each IRQ gets mapped to a unique CPU.

However, on IRQ release, the said cpumask is not updated by clearing the
CPU from the mask to allow future IRQ request, causing the following
error when a SF is reloaded after it has utilized all CPUs for its IRQs:

mlx5_irq_affinity_request:135:(pid 306010): Didn't find a matching IRQ.
err = -28

Thus, when releasing an IRQ, clear its mapped CPU from the used CPUs
mask, to prevent the case described above.

While at it, move the used cpumask update to the EQ layer as it is more
fitting and preserves symmetricity of the IRQ request/release API.

Fixes: a1772de78d73 ("net/mlx5: Refactor completion IRQ request/release API")
Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-3-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 25 +++++++++----
 .../net/ethernet/mellanox/mlx5/core/irq_affinity.c | 42 ----------------------
 2 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index ea0405e0a43f..40a6cb052a2d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -885,11 +885,14 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	struct mlx5_irq *irq;
+	int cpu;
 
 	irq = xa_load(&table->comp_irqs, vecidx);
 	if (!irq)
 		return;
 
+	cpu = cpumask_first(mlx5_irq_get_affinity_mask(irq));
+	cpumask_clear_cpu(cpu, &table->used_cpus);
 	xa_erase(&table->comp_irqs, vecidx);
 	mlx5_irq_affinity_irq_release(dev, irq);
 }
@@ -897,16 +900,26 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx)
 static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
+	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+	struct irq_affinity_desc af_desc = {};
 	struct mlx5_irq *irq;
 
-	irq = mlx5_irq_affinity_irq_request_auto(dev, &table->used_cpus, vecidx);
-	if (IS_ERR(irq)) {
-		/* In case SF irq pool does not exist, fallback to the PF irqs*/
-		if (PTR_ERR(irq) == -ENOENT)
-			return comp_irq_request_pci(dev, vecidx);
+	/* In case SF irq pool does not exist, fallback to the PF irqs*/
+	if (!mlx5_irq_pool_is_sf_pool(pool))
+		return comp_irq_request_pci(dev, vecidx);
 
+	af_desc.is_managed = 1;
+	cpumask_copy(&af_desc.mask, cpu_online_mask);
+	cpumask_andnot(&af_desc.mask, &af_desc.mask, &table->used_cpus);
+	irq = mlx5_irq_affinity_request(pool, &af_desc);
+	if (IS_ERR(irq))
 		return PTR_ERR(irq);
-	}
+
+	cpumask_or(&table->used_cpus, &table->used_cpus, mlx5_irq_get_affinity_mask(irq));
+	mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
+		      pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
+		      cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
+		      mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
 
 	return xa_err(xa_store(&table->comp_irqs, vecidx, irq, GFP_KERNEL));
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
index 047d5fed5f89..612e666ec263 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
@@ -168,45 +168,3 @@ void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *i
 		if (pool->irqs_per_cpu)
 			cpu_put(pool, cpu);
 }
-
-/**
- * mlx5_irq_affinity_irq_request_auto - request one IRQ for mlx5 device.
- * @dev: mlx5 device that is requesting the IRQ.
- * @used_cpus: cpumask of bounded cpus by the device
- * @vecidx: vector index to request an IRQ for.
- *
- * Each IRQ is bounded to at most 1 CPU.
- * This function is requesting an IRQ according to the default assignment.
- * The default assignment policy is:
- * - request the least loaded IRQ which is not bound to any
- *   CPU of the previous IRQs requested.
- *
- * On success, this function updates used_cpus mask and returns an irq pointer.
- * In case of an error, an appropriate error pointer is returned.
- */
-struct mlx5_irq *mlx5_irq_affinity_irq_request_auto(struct mlx5_core_dev *dev,
-						    struct cpumask *used_cpus, u16 vecidx)
-{
-	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
-	struct irq_affinity_desc af_desc = {};
-	struct mlx5_irq *irq;
-
-	if (!mlx5_irq_pool_is_sf_pool(pool))
-		return ERR_PTR(-ENOENT);
-
-	af_desc.is_managed = 1;
-	cpumask_copy(&af_desc.mask, cpu_online_mask);
-	cpumask_andnot(&af_desc.mask, &af_desc.mask, used_cpus);
-	irq = mlx5_irq_affinity_request(pool, &af_desc);
-
-	if (IS_ERR(irq))
-		return irq;
-
-	cpumask_or(used_cpus, used_cpus, mlx5_irq_get_affinity_mask(irq));
-	mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
-		      pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
-		      cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
-		      mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
-
-	return irq;
-}
-- 
cgit 


From ad4d82c3eacdd500a246af736e6e01d96484e35e Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:34 -0800
Subject: net/mlx5: DR, Allow old devices to use multi destination FTE

The current check isn't aware of old devices that don't have the
relevant FW capability. This patch allows multi destination FTE
in old cards, as it was before this check.

Fixes: f6f46e7173cb ("net/mlx5: DR, Add check for multi destination FTE")
Signed-off-by: Erez Shitrit <erezsh@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-4-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index 6ea88a581804..e3ec559369fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
@@ -57,7 +57,8 @@ static const char *dr_action_id_to_str(enum mlx5dr_action_type action_id)
 
 static bool mlx5dr_action_supp_fwd_fdb_multi_ft(struct mlx5_core_dev *dev)
 {
-	return (MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) ||
+	return (MLX5_CAP_GEN(dev, steering_format_version) < MLX5_STEERING_FORMAT_CONNECTX_6DX ||
+		MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) ||
 		MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table));
 }
 
-- 
cgit 


From fd64fd13c49a53012ce9170449dcd9eb71c11284 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:35 -0800
Subject: net/mlx5: Decouple PHC .adjtime and .adjphase implementations

When running a phase adjustment operation, the free running clock should
not be modified at all. The phase control keyword is intended to trigger an
internal servo on the device that will converge to the provided delta. A
free running counter cannot implement phase adjustment.

Fixes: 8e11a68e2e8a ("net/mlx5: Add adjphase function to support hardware-only offset control")
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-5-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index aa29f09e8356..0c83ef174275 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -384,7 +384,12 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 
 static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta)
 {
-	return mlx5_ptp_adjtime(ptp, delta);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev;
+
+	mdev = container_of(clock, struct mlx5_core_dev, clock);
+
+	return mlx5_ptp_adjtime_real_time(mdev, delta);
 }
 
 static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm)
-- 
cgit 


From 6f9b1a0731662648949a1c0587f6acb3b7f8acf1 Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Tue, 14 Nov 2023 13:58:36 -0800
Subject: net/mlx5e: fix double free of encap_header

When mlx5_packet_reformat_alloc() fails, the encap_header allocated in
mlx5e_tc_tun_create_header_ipv4{6} will be released within it. However,
e->encap_header is already set to the previously freed encap_header
before mlx5_packet_reformat_alloc(). As a result, the later
mlx5e_encap_put() will free e->encap_header again, causing a double free
issue.

mlx5e_encap_put()
    --> mlx5e_encap_dealloc()
        --> kfree(e->encap_header)

This happens when cmd: MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT fail.

This patch fix it by not setting e->encap_header until
mlx5_packet_reformat_alloc() success.

Fixes: d589e785baf5e ("net/mlx5e: Allow concurrent creation of encap entries")
Reported-by: Cruz Zhao <cruzzhao@linux.alibaba.com>
Reported-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index 00a04fdd756f..8bca696b6658 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -300,9 +300,6 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
 	if (err)
 		goto destroy_neigh_entry;
 
-	e->encap_size = ipv4_encap_size;
-	e->encap_header = encap_header;
-
 	if (!(nud_state & NUD_VALID)) {
 		neigh_event_send(attr.n, NULL);
 		/* the encap entry will be made valid on neigh update event
@@ -322,6 +319,8 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
 		goto destroy_neigh_entry;
 	}
 
+	e->encap_size = ipv4_encap_size;
+	e->encap_header = encap_header;
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
 	mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
 	mlx5e_route_lookup_ipv4_put(&attr);
@@ -568,9 +567,6 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 	if (err)
 		goto destroy_neigh_entry;
 
-	e->encap_size = ipv6_encap_size;
-	e->encap_header = encap_header;
-
 	if (!(nud_state & NUD_VALID)) {
 		neigh_event_send(attr.n, NULL);
 		/* the encap entry will be made valid on neigh update event
@@ -590,6 +586,8 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 		goto destroy_neigh_entry;
 	}
 
+	e->encap_size = ipv6_encap_size;
+	e->encap_header = encap_header;
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
 	mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
 	mlx5e_route_lookup_ipv6_put(&attr);
-- 
cgit 


From 3a4aa3cb83563df942be49d145ee3b7ddf17d6bb Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:37 -0800
Subject: net/mlx5e: fix double free of encap_header in update funcs

Follow up to the previous patch to fix the same issue for
mlx5e_tc_tun_update_header_ipv4{6} when mlx5_packet_reformat_alloc()
fails.

When mlx5_packet_reformat_alloc() fails, the encap_header allocated in
mlx5e_tc_tun_update_header_ipv4{6} will be released within it. However,
e->encap_header is already set to the previously freed encap_header
before mlx5_packet_reformat_alloc(). As a result, the later
mlx5e_encap_put() will free e->encap_header again, causing a double free
issue.

mlx5e_encap_put()
     --> mlx5e_encap_dealloc()
         --> kfree(e->encap_header)

This patch fix it by not setting e->encap_header until
mlx5_packet_reformat_alloc() success.

Fixes: a54e20b4fcae ("net/mlx5e: Add basic TC tunnel set action for SRIOV offloads")
Signed-off-by: Gavin Li <gavinl@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-7-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index 8bca696b6658..668da5c70e63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -403,16 +403,12 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv,
 	if (err)
 		goto free_encap;
 
-	e->encap_size = ipv4_encap_size;
-	kfree(e->encap_header);
-	e->encap_header = encap_header;
-
 	if (!(nud_state & NUD_VALID)) {
 		neigh_event_send(attr.n, NULL);
 		/* the encap entry will be made valid on neigh update event
 		 * and not used before that.
 		 */
-		goto release_neigh;
+		goto free_encap;
 	}
 
 	memset(&reformat_params, 0, sizeof(reformat_params));
@@ -426,6 +422,10 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv,
 		goto free_encap;
 	}
 
+	e->encap_size = ipv4_encap_size;
+	kfree(e->encap_header);
+	e->encap_header = encap_header;
+
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
 	mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
 	mlx5e_route_lookup_ipv4_put(&attr);
@@ -669,16 +669,12 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv,
 	if (err)
 		goto free_encap;
 
-	e->encap_size = ipv6_encap_size;
-	kfree(e->encap_header);
-	e->encap_header = encap_header;
-
 	if (!(nud_state & NUD_VALID)) {
 		neigh_event_send(attr.n, NULL);
 		/* the encap entry will be made valid on neigh update event
 		 * and not used before that.
 		 */
-		goto release_neigh;
+		goto free_encap;
 	}
 
 	memset(&reformat_params, 0, sizeof(reformat_params));
@@ -692,6 +688,10 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv,
 		goto free_encap;
 	}
 
+	e->encap_size = ipv6_encap_size;
+	kfree(e->encap_header);
+	e->encap_header = encap_header;
+
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
 	mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
 	mlx5e_route_lookup_ipv6_put(&attr);
-- 
cgit 


From 0c101a23ca7eaf00eef1328eefb04b3a93401cc8 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:38 -0800
Subject: net/mlx5e: Fix pedit endianness

Referenced commit addressed endianness issue in mlx5 pedit implementation
in ad hoc manner instead of systematically treating integer values
according to their types which left pedit fields of sizes not equal to 4
and where the bytes being modified are not least significant ones broken on
big endian machines since wrong bits will be consumed during parsing which
leads to following example error when applying pedit to source and
destination MAC addresses:

[Wed Oct 18 12:52:42 2023] mlx5_core 0001:00:00.1 p1v3_r: attempt to offload an unsupported field (cmd 0)
[Wed Oct 18 12:52:42 2023] mask: 00000000330c5b68: 00 00 00 00 ff ff 00 00 00 00 ff ff 00 00 00 00  ................
[Wed Oct 18 12:52:42 2023] mask: 0000000017d22fd9: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[Wed Oct 18 12:52:42 2023] mask: 000000008186d717: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[Wed Oct 18 12:52:42 2023] mask: 0000000029eb6149: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[Wed Oct 18 12:52:42 2023] mask: 000000007ed103e4: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[Wed Oct 18 12:52:42 2023] mask: 00000000db8101a6: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[Wed Oct 18 12:52:42 2023] mask: 00000000ec3c08a9: 00 00 00 00 00 00 00 00 00 00 00 00              ............

Treat masks and values of pedit and filter match as network byte order,
refactor pointers to them to void pointers instead of confusing u32
pointers and only cast to pointer-to-integer when reading a value from
them. Treat pedit mlx5_fields->field_mask as host byte order according to
its type u32, change the constants in fields array accordingly.

Fixes: 82198d8bcdef ("net/mlx5e: Fix endianness when calculating pedit mask first bit")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-8-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 60 +++++++++++++------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9a5a5c2c7da9..7ca9e5b86778 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3147,7 +3147,7 @@ static struct mlx5_fields fields[] = {
 	OFFLOAD(DIPV6_31_0,   32, U32_MAX, ip6.daddr.s6_addr32[3], 0,
 		dst_ipv4_dst_ipv6.ipv6_layout.ipv6[12]),
 	OFFLOAD(IPV6_HOPLIMIT, 8,  U8_MAX, ip6.hop_limit, 0, ttl_hoplimit),
-	OFFLOAD(IP_DSCP, 16,  0xc00f, ip6, 0, ip_dscp),
+	OFFLOAD(IP_DSCP, 16,  0x0fc0, ip6, 0, ip_dscp),
 
 	OFFLOAD(TCP_SPORT, 16, U16_MAX, tcp.source,  0, tcp_sport),
 	OFFLOAD(TCP_DPORT, 16, U16_MAX, tcp.dest,    0, tcp_dport),
@@ -3158,21 +3158,31 @@ static struct mlx5_fields fields[] = {
 	OFFLOAD(UDP_DPORT, 16, U16_MAX, udp.dest,   0, udp_dport),
 };
 
-static unsigned long mask_to_le(unsigned long mask, int size)
+static u32 mask_field_get(void *mask, struct mlx5_fields *f)
 {
-	__be32 mask_be32;
-	__be16 mask_be16;
-
-	if (size == 32) {
-		mask_be32 = (__force __be32)(mask);
-		mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32));
-	} else if (size == 16) {
-		mask_be32 = (__force __be32)(mask);
-		mask_be16 = *(__be16 *)&mask_be32;
-		mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16));
+	switch (f->field_bsize) {
+	case 32:
+		return be32_to_cpu(*(__be32 *)mask) & f->field_mask;
+	case 16:
+		return be16_to_cpu(*(__be16 *)mask) & (u16)f->field_mask;
+	default:
+		return *(u8 *)mask & (u8)f->field_mask;
 	}
+}
 
-	return mask;
+static void mask_field_clear(void *mask, struct mlx5_fields *f)
+{
+	switch (f->field_bsize) {
+	case 32:
+		*(__be32 *)mask &= ~cpu_to_be32(f->field_mask);
+		break;
+	case 16:
+		*(__be16 *)mask &= ~cpu_to_be16((u16)f->field_mask);
+		break;
+	default:
+		*(u8 *)mask &= ~(u8)f->field_mask;
+		break;
+	}
 }
 
 static int offload_pedit_fields(struct mlx5e_priv *priv,
@@ -3184,11 +3194,12 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
 	struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals;
 	struct pedit_headers_action *hdrs = parse_attr->hdrs;
 	void *headers_c, *headers_v, *action, *vals_p;
-	u32 *s_masks_p, *a_masks_p, s_mask, a_mask;
 	struct mlx5e_tc_mod_hdr_acts *mod_acts;
-	unsigned long mask, field_mask;
+	void *s_masks_p, *a_masks_p;
 	int i, first, last, next_z;
 	struct mlx5_fields *f;
+	unsigned long mask;
+	u32 s_mask, a_mask;
 	u8 cmd;
 
 	mod_acts = &parse_attr->mod_hdr_acts;
@@ -3204,15 +3215,11 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
 		bool skip;
 
 		f = &fields[i];
-		/* avoid seeing bits set from previous iterations */
-		s_mask = 0;
-		a_mask = 0;
-
 		s_masks_p = (void *)set_masks + f->offset;
 		a_masks_p = (void *)add_masks + f->offset;
 
-		s_mask = *s_masks_p & f->field_mask;
-		a_mask = *a_masks_p & f->field_mask;
+		s_mask = mask_field_get(s_masks_p, f);
+		a_mask = mask_field_get(a_masks_p, f);
 
 		if (!s_mask && !a_mask) /* nothing to offload here */
 			continue;
@@ -3239,22 +3246,20 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
 					 match_mask, f->field_bsize))
 				skip = true;
 			/* clear to denote we consumed this field */
-			*s_masks_p &= ~f->field_mask;
+			mask_field_clear(s_masks_p, f);
 		} else {
 			cmd  = MLX5_ACTION_TYPE_ADD;
 			mask = a_mask;
 			vals_p = (void *)add_vals + f->offset;
 			/* add 0 is no change */
-			if ((*(u32 *)vals_p & f->field_mask) == 0)
+			if (!mask_field_get(vals_p, f))
 				skip = true;
 			/* clear to denote we consumed this field */
-			*a_masks_p &= ~f->field_mask;
+			mask_field_clear(a_masks_p, f);
 		}
 		if (skip)
 			continue;
 
-		mask = mask_to_le(mask, f->field_bsize);
-
 		first = find_first_bit(&mask, f->field_bsize);
 		next_z = find_next_zero_bit(&mask, f->field_bsize, first);
 		last  = find_last_bit(&mask, f->field_bsize);
@@ -3281,10 +3286,9 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
 		MLX5_SET(set_action_in, action, field, f->field);
 
 		if (cmd == MLX5_ACTION_TYPE_SET) {
+			unsigned long field_mask = f->field_mask;
 			int start;
 
-			field_mask = mask_to_le(f->field_mask, f->field_bsize);
-
 			/* if field is bit sized it can start not from first bit */
 			start = find_first_bit(&field_mask, f->field_bsize);
 
-- 
cgit 


From bdf788cf224f61c20a01c58c00685d394d57887f Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:39 -0800
Subject: net/mlx5e: Don't modify the peer sent-to-vport rules for IPSec
 offload

As IPSec packet offload in switchdev mode is not supported with LAG,
it's unnecessary to modify those sent-to-vport rules to the peer eswitch.

Fixes: c6c2bf5db4ea ("net/mlx5e: Support IPsec packet offload for TX in switchdev mode")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-9-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b296ac52a439..88236e75fd90 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -984,7 +984,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
 	dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
-	if (rep->vport == MLX5_VPORT_UPLINK && on_esw->offloads.ft_ipsec_tx_pol) {
+	if (rep->vport == MLX5_VPORT_UPLINK &&
+	    on_esw == from_esw && on_esw->offloads.ft_ipsec_tx_pol) {
 		dest.ft = on_esw->offloads.ft_ipsec_tx_pol;
 		flow_act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL;
 		dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-- 
cgit 


From 64f14d16eef1f939000f2617b50c7c996b5117d4 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:40 -0800
Subject: net/mlx5e: Avoid referencing skb after free-ing in drop path of
 mlx5e_sq_xmit_wqe

When SQ is a port timestamping SQ for PTP, do not access tx flags of skb
after free-ing the skb. Free the skb only after all references that depend
on it have been handled in the dropped WQE path.

Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs")
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-10-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index d41435c22ce5..19f2c25b05a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -494,10 +494,10 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 err_drop:
 	stats->dropped++;
-	dev_kfree_skb_any(skb);
 	if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
 		mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist,
 					     be32_to_cpu(eseg->flow_table_metadata));
+	dev_kfree_skb_any(skb);
 	mlx5e_tx_flush(sq);
 }
 
-- 
cgit 


From 7e3f3ba97e6cc6fce5bf62df2ca06c8e59040167 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:41 -0800
Subject: net/mlx5e: Track xmit submission to PTP WQ after populating metadata
 map

Ensure the skb is available in metadata mapping to skbs before tracking the
metadata index for detecting undelivered CQEs. If the metadata index is put
in the tracking list before putting the skb in the map, the metadata index
might be used for detecting undelivered CQEs before the relevant skb is
available in the map, which can lead to a null-ptr-deref.

Log:
    general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] SMP KASAN
    KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f]
    CPU: 0 PID: 1243 Comm: kworker/0:2 Not tainted 6.6.0-rc4+ #108
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
    Workqueue: events mlx5e_rx_dim_work [mlx5_core]
    RIP: 0010:mlx5e_ptp_napi_poll+0x9a4/0x2290 [mlx5_core]
    Code: 8c 24 38 cc ff ff 4c 8d 3c c1 4c 89 f9 48 c1 e9 03 42 80 3c 31 00 0f 85 97 0f 00 00 4d 8b 3f 49 8d 7f 28 48 89 f9 48 c1 e9 03 <42> 80 3c 31 00 0f 85 8b 0f 00 00 49 8b 47 28 48 85 c0 0f 84 05 07
    RSP: 0018:ffff8884d3c09c88 EFLAGS: 00010206
    RAX: 0000000000000069 RBX: ffff8881160349d8 RCX: 0000000000000005
    RDX: ffffed10218f48cf RSI: 0000000000000004 RDI: 0000000000000028
    RBP: ffff888122707700 R08: 0000000000000001 R09: ffffed109a781383
    R10: 0000000000000003 R11: 0000000000000003 R12: ffff88810c7a7a40
    R13: ffff888122707700 R14: dffffc0000000000 R15: 0000000000000000
    FS:  0000000000000000(0000) GS:ffff8884d3c00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007f4f878dd6e0 CR3: 000000014d108002 CR4: 0000000000370eb0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Call Trace:
    <IRQ>
    ? die_addr+0x3c/0xa0
    ? exc_general_protection+0x144/0x210
    ? asm_exc_general_protection+0x22/0x30
    ? mlx5e_ptp_napi_poll+0x9a4/0x2290 [mlx5_core]
    ? mlx5e_ptp_napi_poll+0x8f6/0x2290 [mlx5_core]
    __napi_poll.constprop.0+0xa4/0x580
    net_rx_action+0x460/0xb80
    ? _raw_spin_unlock_irqrestore+0x32/0x60
    ? __napi_poll.constprop.0+0x580/0x580
    ? tasklet_action_common.isra.0+0x2ef/0x760
    __do_softirq+0x26c/0x827
    irq_exit_rcu+0xc2/0x100
    common_interrupt+0x7f/0xa0
    </IRQ>
    <TASK>
    asm_common_interrupt+0x22/0x40
    RIP: 0010:__kmem_cache_alloc_node+0xb/0x330
    Code: 41 5d 41 5e 41 5f c3 8b 44 24 14 8b 4c 24 10 09 c8 eb d5 e8 b7 43 ca 01 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 57 <41> 56 41 89 d6 41 55 41 89 f5 41 54 49 89 fc 53 48 83 e4 f0 48 83
    RSP: 0018:ffff88812c4079c0 EFLAGS: 00000246
    RAX: 1ffffffff083c7fe RBX: ffff888100042dc0 RCX: 0000000000000218
    RDX: 00000000ffffffff RSI: 0000000000000dc0 RDI: ffff888100042dc0
    RBP: ffff88812c4079c8 R08: ffffffffa0289f96 R09: ffffed1025880ea9
    R10: ffff888138839f80 R11: 0000000000000002 R12: 0000000000000dc0
    R13: 0000000000000100 R14: 000000000000008c R15: ffff8881271fc450
    ? cmd_exec+0x796/0x2200 [mlx5_core]
    kmalloc_trace+0x26/0xc0
    cmd_exec+0x796/0x2200 [mlx5_core]
    mlx5_cmd_do+0x22/0xc0 [mlx5_core]
    mlx5_cmd_exec+0x17/0x30 [mlx5_core]
    mlx5_core_modify_cq_moderation+0x139/0x1b0 [mlx5_core]
    ? mlx5_add_cq_to_tasklet+0x280/0x280 [mlx5_core]
    ? lockdep_set_lock_cmp_fn+0x190/0x190
    ? process_one_work+0x659/0x1220
    mlx5e_rx_dim_work+0x9d/0x100 [mlx5_core]
    process_one_work+0x730/0x1220
    ? lockdep_hardirqs_on_prepare+0x400/0x400
    ? max_active_store+0xf0/0xf0
    ? assign_work+0x168/0x240
    worker_thread+0x70f/0x12d0
    ? __kthread_parkme+0xd1/0x1d0
    ? process_one_work+0x1220/0x1220
    kthread+0x2d9/0x3b0
    ? kthread_complete_and_exit+0x20/0x20
    ret_from_fork+0x2d/0x70
    ? kthread_complete_and_exit+0x20/0x20
    ret_from_fork_asm+0x11/0x20
    </TASK>
    Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_ib ib_uverbs ib_core zram zsmalloc mlx5_core fuse
    ---[ end trace 0000000000000000 ]---

Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs")
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-11-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 19f2c25b05a0..f0b506e562df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -399,9 +399,9 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata);
 
 		mlx5e_skb_cb_hwtstamp_init(skb);
-		mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index);
 		mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb,
 					   metadata_index);
+		mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index);
 		if (!netif_tx_queue_stopped(sq->txq) &&
 		    mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq)) {
 			netif_tx_stop_queue(sq->txq);
-- 
cgit 


From 92214be5979c0961a471b7eaaaeacab41bdf456c Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:42 -0800
Subject: net/mlx5e: Update doorbell for port timestamping CQ before the
 software counter

Previously, mlx5e_ptp_poll_ts_cq would update the device doorbell with the
incremented consumer index after the relevant software counters in the
kernel were updated. In the mlx5e_sq_xmit_wqe context, this would lead to
either overrunning the device CQ or exceeding the expected software buffer
size in the device CQ if the device CQ size was greater than the software
buffer size. Update the relevant software counter only after updating the
device CQ consumer index in the port timestamping napi_poll context.

Log:
    mlx5_core 0000:08:00.0: cq_err_event_notifier:517:(pid 0): CQ error on CQN 0x487, syndrome 0x1
    mlx5_core 0000:08:00.0 eth2: mlx5e_cq_error_event: cqn=0x000487 event=0x04

Fixes: 1880bc4e4a96 ("net/mlx5e: Add TX port timestamp support")
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-12-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index bb11e644d24f..af3928eddafd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -177,6 +177,8 @@ static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq,
 
 static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
 				    struct mlx5_cqe64 *cqe,
+				    u8 *md_buff,
+				    u8 *md_buff_sz,
 				    int budget)
 {
 	struct mlx5e_ptp_port_ts_cqe_list *pending_cqe_list = ptpsq->ts_cqe_pending_list;
@@ -211,19 +213,24 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
 	mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
 out:
 	napi_consume_skb(skb, budget);
-	mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id);
+	md_buff[*md_buff_sz++] = metadata_id;
 	if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) &&
 	    !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
 		queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work);
 }
 
-static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
+static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int napi_budget)
 {
 	struct mlx5e_ptpsq *ptpsq = container_of(cq, struct mlx5e_ptpsq, ts_cq);
-	struct mlx5_cqwq *cqwq = &cq->wq;
+	int budget = min(napi_budget, MLX5E_TX_CQ_POLL_BUDGET);
+	u8 metadata_buff[MLX5E_TX_CQ_POLL_BUDGET];
+	u8 metadata_buff_sz = 0;
+	struct mlx5_cqwq *cqwq;
 	struct mlx5_cqe64 *cqe;
 	int work_done = 0;
 
+	cqwq = &cq->wq;
+
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state)))
 		return false;
 
@@ -234,7 +241,8 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
 	do {
 		mlx5_cqwq_pop(cqwq);
 
-		mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, budget);
+		mlx5e_ptp_handle_ts_cqe(ptpsq, cqe,
+					metadata_buff, &metadata_buff_sz, napi_budget);
 	} while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq)));
 
 	mlx5_cqwq_update_db_record(cqwq);
@@ -242,6 +250,10 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
 	/* ensure cq space is freed before enabling more cqes */
 	wmb();
 
+	while (metadata_buff_sz > 0)
+		mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist,
+					     metadata_buff[--metadata_buff_sz]);
+
 	mlx5e_txqsq_wake(&ptpsq->txqsq);
 
 	return work_done == budget;
-- 
cgit 


From 3338bebfc26a1e2cebbba82a1cf12c0159608e73 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:43 -0800
Subject: net/mlx5: Increase size of irq name buffer

Without increased buffer size, will trigger -Wformat-truncation with W=1
for the snprintf operation writing to the buffer.

    drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c: In function 'mlx5_irq_alloc':
    drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:296:7: error: '@pci:' directive output may be truncated writing 5 bytes into a region of size between 1 and 32 [-Werror=format-truncation=]
      296 |    "%s@pci:%s", name, pci_name(dev->pdev));
          |       ^~~~~
    drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:295:2: note: 'snprintf' output 6 or more bytes (assuming 37) into a destination of size 32
      295 |  snprintf(irq->name, MLX5_MAX_IRQ_NAME,
          |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      296 |    "%s@pci:%s", name, pci_name(dev->pdev));
          |    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fixes: ada9f5d00797 ("IB/mlx5: Fix eq names to display nicely in /proc/interrupts")
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-13-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 653648216730..4dcf995cb1a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -28,7 +28,7 @@
 struct mlx5_irq {
 	struct atomic_notifier_head nh;
 	cpumask_var_t mask;
-	char name[MLX5_MAX_IRQ_NAME];
+	char name[MLX5_MAX_IRQ_FORMATTED_NAME];
 	struct mlx5_irq_pool *pool;
 	int refcount;
 	struct msi_map map;
@@ -292,8 +292,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
 	else
 		irq_sf_set_name(pool, name, i);
 	ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
-	snprintf(irq->name, MLX5_MAX_IRQ_NAME,
-		 "%s@pci:%s", name, pci_name(dev->pdev));
+	snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME,
+		 MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev));
 	err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
 			  &irq->nh);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
index d3a77a0ab848..c4d377f8df30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
@@ -7,6 +7,9 @@
 #include <linux/mlx5/driver.h>
 
 #define MLX5_MAX_IRQ_NAME (32)
+#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s")
+#define MLX5_MAX_IRQ_FORMATTED_NAME \
+	(MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR))
 /* max irq_index is 2047, so four chars */
 #define MLX5_MAX_IRQ_IDX_CHARS (4)
 #define MLX5_EQ_REFS_PER_IRQ (2)
-- 
cgit 


From dce94142842e119b982c27c1b62bd20890c7fd21 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:44 -0800
Subject: net/mlx5e: Reduce the size of icosq_str

icosq_str size is unnecessarily too long, and it causes a build warning
-Wformat-truncation with W=1. Looking closely, It doesn't need to be 255B,
hence this patch reduces the size to 32B which should be more than enough
to host the string: "ICOSQ: 0x%x, ".

While here, add a missing space in the formatted string.

This fixes the following build warning:

$ KCFLAGS='-Wall -Werror'
$ make O=/tmp/kbuild/linux W=1 -s -j12 drivers/net/ethernet/mellanox/mlx5/core/

drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c: In function 'mlx5e_reporter_rx_timeout':
drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c:718:56:
error: ', CQ: 0x' directive output may be truncated writing 8 bytes into a region of size between 0 and 255 [-Werror=format-truncation=]
  718 |                  "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x",
      |                                                        ^~~~~~~~
drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c:717:9: note: 'snprintf' output between 43 and 322 bytes into a destination of size 288
  717 |         snprintf(err_str, sizeof(err_str),
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  718 |                  "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x",
      |                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  719 |                  rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn);
      |                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fixes: 521f31af004a ("net/mlx5e: Allow RQ outside of channel context")
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-14-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index fea8c0a5fe89..4358798d6ce1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -492,11 +492,11 @@ static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter,
 
 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq)
 {
-	char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {};
 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
 	struct mlx5e_icosq *icosq = rq->icosq;
 	struct mlx5e_priv *priv = rq->priv;
 	struct mlx5e_err_ctx err_ctx = {};
+	char icosq_str[32] = {};
 
 	err_ctx.ctx = rq;
 	err_ctx.recover = mlx5e_rx_reporter_timeout_recover;
@@ -505,7 +505,7 @@ void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq)
 	if (icosq)
 		snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn);
 	snprintf(err_str, sizeof(err_str),
-		 "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x",
+		 "RX timeout on channel: %d, %s RQ: 0x%x, CQ: 0x%x",
 		 rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn);
 
 	mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
-- 
cgit 


From 41e63c2baa11dc2aa71df5dd27a5bd87d11b6bbb Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:45 -0800
Subject: net/mlx5e: Check return value of snprintf writing to fw_version
 buffer

Treat the operation as an error case when the return value is equivalent to
the size of the name buffer. Failed to write null terminator to the name
buffer, making the string malformed and should not be used. Provide a
string with only the firmware version when forming the string with the
board id fails.

Without check, will trigger -Wformat-truncation with W=1.

    drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c: In function 'mlx5e_ethtool_get_drvinfo':
    drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c:49:31: warning: '%.16s' directive output may be truncated writing up to 16 bytes into a region of size between 13 and 22 [-Wformat-truncation=]
      49 |                  "%d.%d.%04d (%.16s)",
         |                               ^~~~~
    drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c:48:9: note: 'snprintf' output between 12 and 37 bytes into a destination of size 32
      48 |         snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      49 |                  "%d.%d.%04d (%.16s)",
         |                  ~~~~~~~~~~~~~~~~~~~~~
      50 |                  fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev),
         |                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      51 |                  mdev->board_id);
         |                  ~~~~~~~~~~~~~~~

Fixes: 84e11edb71de ("net/mlx5e: Show board id in ethtool driver information")
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 215261a69255..792a0ea544cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -43,12 +43,17 @@ void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
 			       struct ethtool_drvinfo *drvinfo)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
+	int count;
 
 	strscpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
-	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-		 "%d.%d.%04d (%.16s)",
-		 fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev),
-		 mdev->board_id);
+	count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%d.%d.%04d (%.16s)", fw_rev_maj(mdev),
+			 fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id);
+	if (count == sizeof(drvinfo->fw_version))
+		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%d.%d.%04d", fw_rev_maj(mdev),
+			 fw_rev_min(mdev), fw_rev_sub(mdev));
+
 	strscpy(drvinfo->bus_info, dev_name(mdev->device),
 		sizeof(drvinfo->bus_info));
 }
-- 
cgit 


From 1b2bd0c0264febcd8d47209079a6671c38e6558b Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 14 Nov 2023 13:58:46 -0800
Subject: net/mlx5e: Check return value of snprintf writing to fw_version
 buffer for representors

Treat the operation as an error case when the return value is equivalent to
the size of the name buffer. Failed to write null terminator to the name
buffer, making the string malformed and should not be used. Provide a
string with only the firmware version when forming the string with the
board id fails. This logic for representors is identical to normal flow
with ethtool.

Without check, will trigger -Wformat-truncation with W=1.

    drivers/net/ethernet/mellanox/mlx5/core/en_rep.c: In function 'mlx5e_rep_get_drvinfo':
    drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:78:31: warning: '%.16s' directive output may be truncated writing up to 16 bytes into a region of size between 13 and 22 [-Wformat-truncation=]
      78 |                  "%d.%d.%04d (%.16s)",
         |                               ^~~~~
    drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:77:9: note: 'snprintf' output between 12 and 37 bytes into a destination of size 32
      77 |         snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      78 |                  "%d.%d.%04d (%.16s)",
         |                  ~~~~~~~~~~~~~~~~~~~~~
      79 |                  fw_rev_maj(mdev), fw_rev_min(mdev),
         |                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      80 |                  fw_rev_sub(mdev), mdev->board_id);
         |                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fixes: cf83c8fdcd47 ("net/mlx5e: Add missing ethtool driver info for representors")
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20231114215846.5902-16-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 693e55b010d9..3ab682bbcf86 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -71,13 +71,17 @@ static void mlx5e_rep_get_drvinfo(struct net_device *dev,
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
+	int count;
 
 	strscpy(drvinfo->driver, mlx5e_rep_driver_name,
 		sizeof(drvinfo->driver));
-	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-		 "%d.%d.%04d (%.16s)",
-		 fw_rev_maj(mdev), fw_rev_min(mdev),
-		 fw_rev_sub(mdev), mdev->board_id);
+	count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%d.%d.%04d (%.16s)", fw_rev_maj(mdev),
+			 fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id);
+	if (count == sizeof(drvinfo->fw_version))
+		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%d.%d.%04d", fw_rev_maj(mdev),
+			 fw_rev_min(mdev), fw_rev_sub(mdev));
 }
 
 static const struct counter_desc sw_rep_stats_desc[] = {
-- 
cgit 


From 7cd5af0e937a197295f3aa3721031f0fbae49cff Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 13 Nov 2023 12:53:28 -0500
Subject: net: sched: do not offload flows with a helper in act_ct

There is no hardware supporting ct helper offload. However, prior to this
patch, a flower filter with a helper in the ct action can be successfully
set into the HW, for example (eth1 is a bnxt NIC):

  # tc qdisc add dev eth1 ingress_block 22 ingress
  # tc filter add block 22 proto ip flower skip_sw ip_proto tcp \
    dst_port 21 ct_state -trk action ct helper ipv4-tcp-ftp
  # tc filter show dev eth1 ingress

    filter block 22 protocol ip pref 49152 flower chain 0 handle 0x1
      eth_type ipv4
      ip_proto tcp
      dst_port 21
      ct_state -trk
      skip_sw
      in_hw in_hw_count 1   <----
        action order 1: ct zone 0 helper ipv4-tcp-ftp pipe
         index 2 ref 1 bind 1
        used_hw_stats delayed

This might cause the flower filter not to work as expected in the HW.

This patch avoids this problem by simply returning -EOPNOTSUPP in
tcf_ct_offload_act_setup() to not allow to offload flows with a helper
in act_ct.

Fixes: a21b06e73191 ("net: sched: add helper support in act_ct")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/f8685ec7702c4a448a1371a8b34b43217b583b9d.1699898008.git.lucien.xin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tc_act/tc_ct.h | 9 +++++++++
 net/sched/act_ct.c         | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index 8a6dbfb23336..77f87c622a2e 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -58,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a)
 	return to_ct_params(a)->nf_ft;
 }
 
+static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a)
+{
+	return to_ct_params(a)->helper;
+}
+
 #else
 static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; }
 static inline int tcf_ct_action(const struct tc_action *a) { return 0; }
@@ -65,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a)
 {
 	return NULL;
 }
+static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a)
+{
+	return NULL;
+}
 #endif /* CONFIG_NF_CONNTRACK */
 
 #if IS_ENABLED(CONFIG_NET_ACT_CT)
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 0db0ecf1d110..b3f4a503ee2b 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1549,6 +1549,9 @@ static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
 	if (bind) {
 		struct flow_action_entry *entry = entry_data;
 
+		if (tcf_ct_helper(act))
+			return -EOPNOTSUPP;
+
 		entry->id = FLOW_ACTION_CT;
 		entry->ct.action = tcf_ct_action(act);
 		entry->ct.zone = tcf_ct_zone(act);
-- 
cgit 


From 7e1caeace0418381f36b3aa8403dfd82fc57fc53 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Tue, 14 Nov 2023 18:59:15 +0100
Subject: macvlan: Don't propagate promisc change to lower dev in passthru

Macvlan device in passthru mode sets its lower device promiscuous mode
according to its MACVLAN_FLAG_NOPROMISC flag instead of synchronizing it to
its own promiscuity setting. However, macvlan_change_rx_flags() function
doesn't check the mode before propagating such changes to the lower device
which can cause net_device->promiscuity counter overflow as illustrated by
reproduction example [0] and resulting dmesg log [1]. Fix the issue by
first verifying the mode in macvlan_change_rx_flags() function before
propagating promiscuous mode change to the lower device.

[0]:
ip link add macvlan1 link enp8s0f0 type macvlan mode passthru
ip link set macvlan1 promisc on
ip l set dev macvlan1 up
ip link set macvlan1 promisc off
ip l set dev macvlan1 down
ip l set dev macvlan1 up

[1]:
[ 5156.281724] macvlan1: entered promiscuous mode
[ 5156.285467] mlx5_core 0000:08:00.0 enp8s0f0: entered promiscuous mode
[ 5156.287639] macvlan1: left promiscuous mode
[ 5156.288339] mlx5_core 0000:08:00.0 enp8s0f0: left promiscuous mode
[ 5156.290907] mlx5_core 0000:08:00.0 enp8s0f0: entered promiscuous mode
[ 5156.317197] mlx5_core 0000:08:00.0 enp8s0f0: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.

Fixes: efdbd2b30caa ("macvlan: Propagate promiscuity setting to lower devices.")
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20231114175915.1649154-1-vladbu@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/macvlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 02bd201bc7e5..c8da94af4161 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -780,7 +780,7 @@ static void macvlan_change_rx_flags(struct net_device *dev, int change)
 	if (dev->flags & IFF_UP) {
 		if (change & IFF_ALLMULTI)
 			dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1);
-		if (change & IFF_PROMISC)
+		if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC)
 			dev_set_promiscuity(lowerdev,
 					    dev->flags & IFF_PROMISC ? 1 : -1);
 
-- 
cgit 


From 1c4a7587d1bbee0fd53b63af60e4244a62775f57 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 1 Nov 2023 02:46:27 +0900
Subject: modpost: fix section mismatch message for RELA

The section mismatch check prints a bogus symbol name on some
architectures.

[test code]

  #include <linux/init.h>

  int __initdata foo;
  int get_foo(void) { return foo; }

If you compile it with GCC for riscv or loongarch, modpost will show an
incorrect symbol name:

  WARNING: modpost: vmlinux: section mismatch in reference: get_foo+0x8 (section: .text) -> done (section: .init.data)

To get the correct symbol address, the st_value must be added.

This issue has never been noticed since commit 93684d3b8062 ("kbuild:
include symbol names in section mismatch warnings") presumably because
st_value becomes zero on most architectures when the referenced symbol
is looked up. It is not true for riscv or loongarch, at least.

With this fix, modpost will show the correct symbol name:

  WARNING: modpost: vmlinux: section mismatch in reference: get_foo+0x8 (section: .text) -> foo (section: .init.data)

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
---
 scripts/mod/modpost.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 973b5e5ae2dd..cb6406f485a9 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1383,13 +1383,15 @@ static void section_rela(struct module *mod, struct elf_info *elf,
 	const Elf_Rela *rela;
 
 	for (rela = start; rela < stop; rela++) {
+		Elf_Sym *tsym;
 		Elf_Addr taddr, r_offset;
 		unsigned int r_type, r_sym;
 
 		r_offset = TO_NATIVE(rela->r_offset);
 		get_rel_type_and_sym(elf, rela->r_info, &r_type, &r_sym);
 
-		taddr = TO_NATIVE(rela->r_addend);
+		tsym = elf->symtab_start + r_sym;
+		taddr = tsym->st_value + TO_NATIVE(rela->r_addend);
 
 		switch (elf->hdr->e_machine) {
 		case EM_RISCV:
@@ -1404,7 +1406,7 @@ static void section_rela(struct module *mod, struct elf_info *elf,
 			break;
 		}
 
-		check_section_mismatch(mod, elf, elf->symtab_start + r_sym,
+		check_section_mismatch(mod, elf, tsym,
 				       fsecndx, fromsec, r_offset, taddr);
 	}
 }
-- 
cgit 


From ba276ce5865b5a22ee96c4c5664bfefd9c1bb593 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Nov 2023 19:11:04 -0500
Subject: bcachefs: Fix missing locking for dentry->d_parent access

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index a39ff0c296ec..79d982674c18 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		s.v = v + 1;
 		s.defined = true;
 	} else {
+		/*
+		 * Check if this option was set on the parent - if so, switched
+		 * back to inheriting from the parent:
+		 *
+		 * rename() also has to deal with keeping inherited options up
+		 * to date - see bch2_reinherit_attrs()
+		 */
+		spin_lock(&dentry->d_lock);
 		if (!IS_ROOT(dentry)) {
 			struct bch_inode_info *dir =
 				to_bch_ei(d_inode(dentry->d_parent));
@@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		} else {
 			s.v = 0;
 		}
+		spin_unlock(&dentry->d_lock);
 
 		s.defined = false;
 	}
-- 
cgit 


From 9e0be3f50c0e8517d0238b62409c20bcb8cd8785 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 10 Nov 2023 13:07:22 +0100
Subject: linux/export: clean up the IA-64 KSYM_FUNC macro

With commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"),
there is no need to keep the IA-64 definition of the KSYM_FUNC macro.

Clean up the IA-64 definition of the KSYM_FUNC macro.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 45fca09b2319..69501e0ec239 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -50,9 +50,7 @@
 	    "	.previous"						"\n"	\
 	)
 
-#ifdef CONFIG_IA64
-#define KSYM_FUNC(name)		@fptr(name)
-#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
 #define KSYM_FUNC(name)		P%name
 #else
 #define KSYM_FUNC(name)		name
-- 
cgit 


From 76020731d4ee897411ce4a73916ed805ea15d946 Mon Sep 17 00:00:00 2001
From: Simon Glass <sjg@chromium.org>
Date: Fri, 10 Nov 2023 17:28:01 -0700
Subject: kbuild: Move the single quotes for image name

Add quotes where UIMAGE_NAME is used, rather than where it is defined.
This allows the UIMAGE_NAME variable to be set by the user.

Signed-off-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/Makefile.lib | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 68d0134bdbf9..1a965fe68e01 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -487,14 +487,14 @@ UIMAGE_OPTS-y ?=
 UIMAGE_TYPE ?= kernel
 UIMAGE_LOADADDR ?= arch_must_set_this
 UIMAGE_ENTRYADDR ?= $(UIMAGE_LOADADDR)
-UIMAGE_NAME ?= 'Linux-$(KERNELRELEASE)'
+UIMAGE_NAME ?= Linux-$(KERNELRELEASE)
 
 quiet_cmd_uimage = UIMAGE  $@
       cmd_uimage = $(BASH) $(MKIMAGE) -A $(UIMAGE_ARCH) -O linux \
 			-C $(UIMAGE_COMPRESSION) $(UIMAGE_OPTS-y) \
 			-T $(UIMAGE_TYPE) \
 			-a $(UIMAGE_LOADADDR) -e $(UIMAGE_ENTRYADDR) \
-			-n $(UIMAGE_NAME) -d $< $@
+			-n '$(UIMAGE_NAME)' -d $< $@
 
 # XZ
 # ---------------------------------------------------------------------------
-- 
cgit 


From ae1eff0349f2e908fc083630e8441ea6dc434dc0 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 15 Nov 2023 13:16:53 +0900
Subject: kconfig: fix memory leak from range properties

Currently, sym_validate_range() duplicates the range string using
xstrdup(), which is overwritten by a subsequent sym_calc_value() call.
It results in a memory leak.

Instead, only the pointer should be copied.

Below is a test case, with a summary from Valgrind.

[Test Kconfig]

  config FOO
          int "foo"
          range 10 20

[Test .config]

  CONFIG_FOO=0

[Before]

  LEAK SUMMARY:
     definitely lost: 3 bytes in 1 blocks
     indirectly lost: 0 bytes in 0 blocks
       possibly lost: 0 bytes in 0 blocks
     still reachable: 17,465 bytes in 21 blocks
          suppressed: 0 bytes in 0 blocks

[After]

  LEAK SUMMARY:
     definitely lost: 0 bytes in 0 blocks
     indirectly lost: 0 bytes in 0 blocks
       possibly lost: 0 bytes in 0 blocks
     still reachable: 17,462 bytes in 20 blocks
          suppressed: 0 bytes in 0 blocks

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/kconfig/symbol.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index 0572330bf8a7..a76925b46ce6 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -122,9 +122,9 @@ static long long sym_get_range_val(struct symbol *sym, int base)
 static void sym_validate_range(struct symbol *sym)
 {
 	struct property *prop;
+	struct symbol *range_sym;
 	int base;
 	long long val, val2;
-	char str[64];
 
 	switch (sym->type) {
 	case S_INT:
@@ -140,17 +140,15 @@ static void sym_validate_range(struct symbol *sym)
 	if (!prop)
 		return;
 	val = strtoll(sym->curr.val, NULL, base);
-	val2 = sym_get_range_val(prop->expr->left.sym, base);
+	range_sym = prop->expr->left.sym;
+	val2 = sym_get_range_val(range_sym, base);
 	if (val >= val2) {
-		val2 = sym_get_range_val(prop->expr->right.sym, base);
+		range_sym = prop->expr->right.sym;
+		val2 = sym_get_range_val(range_sym, base);
 		if (val <= val2)
 			return;
 	}
-	if (sym->type == S_INT)
-		sprintf(str, "%lld", val2);
-	else
-		sprintf(str, "0x%llx", val2);
-	sym->curr.val = xstrdup(str);
+	sym->curr.val = range_sym->curr.val;
 }
 
 static void sym_set_changed(struct symbol *sym)
-- 
cgit 


From 1ffa8602e39b89469dc703ebab7a7e44c33da0f7 Mon Sep 17 00:00:00 2001
From: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Date: Wed, 13 Sep 2023 16:18:44 -0400
Subject: drm/amd/display: Guard against invalid RPTR/WPTR being set

[WHY]
HW can return invalid values on register read, guard against these being
set and causing us to access memory out of range and page fault.

[HOW]
Guard at sync_inbox1 and guard at pushing commands.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Hansen Dsouza <hansen.dsouza@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
index e43e8d4bfe37..5d36f3e5dc2b 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
@@ -707,9 +707,16 @@ enum dmub_status dmub_srv_sync_inbox1(struct dmub_srv *dmub)
 		return DMUB_STATUS_INVALID;
 
 	if (dmub->hw_funcs.get_inbox1_rptr && dmub->hw_funcs.get_inbox1_wptr) {
-		dmub->inbox1_rb.rptr = dmub->hw_funcs.get_inbox1_rptr(dmub);
-		dmub->inbox1_rb.wrpt = dmub->hw_funcs.get_inbox1_wptr(dmub);
-		dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt;
+		uint32_t rptr = dmub->hw_funcs.get_inbox1_rptr(dmub);
+		uint32_t wptr = dmub->hw_funcs.get_inbox1_wptr(dmub);
+
+		if (rptr > dmub->inbox1_rb.capacity || wptr > dmub->inbox1_rb.capacity) {
+			return DMUB_STATUS_HW_FAILURE;
+		} else {
+			dmub->inbox1_rb.rptr = rptr;
+			dmub->inbox1_rb.wrpt = wptr;
+			dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt;
+		}
 	}
 
 	return DMUB_STATUS_OK;
@@ -743,6 +750,11 @@ enum dmub_status dmub_srv_cmd_queue(struct dmub_srv *dmub,
 	if (!dmub->hw_init)
 		return DMUB_STATUS_INVALID;
 
+	if (dmub->inbox1_rb.rptr > dmub->inbox1_rb.capacity ||
+	    dmub->inbox1_rb.wrpt > dmub->inbox1_rb.capacity) {
+		return DMUB_STATUS_HW_FAILURE;
+	}
+
 	if (dmub_rb_push_front(&dmub->inbox1_rb, cmd))
 		return DMUB_STATUS_OK;
 
-- 
cgit 


From 0288603040c38ccfeb5342f34a52673366d90038 Mon Sep 17 00:00:00 2001
From: Victor Lu <victorchengchi.lu@amd.com>
Date: Wed, 4 Oct 2023 14:24:15 -0400
Subject: drm/amdgpu: Do not program VF copy regs in mmhub v1.8 under SRIOV
 (v2)

MC_VM_AGP_* registers should not be programmed by guest driver.

v2: move early return outside of loop

Signed-off-by: Victor Lu <victorchengchi.lu@amd.com>
Reviewed-by: Samir Dhume <samir.dhume@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index ea142611be1c..9b0146732e13 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -130,6 +130,9 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev)
 	uint64_t value;
 	int i;
 
+	if (amdgpu_sriov_vf(adev))
+		return;
+
 	inst_mask = adev->aid_mask;
 	for_each_inst(i, inst_mask) {
 		/* Program the AGP BAR */
@@ -139,9 +142,6 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev)
 		WREG32_SOC15(MMHUB, i, regMC_VM_AGP_TOP,
 			     adev->gmc.agp_end >> 24);
 
-		if (amdgpu_sriov_vf(adev))
-			return;
-
 		/* Program the system aperture low logical page number. */
 		WREG32_SOC15(MMHUB, i, regMC_VM_SYSTEM_APERTURE_LOW_ADDR,
 			min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18);
-- 
cgit 


From bdb72185d310fc8049c7ea95221d640e9e7165e5 Mon Sep 17 00:00:00 2001
From: Le Ma <le.ma@amd.com>
Date: Mon, 13 Nov 2023 18:05:34 +0800
Subject: drm/amdgpu: finalizing mem_partitions at the end of GMC v9 sw_fini

The valid num_mem_partitions is required during ttm pool fini,
thus move the cleanup at the end of the function.

Signed-off-by: Le Ma <le.ma@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bde25eb4ed8e..c1f2f166f064 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2170,8 +2170,6 @@ static int gmc_v9_0_sw_fini(void *handle)
 
 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3))
 		amdgpu_gmc_sysfs_fini(adev);
-	adev->gmc.num_mem_partitions = 0;
-	kfree(adev->gmc.mem_partitions);
 
 	amdgpu_gmc_ras_fini(adev);
 	amdgpu_gem_force_release(adev);
@@ -2185,6 +2183,9 @@ static int gmc_v9_0_sw_fini(void *handle)
 	amdgpu_bo_free_kernel(&adev->gmc.pdb0_bo, NULL, &adev->gmc.ptr_pdb0);
 	amdgpu_bo_fini(adev);
 
+	adev->gmc.num_mem_partitions = 0;
+	kfree(adev->gmc.mem_partitions);
+
 	return 0;
 }
 
-- 
cgit 


From 8a0173cd90984835645022bf1997abd1bcd81aae Mon Sep 17 00:00:00 2001
From: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Date: Sun, 12 Nov 2023 09:51:19 +0530
Subject: drm/amdgpu: Address member 'ring' not described in 'amdgpu_ vce,
 uvd_entity_init()'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following:

drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c:237: warning: Function parameter or member 'ring' not described in 'amdgpu_vce_entity_init'
drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c:405: warning: Function parameter or member 'ring' not described in 'amdgpu_uvd_entity_init'

Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Pan, Xinhui" <Xinhui.Pan@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index 65949cc7abb9..07d930339b07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -398,6 +398,7 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
  * amdgpu_uvd_entity_init - init entity
  *
  * @adev: amdgpu_device pointer
+ * @ring: amdgpu_ring pointer to check
  *
  * Initialize the entity used for handle management in the kernel driver.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 0954447f689d..59acf424a078 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -230,6 +230,7 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
  * amdgpu_vce_entity_init - init entity
  *
  * @adev: amdgpu_device pointer
+ * @ring: amdgpu_ring pointer to check
  *
  * Initialize the entity used for handle management in the kernel driver.
  */
-- 
cgit 


From a58555359a9f870543aaddef277c3396159895ce Mon Sep 17 00:00:00 2001
From: Fangzhi Zuo <jerry.zuo@amd.com>
Date: Mon, 23 Oct 2023 13:57:32 -0400
Subject: drm/amd/display: Fix DSC not Enabled on Direct MST Sink

[WHY & HOW]
For the scenario when a dsc capable MST sink device is directly
connected, it needs to use max dsc compression as the link bw constraint.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Roman Li <roman.li@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Fangzhi Zuo <jerry.zuo@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c    | 29 +++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index d3b13d362eda..11da0eebee6c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -1604,31 +1604,31 @@ enum dc_status dm_dp_mst_is_port_support_mode(
 	unsigned int upper_link_bw_in_kbps = 0, down_link_bw_in_kbps = 0;
 	unsigned int max_compressed_bw_in_kbps = 0;
 	struct dc_dsc_bw_range bw_range = {0};
-	struct drm_dp_mst_topology_mgr *mst_mgr;
+	uint16_t full_pbn = aconnector->mst_output_port->full_pbn;
 
 	/*
-	 * check if the mode could be supported if DSC pass-through is supported
-	 * AND check if there enough bandwidth available to support the mode
-	 * with DSC enabled.
+	 * Consider the case with the depth of the mst topology tree is equal or less than 2
+	 * A. When dsc bitstream can be transmitted along the entire path
+	 *    1. dsc is possible between source and branch/leaf device (common dsc params is possible), AND
+	 *    2. dsc passthrough supported at MST branch, or
+	 *    3. dsc decoding supported at leaf MST device
+	 *    Use maximum dsc compression as bw constraint
+	 * B. When dsc bitstream cannot be transmitted along the entire path
+	 *    Use native bw as bw constraint
 	 */
 	if (is_dsc_common_config_possible(stream, &bw_range) &&
-	    aconnector->mst_output_port->passthrough_aux) {
-		mst_mgr = aconnector->mst_output_port->mgr;
-		mutex_lock(&mst_mgr->lock);
-
+	   (aconnector->mst_output_port->passthrough_aux ||
+	    aconnector->dsc_aux == &aconnector->mst_output_port->aux)) {
 		cur_link_settings = stream->link->verified_link_cap;
 
 		upper_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link,
-							       &cur_link_settings
-							       );
-		down_link_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn);
+							       &cur_link_settings);
+		down_link_bw_in_kbps = kbps_from_pbn(full_pbn);
 
 		/* pick the bottleneck */
 		end_to_end_bw_in_kbps = min(upper_link_bw_in_kbps,
 					    down_link_bw_in_kbps);
 
-		mutex_unlock(&mst_mgr->lock);
-
 		/*
 		 * use the maximum dsc compression bandwidth as the required
 		 * bandwidth for the mode
@@ -1643,8 +1643,7 @@ enum dc_status dm_dp_mst_is_port_support_mode(
 		/* check if mode could be supported within full_pbn */
 		bpp = convert_dc_color_depth_into_bpc(stream->timing.display_color_depth) * 3;
 		pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, bpp, false);
-
-		if (pbn > aconnector->mst_output_port->full_pbn)
+		if (pbn > full_pbn)
 			return DC_FAIL_BANDWIDTH_VALIDATE;
 	}
 
-- 
cgit 


From 50d51374b498457c4dea26779d32ccfed12ddaff Mon Sep 17 00:00:00 2001
From: YuanShang <YuanShang.Mao@amd.com>
Date: Tue, 31 Oct 2023 10:32:37 +0800
Subject: drm/amdgpu: correct chunk_ptr to a pointer to chunk.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The variable "chunk_ptr" should be a pointer pointing
to a struct drm_amdgpu_cs_chunk instead of to a pointer
of that.

Signed-off-by: YuanShang <YuanShang.Mao@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index df3ecfa9e13f..e50be6500030 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -207,7 +207,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 	}
 
 	for (i = 0; i < p->nchunks; i++) {
-		struct drm_amdgpu_cs_chunk __user **chunk_ptr = NULL;
+		struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL;
 		struct drm_amdgpu_cs_chunk user_chunk;
 		uint32_t __user *cdata;
 
-- 
cgit 


From 786c355797b3942725829d02ce9e2e6a9eba11fe Mon Sep 17 00:00:00 2001
From: Asad Kamal <asad.kamal@amd.com>
Date: Tue, 31 Oct 2023 03:14:02 +0800
Subject: drm/amd/pm: Update metric table for smu v13_0_6

Update pmfw metric table to include pcie
instantaneous bandwidth & pcie error counters

Signed-off-by: Asad Kamal <asad.kamal@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
index dab35d878a90..fef2d290f3f2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
@@ -123,7 +123,7 @@ typedef enum {
   VOLTAGE_GUARDBAND_COUNT
 } GFX_GUARDBAND_e;
 
-#define SMU_METRICS_TABLE_VERSION 0x8
+#define SMU_METRICS_TABLE_VERSION 0x9
 
 typedef struct __attribute__((packed, aligned(4))) {
   uint32_t AccumulationCounter;
@@ -211,6 +211,14 @@ typedef struct __attribute__((packed, aligned(4))) {
   //XGMI Data tranfser size
   uint64_t XgmiReadDataSizeAcc[8];//in KByte
   uint64_t XgmiWriteDataSizeAcc[8];//in KByte
+
+  //PCIE BW Data and error count
+  uint32_t PcieBandwidth[4];
+  uint32_t PCIeL0ToRecoveryCountAcc;      // The Pcie counter itself is accumulated
+  uint32_t PCIenReplayAAcc;               // The Pcie counter itself is accumulated
+  uint32_t PCIenReplayARolloverCountAcc;  // The Pcie counter itself is accumulated
+  uint32_t PCIeNAKSentCountAcc;           // The Pcie counter itself is accumulated
+  uint32_t PCIeNAKReceivedCountAcc;       // The Pcie counter itself is accumulated
 } MetricsTable_t;
 
 #define SMU_VF_METRICS_TABLE_VERSION 0x3
-- 
cgit 


From e4d0be18243ca006258b5c7c148796c0b43505c4 Mon Sep 17 00:00:00 2001
From: Asad Kamal <asad.kamal@amd.com>
Date: Tue, 14 Nov 2023 16:17:17 +0800
Subject: drm/amd/pm: Fill pcie error counters for gpu v1_4

Fill PCIE error counters & instantaneous bandwidth
in gpu metrics v1_4 for smu v_13_0_6

Signed-off-by: Asad Kamal <asad.kamal@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 891605d4975f..84fac499cf0c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2095,6 +2095,14 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table
 			smu_v13_0_6_get_current_pcie_link_speed(smu);
 		gpu_metrics->pcie_bandwidth_acc =
 				SMUQ10_ROUND(metrics->PcieBandwidthAcc[0]);
+		gpu_metrics->pcie_bandwidth_inst =
+				SMUQ10_ROUND(metrics->PcieBandwidth[0]);
+		gpu_metrics->pcie_l0_to_recov_count_acc =
+				metrics->PCIeL0ToRecoveryCountAcc;
+		gpu_metrics->pcie_replay_count_acc =
+				metrics->PCIenReplayAAcc;
+		gpu_metrics->pcie_replay_rover_count_acc =
+				metrics->PCIenReplayARolloverCountAcc;
 	}
 
 	gpu_metrics->system_clock_counter = ktime_get_boottime_ns();
-- 
cgit 


From 9725a4f9eb495bfa6c7f5ccdb49440ff06dba0a1 Mon Sep 17 00:00:00 2001
From: Muhammad Ahmed <ahmed.ahmed@amd.com>
Date: Tue, 31 Oct 2023 16:03:21 -0400
Subject: drm/amd/display: Add null checks for 8K60 lightup

[WHY & HOW]
Add some null checks to fix an issue where 8k60
tiled display fails to light up.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Charlene Liu <charlene.liu@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Muhammad Ahmed <ahmed.ahmed@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc.c          | 2 +-
 drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 7b9bf5cb4529..d8f434738212 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -3178,7 +3178,7 @@ static bool update_planes_and_stream_state(struct dc *dc,
 			struct pipe_ctx *otg_master = resource_get_otg_master_for_stream(&context->res_ctx,
 					context->streams[i]);
 
-			if (otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE)
+			if (otg_master && otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE)
 				resource_build_test_pattern_params(&context->res_ctx, otg_master);
 		}
 	}
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
index 1d48278cba96..a1f1d1003992 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
@@ -5190,6 +5190,9 @@ bool dc_resource_acquire_secondary_pipe_for_mpc_odm_legacy(
 	sec_next = sec_pipe->next_odm_pipe;
 	sec_prev = sec_pipe->prev_odm_pipe;
 
+	if (pri_pipe == NULL)
+		return false;
+
 	*sec_pipe = *pri_pipe;
 
 	sec_pipe->top_pipe = sec_top;
-- 
cgit 


From b71f4ade1b8900d30c661d6c27f87c35214c398c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 8 Nov 2023 13:31:57 -0600
Subject: drm/amd/display: fix a NULL pointer dereference in
 amdgpu_dm_i2c_xfer()

When ddc_service_construct() is called, it explicitly checks both the
link type and whether there is something on the link which will
dictate whether the pin is marked as hw_supported.

If the pin isn't set or the link is not set (such as from
unloading/reloading amdgpu in an IGT test) then fail the
amdgpu_dm_i2c_xfer() call.

Cc: stable@vger.kernel.org
Fixes: 22676bc500c2 ("drm/amd/display: Fix dmub soft hang for PSR 1")
Link: https://github.com/fwupd/fwupd/issues/6327
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 6f99f6754c11..5ec7acf65ee1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -7481,6 +7481,9 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap,
 	int i;
 	int result = -EIO;
 
+	if (!ddc_service->ddc_pin || !ddc_service->ddc_pin->hw_info.hw_supported)
+		return result;
+
 	cmd.payloads = kcalloc(num, sizeof(struct i2c_payload), GFP_KERNEL);
 
 	if (!cmd.payloads)
-- 
cgit 


From 270b301beca58e427a0fda7523a71a9562e644bb Mon Sep 17 00:00:00 2001
From: José Pekkarinen <jose.pekkarinen@foxhound.fi>
Date: Tue, 14 Nov 2023 17:27:51 +0200
Subject: drm/amd/display: fix NULL dereference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following patch will fix a minor issue where a debug message is
referencing an struct that has just being checked whether is null or
not. This has been noticed by using coccinelle, in the following output:

drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c:540:25-29: ERROR: aconnector is NULL but dereferenced.

Fixes: 5d72e247e58c ("drm/amd/display: switch DC over to the new DRM logging macros")
Signed-off-by: José Pekkarinen <jose.pekkarinen@foxhound.fi>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index ed784cf27d39..c7a29bb737e2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -536,11 +536,8 @@ bool dm_helpers_dp_read_dpcd(
 
 	struct amdgpu_dm_connector *aconnector = link->priv;
 
-	if (!aconnector) {
-		drm_dbg_dp(aconnector->base.dev,
-			   "Failed to find connector for link!\n");
+	if (!aconnector)
 		return false;
-	}
 
 	return drm_dp_dpcd_read(&aconnector->dm_dp_aux.aux, address, data,
 				size) == size;
-- 
cgit 


From 435f5b369657cffee4b04db1f5805b48599f4dbe Mon Sep 17 00:00:00 2001
From: Tianci Yin <tianci.yin@amd.com>
Date: Wed, 1 Nov 2023 09:47:13 +0800
Subject: drm/amd/display: Enable fast plane updates on DCN3.2 and above

[WHY]
When cursor moves across screen boarder, lag cursor observed,
since subvp settings need to sync up with vblank that causes
cursor updates being delayed.

[HOW]
Enable fast plane updates on DCN3.2 to fix it.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Tianci Yin <tianci.yin@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 5ec7acf65ee1..9581510d3740 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -9606,14 +9606,14 @@ static bool should_reset_plane(struct drm_atomic_state *state,
 	struct drm_plane *other;
 	struct drm_plane_state *old_other_state, *new_other_state;
 	struct drm_crtc_state *new_crtc_state;
+	struct amdgpu_device *adev = drm_to_adev(plane->dev);
 	int i;
 
 	/*
-	 * TODO: Remove this hack once the checks below are sufficient
-	 * enough to determine when we need to reset all the planes on
-	 * the stream.
+	 * TODO: Remove this hack for all asics once it proves that the
+	 * fast updates works fine on DCN3.2+.
 	 */
-	if (state->allow_modeset)
+	if (adev->ip_versions[DCE_HWIP][0] < IP_VERSION(3, 2, 0) && state->allow_modeset)
 		return true;
 
 	/* Exit early if we know that we're adding or removing the plane. */
-- 
cgit 


From 923bbfe6c888812db1088d684bd30c24036226d2 Mon Sep 17 00:00:00 2001
From: Paul Hsieh <paul.hsieh@amd.com>
Date: Wed, 25 Oct 2023 10:53:35 +0800
Subject: drm/amd/display: Clear dpcd_sink_ext_caps if not set

[WHY]
Some eDP panels' ext caps don't set initial values
and the value of dpcd_addr (0x317) is random.
It means that sometimes the eDP can be OLED, miniLED and etc,
and cause incorrect backlight control interface.

[HOW]
Add remove_sink_ext_caps to remove sink ext caps (HDR, OLED and etc)

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Anthony Koo <anthony.koo@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Paul Hsieh <paul.hsieh@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dc_types.h            | 1 +
 drivers/gpu/drm/amd/display/dc/link/link_detection.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h
index cea666ea66c6..fcb825e4f1bb 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_types.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_types.h
@@ -177,6 +177,7 @@ struct dc_panel_patch {
 	unsigned int disable_fams;
 	unsigned int skip_avmute;
 	unsigned int mst_start_top_delay;
+	unsigned int remove_sink_ext_caps;
 };
 
 struct dc_edid_caps {
diff --git a/drivers/gpu/drm/amd/display/dc/link/link_detection.c b/drivers/gpu/drm/amd/display/dc/link/link_detection.c
index d6f0f857c05a..f2fe523f914f 100644
--- a/drivers/gpu/drm/amd/display/dc/link/link_detection.c
+++ b/drivers/gpu/drm/amd/display/dc/link/link_detection.c
@@ -1088,6 +1088,9 @@ static bool detect_link_and_local_sink(struct dc_link *link,
 		if (sink->edid_caps.panel_patch.skip_scdc_overwrite)
 			link->ctx->dc->debug.hdmi20_disable = true;
 
+		if (sink->edid_caps.panel_patch.remove_sink_ext_caps)
+			link->dpcd_sink_ext_caps.raw = 0;
+
 		if (dc_is_hdmi_signal(link->connector_signal))
 			read_scdc_caps(link->ddc, link->local_sink);
 
-- 
cgit 


From 07ee43faeb7eb088e49a7549fcabcae94c443d3b Mon Sep 17 00:00:00 2001
From: Yang Wang <kevinyang.wang@amd.com>
Date: Mon, 13 Nov 2023 14:34:55 +0800
Subject: drm/amdgpu: fix ras err_data null pointer issue in amdgpu_ras.c

fix ras err_data null pointer issue in amdgpu_ras.c

Fixes: 8cc0f5669eb6 ("drm/amdgpu: Support multiple error query modes")
Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..a3dc68e98910 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1188,7 +1188,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
 			}
 
 			if (block_obj->hw_ops->query_ras_error_count)
-				block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+				block_obj->hw_ops->query_ras_error_count(adev, err_data);
 
 			if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
 			    (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
-- 
cgit 


From 0f216364625cb453b4f933deacfa92df7f2a2fc9 Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Fri, 10 Nov 2023 13:15:39 +0530
Subject: drm/amd/pm: Don't send unload message for reset

No need to notify about unload during reset. Also remove the FW version
check.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 84fac499cf0c..0e5a77c3c2e2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1454,7 +1454,7 @@ static int smu_v13_0_6_register_irq_handler(struct smu_context *smu)
 
 static int smu_v13_0_6_notify_unload(struct smu_context *smu)
 {
-	if (smu->smc_fw_version <= 0x553500)
+	if (amdgpu_in_reset(smu->adev))
 		return 0;
 
 	dev_dbg(smu->adev->dev, "Notify PMFW about driver unload");
-- 
cgit 


From 5e8a0d3598b47ee5a57708072bdef08816264538 Mon Sep 17 00:00:00 2001
From: Duncan Ma <duncan.ma@amd.com>
Date: Wed, 25 Oct 2023 19:07:21 -0400
Subject: drm/amd/display: Negate IPS allow and commit bits

[WHY]
On s0i3, IPS mask isn't saved and restored.
It is reset to zero on exit.

If it is cleared unexpectedly, driver will
proceed operations while DCN is in IPS2 and
cause a hang.

[HOW]
Negate the bit logic. Default value of
zero indicates it is still in IPS2. Driver
must poll for the bit to assert.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Charlene Liu <charlene.liu@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Duncan Ma <duncan.ma@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c   | 18 +++++++++---------
 drivers/gpu/drm/amd/display/dc/core/dc.c               |  4 ++--
 drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c           | 10 +++++-----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
index 0fa4fcd00de2..507a7cf56711 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
@@ -820,22 +820,22 @@ static void dcn35_set_idle_state(struct clk_mgr *clk_mgr_base, bool allow_idle)
 
 	if (dc->config.disable_ips == DMUB_IPS_ENABLE ||
 		dc->config.disable_ips == DMUB_IPS_DISABLE_DYNAMIC) {
-		val |= DMUB_IPS1_ALLOW_MASK;
-		val |= DMUB_IPS2_ALLOW_MASK;
-	} else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) {
 		val = val & ~DMUB_IPS1_ALLOW_MASK;
 		val = val & ~DMUB_IPS2_ALLOW_MASK;
-	} else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) {
-		val |= DMUB_IPS1_ALLOW_MASK;
-		val = val & ~DMUB_IPS2_ALLOW_MASK;
-	} else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) {
+	} else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) {
 		val |= DMUB_IPS1_ALLOW_MASK;
 		val |= DMUB_IPS2_ALLOW_MASK;
+	} else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) {
+		val = val & ~DMUB_IPS1_ALLOW_MASK;
+		val |= DMUB_IPS2_ALLOW_MASK;
+	} else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) {
+		val = val & ~DMUB_IPS1_ALLOW_MASK;
+		val = val & ~DMUB_IPS2_ALLOW_MASK;
 	}
 
 	if (!allow_idle) {
-		val = val & ~DMUB_IPS1_ALLOW_MASK;
-		val = val & ~DMUB_IPS2_ALLOW_MASK;
+		val |= DMUB_IPS1_ALLOW_MASK;
+		val |= DMUB_IPS2_ALLOW_MASK;
 	}
 
 	dcn35_smu_write_ips_scratch(clk_mgr, val);
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index d8f434738212..76b47f178127 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -4934,8 +4934,8 @@ bool dc_dmub_is_ips_idle_state(struct dc *dc)
 	if (dc->hwss.get_idle_state)
 		idle_state = dc->hwss.get_idle_state(dc);
 
-	if ((idle_state & DMUB_IPS1_ALLOW_MASK) ||
-		(idle_state & DMUB_IPS2_ALLOW_MASK))
+	if (!(idle_state & DMUB_IPS1_ALLOW_MASK) ||
+		!(idle_state & DMUB_IPS2_ALLOW_MASK))
 		return true;
 
 	return false;
diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
index e4c007203318..0e07699c1e83 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
@@ -1202,11 +1202,11 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc)
 		allow_state = dc->hwss.get_idle_state(dc);
 		dc->hwss.set_idle_state(dc, false);
 
-		if (allow_state & DMUB_IPS2_ALLOW_MASK) {
+		if (!(allow_state & DMUB_IPS2_ALLOW_MASK)) {
 			// Wait for evaluation time
 			udelay(dc->debug.ips2_eval_delay_us);
 			commit_state = dc->hwss.get_idle_state(dc);
-			if (commit_state & DMUB_IPS2_COMMIT_MASK) {
+			if (!(commit_state & DMUB_IPS2_COMMIT_MASK)) {
 				// Tell PMFW to exit low power state
 				dc->clk_mgr->funcs->exit_low_power_state(dc->clk_mgr);
 
@@ -1216,7 +1216,7 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc)
 
 				for (i = 0; i < max_num_polls; ++i) {
 					commit_state = dc->hwss.get_idle_state(dc);
-					if (!(commit_state & DMUB_IPS2_COMMIT_MASK))
+					if (commit_state & DMUB_IPS2_COMMIT_MASK)
 						break;
 
 					udelay(1);
@@ -1235,10 +1235,10 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc)
 		}
 
 		dc_dmub_srv_notify_idle(dc, false);
-		if (allow_state & DMUB_IPS1_ALLOW_MASK) {
+		if (!(allow_state & DMUB_IPS1_ALLOW_MASK)) {
 			for (i = 0; i < max_num_polls; ++i) {
 				commit_state = dc->hwss.get_idle_state(dc);
-				if (!(commit_state & DMUB_IPS1_COMMIT_MASK))
+				if (commit_state & DMUB_IPS1_COMMIT_MASK)
 					break;
 
 				udelay(1);
-- 
cgit 


From 9ddea8c9775d9379d71e6ac1519c552461b90b07 Mon Sep 17 00:00:00 2001
From: Shiwu Zhang <shiwu.zhang@amd.com>
Date: Tue, 31 Oct 2023 11:02:49 +0800
Subject: drm/amdgpu: add and populate the port num into xgmi topology info

The port num info is firstly introduced with 20.00.01.13 xgmi ta and
make them as part of topology info.

Signed-off-by: Shiwu Zhang <shiwu.zhang@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 32b701cc0376..a21045d018f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1473,6 +1473,11 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
 				topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ?
 								topology->nodes[i].num_links : node_num_links;
 			}
+			/* popluate the connected port num info if supported and available */
+			if (ta_port_num_support && topology->nodes[i].num_links) {
+				memcpy(topology->nodes[i].port_num, link_extend_info_output->nodes[i].port_num,
+				       sizeof(struct xgmi_connected_port_num) * TA_XGMI__MAX_PORT_NUM);
+			}
 
 			/* reflect the topology information for bi-directionality */
 			if (requires_reflection && topology->nodes[i].num_hops)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 5d36ad3f48c7..c4d9cbde55b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -150,6 +150,7 @@ struct psp_xgmi_node_info {
 	uint8_t					is_sharing_enabled;
 	enum ta_xgmi_assigned_sdma_engine	sdma_engine;
 	uint8_t					num_links;
+	struct xgmi_connected_port_num		port_num[TA_XGMI__MAX_PORT_NUM];
 };
 
 struct psp_xgmi_topology_info {
-- 
cgit 


From 5911d02cac70d7fb52009fbd37423e63f8f6f9bc Mon Sep 17 00:00:00 2001
From: Lewis Huang <lewis.huang@amd.com>
Date: Thu, 19 Oct 2023 17:22:21 +0800
Subject: drm/amd/display: Change the DMCUB mailbox memory location from FB to
 inbox

[WHY]
Flush command sent to DMCUB spends more time for execution on
a dGPU than on an APU. This causes cursor lag when using high
refresh rate mouses.

[HOW]
1. Change the DMCUB mailbox memory location from FB to inbox.
2. Only change windows memory to inbox.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Lewis Huang <lewis.huang@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 ++++-----
 drivers/gpu/drm/amd/display/dmub/dmub_srv.h       | 22 ++++++++++------
 drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c   | 32 +++++++++++++++++------
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 9581510d3740..ee97814ebd99 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2079,7 +2079,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
 	struct dmub_srv_create_params create_params;
 	struct dmub_srv_region_params region_params;
 	struct dmub_srv_region_info region_info;
-	struct dmub_srv_fb_params fb_params;
+	struct dmub_srv_memory_params memory_params;
 	struct dmub_srv_fb_info *fb_info;
 	struct dmub_srv *dmub_srv;
 	const struct dmcub_firmware_header_v1_0 *hdr;
@@ -2182,6 +2182,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
 		adev->dm.dmub_fw->data +
 		le32_to_cpu(hdr->header.ucode_array_offset_bytes) +
 		PSP_HEADER_BYTES;
+	region_params.is_mailbox_in_inbox = false;
 
 	status = dmub_srv_calc_region_info(dmub_srv, &region_params,
 					   &region_info);
@@ -2205,10 +2206,10 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
 		return r;
 
 	/* Rebase the regions on the framebuffer address. */
-	memset(&fb_params, 0, sizeof(fb_params));
-	fb_params.cpu_addr = adev->dm.dmub_bo_cpu_addr;
-	fb_params.gpu_addr = adev->dm.dmub_bo_gpu_addr;
-	fb_params.region_info = &region_info;
+	memset(&memory_params, 0, sizeof(memory_params));
+	memory_params.cpu_fb_addr = adev->dm.dmub_bo_cpu_addr;
+	memory_params.gpu_fb_addr = adev->dm.dmub_bo_gpu_addr;
+	memory_params.region_info = &region_info;
 
 	adev->dm.dmub_fb_info =
 		kzalloc(sizeof(*adev->dm.dmub_fb_info), GFP_KERNEL);
@@ -2220,7 +2221,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
 		return -ENOMEM;
 	}
 
-	status = dmub_srv_calc_fb_info(dmub_srv, &fb_params, fb_info);
+	status = dmub_srv_calc_mem_info(dmub_srv, &memory_params, fb_info);
 	if (status != DMUB_STATUS_OK) {
 		DRM_ERROR("Error calculating DMUB FB info: %d\n", status);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
index 9665ada0f894..df63aa8f01e9 100644
--- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
+++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
@@ -195,6 +195,7 @@ struct dmub_srv_region_params {
 	uint32_t vbios_size;
 	const uint8_t *fw_inst_const;
 	const uint8_t *fw_bss_data;
+	bool is_mailbox_in_inbox;
 };
 
 /**
@@ -214,20 +215,25 @@ struct dmub_srv_region_params {
  */
 struct dmub_srv_region_info {
 	uint32_t fb_size;
+	uint32_t inbox_size;
 	uint8_t num_regions;
 	struct dmub_region regions[DMUB_WINDOW_TOTAL];
 };
 
 /**
- * struct dmub_srv_fb_params - parameters used for driver fb setup
+ * struct dmub_srv_memory_params - parameters used for driver fb setup
  * @region_info: region info calculated by dmub service
- * @cpu_addr: base cpu address for the framebuffer
- * @gpu_addr: base gpu virtual address for the framebuffer
+ * @cpu_fb_addr: base cpu address for the framebuffer
+ * @cpu_inbox_addr: base cpu address for the gart
+ * @gpu_fb_addr: base gpu virtual address for the framebuffer
+ * @gpu_inbox_addr: base gpu virtual address for the gart
  */
-struct dmub_srv_fb_params {
+struct dmub_srv_memory_params {
 	const struct dmub_srv_region_info *region_info;
-	void *cpu_addr;
-	uint64_t gpu_addr;
+	void *cpu_fb_addr;
+	void *cpu_inbox_addr;
+	uint64_t gpu_fb_addr;
+	uint64_t gpu_inbox_addr;
 };
 
 /**
@@ -563,8 +569,8 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
  *   DMUB_STATUS_OK - success
  *   DMUB_STATUS_INVALID - unspecified error
  */
-enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
-				       const struct dmub_srv_fb_params *params,
+enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub,
+				       const struct dmub_srv_memory_params *params,
 				       struct dmub_srv_fb_info *out);
 
 /**
diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
index 5d36f3e5dc2b..22fc4ba96def 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
@@ -434,7 +434,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
 	uint32_t fw_state_size = DMUB_FW_STATE_SIZE;
 	uint32_t trace_buffer_size = DMUB_TRACE_BUFFER_SIZE;
 	uint32_t scratch_mem_size = DMUB_SCRATCH_MEM_SIZE;
-
+	uint32_t previous_top = 0;
 	if (!dmub->sw_init)
 		return DMUB_STATUS_INVALID;
 
@@ -459,8 +459,15 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
 	bios->base = dmub_align(stack->top, 256);
 	bios->top = bios->base + params->vbios_size;
 
-	mail->base = dmub_align(bios->top, 256);
-	mail->top = mail->base + DMUB_MAILBOX_SIZE;
+	if (params->is_mailbox_in_inbox) {
+		mail->base = 0;
+		mail->top = mail->base + DMUB_MAILBOX_SIZE;
+		previous_top = bios->top;
+	} else {
+		mail->base = dmub_align(bios->top, 256);
+		mail->top = mail->base + DMUB_MAILBOX_SIZE;
+		previous_top = mail->top;
+	}
 
 	fw_info = dmub_get_fw_meta_info(params);
 
@@ -479,7 +486,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
 			dmub->fw_version = fw_info->fw_version;
 	}
 
-	trace_buff->base = dmub_align(mail->top, 256);
+	trace_buff->base = dmub_align(previous_top, 256);
 	trace_buff->top = trace_buff->base + dmub_align(trace_buffer_size, 64);
 
 	fw_state->base = dmub_align(trace_buff->top, 256);
@@ -490,11 +497,14 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
 
 	out->fb_size = dmub_align(scratch_mem->top, 4096);
 
+	if (params->is_mailbox_in_inbox)
+		out->inbox_size = dmub_align(mail->top, 4096);
+
 	return DMUB_STATUS_OK;
 }
 
-enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
-				       const struct dmub_srv_fb_params *params,
+enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub,
+				       const struct dmub_srv_memory_params *params,
 				       struct dmub_srv_fb_info *out)
 {
 	uint8_t *cpu_base;
@@ -509,8 +519,8 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
 	if (params->region_info->num_regions != DMUB_NUM_WINDOWS)
 		return DMUB_STATUS_INVALID;
 
-	cpu_base = (uint8_t *)params->cpu_addr;
-	gpu_base = params->gpu_addr;
+	cpu_base = (uint8_t *)params->cpu_fb_addr;
+	gpu_base = params->gpu_fb_addr;
 
 	for (i = 0; i < DMUB_NUM_WINDOWS; ++i) {
 		const struct dmub_region *reg =
@@ -518,6 +528,12 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
 
 		out->fb[i].cpu_addr = cpu_base + reg->base;
 		out->fb[i].gpu_addr = gpu_base + reg->base;
+
+		if (i == DMUB_WINDOW_4_MAILBOX && params->cpu_inbox_addr != 0) {
+			out->fb[i].cpu_addr = (uint8_t *)params->cpu_inbox_addr + reg->base;
+			out->fb[i].gpu_addr = params->gpu_inbox_addr + reg->base;
+		}
+
 		out->fb[i].size = reg->top - reg->base;
 	}
 
-- 
cgit 


From 0ee057e66c4b782809a0a9265cdac5542e646706 Mon Sep 17 00:00:00 2001
From: Nicholas Susanto <nicholas.susanto@amd.com>
Date: Wed, 1 Nov 2023 15:30:10 -0400
Subject: drm/amd/display: Fix encoder disable logic

[WHY]
DENTIST hangs when OTG is off and encoder is on. We were not
disabling the encoder properly when switching from extended mode to
external monitor only.

[HOW]
Disable the encoder using an existing enable/disable fifo helper instead
of enc35_stream_encoder_enable.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Nicholas Susanto <nicholas.susanto@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c    | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c
index 001f9eb66920..62a8f0b56006 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c
@@ -261,12 +261,6 @@ static void enc35_stream_encoder_enable(
 			/* invalid mode ! */
 			ASSERT_CRITICAL(false);
 		}
-
-		REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1);
-		REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1);
-	} else {
-		REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0);
-		REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0);
 	}
 }
 
@@ -436,6 +430,8 @@ static void enc35_disable_fifo(struct stream_encoder *enc)
 	struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc);
 
 	REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_ENABLE, 0);
+	REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0);
+	REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0);
 }
 
 static void enc35_enable_fifo(struct stream_encoder *enc)
@@ -443,6 +439,8 @@ static void enc35_enable_fifo(struct stream_encoder *enc)
 	struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc);
 
 	REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_READ_START_LEVEL, 0x7);
+	REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1);
+	REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1);
 
 	enc35_reset_fifo(enc, true);
 	enc35_reset_fifo(enc, false);
-- 
cgit 


From 564ca1b53ece166b5915c2ac90f3e9313100f4ea Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 14 Nov 2023 11:36:56 -0500
Subject: drm/amdgpu/gmc11: fix logic typo in AGP check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Should be && rather than ||.

Fixes: b2e1cbe6281f ("drm/amdgpu/gmc11: disable AGP on GC 11.5")
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mario Limonciello <mario.limonciello@amd.com> # PHX & Navi33
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 6dce9b29f675..ba4c82f5e617 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -640,7 +640,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev,
 	amdgpu_gmc_set_agp_default(adev, mc);
 	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
 	amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
-	if (!amdgpu_sriov_vf(adev) ||
+	if (!amdgpu_sriov_vf(adev) &&
 	    (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
 		amdgpu_gmc_agp_location(adev, mc);
 
-- 
cgit 


From 6ba5b613837c5d997ad8297b22fc46cd0be58d76 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 9 Nov 2023 15:31:00 -0500
Subject: drm/amdgpu: add a module parameter to control the AGP aperture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a module parameter to control the AGP aperture.  The AGP
aperture is an aperture in the GPU's internal address space
which provides direct non-paged access to the platform address
space.  This access is non-snooped so only uncached memory
can be accessed.

Add a knob so that we can toggle this for debugging.

Fixes: 67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11")
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mario Limonciello <mario.limonciello@amd.com> # PHX & Navi33
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index afec09930efa..9d92ca157677 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -248,6 +248,7 @@ extern int amdgpu_umsch_mm;
 extern int amdgpu_seamless;
 
 extern int amdgpu_user_partt_mode;
+extern int amdgpu_agp;
 
 #define AMDGPU_VM_MAX_NUM_CTX			4096
 #define AMDGPU_SG_THRESHOLD			(256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 3095a3a864af..8f24cabe2155 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -207,6 +207,7 @@ int amdgpu_user_partt_mode = AMDGPU_AUTO_COMPUTE_PARTITION_MODE;
 int amdgpu_umsch_mm;
 int amdgpu_seamless = -1; /* auto */
 uint amdgpu_debug_mask;
+int amdgpu_agp = -1; /* auto */
 
 static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
 
@@ -961,6 +962,15 @@ module_param_named(seamless, amdgpu_seamless, int, 0444);
 MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
 module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
 
+/**
+ * DOC: agp (int)
+ * Enable the AGP aperture.  This provides an aperture in the GPU's internal
+ * address space for direct access to system memory.  Note that these accesses
+ * are non-snooped, so they are only used for access to uncached memory.
+ */
+MODULE_PARM_DESC(agp, "AGP (-1 = auto (default), 0 = disable, 1 = enable)");
+module_param_named(agp, amdgpu_agp, int, 0444);
+
 /* These devices are not supported by amdgpu.
  * They are supported by the mach64, r128, radeon drivers
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 0ec7b061d7c2..23483bffa1c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev,
 	amdgpu_gmc_set_agp_default(adev, mc);
 	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
 	amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-	if (!amdgpu_sriov_vf(adev))
+	if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
 		amdgpu_gmc_agp_location(adev, mc);
 
 	/* base offset of vram pages */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index ba4c82f5e617..e1078b53e942 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -641,7 +641,8 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev,
 	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
 	amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
 	if (!amdgpu_sriov_vf(adev) &&
-	    (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
+	    (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) &&
+	    (amdgpu_agp != 0))
 		amdgpu_gmc_agp_location(adev, mc);
 
 	/* base offset of vram pages */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c1f2f166f064..1638c3177799 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
 	} else {
 		amdgpu_gmc_vram_location(adev, mc, base);
 		amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-		if (!amdgpu_sriov_vf(adev))
+		if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
 			amdgpu_gmc_agp_location(adev, mc);
 	}
 	/* base offset of vram pages */
-- 
cgit 


From 0db062eac3e0846c6f120867a79df83b4c3db46f Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 9 Nov 2023 15:34:19 -0500
Subject: drm/amdgpu/gmc11: disable AGP aperture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.  It can still be toggled via the
module parameter for testing.

v2: leave it configurable via module parameter

Fixes: 67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11")
Reviewed-by: Yang Wang <kevinyang.wang@amd.com> (v1)
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mario Limonciello <mario.limonciello@amd.com> # PHX & Navi33
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index e1078b53e942..23d7b548d13f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -642,7 +642,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev,
 	amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
 	if (!amdgpu_sriov_vf(adev) &&
 	    (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) &&
-	    (amdgpu_agp != 0))
+	    (amdgpu_agp == 1))
 		amdgpu_gmc_agp_location(adev, mc);
 
 	/* base offset of vram pages */
-- 
cgit 


From 61fc93695bbfde218d5f9f0b8051ce36eb649669 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 9 Nov 2023 15:38:54 -0500
Subject: drm/amdgpu/gmc10: disable AGP aperture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.  It can still be toggled via the
module parameter for testing.

v2: leave it configurable via module parameter

Reviewed-by: Yang Wang <kevinyang.wang@amd.com> (v1)
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mario Limonciello <mario.limonciello@amd.com> # PHX & Navi33
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 23483bffa1c7..a5a05c16c10d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev,
 	amdgpu_gmc_set_agp_default(adev, mc);
 	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
 	amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-	if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
+	if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1))
 		amdgpu_gmc_agp_location(adev, mc);
 
 	/* base offset of vram pages */
-- 
cgit 


From e8c2d3e25b844ad8f7c8b269a7cfd65285329264 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 9 Nov 2023 15:40:00 -0500
Subject: drm/amdgpu/gmc9: disable AGP aperture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.  It can still be toggled via the
module parameter for testing.

v2: leave it configurable via module parameter

Reviewed-by: Yang Wang <kevinyang.wang@amd.com> (v1)
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mario Limonciello <mario.limonciello@amd.com> # PHX & Navi33
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1638c3177799..2ac5820e9c92 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
 	} else {
 		amdgpu_gmc_vram_location(adev, mc, base);
 		amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-		if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
+		if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1))
 			amdgpu_gmc_agp_location(adev, mc);
 	}
 	/* base offset of vram pages */
-- 
cgit 


From 6fc45b6ed921dc00dfb264dc08c7d67ee63d2656 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Nov 2023 18:21:14 +0100
Subject: dm-delay: fix a race between delay_presuspend and delay_bio

In delay_presuspend, we set the atomic variable may_delay and then stop
the timer and flush pending bios. The intention here is to prevent the
delay target from re-arming the timer again.

However, this test is racy. Suppose that one thread goes to delay_bio,
sees that dc->may_delay is one and proceeds; now, another thread executes
delay_presuspend, it sets dc->may_delay to zero, deletes the timer and
flushes pending bios. Then, the first thread continues and adds the bio to
delayed->list despite the fact that dc->may_delay is false.

Fix this bug by changing may_delay's type from atomic_t to bool and
only access it while holding the delayed_bios_lock mutex. Note that we
don't have to grab the mutex in delay_resume because there are no bios
in flight at this point.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index efd510984e25..2d6b900e4353 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -33,7 +33,7 @@ struct delay_c {
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
 	struct task_struct *worker;
-	atomic_t may_delay;
+	bool may_delay;
 
 	struct delay_class read;
 	struct delay_class write;
@@ -236,7 +236,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = dc;
 	INIT_LIST_HEAD(&dc->delayed_bios);
-	atomic_set(&dc->may_delay, 1);
+	dc->may_delay = true;
 	dc->argc = argc;
 
 	ret = delay_class_ctr(ti, &dc->read, argv);
@@ -312,7 +312,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
 	struct dm_delay_info *delayed;
 	unsigned long expires = 0;
 
-	if (!c->delay || !atomic_read(&dc->may_delay))
+	if (!c->delay)
 		return DM_MAPIO_REMAPPED;
 
 	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
@@ -321,6 +321,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
 	delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
 
 	mutex_lock(&delayed_bios_lock);
+	if (unlikely(!dc->may_delay)) {
+		mutex_unlock(&delayed_bios_lock);
+		return DM_MAPIO_REMAPPED;
+	}
 	c->ops++;
 	list_add_tail(&delayed->list, &dc->delayed_bios);
 	mutex_unlock(&delayed_bios_lock);
@@ -337,7 +341,9 @@ static void delay_presuspend(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	atomic_set(&dc->may_delay, 0);
+	mutex_lock(&delayed_bios_lock);
+	dc->may_delay = false;
+	mutex_unlock(&delayed_bios_lock);
 
 	if (delay_is_fast(dc))
 		flush_delayed_bios_fast(dc, true);
@@ -351,7 +357,7 @@ static void delay_resume(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	atomic_set(&dc->may_delay, 1);
+	dc->may_delay = true;
 }
 
 static int delay_map(struct dm_target *ti, struct bio *bio)
-- 
cgit 


From 38cfff568169ff9f99784948f79f62ca1af5a187 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Nov 2023 18:22:47 +0100
Subject: dm-delay: fix bugs introduced by kthread mode

This commit fixes the following bugs introduced by commit 70bbeb29fab0
("dm delay: for short delays, use kthread instead of timers and wq"):

* the function flush_worker_fn has no exit path - on unload, this
  function will just loop and consume 100% CPU without any progress

* the wake-up mechanism in flush_worker_fn is racy - a wake up will be
  missed if the process adds entries to the delayed_bios list just
  before set_current_state(TASK_INTERRUPTIBLE)

* flush_delayed_bios_fast submits a bio while holding a global mutex;
  this may deadlock if we have multiple stacked dm-delay devices and
  the underlying device attempts to acquire the mutex too

* if the target constructor fails, it will call delay_dtr. delay_dtr
  would attempt to free dc->timer_lock without it being initialized by
  the constructor.

* if the target constructor's kthread allocation fails, delay_dtr
  would crash trying to dereference dc->worker because it is non-NULL
  due to ERR_PTR.

Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 61 +++++++++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2d6b900e4353..00c09d9fffeb 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -73,9 +73,23 @@ static inline bool delay_is_fast(struct delay_c *dc)
 	return !!dc->worker;
 }
 
+static void flush_bios(struct bio *bio)
+{
+	struct bio *n;
+
+	while (bio) {
+		n = bio->bi_next;
+		bio->bi_next = NULL;
+		dm_submit_bio_remap(bio, NULL);
+		bio = n;
+	}
+}
+
 static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
 {
 	struct dm_delay_info *delayed, *next;
+	struct bio_list flush_bio_list;
+	bio_list_init(&flush_bio_list);
 
 	mutex_lock(&delayed_bios_lock);
 	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
@@ -83,47 +97,42 @@ static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
 			struct bio *bio = dm_bio_from_per_bio_data(delayed,
 						sizeof(struct dm_delay_info));
 			list_del(&delayed->list);
-			dm_submit_bio_remap(bio, NULL);
+			bio_list_add(&flush_bio_list, bio);
 			delayed->class->ops--;
 		}
 	}
 	mutex_unlock(&delayed_bios_lock);
+
+	flush_bios(bio_list_get(&flush_bio_list));
 }
 
 static int flush_worker_fn(void *data)
 {
 	struct delay_c *dc = data;
 
-	while (1) {
+	while (!kthread_should_stop()) {
 		flush_delayed_bios_fast(dc, false);
+		mutex_lock(&delayed_bios_lock);
 		if (unlikely(list_empty(&dc->delayed_bios))) {
 			set_current_state(TASK_INTERRUPTIBLE);
+			mutex_unlock(&delayed_bios_lock);
 			schedule();
-		} else
+		} else {
+			mutex_unlock(&delayed_bios_lock);
 			cond_resched();
+		}
 	}
 
 	return 0;
 }
 
-static void flush_bios(struct bio *bio)
-{
-	struct bio *n;
-
-	while (bio) {
-		n = bio->bi_next;
-		bio->bi_next = NULL;
-		dm_submit_bio_remap(bio, NULL);
-		bio = n;
-	}
-}
-
-static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
+static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
 {
 	struct dm_delay_info *delayed, *next;
 	unsigned long next_expires = 0;
 	unsigned long start_timer = 0;
-	struct bio_list flush_bios = { };
+	struct bio_list flush_bio_list;
+	bio_list_init(&flush_bio_list);
 
 	mutex_lock(&delayed_bios_lock);
 	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
@@ -131,7 +140,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
 			struct bio *bio = dm_bio_from_per_bio_data(delayed,
 						sizeof(struct dm_delay_info));
 			list_del(&delayed->list);
-			bio_list_add(&flush_bios, bio);
+			bio_list_add(&flush_bio_list, bio);
 			delayed->class->ops--;
 			continue;
 		}
@@ -147,7 +156,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
 	if (start_timer)
 		queue_timeout(dc, next_expires);
 
-	return bio_list_get(&flush_bios);
+	flush_bios(bio_list_get(&flush_bio_list));
 }
 
 static void flush_expired_bios(struct work_struct *work)
@@ -158,7 +167,7 @@ static void flush_expired_bios(struct work_struct *work)
 	if (delay_is_fast(dc))
 		flush_delayed_bios_fast(dc, false);
 	else
-		flush_bios(flush_delayed_bios(dc, false));
+		flush_delayed_bios(dc, false);
 }
 
 static void delay_dtr(struct dm_target *ti)
@@ -177,8 +186,7 @@ static void delay_dtr(struct dm_target *ti)
 	if (dc->worker)
 		kthread_stop(dc->worker);
 
-	if (!delay_is_fast(dc))
-		mutex_destroy(&dc->timer_lock);
+	mutex_destroy(&dc->timer_lock);
 
 	kfree(dc);
 }
@@ -236,6 +244,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = dc;
 	INIT_LIST_HEAD(&dc->delayed_bios);
+	mutex_init(&dc->timer_lock);
 	dc->may_delay = true;
 	dc->argc = argc;
 
@@ -282,12 +291,12 @@ out:
 					    "dm-delay-flush-worker");
 		if (IS_ERR(dc->worker)) {
 			ret = PTR_ERR(dc->worker);
+			dc->worker = NULL;
 			goto bad;
 		}
 	} else {
 		timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
 		INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
-		mutex_init(&dc->timer_lock);
 		dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
 		if (!dc->kdelayd_wq) {
 			ret = -EINVAL;
@@ -345,11 +354,11 @@ static void delay_presuspend(struct dm_target *ti)
 	dc->may_delay = false;
 	mutex_unlock(&delayed_bios_lock);
 
-	if (delay_is_fast(dc))
+	if (delay_is_fast(dc)) {
 		flush_delayed_bios_fast(dc, true);
-	else {
+	} else {
 		del_timer_sync(&dc->delay_timer);
-		flush_bios(flush_delayed_bios(dc, true));
+		flush_delayed_bios(dc, true);
 	}
 }
 
-- 
cgit 


From ccadc8a21ef13eb80006ecff6a7466810e4f0dd6 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Nov 2023 18:24:04 +0100
Subject: dm-delay: avoid duplicate logic

This is small refactoring of dm-delay - we avoid duplicate logic in
flush_delayed_bios and flush_delayed_bios_fast and join these two
functions into one.

We also add cond_resched() to flush_delayed_bios because the list may have
unbounded number of entries.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 65 +++++++++++++++++----------------------------------
 1 file changed, 21 insertions(+), 44 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 00c09d9fffeb..5eabdb06c649 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -85,24 +85,40 @@ static void flush_bios(struct bio *bio)
 	}
 }
 
-static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
+static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
 {
 	struct dm_delay_info *delayed, *next;
 	struct bio_list flush_bio_list;
+	unsigned long next_expires = 0;
+	bool start_timer = false;
 	bio_list_init(&flush_bio_list);
 
 	mutex_lock(&delayed_bios_lock);
 	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+		cond_resched();
 		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
 			struct bio *bio = dm_bio_from_per_bio_data(delayed,
 						sizeof(struct dm_delay_info));
 			list_del(&delayed->list);
 			bio_list_add(&flush_bio_list, bio);
 			delayed->class->ops--;
+			continue;
+		}
+
+		if (!delay_is_fast(dc)) {
+			if (!start_timer) {
+				start_timer = true;
+				next_expires = delayed->expires;
+			} else {
+				next_expires = min(next_expires, delayed->expires);
+			}
 		}
 	}
 	mutex_unlock(&delayed_bios_lock);
 
+	if (start_timer)
+		queue_timeout(dc, next_expires);
+
 	flush_bios(bio_list_get(&flush_bio_list));
 }
 
@@ -111,7 +127,7 @@ static int flush_worker_fn(void *data)
 	struct delay_c *dc = data;
 
 	while (!kthread_should_stop()) {
-		flush_delayed_bios_fast(dc, false);
+		flush_delayed_bios(dc, false);
 		mutex_lock(&delayed_bios_lock);
 		if (unlikely(list_empty(&dc->delayed_bios))) {
 			set_current_state(TASK_INTERRUPTIBLE);
@@ -126,48 +142,12 @@ static int flush_worker_fn(void *data)
 	return 0;
 }
 
-static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
-{
-	struct dm_delay_info *delayed, *next;
-	unsigned long next_expires = 0;
-	unsigned long start_timer = 0;
-	struct bio_list flush_bio_list;
-	bio_list_init(&flush_bio_list);
-
-	mutex_lock(&delayed_bios_lock);
-	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
-		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
-			struct bio *bio = dm_bio_from_per_bio_data(delayed,
-						sizeof(struct dm_delay_info));
-			list_del(&delayed->list);
-			bio_list_add(&flush_bio_list, bio);
-			delayed->class->ops--;
-			continue;
-		}
-
-		if (!start_timer) {
-			start_timer = 1;
-			next_expires = delayed->expires;
-		} else
-			next_expires = min(next_expires, delayed->expires);
-	}
-	mutex_unlock(&delayed_bios_lock);
-
-	if (start_timer)
-		queue_timeout(dc, next_expires);
-
-	flush_bios(bio_list_get(&flush_bio_list));
-}
-
 static void flush_expired_bios(struct work_struct *work)
 {
 	struct delay_c *dc;
 
 	dc = container_of(work, struct delay_c, flush_expired_bios);
-	if (delay_is_fast(dc))
-		flush_delayed_bios_fast(dc, false);
-	else
-		flush_delayed_bios(dc, false);
+	flush_delayed_bios(dc, false);
 }
 
 static void delay_dtr(struct dm_target *ti)
@@ -354,12 +334,9 @@ static void delay_presuspend(struct dm_target *ti)
 	dc->may_delay = false;
 	mutex_unlock(&delayed_bios_lock);
 
-	if (delay_is_fast(dc)) {
-		flush_delayed_bios_fast(dc, true);
-	} else {
+	if (!delay_is_fast(dc))
 		del_timer_sync(&dc->delay_timer);
-		flush_delayed_bios(dc, true);
-	}
+	flush_delayed_bios(dc, true);
 }
 
 static void delay_resume(struct dm_target *ti)
-- 
cgit 


From 2a695062a5a42aead8c539a344168d4806b3fda2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Nov 2023 18:36:34 +0100
Subject: dm-bufio: fix no-sleep mode

dm-bufio has a no-sleep mode. When activated (with the
DM_BUFIO_CLIENT_NO_SLEEP flag), the bufio client is read-only and we
could call dm_bufio_get from tasklets. This is used by dm-verity.

Unfortunately, commit 450e8dee51aa ("dm bufio: improve concurrent IO
performance") broke this and the kernel would warn that cache_get()
was calling down_read() from no-sleeping context. The bug can be
reproduced by using "veritysetup open" with the "--use-tasklets"
flag.

This commit fixes dm-bufio, so that the tasklet mode works again, by
expanding use of the 'no_sleep_enabled' static_key to conditionally
use either a rw_semaphore or rwlock_t (which are colocated in the
buffer_tree structure using a union).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org	# v6.4
Fixes: 450e8dee51aa ("dm bufio: improve concurrent IO performance")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 87 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 62eb27639c9b..f03d7dba270c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -254,7 +254,7 @@ enum evict_result {
 
 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
 
-static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
+static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep)
 {
 	unsigned long tested = 0;
 	struct list_head *h = lru->cursor;
@@ -295,7 +295,8 @@ static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *con
 
 		h = h->next;
 
-		cond_resched();
+		if (!no_sleep)
+			cond_resched();
 	}
 
 	return NULL;
@@ -382,7 +383,10 @@ struct dm_buffer {
  */
 
 struct buffer_tree {
-	struct rw_semaphore lock;
+	union {
+		struct rw_semaphore lock;
+		rwlock_t spinlock;
+	} u;
 	struct rb_root root;
 } ____cacheline_aligned_in_smp;
 
@@ -393,9 +397,12 @@ struct dm_buffer_cache {
 	 * on the locks.
 	 */
 	unsigned int num_locks;
+	bool no_sleep;
 	struct buffer_tree trees[];
 };
 
+static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
+
 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 {
 	return dm_hash_locks_index(block, num_locks);
@@ -403,22 +410,34 @@ static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 
 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
 {
-	down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
+	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+		read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+	else
+		down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
 {
-	up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
+	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+		read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+	else
+		up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
 {
-	down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
+	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+		write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+	else
+		down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
 {
-	up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
+	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+		write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+	else
+		up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 /*
@@ -442,18 +461,32 @@ static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool
 
 static void __lh_lock(struct lock_history *lh, unsigned int index)
 {
-	if (lh->write)
-		down_write(&lh->cache->trees[index].lock);
-	else
-		down_read(&lh->cache->trees[index].lock);
+	if (lh->write) {
+		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+			write_lock_bh(&lh->cache->trees[index].u.spinlock);
+		else
+			down_write(&lh->cache->trees[index].u.lock);
+	} else {
+		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+			read_lock_bh(&lh->cache->trees[index].u.spinlock);
+		else
+			down_read(&lh->cache->trees[index].u.lock);
+	}
 }
 
 static void __lh_unlock(struct lock_history *lh, unsigned int index)
 {
-	if (lh->write)
-		up_write(&lh->cache->trees[index].lock);
-	else
-		up_read(&lh->cache->trees[index].lock);
+	if (lh->write) {
+		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+			write_unlock_bh(&lh->cache->trees[index].u.spinlock);
+		else
+			up_write(&lh->cache->trees[index].u.lock);
+	} else {
+		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+			read_unlock_bh(&lh->cache->trees[index].u.spinlock);
+		else
+			up_read(&lh->cache->trees[index].u.lock);
+	}
 }
 
 /*
@@ -502,14 +535,18 @@ static struct dm_buffer *list_to_buffer(struct list_head *l)
 	return le_to_buffer(le);
 }
 
-static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
+static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep)
 {
 	unsigned int i;
 
 	bc->num_locks = num_locks;
+	bc->no_sleep = no_sleep;
 
 	for (i = 0; i < bc->num_locks; i++) {
-		init_rwsem(&bc->trees[i].lock);
+		if (no_sleep)
+			rwlock_init(&bc->trees[i].u.spinlock);
+		else
+			init_rwsem(&bc->trees[i].u.lock);
 		bc->trees[i].root = RB_ROOT;
 	}
 
@@ -648,7 +685,7 @@ static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode
 	struct lru_entry *le;
 	struct dm_buffer *b;
 
-	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
+	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep);
 	if (!le)
 		return NULL;
 
@@ -702,7 +739,7 @@ static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_
 	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 
 	while (true) {
-		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
+		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep);
 		if (!le)
 			break;
 
@@ -915,10 +952,11 @@ static void cache_remove_range(struct dm_buffer_cache *bc,
 {
 	unsigned int i;
 
+	BUG_ON(bc->no_sleep);
 	for (i = 0; i < bc->num_locks; i++) {
-		down_write(&bc->trees[i].lock);
+		down_write(&bc->trees[i].u.lock);
 		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
-		up_write(&bc->trees[i].lock);
+		up_write(&bc->trees[i].u.lock);
 	}
 }
 
@@ -979,8 +1017,6 @@ struct dm_bufio_client {
 	struct dm_buffer_cache cache; /* must be last member */
 };
 
-static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
-
 /*----------------------------------------------------------------*/
 
 #define dm_bufio_in_request()	(!!current->bio_list)
@@ -1871,7 +1907,8 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 	if (need_submit)
 		submit_io(b, REQ_OP_READ, read_endio);
 
-	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
+	if (nf != NF_GET)	/* we already tested this condition above */
+		wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 
 	if (b->read_error) {
 		int error = blk_status_to_errno(b->read_error);
@@ -2421,7 +2458,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		r = -ENOMEM;
 		goto bad_client;
 	}
-	cache_init(&c->cache, num_locks);
+	cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0);
 
 	c->bdev = bdev;
 	c->block_size = block_size;
-- 
cgit 


From 28f07f2ab4b3a2714f1fefcc58ada4bcc195f806 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Nov 2023 18:37:25 +0100
Subject: dm-verity: don't use blocking calls from tasklets

The commit 5721d4e5a9cd enhanced dm-verity, so that it can verify blocks
from tasklets rather than from workqueues. This reportedly improves
performance significantly.

However, dm-verity was using the flag CRYPTO_TFM_REQ_MAY_SLEEP from
tasklets which resulted in warnings about sleeping function being called
from non-sleeping context.

BUG: sleeping function called from invalid context at crypto/internal.h:206
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 14, name: ksoftirqd/0
preempt_count: 100, expected: 0
RCU nest depth: 0, expected: 0
CPU: 0 PID: 14 Comm: ksoftirqd/0 Tainted: G        W 6.7.0-rc1 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x32/0x50
 __might_resched+0x110/0x160
 crypto_hash_walk_done+0x54/0xb0
 shash_ahash_update+0x51/0x60
 verity_hash_update.isra.0+0x4a/0x130 [dm_verity]
 verity_verify_io+0x165/0x550 [dm_verity]
 ? free_unref_page+0xdf/0x170
 ? psi_group_change+0x113/0x390
 verity_tasklet+0xd/0x70 [dm_verity]
 tasklet_action_common.isra.0+0xb3/0xc0
 __do_softirq+0xaf/0x1ec
 ? smpboot_thread_fn+0x1d/0x200
 ? sort_range+0x20/0x20
 run_ksoftirqd+0x15/0x30
 smpboot_thread_fn+0xed/0x200
 kthread+0xdc/0x110
 ? kthread_complete_and_exit+0x20/0x20
 ret_from_fork+0x28/0x40
 ? kthread_complete_and_exit+0x20/0x20
 ret_from_fork_asm+0x11/0x20
 </TASK>

This commit fixes dm-verity so that it doesn't use the flags
CRYPTO_TFM_REQ_MAY_SLEEP and CRYPTO_TFM_REQ_MAY_BACKLOG from tasklets. The
crypto API would do GFP_ATOMIC allocation instead, it could return -ENOMEM
and we catch -ENOMEM in verity_tasklet and requeue the request to the
workqueue.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org	# v6.0+
Fixes: 5721d4e5a9cd ("dm verity: Add optional "try_verify_in_tasklet" feature")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-verity-fec.c    |  4 ++--
 drivers/md/dm-verity-target.c | 23 ++++++++++++-----------
 drivers/md/dm-verity.h        |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 3ef9f018da60..2099c755119e 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -185,7 +185,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 {
 	if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
 				 data, 1 << v->data_dev_block_bits,
-				 verity_io_real_digest(v, io))))
+				 verity_io_real_digest(v, io), true)))
 		return 0;
 
 	return memcmp(verity_io_real_digest(v, io), want_digest,
@@ -386,7 +386,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 	/* Always re-validate the corrected block against the expected hash */
 	r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
 			1 << v->data_dev_block_bits,
-			verity_io_real_digest(v, io));
+			verity_io_real_digest(v, io), true);
 	if (unlikely(r < 0))
 		return r;
 
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 26adcfea0302..e115fcfe723c 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -135,20 +135,21 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
  * Wrapper for crypto_ahash_init, which handles verity salting.
  */
 static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
-				struct crypto_wait *wait)
+				struct crypto_wait *wait, bool may_sleep)
 {
 	int r;
 
 	ahash_request_set_tfm(req, v->tfm);
-	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
-					CRYPTO_TFM_REQ_MAY_BACKLOG,
-					crypto_req_done, (void *)wait);
+	ahash_request_set_callback(req,
+		may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0,
+		crypto_req_done, (void *)wait);
 	crypto_init_wait(wait);
 
 	r = crypto_wait_req(crypto_ahash_init(req), wait);
 
 	if (unlikely(r < 0)) {
-		DMERR("crypto_ahash_init failed: %d", r);
+		if (r != -ENOMEM)
+			DMERR("crypto_ahash_init failed: %d", r);
 		return r;
 	}
 
@@ -179,12 +180,12 @@ out:
 }
 
 int verity_hash(struct dm_verity *v, struct ahash_request *req,
-		const u8 *data, size_t len, u8 *digest)
+		const u8 *data, size_t len, u8 *digest, bool may_sleep)
 {
 	int r;
 	struct crypto_wait wait;
 
-	r = verity_hash_init(v, req, &wait);
+	r = verity_hash_init(v, req, &wait, may_sleep);
 	if (unlikely(r < 0))
 		goto out;
 
@@ -322,7 +323,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 
 		r = verity_hash(v, verity_io_hash_req(v, io),
 				data, 1 << v->hash_dev_block_bits,
-				verity_io_real_digest(v, io));
+				verity_io_real_digest(v, io), !io->in_tasklet);
 		if (unlikely(r < 0))
 			goto release_ret_r;
 
@@ -556,7 +557,7 @@ static int verity_verify_io(struct dm_verity_io *io)
 			continue;
 		}
 
-		r = verity_hash_init(v, req, &wait);
+		r = verity_hash_init(v, req, &wait, !io->in_tasklet);
 		if (unlikely(r < 0))
 			return r;
 
@@ -652,7 +653,7 @@ static void verity_tasklet(unsigned long data)
 
 	io->in_tasklet = true;
 	err = verity_verify_io(io);
-	if (err == -EAGAIN) {
+	if (err == -EAGAIN || err == -ENOMEM) {
 		/* fallback to retrying with work-queue */
 		INIT_WORK(&io->work, verity_work);
 		queue_work(io->v->verify_wq, &io->work);
@@ -1033,7 +1034,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 		goto out;
 
 	r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
-			v->zero_digest);
+			v->zero_digest, true);
 
 out:
 	kfree(req);
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 2f555b420367..f96f4e281ee4 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -128,7 +128,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 					      u8 *data, size_t len));
 
 extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
-		       const u8 *data, size_t len, u8 *digest);
+		       const u8 *data, size_t len, u8 *digest, bool may_sleep);
 
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 				 sector_t block, u8 *digest, bool *is_zero);
-- 
cgit 


From 13648e04a9b831b3dfa5cf3887dfa6cf8fe5fe69 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Nov 2023 18:38:33 +0100
Subject: dm-crypt: start allocating with MAX_ORDER

Commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely")
changed the meaning of MAX_ORDER from exclusive to inclusive. So, we
can allocate compound pages with up to 1 << MAX_ORDER pages.

Reflect this change in dm-crypt and start trying to allocate compound
pages with MAX_ORDER.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6de107aff331..2ae8560b6a14 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
 	unsigned int remaining_size;
-	unsigned int order = MAX_ORDER - 1;
+	unsigned int order = MAX_ORDER;
 
 retry:
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
-- 
cgit 


From bc1b5acb40201a0746d68a7d7cfc141899937f4f Mon Sep 17 00:00:00 2001
From: Mahmoud Adam <mngyadam@amazon.com>
Date: Fri, 10 Nov 2023 19:21:04 +0100
Subject: nfsd: fix file memleak on client_opens_release

seq_release should be called to free the allocated seq_file

Cc: stable@vger.kernel.org # v5.3+
Signed-off-by: Mahmoud Adam <mngyadam@amazon.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Fixes: 78599c42ae3c ("nfsd4: add file to display list of client's opens")
Reviewed-by: NeilBrown <neilb@suse.de>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 65fd5510323a..3709e58f0a4a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2804,7 +2804,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
 
 	/* XXX: alternatively, we could get/drop in seq start/stop */
 	drop_client(clp);
-	return 0;
+	return seq_release(inode, file);
 }
 
 static const struct file_operations client_states_fops = {
-- 
cgit 


From 49cecd8628a9855cd993792a0377559ea32d5e7c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 10 Nov 2023 11:28:39 -0500
Subject: NFSD: Update nfsd_cache_append() to use xdr_stream

When inserting a DRC-cached response into the reply buffer, ensure
that the reply buffer's xdr_stream is updated properly. Otherwise
the server will send a garbage response.

Cc: stable@vger.kernel.org # v6.3+
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfscache.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 80621a709510..abb453be71ca 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -640,24 +640,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
 	return;
 }
 
-/*
- * Copy cached reply to current reply buffer. Should always fit.
- * FIXME as reply is in a page, we should just attach the page, and
- * keep a refcount....
- */
 static int
 nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
 {
-	struct kvec	*vec = &rqstp->rq_res.head[0];
-
-	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
-		printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
-				data->iov_len);
-		return 0;
-	}
-	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
-	vec->iov_len += data->iov_len;
-	return 1;
+	__be32 *p;
+
+	p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len);
+	if (unlikely(!p))
+		return false;
+	memcpy(p, data->iov_base, data->iov_len);
+	xdr_commit_encode(&rqstp->rq_res_stream);
+	return true;
 }
 
 /*
-- 
cgit 


From 1caf5f61dd8430ae5a0b4538afe4953ce7517cbb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 10 Nov 2023 11:28:33 -0500
Subject: NFSD: Fix "start of NFS reply" pointer passed to nfsd_cache_update()

The "statp + 1" pointer that is passed to nfsd_cache_update() is
supposed to point to the start of the egress NFS Reply header. In
fact, it does point there for AUTH_SYS and RPCSEC_GSS_KRB5 requests.

But both krb5i and krb5p add fields between the RPC header's
accept_stat field and the start of the NFS Reply header. In those
cases, "statp + 1" points at the extra fields instead of the Reply.
The result is that nfsd_cache_update() caches what looks to the
client like garbage.

A connection break can occur for a number of reasons, but the most
common reason when using krb5i/p is a GSS sequence number window
underrun. When an underrun is detected, the server is obliged to
drop the RPC and the connection to force a retransmit with a fresh
GSS sequence number. The client presents the same XID, it hits in
the server's DRC, and the server returns the garbage cache entry.

The "statp + 1" argument has been used since the oldest changeset
in the kernel history repo, so it has been in nfsd_dispatch()
literally since before history began. The problem arose only when
the server-side GSS implementation was added twenty years ago.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d6122bb2d167..b4e4e04f9931 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -981,6 +981,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	const struct svc_procedure *proc = rqstp->rq_procinfo;
 	__be32 *statp = rqstp->rq_accept_statp;
 	struct nfsd_cacherep *rp;
+	__be32 *nfs_reply;
 
 	/*
 	 * Give the xdr decoder a chance to change this if it wants
@@ -1010,6 +1011,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 		goto out_dropit;
 	}
 
+	nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0);
 	*statp = proc->pc_func(rqstp);
 	if (test_bit(RQ_DROPME, &rqstp->rq_flags))
 		goto out_update_drop;
@@ -1023,7 +1025,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	 */
 	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
 
-	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
+	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply);
 out_cached_reply:
 	return 1;
 
-- 
cgit 


From bf51c52a1f3c238d72c64e14d5e7702d3a245b82 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 10 Nov 2023 11:28:45 -0500
Subject: NFSD: Fix checksum mismatches in the duplicate reply cache

nfsd_cache_csum() currently assumes that the server's RPC layer has
been advancing rq_arg.head[0].iov_base as it decodes an incoming
request, because that's the way it used to work. On entry, it
expects that buf->head[0].iov_base points to the start of the NFS
header, and excludes the already-decoded RPC header.

These days however, head[0].iov_base now points to the start of the
RPC header during all processing. It no longer points at the NFS
Call header when execution arrives at nfsd_cache_csum().

In a retransmitted RPC the XID and the NFS header are supposed to
be the same as the original message, but the contents of the
retransmitted RPC header can be different. For example, for krb5,
the GSS sequence number will be different between the two. Thus if
the RPC header is always included in the DRC checksum computation,
the checksum of the retransmitted message might not match the
checksum of the original message, even though the NFS part of these
messages is identical.

The result is that, even if a matching XID is found in the DRC,
the checksum mismatch causes the server to execute the
retransmitted RPC transaction again.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/cache.h    |  4 ++--
 fs/nfsd/nfscache.c | 64 ++++++++++++++++++++++++++++++++++++------------------
 fs/nfsd/nfssvc.c   | 10 ++++++++-
 3 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 929248c6ca84..4cbe0434cbb8 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,8 +84,8 @@ int	nfsd_net_reply_cache_init(struct nfsd_net *nn);
 void	nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int	nfsd_reply_cache_init(struct nfsd_net *);
 void	nfsd_reply_cache_shutdown(struct nfsd_net *);
-int	nfsd_cache_lookup(struct svc_rqst *rqstp,
-			  struct nfsd_cacherep **cacherep);
+int	nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+			  unsigned int len, struct nfsd_cacherep **cacherep);
 void	nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
 			  int cachetype, __be32 *statp);
 int	nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index abb453be71ca..6cd36af2f97e 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -368,33 +368,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 	return freed;
 }
 
-/*
- * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+/**
+ * nfsd_cache_csum - Checksum incoming NFS Call arguments
+ * @buf: buffer containing a whole RPC Call message
+ * @start: starting byte of the NFS Call header
+ * @remaining: size of the NFS Call header, in bytes
+ *
+ * Compute a weak checksum of the leading bytes of an NFS procedure
+ * call header to help verify that a retransmitted Call matches an
+ * entry in the duplicate reply cache.
+ *
+ * To avoid assumptions about how the RPC message is laid out in
+ * @buf and what else it might contain (eg, a GSS MIC suffix), the
+ * caller passes us the exact location and length of the NFS Call
+ * header.
+ *
+ * Returns a 32-bit checksum value, as defined in RFC 793.
  */
-static __wsum
-nfsd_cache_csum(struct svc_rqst *rqstp)
+static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start,
+			      unsigned int remaining)
 {
+	unsigned int base, len;
+	struct xdr_buf subbuf;
+	__wsum csum = 0;
+	void *p;
 	int idx;
-	unsigned int base;
-	__wsum csum;
-	struct xdr_buf *buf = &rqstp->rq_arg;
-	const unsigned char *p = buf->head[0].iov_base;
-	size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
-				RC_CSUMLEN);
-	size_t len = min(buf->head[0].iov_len, csum_len);
+
+	if (remaining > RC_CSUMLEN)
+		remaining = RC_CSUMLEN;
+	if (xdr_buf_subsegment(buf, &subbuf, start, remaining))
+		return csum;
 
 	/* rq_arg.head first */
-	csum = csum_partial(p, len, 0);
-	csum_len -= len;
+	if (subbuf.head[0].iov_len) {
+		len = min_t(unsigned int, subbuf.head[0].iov_len, remaining);
+		csum = csum_partial(subbuf.head[0].iov_base, len, csum);
+		remaining -= len;
+	}
 
 	/* Continue into page array */
-	idx = buf->page_base / PAGE_SIZE;
-	base = buf->page_base & ~PAGE_MASK;
-	while (csum_len) {
-		p = page_address(buf->pages[idx]) + base;
-		len = min_t(size_t, PAGE_SIZE - base, csum_len);
+	idx = subbuf.page_base / PAGE_SIZE;
+	base = subbuf.page_base & ~PAGE_MASK;
+	while (remaining) {
+		p = page_address(subbuf.pages[idx]) + base;
+		len = min_t(unsigned int, PAGE_SIZE - base, remaining);
 		csum = csum_partial(p, len, csum);
-		csum_len -= len;
+		remaining -= len;
 		base = 0;
 		++idx;
 	}
@@ -465,6 +484,8 @@ out:
 /**
  * nfsd_cache_lookup - Find an entry in the duplicate reply cache
  * @rqstp: Incoming Call to find
+ * @start: starting byte in @rqstp->rq_arg of the NFS Call header
+ * @len: size of the NFS Call header, in bytes
  * @cacherep: OUT: DRC entry for this request
  *
  * Try to find an entry matching the current call in the cache. When none
@@ -478,7 +499,8 @@ out:
  *   %RC_REPLY: Reply from cache
  *   %RC_DROPIT: Do not process the request further
  */
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+		      unsigned int len, struct nfsd_cacherep **cacherep)
 {
 	struct nfsd_net		*nn;
 	struct nfsd_cacherep	*rp, *found;
@@ -494,7 +516,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
 		goto out;
 	}
 
-	csum = nfsd_cache_csum(rqstp);
+	csum = nfsd_cache_csum(&rqstp->rq_arg, start, len);
 
 	/*
 	 * Since the common case is a cache miss followed by an insert,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b4e4e04f9931..fe61d9bbcc1f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -981,6 +981,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	const struct svc_procedure *proc = rqstp->rq_procinfo;
 	__be32 *statp = rqstp->rq_accept_statp;
 	struct nfsd_cacherep *rp;
+	unsigned int start, len;
 	__be32 *nfs_reply;
 
 	/*
@@ -989,6 +990,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	 */
 	rqstp->rq_cachetype = proc->pc_cachetype;
 
+	/*
+	 * ->pc_decode advances the argument stream past the NFS
+	 * Call header, so grab the header's starting location and
+	 * size now for the call to nfsd_cache_lookup().
+	 */
+	start = xdr_stream_pos(&rqstp->rq_arg_stream);
+	len = xdr_stream_remaining(&rqstp->rq_arg_stream);
 	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
 		goto out_decode_err;
 
@@ -1002,7 +1010,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
 
 	rp = NULL;
-	switch (nfsd_cache_lookup(rqstp, &rp)) {
+	switch (nfsd_cache_lookup(rqstp, start, len, &rp)) {
 	case RC_DOIT:
 		break;
 	case RC_REPLY:
-- 
cgit 


From 721d28f3dfb3e40c45ce45fbeeff47b72c230bc9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 16 Nov 2023 11:13:40 -0800
Subject: parisc: Replace strlcpy() with strscpy()

strlcpy() reads the entire source buffer first. This read may exceed
the destination size limit. This is both inefficient and can lead
to linear read overflows if a source string is not NUL-terminated[1].
Additionally, it returns the size of the source string, not the
resulting size of the destination string. In an effort to remove strlcpy()
completely[2], replace strlcpy() here with strscpy().

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [1]
Link: https://github.com/KSPP/linux/issues/89 [2]
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Azeem Shaikh <azeemshaikh38@gmail.com>
Cc: linux-parisc@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/processor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index 29e2750f86a4..e95a977ba5f3 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -383,7 +383,7 @@ show_cpuinfo (struct seq_file *m, void *v)
 	char cpu_name[60], *p;
 
 	/* strip PA path from CPU name to not confuse lscpu */
-	strlcpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name));
+	strscpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name));
 	p = strrchr(cpu_name, '[');
 	if (p)
 		*(--p) = 0;
-- 
cgit 


From 6ad6e15a9c46b8f0932cd99724f26f3db4db1cdf Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 17 Nov 2023 16:43:52 +0100
Subject: parisc/power: Fix power soft-off when running on qemu

Firmware returns the physical address of the power switch,
so need to use gsc_writel() instead of direct memory access.

Fixes: d0c219472980 ("parisc/power: Add power soft-off when running on qemu")
Signed-off-by: Helge Deller <deller@gmx.de>
Cc: stable@vger.kernel.org # v6.0+
---
 drivers/parisc/power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index 539d8920c202..bb0d92461b08 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -176,7 +176,7 @@ static struct notifier_block parisc_panic_block = {
 static int qemu_power_off(struct sys_off_data *data)
 {
 	/* this turns the system off via SeaBIOS */
-	*(int *)data->cb_data = 0;
+	gsc_writel(0, (unsigned long) data->cb_data);
 	pdc_soft_power_button(1);
 	return NOTIFY_DONE;
 }
-- 
cgit 


From 793838138c157d4c49f4fb744b170747e3dabf58 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sat, 18 Nov 2023 19:33:35 +0100
Subject: prctl: Disable prctl(PR_SET_MDWE) on parisc

systemd-254 tries to use prctl(PR_SET_MDWE) for it's MemoryDenyWriteExecute
functionality, but fails on parisc which still needs executable stacks in
certain combinations of gcc/glibc/kernel.

Disable prctl(PR_SET_MDWE) by returning -EINVAL for now on parisc, until
userspace has catched up.

Signed-off-by: Helge Deller <deller@gmx.de>
Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Sam James <sam@gentoo.org>
Closes: https://github.com/systemd/systemd/issues/29775
Tested-by: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/all/875y2jro9a.fsf@gentoo.org/
Cc: <stable@vger.kernel.org> # v6.3+
---
 kernel/sys.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sys.c b/kernel/sys.c
index 420d9cb9cc8e..e219fcfa112d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2394,6 +2394,10 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
 	if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
 		return -EINVAL;
 
+	/* PARISC cannot allow mdwe as it needs writable stacks */
+	if (IS_ENABLED(CONFIG_PARISC))
+		return -EINVAL;
+
 	current_bits = get_current_mdwe();
 	if (current_bits && current_bits != bits)
 		return -EPERM; /* Cannot unset the flags */
-- 
cgit 


From 98b1cc82c4affc16f5598d4fa14b1858671b2263 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Nov 2023 15:02:14 -0800
Subject: Linux 6.7-rc2

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ede0bd241056..724c79bebe72 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 7
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc2
 NAME = Hurr durr I'ma ninja sloth
 
 # *DOCUMENTATION*
-- 
cgit 


From 8771127e25d6c20d458ad27cf32f7fcfc1755e05 Mon Sep 17 00:00:00 2001
From: Lech Perczak <lech.perczak@gmail.com>
Date: Sat, 18 Nov 2023 00:19:17 +0100
Subject: USB: serial: option: don't claim interface 4 for ZTE MF290
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Interface 4 is used by for QMI interface in stock firmware of MF28D, the
router which uses MF290 modem. Free the interface up, to rebind it to
qmi_wwan driver.
The proper configuration is:

Interface mapping is:
0: QCDM, 1: (unknown), 2: AT (PCUI), 2: AT (Modem), 4: QMI

T:  Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#=  4 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=19d2 ProdID=0189 Rev= 0.00
S:  Manufacturer=ZTE, Incorporated
S:  Product=ZTE LTE Technologies MSM
C:* #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=84(I) Atr=03(Int.) MxPS=  64 Ivl=2ms
E:  Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
E:  Ad=86(I) Atr=03(Int.) MxPS=  64 Ivl=2ms
E:  Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms

Cc: Bjørn Mork <bjorn@mork.no>
Signed-off-by: Lech Perczak <lech.perczak@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 45dcfaadaf98..ff9049db6e65 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1546,7 +1546,8 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0165, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0167, 0xff, 0xff, 0xff),
 	  .driver_info = RSVD(4) },
-	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff),
+	  .driver_info = RSVD(4) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0191, 0xff, 0xff, 0xff), /* ZTE EuFi890 */
 	  .driver_info = RSVD(4) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0196, 0xff, 0xff, 0xff) },
-- 
cgit 


From a1092619dd28ac0fcf23016160a2fdccd98ef935 Mon Sep 17 00:00:00 2001
From: Puliang Lu <puliang.lu@fibocom.com>
Date: Thu, 26 Oct 2023 20:35:06 +0800
Subject: USB: serial: option: fix FM101R-GL defines

Modify the definition of the two Fibocom FM101R-GL PID macros, which had
their PIDs switched.

The correct PIDs are:

- VID:PID 413C:8213, FM101R-GL ESIM are laptop M.2 cards (with
  MBIM interfaces for Linux)

- VID:PID 413C:8215, FM101R-GL are laptop M.2 cards (with
  MBIM interface for Linux)

0x8213: mbim, tty
0x8215: mbim, tty

Signed-off-by: Puliang Lu <puliang.lu@fibocom.com>
Fixes: 52480e1f1a25 ("USB: serial: option: add Fibocom to DELL custom modem FM101R-GL")
Link: https://lore.kernel.org/lkml/TYZPR02MB508845BAD7936A62A105CE5D89DFA@TYZPR02MB5088.apcprd02.prod.outlook.com/
Cc: stable@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index ff9049db6e65..9c76095ebfe1 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -203,8 +203,8 @@ static void option_instat_callback(struct urb *urb);
 #define DELL_PRODUCT_5829E_ESIM			0x81e4
 #define DELL_PRODUCT_5829E			0x81e6
 
-#define DELL_PRODUCT_FM101R			0x8213
-#define DELL_PRODUCT_FM101R_ESIM		0x8215
+#define DELL_PRODUCT_FM101R_ESIM		0x8213
+#define DELL_PRODUCT_FM101R			0x8215
 
 #define KYOCERA_VENDOR_ID			0x0c88
 #define KYOCERA_PRODUCT_KPC650			0x17da
-- 
cgit 


From e389fe8b68137344562fb6e4d53d8a89ef6212dd Mon Sep 17 00:00:00 2001
From: Victor Fragoso <victorffs@hotmail.com>
Date: Tue, 21 Nov 2023 21:05:56 +0000
Subject: USB: serial: option: add Fibocom L7xx modules

Add support for Fibocom L716-EU module series.

L716-EU is a Fibocom module based on ZTE's V3E/V3T chipset.

Device creates multiple interfaces when connected to PC as follows:
 - Network Interface: ECM or RNDIS (set by FW or AT Command)
 - ttyUSB0: AT port
 - ttyUSB1: Modem port
 - ttyUSB2: AT2 port
 - ttyUSB3: Trace port for log information
 - ADB: ADB port for debugging. ("Driver=usbfs" when ADB server enabled)

Here are the outputs of lsusb and usb-devices:
$ ls /dev/ttyUSB*
/dev/ttyUSB0  /dev/ttyUSB1  /dev/ttyUSB2  /dev/ttyUSB3

usb-devices:
L716-EU (ECM mode):
T:  Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 51 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=2cb7 ProdID=0001 Rev= 1.00
S:  Manufacturer=Fibocom,Incorporated
S:  Product=Fibocom Mobile Boardband
S:  SerialNumber=1234567890ABCDEF
C:* #Ifs= 7 Cfg#= 1 Atr=e0 MxPwr=500mA
A:  FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=06 Prot=00
I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=06 Prot=00 Driver=cdc_ether
E:  Ad=87(I) Atr=03(Int.) MxPS=  16 Ivl=32ms
I:  If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether
I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

L716-EU (RNDIS mode):
T:  Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 49 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=2cb7 ProdID=0001 Rev= 1.00
S:  Manufacturer=Fibocom,Incorporated
S:  Product=Fibocom Mobile Boardband
S:  SerialNumber=1234567890ABCDEF
C:* #Ifs= 7 Cfg#= 1 Atr=e0 MxPwr=500mA
A:  FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03
I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=ff Driver=rndis_host
E:  Ad=87(I) Atr=03(Int.) MxPS=   8 Ivl=32ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

Signed-off-by: Victor Fragoso <victorffs@hotmail.com>
Reviewed-by: Lars Melin <larsm17@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 9c76095ebfe1..06b9b04c022a 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -2250,6 +2250,7 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = RSVD(4) | RSVD(5) | RSVD(6) },
 	{ USB_DEVICE(0x1782, 0x4d10) },						/* Fibocom L610 (AT mode) */
 	{ USB_DEVICE_INTERFACE_CLASS(0x1782, 0x4d11, 0xff) },			/* Fibocom L610 (ECM/RNDIS mode) */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0001, 0xff, 0xff, 0xff) },	/* Fibocom L716-EU (ECM/RNDIS mode) */
 	{ USB_DEVICE(0x2cb7, 0x0104),						/* Fibocom NL678 series */
 	  .driver_info = RSVD(4) | RSVD(5) },
 	{ USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff),			/* Fibocom NL678 series */
-- 
cgit 


From da90e45d5afc4da2de7cd3ea7943d0f1baa47cc2 Mon Sep 17 00:00:00 2001
From: Asuna Yang <spriteovo@gmail.com>
Date: Wed, 22 Nov 2023 22:18:03 +0800
Subject: USB: serial: option: add Luat Air72*U series products

Update the USB serial option driver support for Luat Air72*U series
products.

ID 1782:4e00 Spreadtrum Communications Inc. UNISOC-8910

T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 13 Spd=480 MxCh= 0
D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
P: Vendor=1782 ProdID=4e00 Rev=00.00
S: Manufacturer=UNISOC
S: Product=UNISOC-8910
C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=400mA
I: If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host
E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=4096ms
I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host
E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms

If#= 2: AT
If#= 3: PPP + AT
If#= 4: Debug

Co-developed-by: Yangyu Chen <cyy@cyyself.name>
Signed-off-by: Yangyu Chen <cyy@cyyself.name>
Signed-off-by: Asuna Yang <SpriteOvO@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 06b9b04c022a..4dffcfefd62d 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -609,6 +609,8 @@ static void option_instat_callback(struct urb *urb);
 #define UNISOC_VENDOR_ID			0x1782
 /* TOZED LT70-C based on UNISOC SL8563 uses UNISOC's vendor ID */
 #define TOZED_PRODUCT_LT70C			0x4055
+/* Luat Air72*U series based on UNISOC UIS8910 uses UNISOC's vendor ID */
+#define LUAT_PRODUCT_AIR720U			0x4e00
 
 /* Device flags */
 
@@ -2273,6 +2275,7 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) },
 	{ } /* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, option_ids);
-- 
cgit