diff options
Diffstat (limited to 'tools')
366 files changed, 16965 insertions, 3075 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7b32b99023a2..5fd7caea4419 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -86,9 +86,14 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_CORTEX_X1C 0xD4C +#define ARM_CPU_PART_CORTEX_X3 0xD4E #define ARM_CPU_PART_NEOVERSE_V2 0xD4F +#define ARM_CPU_PART_CORTEX_A720 0xD81 #define ARM_CPU_PART_CORTEX_X4 0xD82 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 +#define ARM_CPU_PART_CORTEX_X925 0xD85 +#define ARM_CPU_PART_CORTEX_A725 0xD87 #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -162,9 +167,14 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) +#define MIDR_CORTEX_X3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X3) #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) +#define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) +#define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) +#define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 1691297a766a..eaeda001784e 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -645,6 +645,9 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) #define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4) #define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5) +#define KVM_REG_PPC_DEXCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6) +#define KVM_REG_PPC_HASHKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc7) +#define KVM_REG_PPC_HASHPKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc8) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 3c7434329661..dd4682857c12 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -18,170 +18,170 @@ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature + * bit is not displayed in /proc/cpuinfo at all. * * When adding new features here that depend on other features, * please update the table in kernel/cpu/cpuid-deps.c as well. */ /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */ #define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ #define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ #define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */ #define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ -#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */ #define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ #define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ #define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ #define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ #define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -189,93 +189,93 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ -#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ -#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ -#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ -#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ -#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ -#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ -#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ -#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ -#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */ +#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */ +#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */ +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */ +#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */ +#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ -#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ -#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ -#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ -#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ +#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */ +#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -283,181 +283,183 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ -#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ -#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ -#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ -#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ -#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ -#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ -#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ -#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ -#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ -#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ -#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ -#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ -#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ -#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ -#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ -#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ -#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ -#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ -#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ -#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ -#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ -#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ -#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ -#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ -#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ -#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */ +#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */ +#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */ +#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */ +#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ -#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ -#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ -#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */ -#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ -#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ -#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ -#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ -#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ -#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ -#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ -#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ +#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */ +#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ +#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ -#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ -#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ -#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ -#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */ -#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ -#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ -#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ -#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ -#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ -#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ -#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ +#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ +#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ #define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ #define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ #define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ -#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ -#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ -#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ -#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */ -#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ -#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ -#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ -#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ -#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ -#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ -#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */ +#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ +#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */ +#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */ +#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ -#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ -#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ -#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ -#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ -#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ -#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ -#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ -#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ -#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ -#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ -#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */ -#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ -#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ -#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ -#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ -#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ -#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */ +#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */ +#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */ +#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ -#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ -#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ -#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ -#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ -#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ -#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ -#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ -#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ -#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ +#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ +#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ -#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */ -#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ -#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -465,59 +467,60 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ -#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ -#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ -#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ -#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ /* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */ #define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ #define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ -#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ -#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ -#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ -#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ -#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ -#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ -#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ -#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ -#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ -#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ -#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ -#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ -#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ -#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ -#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ -#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ -#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ -#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ -#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index e022e6eb766c..82c6a4d350e0 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -566,6 +566,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* KeyID partitioning between MKTME and TDX */ #define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 @@ -660,6 +666,8 @@ #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 +#define MSR_SVSM_CAA 0xc001f000 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -781,6 +789,8 @@ #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 +#define MSR_K7_HWCR_CPB_DIS_BIT 25 +#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT) /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 @@ -1164,6 +1174,7 @@ #define MSR_IA32_QM_CTR 0xc8e #define MSR_IA32_PQR_ASSOC 0xc8f #define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 #define MSR_IA32_L2_CBM_BASE 0xd10 #define MSR_IA32_MBA_THRTL_BASE 0xd50 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 9fae1b73b529..bf57a824f722 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -697,6 +698,11 @@ enum sev_cmd_id { /* Second time is the charm; improved versions of the above ioctls. */ KVM_SEV_INIT2, + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_NR_MAX, }; @@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data { __u32 pad2; }; +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) @@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SW_PROTECTED_VM 1 #define KVM_X86_SEV_VM 2 #define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h index 80e1df482337..1814b413fd57 100644 --- a/tools/arch/x86/include/uapi/asm/svm.h +++ b/tools/arch/x86/include/uapi/asm/svm.h @@ -115,6 +115,7 @@ #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv index e0c25b75327e..d751eb8585d0 100644 --- a/tools/arch/x86/kcpuid/cpuid.csv +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -1,451 +1,1053 @@ -# The basic row format is: -# LEAF, SUBLEAF, register_name, bits, short_name, long_description - -# Leaf 00H - 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs - -# Leaf 01H - 1, 0, EAX, 3:0, stepping, Stepping ID - 1, 0, EAX, 7:4, model, Model - 1, 0, EAX, 11:8, family, Family ID - 1, 0, EAX, 13:12, processor, Processor Type - 1, 0, EAX, 19:16, model_ext, Extended Model ID - 1, 0, EAX, 27:20, family_ext, Extended Family ID - - 1, 0, EBX, 7:0, brand, Brand Index - 1, 0, EBX, 15:8, clflush_size, CLFLUSH line size (value * 8) in bytes - 1, 0, EBX, 23:16, max_cpu_id, Maxim number of addressable logic cpu in this package - 1, 0, EBX, 31:24, apic_id, Initial APIC ID - - 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) - 1, 0, ECX, 1, pclmulqdq, PCLMULQDQ instruction supported - 1, 0, ECX, 2, dtes64, DS area uses 64-bit layout - 1, 0, ECX, 3, mwait, MONITOR/MWAIT supported - 1, 0, ECX, 4, ds_cpl, CPL Qualified Debug Store which allows for branch message storage qualified by CPL - 1, 0, ECX, 5, vmx, Virtual Machine Extensions supported - 1, 0, ECX, 6, smx, Safer Mode Extension supported - 1, 0, ECX, 7, eist, Enhanced Intel SpeedStep Technology - 1, 0, ECX, 8, tm2, Thermal Monitor 2 - 1, 0, ECX, 9, ssse3, Supplemental Streaming SIMD Extensions 3 (SSSE3) - 1, 0, ECX, 10, l1_ctx_id, L1 data cache could be set to either adaptive mode or shared mode (check IA32_MISC_ENABLE bit 24 definition) - 1, 0, ECX, 11, sdbg, IA32_DEBUG_INTERFACE MSR for silicon debug supported - 1, 0, ECX, 12, fma, FMA extensions using YMM state supported - 1, 0, ECX, 13, cmpxchg16b, 'CMPXCHG16B - Compare and Exchange Bytes' supported - 1, 0, ECX, 14, xtpr_update, xTPR Update Control supported - 1, 0, ECX, 15, pdcm, Perfmon and Debug Capability present - 1, 0, ECX, 17, pcid, Process-Context Identifiers feature present - 1, 0, ECX, 18, dca, Prefetching data from a memory mapped device supported - 1, 0, ECX, 19, sse4_1, SSE4.1 feature present - 1, 0, ECX, 20, sse4_2, SSE4.2 feature present - 1, 0, ECX, 21, x2apic, x2APIC supported - 1, 0, ECX, 22, movbe, MOVBE instruction supported - 1, 0, ECX, 23, popcnt, POPCNT instruction supported - 1, 0, ECX, 24, tsc_deadline_timer, LAPIC supports one-shot operation using a TSC deadline value - 1, 0, ECX, 25, aesni, AESNI instruction supported - 1, 0, ECX, 26, xsave, XSAVE/XRSTOR processor extended states (XSETBV/XGETBV/XCR0) - 1, 0, ECX, 27, osxsave, OS has set CR4.OSXSAVE bit to enable XSETBV/XGETBV/XCR0 - 1, 0, ECX, 28, avx, AVX instruction supported - 1, 0, ECX, 29, f16c, 16-bit floating-point conversion instruction supported - 1, 0, ECX, 30, rdrand, RDRAND instruction supported - - 1, 0, EDX, 0, fpu, x87 FPU on chip - 1, 0, EDX, 1, vme, Virtual-8086 Mode Enhancement - 1, 0, EDX, 2, de, Debugging Extensions - 1, 0, EDX, 3, pse, Page Size Extensions - 1, 0, EDX, 4, tsc, Time Stamp Counter - 1, 0, EDX, 5, msr, RDMSR and WRMSR Support - 1, 0, EDX, 6, pae, Physical Address Extensions - 1, 0, EDX, 7, mce, Machine Check Exception - 1, 0, EDX, 8, cx8, CMPXCHG8B instr - 1, 0, EDX, 9, apic, APIC on Chip - 1, 0, EDX, 11, sep, SYSENTER and SYSEXIT instrs - 1, 0, EDX, 12, mtrr, Memory Type Range Registers - 1, 0, EDX, 13, pge, Page Global Bit - 1, 0, EDX, 14, mca, Machine Check Architecture - 1, 0, EDX, 15, cmov, Conditional Move Instrs - 1, 0, EDX, 16, pat, Page Attribute Table - 1, 0, EDX, 17, pse36, 36-Bit Page Size Extension - 1, 0, EDX, 18, psn, Processor Serial Number - 1, 0, EDX, 19, clflush, CLFLUSH instr -# 1, 0, EDX, 20, - 1, 0, EDX, 21, ds, Debug Store - 1, 0, EDX, 22, acpi, Thermal Monitor and Software Controlled Clock Facilities - 1, 0, EDX, 23, mmx, Intel MMX Technology - 1, 0, EDX, 24, fxsr, XSAVE and FXRSTOR Instrs - 1, 0, EDX, 25, sse, SSE - 1, 0, EDX, 26, sse2, SSE2 - 1, 0, EDX, 27, ss, Self Snoop - 1, 0, EDX, 28, hit, Max APIC IDs - 1, 0, EDX, 29, tm, Thermal Monitor -# 1, 0, EDX, 30, - 1, 0, EDX, 31, pbe, Pending Break Enable - -# Leaf 02H -# cache and TLB descriptor info - -# Leaf 03H -# Precessor Serial Number, introduced on Pentium III, not valid for -# latest models - -# Leaf 04H -# thread/core and cache topology - 4, 0, EAX, 4:0, cache_type, Cache type like instr/data or unified - 4, 0, EAX, 7:5, cache_level, Cache Level (starts at 1) - 4, 0, EAX, 8, cache_self_init, Cache Self Initialization - 4, 0, EAX, 9, fully_associate, Fully Associative cache -# 4, 0, EAX, 13:10, resvd, resvd - 4, 0, EAX, 25:14, max_logical_id, Max number of addressable IDs for logical processors sharing the cache - 4, 0, EAX, 31:26, max_phy_id, Max number of addressable IDs for processors in phy package - - 4, 0, EBX, 11:0, cache_linesize, Size of a cache line in bytes - 4, 0, EBX, 21:12, cache_partition, Physical Line partitions - 4, 0, EBX, 31:22, cache_ways, Ways of associativity - 4, 0, ECX, 31:0, cache_sets, Number of Sets - 1 - 4, 0, EDX, 0, c_wbinvd, 1 means WBINVD/INVD is not ganranteed to act upon lower level caches of non-originating threads sharing this cache - 4, 0, EDX, 1, c_incl, Whether cache is inclusive of lower cache level - 4, 0, EDX, 2, c_comp_index, Complex Cache Indexing - -# Leaf 05H -# MONITOR/MWAIT - 5, 0, EAX, 15:0, min_mon_size, Smallest monitor line size in bytes - 5, 0, EBX, 15:0, max_mon_size, Largest monitor line size in bytes - 5, 0, ECX, 0, mwait_ext, Enum of Monitor-Mwait extensions supported - 5, 0, ECX, 1, mwait_irq_break, Largest monitor line size in bytes - 5, 0, EDX, 3:0, c0_sub_stats, Number of C0* sub C-states supported using MWAIT - 5, 0, EDX, 7:4, c1_sub_stats, Number of C1* sub C-states supported using MWAIT - 5, 0, EDX, 11:8, c2_sub_stats, Number of C2* sub C-states supported using MWAIT - 5, 0, EDX, 15:12, c3_sub_stats, Number of C3* sub C-states supported using MWAIT - 5, 0, EDX, 19:16, c4_sub_stats, Number of C4* sub C-states supported using MWAIT - 5, 0, EDX, 23:20, c5_sub_stats, Number of C5* sub C-states supported using MWAIT - 5, 0, EDX, 27:24, c6_sub_stats, Number of C6* sub C-states supported using MWAIT - 5, 0, EDX, 31:28, c7_sub_stats, Number of C7* sub C-states supported using MWAIT - -# Leaf 06H -# Thermal & Power Management - - 6, 0, EAX, 0, dig_temp, Digital temperature sensor supported - 6, 0, EAX, 1, turbo, Intel Turbo Boost - 6, 0, EAX, 2, arat, Always running APIC timer -# 6, 0, EAX, 3, resv, Reserved - 6, 0, EAX, 4, pln, Power limit notifications supported - 6, 0, EAX, 5, ecmd, Clock modulation duty cycle extension supported - 6, 0, EAX, 6, ptm, Package thermal management supported - 6, 0, EAX, 7, hwp, HWP base register - 6, 0, EAX, 8, hwp_notify, HWP notification - 6, 0, EAX, 9, hwp_act_window, HWP activity window - 6, 0, EAX, 10, hwp_energy, HWP energy performance preference - 6, 0, EAX, 11, hwp_pkg_req, HWP package level request -# 6, 0, EAX, 12, resv, Reserved - 6, 0, EAX, 13, hdc, HDC base registers supported - 6, 0, EAX, 14, turbo3, Turbo Boost Max 3.0 - 6, 0, EAX, 15, hwp_cap, Highest Performance change supported - 6, 0, EAX, 16, hwp_peci, HWP PECI override is supported - 6, 0, EAX, 17, hwp_flex, Flexible HWP is supported - 6, 0, EAX, 18, hwp_fast, Fast access mode for the IA32_HWP_REQUEST MSR is supported -# 6, 0, EAX, 19, resv, Reserved - 6, 0, EAX, 20, hwp_ignr, Ignoring Idle Logical Processor HWP request is supported - - 6, 0, EBX, 3:0, therm_irq_thresh, Number of Interrupt Thresholds in Digital Thermal Sensor - 6, 0, ECX, 0, aperfmperf, Presence of IA32_MPERF and IA32_APERF - 6, 0, ECX, 3, energ_bias, Performance-energy bias preference supported - -# Leaf 07H -# ECX == 0 -# AVX512 refers to https://en.wikipedia.org/wiki/AVX-512 -# XXX: Do we really need to enumerate each and every AVX512 sub features - - 7, 0, EBX, 0, fsgsbase, RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE supported - 7, 0, EBX, 1, tsc_adjust, TSC_ADJUST MSR supported - 7, 0, EBX, 2, sgx, Software Guard Extensions - 7, 0, EBX, 3, bmi1, BMI1 - 7, 0, EBX, 4, hle, Hardware Lock Elision - 7, 0, EBX, 5, avx2, AVX2 -# 7, 0, EBX, 6, fdp_excp_only, x87 FPU Data Pointer updated only on x87 exceptions - 7, 0, EBX, 7, smep, Supervisor-Mode Execution Prevention - 7, 0, EBX, 8, bmi2, BMI2 - 7, 0, EBX, 9, rep_movsb, Enhanced REP MOVSB/STOSB - 7, 0, EBX, 10, invpcid, INVPCID instruction - 7, 0, EBX, 11, rtm, Restricted Transactional Memory - 7, 0, EBX, 12, rdt_m, Intel RDT Monitoring capability - 7, 0, EBX, 13, depc_fpu_cs_ds, Deprecates FPU CS and FPU DS - 7, 0, EBX, 14, mpx, Memory Protection Extensions - 7, 0, EBX, 15, rdt_a, Intel RDT Allocation capability - 7, 0, EBX, 16, avx512f, AVX512 Foundation instr - 7, 0, EBX, 17, avx512dq, AVX512 Double and Quadword AVX512 instr - 7, 0, EBX, 18, rdseed, RDSEED instr - 7, 0, EBX, 19, adx, ADX instr - 7, 0, EBX, 20, smap, Supervisor Mode Access Prevention - 7, 0, EBX, 21, avx512ifma, AVX512 Integer Fused Multiply Add -# 7, 0, EBX, 22, resvd, resvd - 7, 0, EBX, 23, clflushopt, CLFLUSHOPT instr - 7, 0, EBX, 24, clwb, CLWB instr - 7, 0, EBX, 25, intel_pt, Intel Processor Trace instr - 7, 0, EBX, 26, avx512pf, Prefetch - 7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr - 7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr - 7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr - 7, 0, EBX, 30, avx512bw, AVX512 Byte & Word instr - 7, 0, EBX, 31, avx512vl, AVX512 Vector Length Extentions (VL) - 7, 0, ECX, 0, prefetchwt1, X - 7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions - 7, 0, ECX, 2, umip, User-mode Instruction Prevention - - 7, 0, ECX, 3, pku, Protection Keys for User-mode pages - 7, 0, ECX, 4, ospke, CR4 PKE set to enable protection keys -# 7, 0, ECX, 16:5, resvd, resvd - 7, 0, ECX, 21:17, mawau, The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode - 7, 0, ECX, 22, rdpid, RDPID and IA32_TSC_AUX -# 7, 0, ECX, 29:23, resvd, resvd - 7, 0, ECX, 30, sgx_lc, SGX Launch Configuration -# 7, 0, ECX, 31, resvd, resvd - -# Leaf 08H -# - - -# Leaf 09H -# Direct Cache Access (DCA) information - 9, 0, ECX, 31:0, dca_cap, The value of IA32_PLATFORM_DCA_CAP +# SPDX-License-Identifier: CC0-1.0 +# Generator: x86-cpuid-db v1.0 -# Leaf 0AH -# Architectural Performance Monitoring # -# Do we really need to print out the PMU related stuff? -# Does normal user really care about it? +# Auto-generated file. +# Please submit all updates and bugfixes to https://x86-cpuid.org # - 0xA, 0, EAX, 7:0, pmu_ver, Performance Monitoring Unit version - 0xA, 0, EAX, 15:8, pmu_gp_cnt_num, Numer of general-purose PMU counters per logical CPU - 0xA, 0, EAX, 23:16, pmu_cnt_bits, Bit wideth of PMU counter - 0xA, 0, EAX, 31:24, pmu_ebx_bits, Length of EBX bit vector to enumerate PMU events - - 0xA, 0, EBX, 0, pmu_no_core_cycle_evt, Core cycle event not available - 0xA, 0, EBX, 1, pmu_no_instr_ret_evt, Instruction retired event not available - 0xA, 0, EBX, 2, pmu_no_ref_cycle_evt, Reference cycles event not available - 0xA, 0, EBX, 3, pmu_no_llc_ref_evt, Last-level cache reference event not available - 0xA, 0, EBX, 4, pmu_no_llc_mis_evt, Last-level cache misses event not available - 0xA, 0, EBX, 5, pmu_no_br_instr_ret_evt, Branch instruction retired event not available - 0xA, 0, EBX, 6, pmu_no_br_mispredict_evt, Branch mispredict retired event not available - - 0xA, 0, ECX, 4:0, pmu_fixed_cnt_num, Performance Monitoring Unit version - 0xA, 0, ECX, 12:5, pmu_fixed_cnt_bits, Numer of PMU counters per logical CPU - -# Leaf 0BH -# Extended Topology Enumeration Leaf -# - - 0xB, 0, EAX, 4:0, id_shift, Number of bits to shift right on x2APIC ID to get a unique topology ID of the next level type - 0xB, 0, EBX, 15:0, cpu_nr, Number of logical processors at this level type - 0xB, 0, ECX, 15:8, lvl_type, 0-Invalid 1-SMT 2-Core - 0xB, 0, EDX, 31:0, x2apic_id, x2APIC ID the current logical processor - - -# Leaf 0DH -# Processor Extended State - 0xD, 0, EAX, 0, x87, X87 state - 0xD, 0, EAX, 1, sse, SSE state - 0xD, 0, EAX, 2, avx, AVX state - 0xD, 0, EAX, 4:3, mpx, MPX state - 0xD, 0, EAX, 7:5, avx512, AVX-512 state - 0xD, 0, EAX, 9, pkru, PKRU state - - 0xD, 0, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 - 0xD, 0, ECX, 31:0, max_sz_xsave, Maximum size (bytes) of the XSAVE/XRSTOR save area - - 0xD, 1, EAX, 0, xsaveopt, XSAVEOPT available - 0xD, 1, EAX, 1, xsavec, XSAVEC and compacted form supported - 0xD, 1, EAX, 2, xgetbv, XGETBV supported - 0xD, 1, EAX, 3, xsaves, XSAVES/XRSTORS and IA32_XSS supported - - 0xD, 1, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 - 0xD, 1, ECX, 8, pt, PT state - 0xD, 1, ECX, 11, cet_usr, CET user state - 0xD, 1, ECX, 12, cet_supv, CET supervisor state - 0xD, 1, ECX, 13, hdc, HDC state - 0xD, 1, ECX, 16, hwp, HWP state - -# Leaf 0FH -# Intel RDT Monitoring - - 0xF, 0, EBX, 31:0, rmid_range, Maximum range (zero-based) of RMID within this physical processor of all types - 0xF, 0, EDX, 1, l3c_rdt_mon, L3 Cache RDT Monitoring supported - - 0xF, 1, ECX, 31:0, rmid_range, Maximum range (zero-based) of RMID of this types - 0xF, 1, EDX, 0, l3c_ocp_mon, L3 Cache occupancy Monitoring supported - 0xF, 1, EDX, 1, l3c_tbw_mon, L3 Cache Total Bandwidth Monitoring supported - 0xF, 1, EDX, 2, l3c_lbw_mon, L3 Cache Local Bandwidth Monitoring supported +# The basic row format is: +# LEAF, SUBLEAVES, reg, bits, short_name , long_description + +# Leaf 0H +# Maximum standard leaf number + CPU vendor string + + 0, 0, eax, 31:0, max_std_leaf , Highest cpuid standard leaf supported + 0, 0, ebx, 31:0, cpu_vendorid_0 , CPU vendor ID string bytes 0 - 3 + 0, 0, ecx, 31:0, cpu_vendorid_2 , CPU vendor ID string bytes 8 - 11 + 0, 0, edx, 31:0, cpu_vendorid_1 , CPU vendor ID string bytes 4 - 7 + +# Leaf 1H +# CPU FMS (Family/Model/Stepping) + standard feature flags + + 1, 0, eax, 3:0, stepping , Stepping ID + 1, 0, eax, 7:4, base_model , Base CPU model ID + 1, 0, eax, 11:8, base_family_id , Base CPU family ID + 1, 0, eax, 13:12, cpu_type , CPU type + 1, 0, eax, 19:16, ext_model , Extended CPU model ID + 1, 0, eax, 27:20, ext_family , Extended CPU family ID + 1, 0, ebx, 7:0, brand_id , Brand index + 1, 0, ebx, 15:8, clflush_size , CLFLUSH instruction cache line size + 1, 0, ebx, 23:16, n_logical_cpu , Logical CPU (HW threads) count + 1, 0, ebx, 31:24, local_apic_id , Initial local APIC physical ID + 1, 0, ecx, 0, pni , Streaming SIMD Extensions 3 (SSE3) + 1, 0, ecx, 1, pclmulqdq , PCLMULQDQ instruction support + 1, 0, ecx, 2, dtes64 , 64-bit DS save area + 1, 0, ecx, 3, monitor , MONITOR/MWAIT support + 1, 0, ecx, 4, ds_cpl , CPL Qualified Debug Store + 1, 0, ecx, 5, vmx , Virtual Machine Extensions + 1, 0, ecx, 6, smx , Safer Mode Extensions + 1, 0, ecx, 7, est , Enhanced Intel SpeedStep + 1, 0, ecx, 8, tm2 , Thermal Monitor 2 + 1, 0, ecx, 9, ssse3 , Supplemental SSE3 + 1, 0, ecx, 10, cid , L1 Context ID + 1, 0, ecx, 11, sdbg , Sillicon Debug + 1, 0, ecx, 12, fma , FMA extensions using YMM state + 1, 0, ecx, 13, cx16 , CMPXCHG16B instruction support + 1, 0, ecx, 14, xtpr , xTPR Update Control + 1, 0, ecx, 15, pdcm , Perfmon and Debug Capability + 1, 0, ecx, 17, pcid , Process-context identifiers + 1, 0, ecx, 18, dca , Direct Cache Access + 1, 0, ecx, 19, sse4_1 , SSE4.1 + 1, 0, ecx, 20, sse4_2 , SSE4.2 + 1, 0, ecx, 21, x2apic , X2APIC support + 1, 0, ecx, 22, movbe , MOVBE instruction support + 1, 0, ecx, 23, popcnt , POPCNT instruction support + 1, 0, ecx, 24, tsc_deadline_timer , APIC timer one-shot operation + 1, 0, ecx, 25, aes , AES instructions + 1, 0, ecx, 26, xsave , XSAVE (and related instructions) support + 1, 0, ecx, 27, osxsave , XSAVE (and related instructions) are enabled by OS + 1, 0, ecx, 28, avx , AVX instructions support + 1, 0, ecx, 29, f16c , Half-precision floating-point conversion support + 1, 0, ecx, 30, rdrand , RDRAND instruction support + 1, 0, ecx, 31, guest_status , System is running as guest; (para-)virtualized system + 1, 0, edx, 0, fpu , Floating-Point Unit on-chip (x87) + 1, 0, edx, 1, vme , Virtual-8086 Mode Extensions + 1, 0, edx, 2, de , Debugging Extensions + 1, 0, edx, 3, pse , Page Size Extension + 1, 0, edx, 4, tsc , Time Stamp Counter + 1, 0, edx, 5, msr , Model-Specific Registers (RDMSR and WRMSR support) + 1, 0, edx, 6, pae , Physical Address Extensions + 1, 0, edx, 7, mce , Machine Check Exception + 1, 0, edx, 8, cx8 , CMPXCHG8B instruction + 1, 0, edx, 9, apic , APIC on-chip + 1, 0, edx, 11, sep , SYSENTER, SYSEXIT, and associated MSRs + 1, 0, edx, 12, mtrr , Memory Type Range Registers + 1, 0, edx, 13, pge , Page Global Extensions + 1, 0, edx, 14, mca , Machine Check Architecture + 1, 0, edx, 15, cmov , Conditional Move Instruction + 1, 0, edx, 16, pat , Page Attribute Table + 1, 0, edx, 17, pse36 , Page Size Extension (36-bit) + 1, 0, edx, 18, pn , Processor Serial Number + 1, 0, edx, 19, clflush , CLFLUSH instruction + 1, 0, edx, 21, dts , Debug Store + 1, 0, edx, 22, acpi , Thermal monitor and clock control + 1, 0, edx, 23, mmx , MMX instructions + 1, 0, edx, 24, fxsr , FXSAVE and FXRSTOR instructions + 1, 0, edx, 25, sse , SSE instructions + 1, 0, edx, 26, sse2 , SSE2 instructions + 1, 0, edx, 27, ss , Self Snoop + 1, 0, edx, 28, ht , Hyper-threading + 1, 0, edx, 29, tm , Thermal Monitor + 1, 0, edx, 30, ia64 , Legacy IA-64 (Itanium) support bit, now resreved + 1, 0, edx, 31, pbe , Pending Break Enable + +# Leaf 2H +# Intel cache and TLB information one-byte descriptors + + 2, 0, eax, 7:0, iteration_count , Number of times this CPUD leaf must be queried + 2, 0, eax, 15:8, desc1 , Descriptor #1 + 2, 0, eax, 23:16, desc2 , Descriptor #2 + 2, 0, eax, 30:24, desc3 , Descriptor #3 + 2, 0, eax, 31, eax_invalid , Descriptors 1-3 are invalid if set + 2, 0, ebx, 7:0, desc4 , Descriptor #4 + 2, 0, ebx, 15:8, desc5 , Descriptor #5 + 2, 0, ebx, 23:16, desc6 , Descriptor #6 + 2, 0, ebx, 30:24, desc7 , Descriptor #7 + 2, 0, ebx, 31, ebx_invalid , Descriptors 4-7 are invalid if set + 2, 0, ecx, 7:0, desc8 , Descriptor #8 + 2, 0, ecx, 15:8, desc9 , Descriptor #9 + 2, 0, ecx, 23:16, desc10 , Descriptor #10 + 2, 0, ecx, 30:24, desc11 , Descriptor #11 + 2, 0, ecx, 31, ecx_invalid , Descriptors 8-11 are invalid if set + 2, 0, edx, 7:0, desc12 , Descriptor #12 + 2, 0, edx, 15:8, desc13 , Descriptor #13 + 2, 0, edx, 23:16, desc14 , Descriptor #14 + 2, 0, edx, 30:24, desc15 , Descriptor #15 + 2, 0, edx, 31, edx_invalid , Descriptors 12-15 are invalid if set + +# Leaf 4H +# Intel deterministic cache parameters + + 4, 31:0, eax, 4:0, cache_type , Cache type field + 4, 31:0, eax, 7:5, cache_level , Cache level (1-based) + 4, 31:0, eax, 8, cache_self_init , Self-initialializing cache level + 4, 31:0, eax, 9, fully_associative , Fully-associative cache + 4, 31:0, eax, 25:14, num_threads_sharing , Number logical CPUs sharing this cache + 4, 31:0, eax, 31:26, num_cores_on_die , Number of cores in the physical package + 4, 31:0, ebx, 11:0, cache_linesize , System coherency line size (0-based) + 4, 31:0, ebx, 21:12, cache_npartitions , Physical line partitions (0-based) + 4, 31:0, ebx, 31:22, cache_nways , Ways of associativity (0-based) + 4, 31:0, ecx, 30:0, cache_nsets , Cache number of sets (0-based) + 4, 31:0, edx, 0, wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches + 4, 31:0, edx, 1, ll_inclusive , Cache is inclusive of Lower-Level caches + 4, 31:0, edx, 2, complex_indexing , Not a direct-mapped cache (complex function) + +# Leaf 5H +# MONITOR/MWAIT instructions enumeration + + 5, 0, eax, 15:0, min_mon_size , Smallest monitor-line size, in bytes + 5, 0, ebx, 15:0, max_mon_size , Largest monitor-line size, in bytes + 5, 0, ecx, 0, mwait_ext , Enumeration of MONITOR/MWAIT extensions is supported + 5, 0, ecx, 1, mwait_irq_break , Interrupts as a break-event for MWAIT is supported + 5, 0, edx, 3:0, n_c0_substates , Number of C0 sub C-states supported using MWAIT + 5, 0, edx, 7:4, n_c1_substates , Number of C1 sub C-states supported using MWAIT + 5, 0, edx, 11:8, n_c2_substates , Number of C2 sub C-states supported using MWAIT + 5, 0, edx, 15:12, n_c3_substates , Number of C3 sub C-states supported using MWAIT + 5, 0, edx, 19:16, n_c4_substates , Number of C4 sub C-states supported using MWAIT + 5, 0, edx, 23:20, n_c5_substates , Number of C5 sub C-states supported using MWAIT + 5, 0, edx, 27:24, n_c6_substates , Number of C6 sub C-states supported using MWAIT + 5, 0, edx, 31:28, n_c7_substates , Number of C7 sub C-states supported using MWAIT + +# Leaf 6H +# Thermal and Power Management enumeration + + 6, 0, eax, 0, dtherm , Digital temprature sensor + 6, 0, eax, 1, turbo_boost , Intel Turbo Boost + 6, 0, eax, 2, arat , Always-Running APIC Timer (not affected by p-state) + 6, 0, eax, 4, pln , Power Limit Notification (PLN) event + 6, 0, eax, 5, ecmd , Clock modulation duty cycle extension + 6, 0, eax, 6, pts , Package thermal management + 6, 0, eax, 7, hwp , HWP (Hardware P-states) base registers are supported + 6, 0, eax, 8, hwp_notify , HWP notification (IA32_HWP_INTERRUPT MSR) + 6, 0, eax, 9, hwp_act_window , HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported + 6, 0, eax, 10, hwp_epp , HWP Energy Performance Preference + 6, 0, eax, 11, hwp_pkg_req , HWP Package Level Request + 6, 0, eax, 13, hdc_base_regs , HDC base registers are supported + 6, 0, eax, 14, turbo_boost_3_0 , Intel Turbo Boost Max 3.0 + 6, 0, eax, 15, hwp_capabilities , HWP Highest Performance change + 6, 0, eax, 16, hwp_peci_override , HWP PECI override + 6, 0, eax, 17, hwp_flexible , Flexible HWP + 6, 0, eax, 18, hwp_fast , IA32_HWP_REQUEST MSR fast access mode + 6, 0, eax, 19, hfi , HW_FEEDBACK MSRs supported + 6, 0, eax, 20, hwp_ignore_idle , Ignoring idle logical CPU HWP req is supported + 6, 0, eax, 23, thread_director , Intel thread director support + 6, 0, eax, 24, therm_interrupt_bit25 , IA32_THERM_INTERRUPT MSR bit 25 is supported + 6, 0, ebx, 3:0, n_therm_thresholds , Digital thermometer thresholds + 6, 0, ecx, 0, aperfmperf , MPERF/APERF MSRs (effective frequency interface) + 6, 0, ecx, 3, epb , IA32_ENERGY_PERF_BIAS MSR support + 6, 0, ecx, 15:8, thrd_director_nclasses , Number of classes, Intel thread director + 6, 0, edx, 0, perfcap_reporting , Performance capability reporting + 6, 0, edx, 1, encap_reporting , Energy efficiency capability reporting + 6, 0, edx, 11:8, feedback_sz , HW feedback interface struct size, in 4K pages + 6, 0, edx, 31:16, this_lcpu_hwfdbk_idx , This logical CPU index @ HW feedback struct, 0-based + +# Leaf 7H +# Extended CPU features enumeration + + 7, 0, eax, 31:0, leaf7_n_subleaves , Number of cpuid 0x7 subleaves + 7, 0, ebx, 0, fsgsbase , FSBASE/GSBASE read/write support + 7, 0, ebx, 1, tsc_adjust , IA32_TSC_ADJUST MSR supported + 7, 0, ebx, 2, sgx , Intel SGX (Software Guard Extensions) + 7, 0, ebx, 3, bmi1 , Bit manipulation extensions group 1 + 7, 0, ebx, 4, hle , Hardware Lock Elision + 7, 0, ebx, 5, avx2 , AVX2 instruction set + 7, 0, ebx, 6, fdp_excptn_only , FPU Data Pointer updated only on x87 exceptions + 7, 0, ebx, 7, smep , Supervisor Mode Execution Protection + 7, 0, ebx, 8, bmi2 , Bit manipulation extensions group 2 + 7, 0, ebx, 9, erms , Enhanced REP MOVSB/STOSB + 7, 0, ebx, 10, invpcid , INVPCID instruction (Invalidate Processor Context ID) + 7, 0, ebx, 11, rtm , Intel restricted transactional memory + 7, 0, ebx, 12, cqm , Intel RDT-CMT / AMD Platform-QoS cache monitoring + 7, 0, ebx, 13, zero_fcs_fds , Deprecated FPU CS/DS (stored as zero) + 7, 0, ebx, 14, mpx , Intel memory protection extensions + 7, 0, ebx, 15, rdt_a , Intel RDT / AMD Platform-QoS Enforcemeent + 7, 0, ebx, 16, avx512f , AVX-512 foundation instructions + 7, 0, ebx, 17, avx512dq , AVX-512 double/quadword instructions + 7, 0, ebx, 18, rdseed , RDSEED instruction + 7, 0, ebx, 19, adx , ADCX/ADOX instructions + 7, 0, ebx, 20, smap , Supervisor mode access prevention + 7, 0, ebx, 21, avx512ifma , AVX-512 integer fused multiply add + 7, 0, ebx, 23, clflushopt , CLFLUSHOPT instruction + 7, 0, ebx, 24, clwb , CLWB instruction + 7, 0, ebx, 25, intel_pt , Intel processor trace + 7, 0, ebx, 26, avx512pf , AVX-512 prefetch instructions + 7, 0, ebx, 27, avx512er , AVX-512 exponent/reciprocal instrs + 7, 0, ebx, 28, avx512cd , AVX-512 conflict detection instrs + 7, 0, ebx, 29, sha_ni , SHA/SHA256 instructions + 7, 0, ebx, 30, avx512bw , AVX-512 BW (byte/word granular) instructions + 7, 0, ebx, 31, avx512vl , AVX-512 VL (128/256 vector length) extensions + 7, 0, ecx, 0, prefetchwt1 , PREFETCHWT1 (Intel Xeon Phi only) + 7, 0, ecx, 1, avx512vbmi , AVX-512 Vector byte manipulation instrs + 7, 0, ecx, 2, umip , User mode instruction protection + 7, 0, ecx, 3, pku , Protection keys for user-space + 7, 0, ecx, 4, ospke , OS protection keys enable + 7, 0, ecx, 5, waitpkg , WAITPKG instructions + 7, 0, ecx, 6, avx512_vbmi2 , AVX-512 vector byte manipulation instrs group 2 + 7, 0, ecx, 7, cet_ss , CET shadow stack features + 7, 0, ecx, 8, gfni , Galois field new instructions + 7, 0, ecx, 9, vaes , Vector AES instrs + 7, 0, ecx, 10, vpclmulqdq , VPCLMULQDQ 256-bit instruction support + 7, 0, ecx, 11, avx512_vnni , Vector neural network instructions + 7, 0, ecx, 12, avx512_bitalg , AVX-512 bit count/shiffle + 7, 0, ecx, 13, tme , Intel total memory encryption + 7, 0, ecx, 14, avx512_vpopcntdq , AVX-512: POPCNT for vectors of DW/QW + 7, 0, ecx, 16, la57 , 57-bit linear addreses (five-level paging) + 7, 0, ecx, 21:17, mawau_val_lm , BNDLDX/BNDSTX MAWAU value in 64-bit mode + 7, 0, ecx, 22, rdpid , RDPID instruction + 7, 0, ecx, 23, key_locker , Intel key locker support + 7, 0, ecx, 24, bus_lock_detect , OS bus-lock detection + 7, 0, ecx, 25, cldemote , CLDEMOTE instruction + 7, 0, ecx, 27, movdiri , MOVDIRI instruction + 7, 0, ecx, 28, movdir64b , MOVDIR64B instruction + 7, 0, ecx, 29, enqcmd , Enqueue stores supported (ENQCMD{,S}) + 7, 0, ecx, 30, sgx_lc , Intel SGX launch configuration + 7, 0, ecx, 31, pks , Protection keys for supervisor-mode pages + 7, 0, edx, 1, sgx_keys , Intel SGX attestation services + 7, 0, edx, 2, avx512_4vnniw , AVX-512 neural network instructions + 7, 0, edx, 3, avx512_4fmaps , AVX-512 multiply accumulation single precision + 7, 0, edx, 4, fsrm , Fast short REP MOV + 7, 0, edx, 5, uintr , CPU supports user interrupts + 7, 0, edx, 8, avx512_vp2intersect , VP2INTERSECT{D,Q} instructions + 7, 0, edx, 9, srdbs_ctrl , SRBDS mitigation MSR available + 7, 0, edx, 10, md_clear , VERW MD_CLEAR microcode support + 7, 0, edx, 11, rtm_always_abort , XBEGIN (RTM transaction) always aborts + 7, 0, edx, 13, tsx_force_abort , MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported + 7, 0, edx, 14, serialize , SERIALIZE instruction + 7, 0, edx, 15, hybrid_cpu , The CPU is identified as a 'hybrid part' + 7, 0, edx, 16, tsxldtrk , TSX suspend/resume load address tracking + 7, 0, edx, 18, pconfig , PCONFIG instruction + 7, 0, edx, 19, arch_lbr , Intel architectural LBRs + 7, 0, edx, 20, ibt , CET indirect branch tracking + 7, 0, edx, 22, amx_bf16 , AMX-BF16: tile bfloat16 support + 7, 0, edx, 23, avx512_fp16 , AVX-512 FP16 instructions + 7, 0, edx, 24, amx_tile , AMX-TILE: tile architecture support + 7, 0, edx, 25, amx_int8 , AMX-INT8: tile 8-bit integer support + 7, 0, edx, 26, spec_ctrl , Speculation Control (IBRS/IBPB: indirect branch restrictions) + 7, 0, edx, 27, intel_stibp , Single thread indirect branch predictors + 7, 0, edx, 28, flush_l1d , FLUSH L1D cache: IA32_FLUSH_CMD MSR + 7, 0, edx, 29, arch_capabilities , Intel IA32_ARCH_CAPABILITIES MSR + 7, 0, edx, 30, core_capabilities , IA32_CORE_CAPABILITIES MSR + 7, 0, edx, 31, spec_ctrl_ssbd , Speculative store bypass disable + 7, 1, eax, 4, avx_vnni , AVX-VNNI instructions + 7, 1, eax, 5, avx512_bf16 , AVX-512 bFloat16 instructions + 7, 1, eax, 6, lass , Linear address space separation + 7, 1, eax, 7, cmpccxadd , CMPccXADD instructions + 7, 1, eax, 8, arch_perfmon_ext , ArchPerfmonExt: CPUID leaf 0x23 is supported + 7, 1, eax, 10, fzrm , Fast zero-length REP MOVSB + 7, 1, eax, 11, fsrs , Fast short REP STOSB + 7, 1, eax, 12, fsrc , Fast Short REP CMPSB/SCASB + 7, 1, eax, 17, fred , FRED: Flexible return and event delivery transitions + 7, 1, eax, 18, lkgs , LKGS: Load 'kernel' (userspace) GS + 7, 1, eax, 19, wrmsrns , WRMSRNS instr (WRMSR-non-serializing) + 7, 1, eax, 21, amx_fp16 , AMX-FP16: FP16 tile operations + 7, 1, eax, 22, hreset , History reset support + 7, 1, eax, 23, avx_ifma , Integer fused multiply add + 7, 1, eax, 26, lam , Linear address masking + 7, 1, eax, 27, rd_wr_msrlist , RDMSRLIST/WRMSRLIST instructions + 7, 1, ebx, 0, intel_ppin , Protected processor inventory number (PPIN{,_CTL} MSRs) + 7, 1, edx, 4, avx_vnni_int8 , AVX-VNNI-INT8 instructions + 7, 1, edx, 5, avx_ne_convert , AVX-NE-CONVERT instructions + 7, 1, edx, 8, amx_complex , AMX-COMPLEX instructions (starting from Granite Rapids) + 7, 1, edx, 14, prefetchit_0_1 , PREFETCHIT0/1 instructions + 7, 1, edx, 18, cet_sss , CET supervisor shadow stacks safe to use + 7, 2, edx, 0, intel_psfd , Intel predictive store forward disable + 7, 2, edx, 1, ipred_ctrl , MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S} + 7, 2, edx, 2, rrsba_ctrl , MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S} + 7, 2, edx, 3, ddp_ctrl , MSR bit IA32_SPEC_CTRL.DDPD_U + 7, 2, edx, 4, bhi_ctrl , MSR bit IA32_SPEC_CTRL.BHI_DIS_S + 7, 2, edx, 5, mcdt_no , MCDT mitigation not needed + 7, 2, edx, 6, uclock_disable , UC-lock disable is supported + +# Leaf 9H +# Intel DCA (Direct Cache Access) enumeration + + 9, 0, eax, 0, dca_enabled_in_bios , DCA is enabled in BIOS + +# Leaf AH +# Intel PMU (Performance Monitoring Unit) enumeration + + 0xa, 0, eax, 7:0, pmu_version , Performance monitoring unit version ID + 0xa, 0, eax, 15:8, pmu_n_gcounters , Number of general PMU counters per logical CPU + 0xa, 0, eax, 23:16, pmu_gcounters_nbits , Bitwidth of PMU general counters + 0xa, 0, eax, 31:24, pmu_cpuid_ebx_bits , Length of cpuid leaf 0xa EBX bit vector + 0xa, 0, ebx, 0, no_core_cycle_evt , Core cycle event not available + 0xa, 0, ebx, 1, no_insn_retired_evt , Instruction retired event not available + 0xa, 0, ebx, 2, no_refcycle_evt , Reference cycles event not available + 0xa, 0, ebx, 3, no_llc_ref_evt , LLC-reference event not available + 0xa, 0, ebx, 4, no_llc_miss_evt , LLC-misses event not available + 0xa, 0, ebx, 5, no_br_insn_ret_evt , Branch instruction retired event not available + 0xa, 0, ebx, 6, no_br_mispredict_evt , Branch mispredict retired event not available + 0xa, 0, ebx, 7, no_td_slots_evt , Topdown slots event not available + 0xa, 0, ecx, 31:0, pmu_fcounters_bitmap , Fixed-function PMU counters support bitmap + 0xa, 0, edx, 4:0, pmu_n_fcounters , Number of fixed PMU counters + 0xa, 0, edx, 12:5, pmu_fcounters_nbits , Bitwidth of PMU fixed counters + 0xa, 0, edx, 15, anythread_depr , AnyThread deprecation + +# Leaf BH +# CPUs v1 extended topology enumeration + + 0xb, 1:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) + 0xb, 1:0, ebx, 15:0, domain_lcpus_count , Logical CPUs count across all instances of this domain + 0xb, 1:0, ecx, 7:0, domain_nr , This domain level (subleaf ID) + 0xb, 1:0, ecx, 15:8, domain_type , This domain type + 0xb, 1:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU + +# Leaf DH +# Processor extended state enumeration + + 0xd, 0, eax, 0, xcr0_x87 , XCR0.X87 (bit 0) supported + 0xd, 0, eax, 1, xcr0_sse , XCR0.SEE (bit 1) supported + 0xd, 0, eax, 2, xcr0_avx , XCR0.AVX (bit 2) supported + 0xd, 0, eax, 3, xcr0_mpx_bndregs , XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 regs) + 0xd, 0, eax, 4, xcr0_mpx_bndcsr , XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs) + 0xd, 0, eax, 5, xcr0_avx512_opmask , XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 regs) + 0xd, 0, eax, 6, xcr0_avx512_zmm_hi256 , XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs) + 0xd, 0, eax, 7, xcr0_avx512_hi16_zmm , XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs) + 0xd, 0, eax, 9, xcr0_pkru , XCR0.PKRU (bit 9) supported (XSAVE PKRU reg) + 0xd, 0, eax, 11, xcr0_cet_u , AMD XCR0.CET_U (bit 11) supported (CET supervisor state) + 0xd, 0, eax, 12, xcr0_cet_s , AMD XCR0.CET_S (bit 12) support (CET user state) + 0xd, 0, eax, 17, xcr0_tileconfig , XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG) + 0xd, 0, eax, 18, xcr0_tiledata , XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA) + 0xd, 0, ebx, 31:0, xsave_sz_xcr0_enabled , XSAVE/XRSTR area byte size, for XCR0 enabled features + 0xd, 0, ecx, 31:0, xsave_sz_max , XSAVE/XRSTR area max byte size, all CPU features + 0xd, 0, edx, 30, xcr0_lwp , AMD XCR0.LWP (bit 62) supported (Light-weight Profiling) + 0xd, 1, eax, 0, xsaveopt , XSAVEOPT instruction + 0xd, 1, eax, 1, xsavec , XSAVEC instruction + 0xd, 1, eax, 2, xgetbv1 , XGETBV instruction with ECX = 1 + 0xd, 1, eax, 3, xsaves , XSAVES/XRSTORS instructions (and XSS MSR) + 0xd, 1, eax, 4, xfd , Extended feature disable support + 0xd, 1, ebx, 31:0, xsave_sz_xcr0_xmms_enabled, XSAVE area size, all XCR0 and XMMS features enabled + 0xd, 1, ecx, 8, xss_pt , PT state, supported + 0xd, 1, ecx, 10, xss_pasid , PASID state, supported + 0xd, 1, ecx, 11, xss_cet_u , CET user state, supported + 0xd, 1, ecx, 12, xss_cet_p , CET supervisor state, supported + 0xd, 1, ecx, 13, xss_hdc , HDC state, supported + 0xd, 1, ecx, 14, xss_uintr , UINTR state, supported + 0xd, 1, ecx, 15, xss_lbr , LBR state, supported + 0xd, 1, ecx, 16, xss_hwp , HWP state, supported + 0xd, 63:2, eax, 31:0, xsave_sz , Size of save area for subleaf-N feature, in bytes + 0xd, 63:2, ebx, 31:0, xsave_offset , Offset of save area for subleaf-N feature, in bytes + 0xd, 63:2, ecx, 0, is_xss_bit , Subleaf N describes an XSS bit, otherwise XCR0 bit + 0xd, 63:2, ecx, 1, compacted_xsave_64byte_aligned, When compacted, subleaf-N feature xsave area is 64-byte aligned + +# Leaf FH +# Intel RDT / AMD PQoS resource monitoring + + 0xf, 0, ebx, 31:0, core_rmid_max , RMID max, within this core, all types (0-based) + 0xf, 0, edx, 1, cqm_llc , LLC QoS-monitoring supported + 0xf, 1, eax, 7:0, l3c_qm_bitwidth , L3 QoS-monitoring counter bitwidth (24-based) + 0xf, 1, eax, 8, l3c_qm_overflow_bit , QM_CTR MSR bit 61 is an overflow bit + 0xf, 1, ebx, 31:0, l3c_qm_conver_factor , QM_CTR MSR conversion factor to bytes + 0xf, 1, ecx, 31:0, l3c_qm_rmid_max , L3 QoS-monitoring max RMID + 0xf, 1, edx, 0, cqm_occup_llc , L3 QoS occupancy monitoring supported + 0xf, 1, edx, 1, cqm_mbm_total , L3 QoS total bandwidth monitoring supported + 0xf, 1, edx, 2, cqm_mbm_local , L3 QoS local bandwidth monitoring supported # Leaf 10H -# Intel RDT Allocation - - 0x10, 0, EBX, 1, l3c_rdt_alloc, L3 Cache Allocation supported - 0x10, 0, EBX, 2, l2c_rdt_alloc, L2 Cache Allocation supported - 0x10, 0, EBX, 3, mem_bw_alloc, Memory Bandwidth Allocation supported - +# Intel RDT / AMD PQoS allocation enumeration + + 0x10, 0, ebx, 1, cat_l3 , L3 Cache Allocation Technology supported + 0x10, 0, ebx, 2, cat_l2 , L2 Cache Allocation Technology supported + 0x10, 0, ebx, 3, mba , Memory Bandwidth Allocation supported + 0x10, 2:1, eax, 4:0, cat_cbm_len , L3/L2_CAT capacity bitmask length, minus-one notation + 0x10, 2:1, ebx, 31:0, cat_units_bitmap , L3/L2_CAT bitmap of allocation units + 0x10, 2:1, ecx, 1, l3_cat_cos_infreq_updates, L3_CAT COS updates should be infrequent + 0x10, 2:1, ecx, 2, cdp_l3 , L3/L2_CAT CDP (Code and Data Prioritization) + 0x10, 2:1, ecx, 3, cat_sparse_1s , L3/L2_CAT non-contiguous 1s value supported + 0x10, 2:1, edx, 15:0, cat_cos_max , L3/L2_CAT max COS (Class of Service) supported + 0x10, 3, eax, 11:0, mba_max_delay , Max MBA throttling value; minus-one notation + 0x10, 3, ecx, 0, per_thread_mba , Per-thread MBA controls are supported + 0x10, 3, ecx, 2, mba_delay_linear , Delay values are linear + 0x10, 3, edx, 15:0, mba_cos_max , MBA max Class of Service supported # Leaf 12H -# SGX Capability -# -# Some detailed SGX features not added yet - - 0x12, 0, EAX, 0, sgx1, L3 Cache Allocation supported - 0x12, 1, EAX, 0, sgx2, L3 Cache Allocation supported - +# Intel Software Guard Extensions (SGX) enumeration + + 0x12, 0, eax, 0, sgx1 , SGX1 leaf functions supported + 0x12, 0, eax, 1, sgx2 , SGX2 leaf functions supported + 0x12, 0, eax, 5, enclv_leaves , ENCLV leaves (E{INC,DEC}VIRTCHILD, ESETCONTEXT) supported + 0x12, 0, eax, 6, encls_leaves , ENCLS leaves (ENCLS ETRACKC, ERDINFO, ELDBC, ELDUC) supported + 0x12, 0, eax, 7, enclu_everifyreport2 , ENCLU leaf EVERIFYREPORT2 supported + 0x12, 0, eax, 10, encls_eupdatesvn , ENCLS leaf EUPDATESVN supported + 0x12, 0, eax, 11, sgx_edeccssa , ENCLU leaf EDECCSSA supported + 0x12, 0, ebx, 0, miscselect_exinfo , SSA.MISC frame: reporting #PF and #GP exceptions inside enclave supported + 0x12, 0, ebx, 1, miscselect_cpinfo , SSA.MISC frame: reporting #CP exceptions inside enclave supported + 0x12, 0, edx, 7:0, max_enclave_sz_not64 , Maximum enclave size in non-64-bit mode (log2) + 0x12, 0, edx, 15:8, max_enclave_sz_64 , Maximum enclave size in 64-bit mode (log2) + 0x12, 1, eax, 0, secs_attr_init , ATTRIBUTES.INIT supported (enclave initialized by EINIT) + 0x12, 1, eax, 1, secs_attr_debug , ATTRIBUTES.DEBUG supported (enclave permits debugger read/write) + 0x12, 1, eax, 2, secs_attr_mode64bit , ATTRIBUTES.MODE64BIT supported (enclave runs in 64-bit mode) + 0x12, 1, eax, 4, secs_attr_provisionkey , ATTRIBUTES.PROVISIONKEY supported (provisioning key available) + 0x12, 1, eax, 5, secs_attr_einittoken_key, ATTRIBUTES.EINITTOKEN_KEY supported (EINIT token key available) + 0x12, 1, eax, 6, secs_attr_cet , ATTRIBUTES.CET supported (enable CET attributes) + 0x12, 1, eax, 7, secs_attr_kss , ATTRIBUTES.KSS supported (Key Separation and Sharing enabled) + 0x12, 1, eax, 10, secs_attr_aexnotify , ATTRIBUTES.AEXNOTIFY supported (enclave threads may get AEX notifications + 0x12, 1, ecx, 0, xfrm_x87 , Enclave XFRM.X87 (bit 0) supported + 0x12, 1, ecx, 1, xfrm_sse , Enclave XFRM.SEE (bit 1) supported + 0x12, 1, ecx, 2, xfrm_avx , Enclave XFRM.AVX (bit 2) supported + 0x12, 1, ecx, 3, xfrm_mpx_bndregs , Enclave XFRM.BNDREGS (bit 3) supported (MPX BND0-BND3 regs) + 0x12, 1, ecx, 4, xfrm_mpx_bndcsr , Enclave XFRM.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs) + 0x12, 1, ecx, 5, xfrm_avx512_opmask , Enclave XFRM.OPMASK (bit 5) supported (AVX-512 k0-k7 regs) + 0x12, 1, ecx, 6, xfrm_avx512_zmm_hi256 , Enclave XFRM.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs) + 0x12, 1, ecx, 7, xfrm_avx512_hi16_zmm , Enclave XFRM.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs) + 0x12, 1, ecx, 9, xfrm_pkru , Enclave XFRM.PKRU (bit 9) supported (XSAVE PKRU reg) + 0x12, 1, ecx, 17, xfrm_tileconfig , Enclave XFRM.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG) + 0x12, 1, ecx, 18, xfrm_tiledata , Enclave XFRM.TILEDATA (bit 18) supported (AMX can manage TILEDATA) + 0x12, 31:2, eax, 3:0, subleaf_type , Subleaf type (dictates output layout) + 0x12, 31:2, eax, 31:12, epc_sec_base_addr_0 , EPC section base addr, bits[12:31] + 0x12, 31:2, ebx, 19:0, epc_sec_base_addr_1 , EPC section base addr, bits[32:51] + 0x12, 31:2, ecx, 3:0, epc_sec_type , EPC section type / property encoding + 0x12, 31:2, ecx, 31:12, epc_sec_size_0 , EPC section size, bits[12:31] + 0x12, 31:2, edx, 19:0, epc_sec_size_1 , EPC section size, bits[32:51] # Leaf 14H -# Intel Processor Tracer -# +# Intel Processor Trace enumeration + + 0x14, 0, eax, 31:0, pt_max_subleaf , Max cpuid 0x14 subleaf + 0x14, 0, ebx, 0, cr3_filtering , IA32_RTIT_CR3_MATCH is accessible + 0x14, 0, ebx, 1, psb_cyc , Configurable PSB and cycle-accurate mode + 0x14, 0, ebx, 2, ip_filtering , IP/TraceStop filtering; Warm-reset PT MSRs preservation + 0x14, 0, ebx, 3, mtc_timing , MTC timing packet; COFI-based packets suppression + 0x14, 0, ebx, 4, ptwrite , PTWRITE support + 0x14, 0, ebx, 5, power_event_trace , Power Event Trace support + 0x14, 0, ebx, 6, psb_pmi_preserve , PSB and PMI preservation support + 0x14, 0, ebx, 7, event_trace , Event Trace packet generation through IA32_RTIT_CTL.EventEn + 0x14, 0, ebx, 8, tnt_disable , TNT packet generation disable through IA32_RTIT_CTL.DisTNT + 0x14, 0, ecx, 0, topa_output , ToPA output scheme support + 0x14, 0, ecx, 1, topa_multiple_entries , ToPA tables can hold multiple entries + 0x14, 0, ecx, 2, single_range_output , Single-range output scheme supported + 0x14, 0, ecx, 3, trance_transport_output, Trace Transport subsystem output support + 0x14, 0, ecx, 31, ip_payloads_lip , IP payloads have LIP values (CS base included) + 0x14, 1, eax, 2:0, num_address_ranges , Filtering number of configurable Address Ranges + 0x14, 1, eax, 31:16, mtc_periods_bmp , Bitmap of supported MTC period encodings + 0x14, 1, ebx, 15:0, cycle_thresholds_bmp , Bitmap of supported Cycle Threshold encodings + 0x14, 1, ebx, 31:16, psb_periods_bmp , Bitmap of supported Configurable PSB frequency encodings # Leaf 15H -# Time Stamp Counter and Nominal Core Crystal Clock Information +# Intel TSC (Time Stamp Counter) enumeration - 0x15, 0, EAX, 31:0, tsc_denominator, The denominator of the TSC/”core crystal clock” ratio - 0x15, 0, EBX, 31:0, tsc_numerator, The numerator of the TSC/”core crystal clock” ratio - 0x15, 0, ECX, 31:0, nom_freq, Nominal frequency of the core crystal clock in Hz + 0x15, 0, eax, 31:0, tsc_denominator , Denominator of the TSC/'core crystal clock' ratio + 0x15, 0, ebx, 31:0, tsc_numerator , Numerator of the TSC/'core crystal clock' ratio + 0x15, 0, ecx, 31:0, cpu_crystal_hz , Core crystal clock nominal frequency, in Hz # Leaf 16H -# Processor Frequency Information +# Intel processor fequency enumeration - 0x16, 0, EAX, 15:0, cpu_base_freq, Processor Base Frequency in MHz - 0x16, 0, EBX, 15:0, cpu_max_freq, Maximum Frequency in MHz - 0x16, 0, ECX, 15:0, bus_freq, Bus (Reference) Frequency in MHz + 0x16, 0, eax, 15:0, cpu_base_mhz , Processor base frequency, in MHz + 0x16, 0, ebx, 15:0, cpu_max_mhz , Processor max frequency, in MHz + 0x16, 0, ecx, 15:0, bus_mhz , Bus reference frequency, in MHz # Leaf 17H -# System-On-Chip Vendor Attribute - - 0x17, 0, EAX, 31:0, max_socid, Maximum input value of supported sub-leaf - 0x17, 0, EBX, 15:0, soc_vid, SOC Vendor ID - 0x17, 0, EBX, 16, std_vid, SOC Vendor ID is assigned via an industry standard scheme - 0x17, 0, ECX, 31:0, soc_pid, SOC Project ID assigned by vendor - 0x17, 0, EDX, 31:0, soc_sid, SOC Stepping ID +# Intel SoC vendor attributes enumeration + + 0x17, 0, eax, 31:0, soc_max_subleaf , Max cpuid leaf 0x17 subleaf + 0x17, 0, ebx, 15:0, soc_vendor_id , SoC vendor ID + 0x17, 0, ebx, 16, is_vendor_scheme , Assigned by industry enumaeratoion scheme (not Intel) + 0x17, 0, ecx, 31:0, soc_proj_id , SoC project ID, assigned by vendor + 0x17, 0, edx, 31:0, soc_stepping_id , Soc project stepping ID, assigned by vendor + 0x17, 3:1, eax, 31:0, vendor_brand_a , Vendor Brand ID string, bytes subleaf_nr * (0 -> 3) + 0x17, 3:1, ebx, 31:0, vendor_brand_b , Vendor Brand ID string, bytes subleaf_nr * (4 -> 7) + 0x17, 3:1, ecx, 31:0, vendor_brand_c , Vendor Brand ID string, bytes subleaf_nr * (8 -> 11) + 0x17, 3:1, edx, 31:0, vendor_brand_d , Vendor Brand ID string, bytes subleaf_nr * (12 -> 15) # Leaf 18H -# Deterministic Address Translation Parameters - +# Intel determenestic address translation (TLB) parameters + + 0x18, 31:0, eax, 31:0, tlb_max_subleaf , Max cpuid 0x18 subleaf + 0x18, 31:0, ebx, 0, tlb_4k_page , TLB 4KB-page entries supported + 0x18, 31:0, ebx, 1, tlb_2m_page , TLB 2MB-page entries supported + 0x18, 31:0, ebx, 2, tlb_4m_page , TLB 4MB-page entries supported + 0x18, 31:0, ebx, 3, tlb_1g_page , TLB 1GB-page entries supported + 0x18, 31:0, ebx, 10:8, hard_partitioning , (Hard/Soft) partitioning between logical CPUs sharing this struct + 0x18, 31:0, ebx, 31:16, n_way_associative , Ways of associativity + 0x18, 31:0, ecx, 31:0, n_sets , Number of sets + 0x18, 31:0, edx, 4:0, tlb_type , Translation cache type (TLB type) + 0x18, 31:0, edx, 7:5, tlb_cache_level , Translation cache level (1-based) + 0x18, 31:0, edx, 8, is_fully_associative , Fully-associative structure + 0x18, 31:0, edx, 25:14, tlb_max_addressible_ids, Max num of addressible IDs for logical CPUs sharing this TLB - 1 # Leaf 19H -# Key Locker Leaf +# Intel Key Locker enumeration + 0x19, 0, eax, 0, kl_cpl0_only , CPL0-only key Locker restriction supported + 0x19, 0, eax, 1, kl_no_encrypt , No-encrypt key locker restriction supported + 0x19, 0, eax, 2, kl_no_decrypt , No-decrypt key locker restriction supported + 0x19, 0, ebx, 0, aes_keylocker , AES key locker instructions supported + 0x19, 0, ebx, 2, aes_keylocker_wide , AES wide key locker instructions supported + 0x19, 0, ebx, 4, kl_msr_iwkey , Key locker MSRs and IWKEY backups supported + 0x19, 0, ecx, 0, loadiwkey_no_backup , LOADIWKEY NoBackup parameter supported + 0x19, 0, ecx, 1, iwkey_rand , IWKEY randomization (KeySource encoding 1) supported # Leaf 1AH -# Hybrid Information - - 0x1A, 0, EAX, 31:24, core_type, 20H-Intel_Atom 40H-Intel_Core - +# Intel hybrid CPUs identification (e.g. Atom, Core) + + 0x1a, 0, eax, 23:0, core_native_model , This core's native model ID + 0x1a, 0, eax, 31:24, core_type , This core's type + +# Leaf 1BH +# Intel PCONFIG (Platform configuration) enumeration + + 0x1b, 31:0, eax, 11:0, pconfig_subleaf_type , CPUID 0x1b subleaf type + 0x1b, 31:0, ebx, 31:0, pconfig_target_id_x , A supported PCONFIG target ID + 0x1b, 31:0, ecx, 31:0, pconfig_target_id_y , A supported PCONFIG target ID + 0x1b, 31:0, edx, 31:0, pconfig_target_id_z , A supported PCONFIG target ID + +# Leaf 1CH +# Intel LBR (Last Branch Record) enumeration + + 0x1c, 0, eax, 0, lbr_depth_8 , Max stack depth (number of LBR entries) = 8 + 0x1c, 0, eax, 1, lbr_depth_16 , Max stack depth (number of LBR entries) = 16 + 0x1c, 0, eax, 2, lbr_depth_24 , Max stack depth (number of LBR entries) = 24 + 0x1c, 0, eax, 3, lbr_depth_32 , Max stack depth (number of LBR entries) = 32 + 0x1c, 0, eax, 4, lbr_depth_40 , Max stack depth (number of LBR entries) = 40 + 0x1c, 0, eax, 5, lbr_depth_48 , Max stack depth (number of LBR entries) = 48 + 0x1c, 0, eax, 6, lbr_depth_56 , Max stack depth (number of LBR entries) = 56 + 0x1c, 0, eax, 7, lbr_depth_64 , Max stack depth (number of LBR entries) = 64 + 0x1c, 0, eax, 30, lbr_deep_c_reset , LBRs maybe cleared on MWAIT C-state > C1 + 0x1c, 0, eax, 31, lbr_ip_is_lip , LBR IP contain Last IP, otherwise effective IP + 0x1c, 0, ebx, 0, lbr_cpl , CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported + 0x1c, 0, ebx, 1, lbr_branch_filter , Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported + 0x1c, 0, ebx, 2, lbr_call_stack , Call-stack mode (IA32_LBR_CTL[3] = 1) supported + 0x1c, 0, ecx, 0, lbr_mispredict , Branch misprediction bit supported (IA32_LBR_x_INFO[63]) + 0x1c, 0, ecx, 1, lbr_timed_lbr , Timed LBRs (CPU cycles since last LBR entry) supported + 0x1c, 0, ecx, 2, lbr_branch_type , Branch type field (IA32_LBR_INFO_x[59:56]) supported + 0x1c, 0, ecx, 19:16, lbr_events_gpc_bmp , LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters + +# Leaf 1DH +# Intel AMX (Advanced Matrix Extensions) tile information + + 0x1d, 0, eax, 31:0, amx_max_palette , Highest palette ID / subleaf ID + 0x1d, 1, eax, 15:0, amx_palette_size , AMX palette total tiles size, in bytes + 0x1d, 1, eax, 31:16, amx_tile_size , AMX single tile's size, in bytes + 0x1d, 1, ebx, 15:0, amx_tile_row_size , AMX tile single row's size, in bytes + 0x1d, 1, ebx, 31:16, amx_palette_nr_tiles , AMX palette number of tiles + 0x1d, 1, ecx, 15:0, amx_tile_nr_rows , AMX tile max number of rows + +# Leaf 1EH +# Intel AMX, TMUL (Tile-matrix MULtiply) accelerator unit enumeration + + 0x1e, 0, ebx, 7:0, tmul_maxk , TMUL unit maximum height, K (rows or columns) + 0x1e, 0, ebx, 23:8, tmul_maxn , TMUL unit maxiumum SIMD dimension, N (column bytes) # Leaf 1FH -# V2 Extended Topology - A preferred superset to leaf 0BH - - -# According to SDM -# 40000000H - 4FFFFFFFH is invalid range +# Intel extended topology enumeration v2 + + 0x1f, 5:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) + 0x1f, 5:0, ebx, 15:0, domain_lcpus_count , Logical CPUs count across all instances of this domain + 0x1f, 5:0, ecx, 7:0, domain_level , This domain level (subleaf ID) + 0x1f, 5:0, ecx, 15:8, domain_type , This domain type + 0x1f, 5:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU + +# Leaf 20H +# Intel HRESET (History Reset) enumeration + + 0x20, 0, eax, 31:0, hreset_nr_subleaves , CPUID 0x20 max subleaf + 1 + 0x20, 0, ebx, 0, hreset_thread_director , HRESET of Intel thread director is supported + +# Leaf 21H +# Intel TD (Trust Domain) guest execution environment enumeration + + 0x21, 0, ebx, 31:0, tdx_vendorid_0 , TDX vendor ID string bytes 0 - 3 + 0x21, 0, ecx, 31:0, tdx_vendorid_2 , CPU vendor ID string bytes 8 - 11 + 0x21, 0, edx, 31:0, tdx_vendorid_1 , CPU vendor ID string bytes 4 - 7 + +# Leaf 23H +# Intel Architectural Performance Monitoring Extended (ArchPerfmonExt) + + 0x23, 0, eax, 1, subleaf_1_counters , Subleaf 1, PMU counters bitmaps, is valid + 0x23, 0, eax, 3, subleaf_3_events , Subleaf 3, PMU events bitmaps, is valid + 0x23, 0, ebx, 0, unitmask2 , IA32_PERFEVTSELx MSRs UnitMask2 is supported + 0x23, 0, ebx, 1, zbit , IA32_PERFEVTSELx MSRs Z-bit is supported + 0x23, 1, eax, 31:0, pmu_gp_counters_bitmap , General-purpose PMU counters bitmap + 0x23, 1, ebx, 31:0, pmu_f_counters_bitmap , Fixed PMU counters bitmap + 0x23, 3, eax, 0, core_cycles_evt , Core cycles event supported + 0x23, 3, eax, 1, insn_retired_evt , Instructions retired event supported + 0x23, 3, eax, 2, ref_cycles_evt , Reference cycles event supported + 0x23, 3, eax, 3, llc_refs_evt , Last-level cache references event supported + 0x23, 3, eax, 4, llc_misses_evt , Last-level cache misses event supported + 0x23, 3, eax, 5, br_insn_ret_evt , Branch instruction retired event supported + 0x23, 3, eax, 6, br_mispr_evt , Branch mispredict retired event supported + 0x23, 3, eax, 7, td_slots_evt , Topdown slots event supported + 0x23, 3, eax, 8, td_backend_bound_evt , Topdown backend bound event supported + 0x23, 3, eax, 9, td_bad_spec_evt , Topdown bad speculation event supported + 0x23, 3, eax, 10, td_frontend_bound_evt , Topdown frontend bound event supported + 0x23, 3, eax, 11, td_retiring_evt , Topdown retiring event support + +# Leaf 40000000H +# Maximum hypervisor standard leaf + hypervisor vendor string + +0x40000000, 0, eax, 31:0, max_hyp_leaf , Maximum hypervisor standard leaf number +0x40000000, 0, ebx, 31:0, hypervisor_id_0 , Hypervisor ID string bytes 0 - 3 +0x40000000, 0, ecx, 31:0, hypervisor_id_1 , Hypervisor ID string bytes 4 - 7 +0x40000000, 0, edx, 31:0, hypervisor_id_2 , Hypervisor ID string bytes 8 - 11 + +# Leaf 80000000H +# Maximum extended leaf number + CPU vendor string (AMD) + +0x80000000, 0, eax, 31:0, max_ext_leaf , Maximum extended cpuid leaf supported +0x80000000, 0, ebx, 31:0, cpu_vendorid_0 , Vendor ID string bytes 0 - 3 +0x80000000, 0, ecx, 31:0, cpu_vendorid_2 , Vendor ID string bytes 8 - 11 +0x80000000, 0, edx, 31:0, cpu_vendorid_1 , Vendor ID string bytes 4 - 7 # Leaf 80000001H -# Extended Processor Signature and Feature Bits - -0x80000001, 0, EAX, 27:20, extfamily, Extended family -0x80000001, 0, EAX, 19:16, extmodel, Extended model -0x80000001, 0, EAX, 11:8, basefamily, Description of Family -0x80000001, 0, EAX, 11:8, basemodel, Model numbers vary with product -0x80000001, 0, EAX, 3:0, stepping, Processor stepping (revision) for a specific model - -0x80000001, 0, EBX, 31:28, pkgtype, Specifies the package type - -0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode -0x80000001, 0, ECX, 1, cmplegacy, Core multi-processing legacy mode -0x80000001, 0, ECX, 2, svm, Indicates support for: VMRUN, VMLOAD, VMSAVE, CLGI, VMMCALL, and INVLPGA -0x80000001, 0, ECX, 3, extapicspace, Extended APIC register space -0x80000001, 0, ECX, 4, altmovecr8, Indicates support for LOCK MOV CR0 means MOV CR8 -0x80000001, 0, ECX, 5, lzcnt, LZCNT -0x80000001, 0, ECX, 6, sse4a, EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support -0x80000001, 0, ECX, 7, misalignsse, Misaligned SSE Mode -0x80000001, 0, ECX, 8, prefetchw, PREFETCHW -0x80000001, 0, ECX, 9, osvw, OS Visible Work-around support -0x80000001, 0, ECX, 10, ibs, Instruction Based Sampling -0x80000001, 0, ECX, 11, xop, Extended operation support -0x80000001, 0, ECX, 12, skinit, SKINIT and STGI support -0x80000001, 0, ECX, 13, wdt, Watchdog timer support -0x80000001, 0, ECX, 15, lwp, Lightweight profiling support -0x80000001, 0, ECX, 16, fma4, Four-operand FMA instruction support -0x80000001, 0, ECX, 17, tce, Translation cache extension -0x80000001, 0, ECX, 22, TopologyExtensions, Indicates support for Core::X86::Cpuid::CachePropEax0 and Core::X86::Cpuid::ExtApicId -0x80000001, 0, ECX, 23, perfctrextcore, Indicates support for Core::X86::Msr::PERF_CTL0 - 5 and Core::X86::Msr::PERF_CTR -0x80000001, 0, ECX, 24, perfctrextdf, Indicates support for Core::X86::Msr::DF_PERF_CTL and Core::X86::Msr::DF_PERF_CTR -0x80000001, 0, ECX, 26, databreakpointextension, Indicates data breakpoint support for Core::X86::Msr::DR0_ADDR_MASK, Core::X86::Msr::DR1_ADDR_MASK, Core::X86::Msr::DR2_ADDR_MASK and Core::X86::Msr::DR3_ADDR_MASK -0x80000001, 0, ECX, 27, perftsc, Performance time-stamp counter supported -0x80000001, 0, ECX, 28, perfctrextllc, Indicates support for L3 performance counter extensions -0x80000001, 0, ECX, 29, mwaitextended, MWAITX and MONITORX capability is supported -0x80000001, 0, ECX, 30, admskextn, Indicates support for address mask extension (to 32 bits and to all 4 DRs) for instruction breakpoints - -0x80000001, 0, EDX, 0, fpu, x87 floating point unit on-chip -0x80000001, 0, EDX, 1, vme, Virtual-mode enhancements -0x80000001, 0, EDX, 2, de, Debugging extensions, IO breakpoints, CR4.DE -0x80000001, 0, EDX, 3, pse, Page-size extensions (4 MB pages) -0x80000001, 0, EDX, 4, tsc, Time stamp counter, RDTSC/RDTSCP instructions, CR4.TSD -0x80000001, 0, EDX, 5, msr, Model-specific registers (MSRs), with RDMSR and WRMSR instructions -0x80000001, 0, EDX, 6, pae, Physical-address extensions (PAE) -0x80000001, 0, EDX, 7, mce, Machine Check Exception, CR4.MCE -0x80000001, 0, EDX, 8, cmpxchg8b, CMPXCHG8B instruction -0x80000001, 0, EDX, 9, apic, advanced programmable interrupt controller (APIC) exists and is enabled -0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported -0x80000001, 0, EDX, 12, mtrr, Memory-type range registers -0x80000001, 0, EDX, 13, pge, Page global extension, CR4.PGE -0x80000001, 0, EDX, 14, mca, Machine check architecture, MCG_CAP -0x80000001, 0, EDX, 15, cmov, Conditional move instructions, CMOV, FCOMI, FCMOV -0x80000001, 0, EDX, 16, pat, Page attribute table -0x80000001, 0, EDX, 17, pse36, Page-size extensions -0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available -0x80000001, 0, EDX, 22, mmxext, AMD extensions to MMX instructions -0x80000001, 0, EDX, 23, mmx, MMX instructions -0x80000001, 0, EDX, 24, fxsr, FXSAVE and FXRSTOR instructions -0x80000001, 0, EDX, 25, ffxsr, FXSAVE and FXRSTOR instruction optimizations -0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported -0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available -0x80000001, 0, EDX, 29, lm, 64b Architecture supported -0x80000001, 0, EDX, 30, threednowext, AMD extensions to 3DNow! instructions -0x80000001, 0, EDX, 31, threednow, 3DNow! instructions - -# Leaf 80000002H/80000003H/80000004H -# Processor Brand String +# Extended CPU feature identifiers + +0x80000001, 0, eax, 3:0, e_stepping_id , Stepping ID +0x80000001, 0, eax, 7:4, e_base_model , Base processor model +0x80000001, 0, eax, 11:8, e_base_family , Base processor family +0x80000001, 0, eax, 19:16, e_ext_model , Extended processor model +0x80000001, 0, eax, 27:20, e_ext_family , Extended processor family +0x80000001, 0, ebx, 15:0, brand_id , Brand ID +0x80000001, 0, ebx, 31:28, pkg_type , Package type +0x80000001, 0, ecx, 0, lahf_lm , LAHF and SAHF in 64-bit mode +0x80000001, 0, ecx, 1, cmp_legacy , Multi-processing legacy mode (No HT) +0x80000001, 0, ecx, 2, svm , Secure Virtual Machine +0x80000001, 0, ecx, 3, extapic , Extended APIC space +0x80000001, 0, ecx, 4, cr8_legacy , LOCK MOV CR0 means MOV CR8 +0x80000001, 0, ecx, 5, abm , LZCNT advanced bit manipulation +0x80000001, 0, ecx, 6, sse4a , SSE4A support +0x80000001, 0, ecx, 7, misalignsse , Misaligned SSE mode +0x80000001, 0, ecx, 8, 3dnowprefetch , 3DNow PREFETCH/PREFETCHW support +0x80000001, 0, ecx, 9, osvw , OS visible workaround +0x80000001, 0, ecx, 10, ibs , Instruction based sampling +0x80000001, 0, ecx, 11, xop , XOP: extended operation (AVX instructions) +0x80000001, 0, ecx, 12, skinit , SKINIT/STGI support +0x80000001, 0, ecx, 13, wdt , Watchdog timer support +0x80000001, 0, ecx, 15, lwp , Lightweight profiling +0x80000001, 0, ecx, 16, fma4 , 4-operand FMA instruction +0x80000001, 0, ecx, 17, tce , Translation cache extension +0x80000001, 0, ecx, 19, nodeid_msr , NodeId MSR (0xc001100c) +0x80000001, 0, ecx, 21, tbm , Trailing bit manipulations +0x80000001, 0, ecx, 22, topoext , Topology Extensions (cpuid leaf 0x8000001d) +0x80000001, 0, ecx, 23, perfctr_core , Core performance counter extensions +0x80000001, 0, ecx, 24, perfctr_nb , NB/DF performance counter extensions +0x80000001, 0, ecx, 26, bpext , Data access breakpoint extension +0x80000001, 0, ecx, 27, ptsc , Performance time-stamp counter +0x80000001, 0, ecx, 28, perfctr_llc , LLC (L3) performance counter extensions +0x80000001, 0, ecx, 29, mwaitx , MWAITX/MONITORX support +0x80000001, 0, ecx, 30, addr_mask_ext , Breakpoint address mask extension (to bit 31) +0x80000001, 0, edx, 0, e_fpu , Floating-Point Unit on-chip (x87) +0x80000001, 0, edx, 1, e_vme , Virtual-8086 Mode Extensions +0x80000001, 0, edx, 2, e_de , Debugging Extensions +0x80000001, 0, edx, 3, e_pse , Page Size Extension +0x80000001, 0, edx, 4, e_tsc , Time Stamp Counter +0x80000001, 0, edx, 5, e_msr , Model-Specific Registers (RDMSR and WRMSR support) +0x80000001, 0, edx, 6, pae , Physical Address Extensions +0x80000001, 0, edx, 7, mce , Machine Check Exception +0x80000001, 0, edx, 8, cx8 , CMPXCHG8B instruction +0x80000001, 0, edx, 9, apic , APIC on-chip +0x80000001, 0, edx, 11, syscall , SYSCALL and SYSRET instructions +0x80000001, 0, edx, 12, mtrr , Memory Type Range Registers +0x80000001, 0, edx, 13, pge , Page Global Extensions +0x80000001, 0, edx, 14, mca , Machine Check Architecture +0x80000001, 0, edx, 15, cmov , Conditional Move Instruction +0x80000001, 0, edx, 16, pat , Page Attribute Table +0x80000001, 0, edx, 17, pse36 , Page Size Extension (36-bit) +0x80000001, 0, edx, 19, mp , Out-of-spec AMD Multiprocessing bit +0x80000001, 0, edx, 20, nx , No-execute page protection +0x80000001, 0, edx, 22, mmxext , AMD MMX extensions +0x80000001, 0, edx, 24, e_fxsr , FXSAVE and FXRSTOR instructions +0x80000001, 0, edx, 25, fxsr_opt , FXSAVE and FXRSTOR optimizations +0x80000001, 0, edx, 26, pdpe1gb , 1-GB large page support +0x80000001, 0, edx, 27, rdtscp , RDTSCP instruction +0x80000001, 0, edx, 29, lm , Long mode (x86-64, 64-bit support) +0x80000001, 0, edx, 30, 3dnowext , AMD 3DNow extensions +0x80000001, 0, edx, 31, 3dnow , 3DNow instructions + +# Leaf 80000002H +# CPU brand ID string, bytes 0 - 15 + +0x80000002, 0, eax, 31:0, cpu_brandid_0 , CPU brand ID string, bytes 0 - 3 +0x80000002, 0, ebx, 31:0, cpu_brandid_1 , CPU brand ID string, bytes 4 - 7 +0x80000002, 0, ecx, 31:0, cpu_brandid_2 , CPU brand ID string, bytes 8 - 11 +0x80000002, 0, edx, 31:0, cpu_brandid_3 , CPU brand ID string, bytes 12 - 15 + +# Leaf 80000003H +# CPU brand ID string, bytes 16 - 31 + +0x80000003, 0, eax, 31:0, cpu_brandid_4 , CPU brand ID string bytes, 16 - 19 +0x80000003, 0, ebx, 31:0, cpu_brandid_5 , CPU brand ID string bytes, 20 - 23 +0x80000003, 0, ecx, 31:0, cpu_brandid_6 , CPU brand ID string bytes, 24 - 27 +0x80000003, 0, edx, 31:0, cpu_brandid_7 , CPU brand ID string bytes, 28 - 31 + +# Leaf 80000004H +# CPU brand ID string, bytes 32 - 47 + +0x80000004, 0, eax, 31:0, cpu_brandid_8 , CPU brand ID string, bytes 32 - 35 +0x80000004, 0, ebx, 31:0, cpu_brandid_9 , CPU brand ID string, bytes 36 - 39 +0x80000004, 0, ecx, 31:0, cpu_brandid_10 , CPU brand ID string, bytes 40 - 43 +0x80000004, 0, edx, 31:0, cpu_brandid_11 , CPU brand ID string, bytes 44 - 47 # Leaf 80000005H -# Reserved +# AMD L1 cache and L1 TLB enumeration + +0x80000005, 0, eax, 7:0, l1_itlb_2m_4m_nentries , L1 ITLB #entires, 2M and 4M pages +0x80000005, 0, eax, 15:8, l1_itlb_2m_4m_assoc , L1 ITLB associativity, 2M and 4M pages +0x80000005, 0, eax, 23:16, l1_dtlb_2m_4m_nentries , L1 DTLB #entires, 2M and 4M pages +0x80000005, 0, eax, 31:24, l1_dtlb_2m_4m_assoc , L1 DTLB associativity, 2M and 4M pages +0x80000005, 0, ebx, 7:0, l1_itlb_4k_nentries , L1 ITLB #entries, 4K pages +0x80000005, 0, ebx, 15:8, l1_itlb_4k_assoc , L1 ITLB associativity, 4K pages +0x80000005, 0, ebx, 23:16, l1_dtlb_4k_nentries , L1 DTLB #entries, 4K pages +0x80000005, 0, ebx, 31:24, l1_dtlb_4k_assoc , L1 DTLB associativity, 4K pages +0x80000005, 0, ecx, 7:0, l1_dcache_line_size , L1 dcache line size, in bytes +0x80000005, 0, ecx, 15:8, l1_dcache_nlines , L1 dcache lines per tag +0x80000005, 0, ecx, 23:16, l1_dcache_assoc , L1 dcache associativity +0x80000005, 0, ecx, 31:24, l1_dcache_size_kb , L1 dcache size, in KB +0x80000005, 0, edx, 7:0, l1_icache_line_size , L1 icache line size, in bytes +0x80000005, 0, edx, 15:8, l1_icache_nlines , L1 icache lines per tag +0x80000005, 0, edx, 23:16, l1_icache_assoc , L1 icache associativity +0x80000005, 0, edx, 31:24, l1_icache_size_kb , L1 icache size, in KB # Leaf 80000006H -# Extended L2 Cache Features - -0x80000006, 0, ECX, 7:0, clsize, Cache Line size in bytes -0x80000006, 0, ECX, 15:12, l2c_assoc, L2 Associativity -0x80000006, 0, ECX, 31:16, csize, Cache size in 1K units - +# (Mostly AMD) L2 TLB, L2 cache, and L3 cache enumeration + +0x80000006, 0, eax, 11:0, l2_itlb_2m_4m_nentries , L2 iTLB #entries, 2M and 4M pages +0x80000006, 0, eax, 15:12, l2_itlb_2m_4m_assoc , L2 iTLB associativity, 2M and 4M pages +0x80000006, 0, eax, 27:16, l2_dtlb_2m_4m_nentries , L2 dTLB #entries, 2M and 4M pages +0x80000006, 0, eax, 31:28, l2_dtlb_2m_4m_assoc , L2 dTLB associativity, 2M and 4M pages +0x80000006, 0, ebx, 11:0, l2_itlb_4k_nentries , L2 iTLB #entries, 4K pages +0x80000006, 0, ebx, 15:12, l2_itlb_4k_assoc , L2 iTLB associativity, 4K pages +0x80000006, 0, ebx, 27:16, l2_dtlb_4k_nentries , L2 dTLB #entries, 4K pages +0x80000006, 0, ebx, 31:28, l2_dtlb_4k_assoc , L2 dTLB associativity, 4K pages +0x80000006, 0, ecx, 7:0, l2_line_size , L2 cache line size, in bytes +0x80000006, 0, ecx, 11:8, l2_nlines , L2 cache number of lines per tag +0x80000006, 0, ecx, 15:12, l2_assoc , L2 cache associativity +0x80000006, 0, ecx, 31:16, l2_size_kb , L2 cache size, in KB +0x80000006, 0, edx, 7:0, l3_line_size , L3 cache line size, in bytes +0x80000006, 0, edx, 11:8, l3_nlines , L3 cache number of lines per tag +0x80000006, 0, edx, 15:12, l3_assoc , L3 cache associativity +0x80000006, 0, edx, 31:18, l3_size_range , L3 cache size range # Leaf 80000007H - -0x80000007, 0, EDX, 8, nonstop_tsc, Invariant TSC available - +# CPU power management (mostly AMD) and AMD RAS enumeration + +0x80000007, 0, ebx, 0, overflow_recov , MCA overflow conditions not fatal +0x80000007, 0, ebx, 1, succor , Software containment of UnCORRectable errors +0x80000007, 0, ebx, 2, hw_assert , Hardware assert MSRs +0x80000007, 0, ebx, 3, smca , Scalable MCA (MCAX MSRs) +0x80000007, 0, ecx, 31:0, cpu_pwr_sample_ratio , CPU power sample time ratio +0x80000007, 0, edx, 0, digital_temp , Digital temprature sensor +0x80000007, 0, edx, 1, powernow_freq_id , PowerNOW! frequency scaling +0x80000007, 0, edx, 2, powernow_volt_id , PowerNOW! voltage scaling +0x80000007, 0, edx, 3, thermal_trip , THERMTRIP (Thermal Trip) +0x80000007, 0, edx, 4, hw_thermal_control , Hardware thermal control +0x80000007, 0, edx, 5, sw_thermal_control , Software thermal control +0x80000007, 0, edx, 6, 100mhz_steps , 100 MHz multiplier control +0x80000007, 0, edx, 7, hw_pstate , Hardware P-state control +0x80000007, 0, edx, 8, constant_tsc , TSC ticks at constant rate across all P and C states +0x80000007, 0, edx, 9, cpb , Core performance boost +0x80000007, 0, edx, 10, eff_freq_ro , Read-only effective frequency interface +0x80000007, 0, edx, 11, proc_feedback , Processor feedback interface (deprecated) +0x80000007, 0, edx, 12, acc_power , Processor power reporting interface +0x80000007, 0, edx, 13, connected_standby , CPU Connected Standby support +0x80000007, 0, edx, 14, rapl , Runtime Average Power Limit interface # Leaf 80000008H - -0x80000008, 0, EAX, 7:0, phy_adr_bits, Physical Address Bits -0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits -0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD - -# 0x8000001E -# EAX: Extended APIC ID -0x8000001E, 0, EAX, 31:0, extended_apic_id, Extended APIC ID -# EBX: Core Identifiers -0x8000001E, 0, EBX, 7:0, core_id, Identifies the logical core ID -0x8000001E, 0, EBX, 15:8, threads_per_core, The number of threads per core is threads_per_core + 1 -# ECX: Node Identifiers -0x8000001E, 0, ECX, 7:0, node_id, Node ID -0x8000001E, 0, ECX, 10:8, nodes_per_processor, Nodes per processor { 0: 1 node, else reserved } - -# 8000001F: AMD Secure Encryption -0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption -0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization -0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR -0x8000001F, 0, EAX, 3, seves, SEV Encrypted State -0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption -0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled -0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest -0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest +# CPU capacity parameters and extended feature flags (mostly AMD) + +0x80000008, 0, eax, 7:0, phys_addr_bits , Max physical address bits +0x80000008, 0, eax, 15:8, virt_addr_bits , Max virtual address bits +0x80000008, 0, eax, 23:16, guest_phys_addr_bits , Max nested-paging guest physical address bits +0x80000008, 0, ebx, 0, clzero , CLZERO supported +0x80000008, 0, ebx, 1, irperf , Instruction retired counter MSR +0x80000008, 0, ebx, 2, xsaveerptr , XSAVE/XRSTOR always saves/restores FPU error pointers +0x80000008, 0, ebx, 3, invlpgb , INVLPGB broadcasts a TLB invalidate to all threads +0x80000008, 0, ebx, 4, rdpru , RDPRU (Read Processor Register at User level) supported +0x80000008, 0, ebx, 6, mba , Memory Bandwidth Allocation (AMD bit) +0x80000008, 0, ebx, 8, mcommit , MCOMMIT (Memory commit) supported +0x80000008, 0, ebx, 9, wbnoinvd , WBNOINVD supported +0x80000008, 0, ebx, 12, amd_ibpb , Indirect Branch Prediction Barrier +0x80000008, 0, ebx, 13, wbinvd_int , Interruptible WBINVD/WBNOINVD +0x80000008, 0, ebx, 14, amd_ibrs , Indirect Branch Restricted Speculation +0x80000008, 0, ebx, 15, amd_stibp , Single Thread Indirect Branch Prediction mode +0x80000008, 0, ebx, 16, ibrs_always_on , IBRS always-on preferred +0x80000008, 0, ebx, 17, amd_stibp_always_on , STIBP always-on preferred +0x80000008, 0, ebx, 18, ibrs_fast , IBRS is preferred over software solution +0x80000008, 0, ebx, 19, ibrs_same_mode , IBRS provides same mode protection +0x80000008, 0, ebx, 20, no_efer_lmsle , EFER[LMSLE] bit (Long-Mode Segment Limit Enable) unsupported +0x80000008, 0, ebx, 21, tlb_flush_nested , INVLPGB RAX[5] bit can be set (nested translations) +0x80000008, 0, ebx, 23, amd_ppin , Protected Processor Inventory Number +0x80000008, 0, ebx, 24, amd_ssbd , Speculative Store Bypass Disable +0x80000008, 0, ebx, 25, virt_ssbd , virtualized SSBD (Speculative Store Bypass Disable) +0x80000008, 0, ebx, 26, amd_ssb_no , SSBD not needed (fixed in HW) +0x80000008, 0, ebx, 27, cppc , Collaborative Processor Performance Control +0x80000008, 0, ebx, 28, amd_psfd , Predictive Store Forward Disable +0x80000008, 0, ebx, 29, btc_no , CPU not affected by Branch Type Confusion +0x80000008, 0, ebx, 30, ibpb_ret , IBPB clears RSB/RAS too +0x80000008, 0, ebx, 31, brs , Branch Sampling supported +0x80000008, 0, ecx, 7:0, cpu_nthreads , Number of physical threads - 1 +0x80000008, 0, ecx, 15:12, apicid_coreid_len , Number of thread core ID bits (shift) in APIC ID +0x80000008, 0, ecx, 17:16, perf_tsc_len , Performance time-stamp counter size +0x80000008, 0, edx, 15:0, invlpgb_max_pages , INVLPGB maximum page count +0x80000008, 0, edx, 31:16, rdpru_max_reg_id , RDPRU max register ID (ECX input) + +# Leaf 8000000AH +# AMD SVM (Secure Virtual Machine) enumeration + +0x8000000a, 0, eax, 7:0, svm_version , SVM revision number +0x8000000a, 0, ebx, 31:0, svm_nasid , Number of address space identifiers (ASID) +0x8000000a, 0, edx, 0, npt , Nested paging +0x8000000a, 0, edx, 1, lbrv , LBR virtualization +0x8000000a, 0, edx, 2, svm_lock , SVM lock +0x8000000a, 0, edx, 3, nrip_save , NRIP save support on #VMEXIT +0x8000000a, 0, edx, 4, tsc_scale , MSR based TSC rate control +0x8000000a, 0, edx, 5, vmcb_clean , VMCB clean bits support +0x8000000a, 0, edx, 6, flushbyasid , Flush by ASID + Extended VMCB TLB_Control +0x8000000a, 0, edx, 7, decodeassists , Decode Assists support +0x8000000a, 0, edx, 10, pausefilter , Pause intercept filter +0x8000000a, 0, edx, 12, pfthreshold , Pause filter threshold +0x8000000a, 0, edx, 13, avic , Advanced virtual interrupt controller +0x8000000a, 0, edx, 15, v_vmsave_vmload , Virtual VMSAVE/VMLOAD (nested virt) +0x8000000a, 0, edx, 16, vgif , Virtualize the Global Interrupt Flag +0x8000000a, 0, edx, 17, gmet , Guest mode execution trap +0x8000000a, 0, edx, 18, x2avic , Virtual x2APIC +0x8000000a, 0, edx, 19, sss_check , Supervisor Shadow Stack restrictions +0x8000000a, 0, edx, 20, v_spec_ctrl , Virtual SPEC_CTRL +0x8000000a, 0, edx, 21, ro_gpt , Read-Only guest page table support +0x8000000a, 0, edx, 23, h_mce_override , Host MCE override +0x8000000a, 0, edx, 24, tlbsync_int , TLBSYNC intercept + INVLPGB/TLBSYNC in VMCB +0x8000000a, 0, edx, 25, vnmi , NMI virtualization +0x8000000a, 0, edx, 26, ibs_virt , IBS Virtualization +0x8000000a, 0, edx, 27, ext_lvt_off_chg , Extended LVT offset fault change +0x8000000a, 0, edx, 28, svme_addr_chk , Guest SVME addr check + +# Leaf 80000019H +# AMD TLB 1G-pages enumeration + +0x80000019, 0, eax, 11:0, l1_itlb_1g_nentries , L1 iTLB #entries, 1G pages +0x80000019, 0, eax, 15:12, l1_itlb_1g_assoc , L1 iTLB associativity, 1G pages +0x80000019, 0, eax, 27:16, l1_dtlb_1g_nentries , L1 dTLB #entries, 1G pages +0x80000019, 0, eax, 31:28, l1_dtlb_1g_assoc , L1 dTLB associativity, 1G pages +0x80000019, 0, ebx, 11:0, l2_itlb_1g_nentries , L2 iTLB #entries, 1G pages +0x80000019, 0, ebx, 15:12, l2_itlb_1g_assoc , L2 iTLB associativity, 1G pages +0x80000019, 0, ebx, 27:16, l2_dtlb_1g_nentries , L2 dTLB #entries, 1G pages +0x80000019, 0, ebx, 31:28, l2_dtlb_1g_assoc , L2 dTLB associativity, 1G pages + +# Leaf 8000001AH +# AMD instruction optimizations enumeration + +0x8000001a, 0, eax, 0, fp_128 , Internal FP/SIMD exec data path is 128-bits wide +0x8000001a, 0, eax, 1, movu_preferred , SSE: MOVU* better than MOVL*/MOVH* +0x8000001a, 0, eax, 2, fp_256 , internal FP/SSE exec data path is 256-bits wide + +# Leaf 8000001BH +# AMD IBS (Instruction-Based Sampling) enumeration + +0x8000001b, 0, eax, 0, ibs_flags_valid , IBS feature flags valid +0x8000001b, 0, eax, 1, ibs_fetch_sampling , IBS fetch sampling supported +0x8000001b, 0, eax, 2, ibs_op_sampling , IBS execution sampling supported +0x8000001b, 0, eax, 3, ibs_rdwr_op_counter , IBS read/write of op counter supported +0x8000001b, 0, eax, 4, ibs_op_count , IBS OP counting mode supported +0x8000001b, 0, eax, 5, ibs_branch_target , IBS branch target address reporting supported +0x8000001b, 0, eax, 6, ibs_op_counters_ext , IBS IbsOpCurCnt/IbsOpMaxCnt extend by 7 bits +0x8000001b, 0, eax, 7, ibs_rip_invalid_chk , IBS invalid RIP indication supported +0x8000001b, 0, eax, 8, ibs_op_branch_fuse , IBS fused branch micro-op indication supported +0x8000001b, 0, eax, 9, ibs_fetch_ctl_ext , IBS Fetch Control Extended MSR (0xc001103c) supported +0x8000001b, 0, eax, 10, ibs_op_data_4 , IBS op data 4 MSR supported +0x8000001b, 0, eax, 11, ibs_l3_miss_filter , IBS L3-miss filtering supported (Zen4+) + +# Leaf 8000001CH +# AMD LWP (Lightweight Profiling) + +0x8000001c, 0, eax, 0, os_lwp_avail , LWP is available to application programs (supported by OS) +0x8000001c, 0, eax, 1, os_lpwval , LWPVAL instruction (EventId=1) is supported by OS +0x8000001c, 0, eax, 2, os_lwp_ire , Instructions Retired Event (EventId=2) is supported by OS +0x8000001c, 0, eax, 3, os_lwp_bre , Branch Retired Event (EventId=3) is supported by OS +0x8000001c, 0, eax, 4, os_lwp_dme , DCache Miss Event (EventId=4) is supported by OS +0x8000001c, 0, eax, 5, os_lwp_cnh , CPU Clocks Not Halted event (EventId=5) is supported by OS +0x8000001c, 0, eax, 6, os_lwp_rnh , CPU Reference clocks Not Halted event (EventId=6) is supported by OS +0x8000001c, 0, eax, 29, os_lwp_cont , LWP sampling in continuous mode is supported by OS +0x8000001c, 0, eax, 30, os_lwp_ptsc , Performance Time Stamp Counter in event records is supported by OS +0x8000001c, 0, eax, 31, os_lwp_int , Interrupt on threshold overflow is supported by OS +0x8000001c, 0, ebx, 7:0, lwp_lwpcb_sz , LWP Control Block size, in quadwords +0x8000001c, 0, ebx, 15:8, lwp_event_sz , LWP event record size, in bytes +0x8000001c, 0, ebx, 23:16, lwp_max_events , LWP max supported EventId value (EventID 255 not included) +0x8000001c, 0, ebx, 31:24, lwp_event_offset , LWP events area offset in the LWP Control Block +0x8000001c, 0, ecx, 4:0, lwp_latency_max , Num of bits in cache latency counters (10 to 31) +0x8000001c, 0, ecx, 5, lwp_data_adddr , Cache miss events report the data address of the reference +0x8000001c, 0, ecx, 8:6, lwp_latency_rnd , Amount by which cache latency is rounded +0x8000001c, 0, ecx, 15:9, lwp_version , LWP implementation version +0x8000001c, 0, ecx, 23:16, lwp_buf_min_sz , LWP event ring buffer min size, in units of 32 event records +0x8000001c, 0, ecx, 28, lwp_branch_predict , Branches Retired events can be filtered +0x8000001c, 0, ecx, 29, lwp_ip_filtering , IP filtering (IPI, IPF, BaseIP, and LimitIP @ LWPCP) supported +0x8000001c, 0, ecx, 30, lwp_cache_levels , Cache-related events can be filtered by cache level +0x8000001c, 0, ecx, 31, lwp_cache_latency , Cache-related events can be filtered by latency +0x8000001c, 0, edx, 0, hw_lwp_avail , LWP is available in Hardware +0x8000001c, 0, edx, 1, hw_lpwval , LWPVAL instruction (EventId=1) is available in HW +0x8000001c, 0, edx, 2, hw_lwp_ire , Instructions Retired Event (EventId=2) is available in HW +0x8000001c, 0, edx, 3, hw_lwp_bre , Branch Retired Event (EventId=3) is available in HW +0x8000001c, 0, edx, 4, hw_lwp_dme , DCache Miss Event (EventId=4) is available in HW +0x8000001c, 0, edx, 5, hw_lwp_cnh , CPU Clocks Not Halted event (EventId=5) is available in HW +0x8000001c, 0, edx, 6, hw_lwp_rnh , CPU Reference clocks Not Halted event (EventId=6) is available in HW +0x8000001c, 0, edx, 29, hw_lwp_cont , LWP sampling in continuous mode is available in HW +0x8000001c, 0, edx, 30, hw_lwp_ptsc , Performance Time Stamp Counter in event records is available in HW +0x8000001c, 0, edx, 31, hw_lwp_int , Interrupt on threshold overflow is available in HW + +# Leaf 8000001DH +# AMD deterministic cache parameters + +0x8000001d, 31:0, eax, 4:0, cache_type , Cache type field +0x8000001d, 31:0, eax, 7:5, cache_level , Cache level (1-based) +0x8000001d, 31:0, eax, 8, cache_self_init , Self-initializing cache level +0x8000001d, 31:0, eax, 9, fully_associative , Fully-associative cache +0x8000001d, 31:0, eax, 25:14, num_threads_sharing , Number of logical CPUs sharing cache +0x8000001d, 31:0, ebx, 11:0, cache_linesize , System coherency line size (0-based) +0x8000001d, 31:0, ebx, 21:12, cache_npartitions , Physical line partitions (0-based) +0x8000001d, 31:0, ebx, 31:22, cache_nways , Ways of associativity (0-based) +0x8000001d, 31:0, ecx, 30:0, cache_nsets , Cache number of sets (0-based) +0x8000001d, 31:0, edx, 0, wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches +0x8000001d, 31:0, edx, 1, ll_inclusive , Cache is inclusive of Lower-Level caches + +# Leaf 8000001EH +# AMD CPU topology enumeration + +0x8000001e, 0, eax, 31:0, ext_apic_id , Extended APIC ID +0x8000001e, 0, ebx, 7:0, core_id , Unique per-socket logical core unit ID +0x8000001e, 0, ebx, 15:8, core_nthreas , #Threads per core (zero-based) +0x8000001e, 0, ecx, 7:0, node_id , Node (die) ID of invoking logical CPU +0x8000001e, 0, ecx, 10:8, nnodes_per_socket , #nodes in invoking logical CPU's package/socket + +# Leaf 8000001FH +# AMD encrypted memory capabilities enumeration (SME/SEV) + +0x8000001f, 0, eax, 0, sme , Secure Memory Encryption supported +0x8000001f, 0, eax, 1, sev , Secure Encrypted Virtualization supported +0x8000001f, 0, eax, 2, vm_page_flush , VM Page Flush MSR (0xc001011e) available +0x8000001f, 0, eax, 3, sev_es , SEV Encrypted State supported +0x8000001f, 0, eax, 4, sev_nested_paging , SEV secure nested paging supported +0x8000001f, 0, eax, 5, vm_permission_levels , VMPL supported +0x8000001f, 0, eax, 6, rpmquery , RPMQUERY instruction supported +0x8000001f, 0, eax, 7, vmpl_sss , VMPL supervisor shadwo stack supported +0x8000001f, 0, eax, 8, secure_tsc , Secure TSC supported +0x8000001f, 0, eax, 9, v_tsc_aux , Hardware virtualizes TSC_AUX +0x8000001f, 0, eax, 10, sme_coherent , HW enforces cache coherency across encryption domains +0x8000001f, 0, eax, 11, req_64bit_hypervisor , SEV guest mandates 64-bit hypervisor +0x8000001f, 0, eax, 12, restricted_injection , Restricted Injection supported +0x8000001f, 0, eax, 13, alternate_injection , Alternate Injection supported +0x8000001f, 0, eax, 14, debug_swap , SEV-ES: full debug state swap is supported +0x8000001f, 0, eax, 15, disallow_host_ibs , SEV-ES: Disallowing IBS use by the host is supported +0x8000001f, 0, eax, 16, virt_transparent_enc , Virtual Transparent Encryption +0x8000001f, 0, eax, 17, vmgexit_paremeter , VmgexitParameter is supported in SEV_FEATURES +0x8000001f, 0, eax, 18, virt_tom_msr , Virtual TOM MSR is supported +0x8000001f, 0, eax, 19, virt_ibs , IBS state virtualization is supported for SEV-ES guests +0x8000001f, 0, eax, 24, vmsa_reg_protection , VMSA register protection is supported +0x8000001f, 0, eax, 25, smt_protection , SMT protection is supported +0x8000001f, 0, eax, 28, svsm_page_msr , SVSM communication page MSR (0xc001f000h) is supported +0x8000001f, 0, eax, 29, nested_virt_snp_msr , VIRT_RMPUPDATE/VIRT_PSMASH MSRs are supported +0x8000001f, 0, ebx, 5:0, pte_cbit_pos , PTE bit number used to enable memory encryption +0x8000001f, 0, ebx, 11:6, phys_addr_reduction_nbits, Reduction of phys address space when encryption is enabled, in bits +0x8000001f, 0, ebx, 15:12, vmpl_count , Number of VM permission levels (VMPL) supported +0x8000001f, 0, ecx, 31:0, enc_guests_max , Max supported number of simultaneous encrypted guests +0x8000001f, 0, edx, 31:0, min_sev_asid_no_sev_es , Mininum ASID for SEV-enabled SEV-ES-disabled guest + +# Leaf 80000020H +# AMD Platform QoS extended feature IDs + +0x80000020, 0, ebx, 1, mba , Memory Bandwidth Allocation support +0x80000020, 0, ebx, 2, smba , Slow Memory Bandwidth Allocation support +0x80000020, 0, ebx, 3, bmec , Bandwidth Monitoring Event Configuration support +0x80000020, 0, ebx, 4, l3rr , L3 Range Reservation support +0x80000020, 1, eax, 31:0, mba_limit_len , MBA enforcement limit size +0x80000020, 1, edx, 31:0, mba_cos_max , MBA max Class of Service number (zero-based) +0x80000020, 2, eax, 31:0, smba_limit_len , SMBA enforcement limit size +0x80000020, 2, edx, 31:0, smba_cos_max , SMBA max Class of Service number (zero-based) +0x80000020, 3, ebx, 7:0, bmec_num_events , BMEC number of bandwidth events available +0x80000020, 3, ecx, 0, bmec_local_reads , Local NUMA reads can be tracked +0x80000020, 3, ecx, 1, bmec_remote_reads , Remote NUMA reads can be tracked +0x80000020, 3, ecx, 2, bmec_local_nontemp_wr , Local NUMA non-temporal writes can be tracked +0x80000020, 3, ecx, 3, bmec_remote_nontemp_wr , Remote NUMA non-temporal writes can be tracked +0x80000020, 3, ecx, 4, bmec_local_slow_mem_rd , Local NUMA slow-memory reads can be tracked +0x80000020, 3, ecx, 5, bmec_remote_slow_mem_rd, Remote NUMA slow-memory reads can be tracked +0x80000020, 3, ecx, 6, bmec_all_dirty_victims , Dirty QoS victims to all types of memory can be tracked + +# Leaf 80000021H +# AMD extended features enumeration 2 + +0x80000021, 0, eax, 0, no_nested_data_bp , No nested data breakpoints +0x80000021, 0, eax, 1, fsgs_non_serializing , WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing +0x80000021, 0, eax, 2, lfence_rdtsc , LFENCE always serializing / synchronizes RDTSC +0x80000021, 0, eax, 3, smm_page_cfg_lock , SMM paging configuration lock is supported +0x80000021, 0, eax, 6, null_sel_clr_base , Null selector clears base +0x80000021, 0, eax, 7, upper_addr_ignore , EFER MSR Upper Address Ignore Enable bit supported +0x80000021, 0, eax, 8, autoibrs , EFER MSR Automatic IBRS enable bit supported +0x80000021, 0, eax, 9, no_smm_ctl_msr , SMM_CTL MSR (0xc0010116) is not present +0x80000021, 0, eax, 10, fsrs_supported , Fast Short Rep Stosb (FSRS) is supported +0x80000021, 0, eax, 11, fsrc_supported , Fast Short Repe Cmpsb (FSRC) is supported +0x80000021, 0, eax, 13, prefetch_ctl_msr , Prefetch control MSR is supported +0x80000021, 0, eax, 17, user_cpuid_disable , #GP when executing CPUID at CPL > 0 is supported +0x80000021, 0, eax, 18, epsf_supported , Enhanced Predictive Store Forwarding (EPSF) is supported +0x80000021, 0, ebx, 11:0, microcode_patch_size , Size of microcode patch, in 16-byte units + +# Leaf 80000022H +# AMD Performance Monitoring v2 enumeration + +0x80000022, 0, eax, 0, perfmon_v2 , Performance monitoring v2 supported +0x80000022, 0, eax, 1, lbr_v2 , Last Branch Record v2 extensions (LBR Stack) +0x80000022, 0, eax, 2, lbr_pmc_freeze , Freezing core performance counters / LBR Stack supported +0x80000022, 0, ebx, 3:0, n_pmc_core , Number of core perfomance counters +0x80000022, 0, ebx, 9:4, lbr_v2_stack_size , Number of available LBR stack entries +0x80000022, 0, ebx, 15:10, n_pmc_northbridge , Number of available northbridge (data fabric) performance counters +0x80000022, 0, ebx, 21:16, n_pmc_umc , Number of available UMC performance counters +0x80000022, 0, ecx, 31:0, active_umc_bitmask , Active UMCs bitmask + +# Leaf 80000023H +# AMD Secure Multi-key Encryption enumeration + +0x80000023, 0, eax, 0, mem_hmk_mode , MEM-HMK encryption mode is supported +0x80000023, 0, ebx, 15:0, mem_hmk_avail_keys , MEM-HMK mode: total num of available encryption keys + +# Leaf 80000026H +# AMD extended topology enumeration v2 + +0x80000026, 3:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) +0x80000026, 3:0, eax, 29, core_has_pwreff_ranking, This core has a power efficiency ranking +0x80000026, 3:0, eax, 30, domain_has_hybrid_cores, This domain level has hybrid (E, P) cores +0x80000026, 3:0, eax, 31, domain_core_count_asymm, The 'Core' domain has asymmetric cores count +0x80000026, 3:0, ebx, 15:0, domain_lcpus_count , Number of logical CPUs at this domain instance +0x80000026, 3:0, ebx, 23:16, core_pwreff_ranking , This core's static power efficiency ranking +0x80000026, 3:0, ebx, 27:24, core_native_model_id , This core's native model ID +0x80000026, 3:0, ebx, 31:28, core_type , This core's type +0x80000026, 3:0, ecx, 7:0, domain_level , This domain level (subleaf ID) +0x80000026, 3:0, ecx, 15:8, domain_type , This domain type +0x80000026, 3:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 24b7d017ec2c..1b25c0a95d3f 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -7,7 +7,8 @@ #include <string.h> #include <getopt.h> -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define min(a, b) (((a) < (b)) ? (a) : (b)) typedef unsigned int u32; typedef unsigned long long u64; @@ -76,7 +77,6 @@ struct cpuid_range { */ struct cpuid_range *leafs_basic, *leafs_ext; -static int num_leafs; static bool is_amd; static bool show_details; static bool show_raw; @@ -98,27 +98,17 @@ static inline void cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) static inline bool has_subleafs(u32 f) { - if (f == 0x7 || f == 0xd) - return true; - - if (is_amd) { - if (f == 0x8000001d) + u32 with_subleaves[] = { + 0x4, 0x7, 0xb, 0xd, 0xf, 0x10, 0x12, + 0x14, 0x17, 0x18, 0x1b, 0x1d, 0x1f, 0x23, + 0x8000001d, 0x80000020, 0x80000026, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(with_subleaves); i++) + if (f == with_subleaves[i]) return true; - return false; - } - switch (f) { - case 0x4: - case 0xb: - case 0xf: - case 0x10: - case 0x14: - case 0x18: - case 0x1f: - return true; - default: - return false; - } + return false; } static void leaf_print_raw(struct subleaf *leaf) @@ -204,15 +194,12 @@ static void raw_dump_range(struct cpuid_range *range) } } -#define MAX_SUBLEAF_NUM 32 +#define MAX_SUBLEAF_NUM 64 struct cpuid_range *setup_cpuid_range(u32 input_eax) { - u32 max_func, idx_func; - int subleaf; + u32 max_func, idx_func, subleaf, max_subleaf; + u32 eax, ebx, ecx, edx, f = input_eax; struct cpuid_range *range; - u32 eax, ebx, ecx, edx; - u32 f = input_eax; - int max_subleaf; bool allzero; eax = input_eax; @@ -246,7 +233,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) allzero = cpuid_store(range, f, subleaf, eax, ebx, ecx, edx); if (allzero) continue; - num_leafs++; if (!has_subleafs(f)) continue; @@ -257,11 +243,18 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) * Some can provide the exact number of subleafs, * others have to be tried (0xf) */ - if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) - max_subleaf = (eax & 0xff) + 1; - + if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18 || f == 0x1d) + max_subleaf = min((eax & 0xff) + 1, max_subleaf); if (f == 0xb) max_subleaf = 2; + if (f == 0x1f) + max_subleaf = 6; + if (f == 0x23) + max_subleaf = 4; + if (f == 0x80000020) + max_subleaf = 4; + if (f == 0x80000026) + max_subleaf = 5; for (subleaf = 1; subleaf < max_subleaf; subleaf++) { eax = f; @@ -272,7 +265,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) eax, ebx, ecx, edx); if (allzero) continue; - num_leafs++; } } @@ -313,6 +305,8 @@ static int parse_line(char *line) struct bits_desc *bdesc; int reg_index; char *start, *end; + u32 subleaf_start, subleaf_end; + unsigned bit_start, bit_end; /* Skip comments and NULL line */ if (line[0] == '#' || line[0] == '\n') @@ -351,13 +345,25 @@ static int parse_line(char *line) return 0; /* subleaf */ - sub = strtoul(tokens[1], NULL, 0); - if ((int)sub > func->nr) - return -1; + buf = tokens[1]; + end = strtok(buf, ":"); + start = strtok(NULL, ":"); + subleaf_end = strtoul(end, NULL, 0); + + /* A subleaf range is given? */ + if (start) { + subleaf_start = strtoul(start, NULL, 0); + subleaf_end = min(subleaf_end, (u32)(func->nr - 1)); + if (subleaf_start > subleaf_end) + return 0; + } else { + subleaf_start = subleaf_end; + if (subleaf_start > (u32)(func->nr - 1)) + return 0; + } - leaf = &func->leafs[sub]; + /* register */ buf = tokens[2]; - if (strcasestr(buf, "EAX")) reg_index = R_EAX; else if (strcasestr(buf, "EBX")) @@ -369,23 +375,23 @@ static int parse_line(char *line) else goto err_exit; - reg = &leaf->info[reg_index]; - bdesc = ®->descs[reg->nr++]; - /* bit flag or bits field */ buf = tokens[3]; - end = strtok(buf, ":"); - bdesc->end = strtoul(end, NULL, 0); - bdesc->start = bdesc->end; - - /* start != NULL means it is bit fields */ start = strtok(NULL, ":"); - if (start) - bdesc->start = strtoul(start, NULL, 0); - - strcpy(bdesc->simp, tokens[4]); - strcpy(bdesc->detail, tokens[5]); + bit_end = strtoul(end, NULL, 0); + bit_start = (start) ? strtoul(start, NULL, 0) : bit_end; + + for (sub = subleaf_start; sub <= subleaf_end; sub++) { + leaf = &func->leafs[sub]; + reg = &leaf->info[reg_index]; + bdesc = ®->descs[reg->nr++]; + + bdesc->end = bit_end; + bdesc->start = bit_start; + strcpy(bdesc->simp, strtok(tokens[4], " \t")); + strcpy(bdesc->detail, tokens[5]); + } return 0; err_exit: @@ -452,8 +458,9 @@ static void decode_bits(u32 value, struct reg_desc *rdesc, enum cpuid_reg reg) if (start == end) { /* single bit flag */ if (value & (1 << start)) - printf("\t%-20s %s%s\n", + printf("\t%-20s %s%s%s\n", bdesc->simp, + show_flags_only ? "" : "\t\t\t", show_details ? "-" : "", show_details ? bdesc->detail : "" ); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 40ea743d139f..2ff949ea82fa 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2489,7 +2489,7 @@ static int do_help(int argc, char **argv) " cgroup/connect_unix | cgroup/getpeername4 | cgroup/getpeername6 |\n" " cgroup/getpeername_unix | cgroup/getsockname4 | cgroup/getsockname6 |\n" " cgroup/getsockname_unix | cgroup/sendmsg4 | cgroup/sendmsg6 |\n" - " cgroup/sendmsg°unix | cgroup/recvmsg4 | cgroup/recvmsg6 | cgroup/recvmsg_unix |\n" + " cgroup/sendmsg_unix | cgroup/recvmsg4 | cgroup/recvmsg6 | cgroup/recvmsg_unix |\n" " cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n" " struct_ops | fentry | fexit | freplace | sk_lookup }\n" " ATTACH_TYPE := { sk_msg_verdict | sk_skb_verdict | sk_skb_stream_verdict |\n" diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 936ef95c3d32..d54aaa0619df 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -704,7 +704,7 @@ static int sets_patch(struct object *obj) * Make sure id is at the beginning of the pairs * struct, otherwise the below qsort would not work. */ - BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); /* diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 1e2ab148d5db..e1900abd44f6 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -149,6 +149,24 @@ FEATURE_DISPLAY ?= \ # FEATURE_GROUP_MEMBERS-libbfd = libbfd-liberty libbfd-liberty-z +# +# Declare list of feature dependency packages that provide pkg-config files. +# +FEATURE_PKG_CONFIG ?= \ + libtraceevent \ + libtracefs + +feature_pkg_config = $(eval $(feature_pkg_config_code)) +define feature_pkg_config_code + FEATURE_CHECK_CFLAGS-$(1) := $(shell $(PKG_CONFIG) --cflags $(1) 2>/dev/null) + FEATURE_CHECK_LDFLAGS-$(1) := $(shell $(PKG_CONFIG) --libs $(1) 2>/dev/null) +endef + +# Set FEATURE_CHECK_(C|LD)FLAGS-$(package) for packages using pkg-config. +ifneq ($(PKG_CONFIG),) + $(foreach package,$(FEATURE_PKG_CONFIG),$(call feature_pkg_config,$(package))) +endif + # Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features. # If in the future we need per-feature checks/flags for features not # mentioned in this list we need to refactor this ;-). diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 489cbed7e82a..12796808f07a 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -82,7 +82,30 @@ FILES= \ FILES := $(addprefix $(OUTPUT),$(FILES)) -PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +# Some distros provide the command $(CROSS_COMPILE)pkg-config for +# searching packges installed with Multiarch. Use it for cross +# compilation if it is existed. +ifneq (, $(shell which $(CROSS_COMPILE)pkg-config)) + PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +else + PKG_CONFIG ?= pkg-config + + # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR + # for modified system root, are required for the cross compilation. + # If these PKG_CONFIG environment variables are not set, Multiarch library + # paths are used instead. + ifdef CROSS_COMPILE + ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) + CROSS_ARCH = $(shell $(CC) -dumpmachine) + PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/ + export PKG_CONFIG_LIBDIR + endif + endif +endif all: $(FILES) @@ -147,7 +170,17 @@ $(OUTPUT)test-libopencsd.bin: DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) -DWARFLIBS += -lelf -lebl -lz -llzma -lbz2 + DWARFLIBS += -lelf -lz -llzma -lbz2 -lzstd + + LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) + LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) + LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION))) + + # Elfutils merged libebl.a into libdw.a starting from version 0.177, + # Link libebl.a only if libdw is older than this version. + ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0) + DWARFLIBS += -lebl + endif endif $(OUTPUT)test-dwarf.bin: @@ -178,27 +211,27 @@ $(OUTPUT)test-numa_num_possible_cpus.bin: $(BUILD) -lnuma $(OUTPUT)test-libunwind.bin: - $(BUILD) -lelf + $(BUILD) -lelf -llzma $(OUTPUT)test-libunwind-debug-frame.bin: - $(BUILD) -lelf + $(BUILD) -lelf -llzma $(OUTPUT)test-libunwind-x86.bin: - $(BUILD) -lelf -lunwind-x86 + $(BUILD) -lelf -llzma -lunwind-x86 $(OUTPUT)test-libunwind-x86_64.bin: - $(BUILD) -lelf -lunwind-x86_64 + $(BUILD) -lelf -llzma -lunwind-x86_64 $(OUTPUT)test-libunwind-arm.bin: - $(BUILD) -lelf -lunwind-arm + $(BUILD) -lelf -llzma -lunwind-arm $(OUTPUT)test-libunwind-aarch64.bin: - $(BUILD) -lelf -lunwind-aarch64 + $(BUILD) -lelf -llzma -lunwind-aarch64 $(OUTPUT)test-libunwind-debug-frame-arm.bin: - $(BUILD) -lelf -lunwind-arm + $(BUILD) -lelf -llzma -lunwind-arm $(OUTPUT)test-libunwind-debug-frame-aarch64.bin: - $(BUILD) -lelf -lunwind-aarch64 + $(BUILD) -lelf -llzma -lunwind-aarch64 $(OUTPUT)test-libaudit.bin: $(BUILD) -laudit diff --git a/tools/crypto/ccp/dbc.c b/tools/crypto/ccp/dbc.c index a807df0f0597..80248d3d3a5a 100644 --- a/tools/crypto/ccp/dbc.c +++ b/tools/crypto/ccp/dbc.c @@ -57,7 +57,6 @@ int process_param(int fd, int msg_index, __u8 *signature, int *data) .msg_index = msg_index, .param = *data, }; - int ret; assert(signature); assert(data); diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile index d29c9c49e251..ed565eb52275 100644 --- a/tools/gpio/Makefile +++ b/tools/gpio/Makefile @@ -78,7 +78,7 @@ $(OUTPUT)gpio-watch: $(GPIO_WATCH_IN) clean: rm -f $(ALL_PROGRAMS) rm -f $(OUTPUT)include/linux/gpio.h - find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(bindir); \ diff --git a/tools/gpio/gpio-hammer.c b/tools/gpio/gpio-hammer.c index 54fdf59dd320..ba0866eb3581 100644 --- a/tools/gpio/gpio-hammer.c +++ b/tools/gpio/gpio-hammer.c @@ -54,7 +54,7 @@ int hammer_device(const char *device_name, unsigned int *lines, int num_lines, fprintf(stdout, "Hammer lines ["); for (i = 0; i < num_lines; i++) { - fprintf(stdout, "%d", lines[i]); + fprintf(stdout, "%u", lines[i]); if (i != (num_lines - 1)) fprintf(stdout, ", "); } @@ -89,7 +89,7 @@ int hammer_device(const char *device_name, unsigned int *lines, int num_lines, fprintf(stdout, "["); for (i = 0; i < num_lines; i++) { - fprintf(stdout, "%d: %d", lines[i], + fprintf(stdout, "%u: %d", lines[i], gpiotools_test_bit(values.bits, i)); if (i != (num_lines - 1)) fprintf(stdout, ", "); diff --git a/tools/hv/Makefile b/tools/hv/Makefile index 2e60e2c212cd..34ffcec264ab 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -52,7 +52,7 @@ $(OUTPUT)hv_fcopy_uio_daemon: $(HV_FCOPY_UIO_DAEMON_IN) clean: rm -f $(ALL_PROGRAMS) - find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(sbindir); \ diff --git a/tools/hv/lsvmbus b/tools/hv/lsvmbus index 099f2c44dbed..f83698f14da2 100644..100755 --- a/tools/hv/lsvmbus +++ b/tools/hv/lsvmbus @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 import os diff --git a/tools/include/asm/rwonce.h b/tools/include/asm/rwonce.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/tools/include/asm/rwonce.h diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 210c13b1b857..2a7f260ef9dc 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -19,7 +19,7 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); -void bitmap_clear(unsigned long *map, unsigned int start, int len); +void __bitmap_clear(unsigned long *map, unsigned int start, int len); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); @@ -150,4 +150,19 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } +static inline void bitmap_clear(unsigned long *map, unsigned int start, + unsigned int nbits) +{ + if (__builtin_constant_p(nbits) && nbits == 1) + __clear_bit(start, map); + else if (small_const_nbits(start + nbits)) + *map &= ~GENMASK(start + nbits - 1, start); + else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && + IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && + __builtin_constant_p(nbits & BITMAP_MEM_MASK) && + IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) + memset((char *)map + start / 8, 0, nbits / 8); + else + __bitmap_clear(map, start, nbits); +} #endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index e69c26abe1ea..a1f55fb24bb3 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -35,6 +35,7 @@ all_files := \ stackprotector.h \ std.h \ stdarg.h \ + stdbool.h \ stdint.h \ stdlib.h \ string.h \ diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h index b23ac1f04035..06fdef7b291a 100644 --- a/tools/include/nolibc/arch-aarch64.h +++ b/tools/include/nolibc/arch-aarch64.h @@ -142,13 +142,13 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "mov x0, sp\n" /* save stack pointer to x0, as arg1 of _start_c */ "and sp, x0, -16\n" /* sp must be 16-byte aligned in the callee */ "bl _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_AARCH64_H */ diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index cae4afa7c1c7..6180ff99ab43 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -185,15 +185,15 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( - "mov %r0, sp\n" /* save stack pointer to %r0, as arg1 of _start_c */ - "and ip, %r0, #-8\n" /* sp must be 8-byte aligned in the callee */ + "mov r0, sp\n" /* save stack pointer to %r0, as arg1 of _start_c */ + "and ip, r0, #-8\n" /* sp must be 8-byte aligned in the callee */ "mov sp, ip\n" "bl _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_ARM_H */ diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index 28c26a00a762..ff5afc35bbd8 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -162,7 +162,7 @@ * 2) The deepest stack frame should be set to zero * */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "xor %ebp, %ebp\n" /* zero the stack frame */ @@ -174,7 +174,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "call _start_c\n" /* transfer to c runtime */ "hlt\n" /* ensure it does not return */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_I386_H */ diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h index 3f8ef8f86c0f..fb519545959e 100644 --- a/tools/include/nolibc/arch-loongarch.h +++ b/tools/include/nolibc/arch-loongarch.h @@ -149,14 +149,14 @@ #endif /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "move $a0, $sp\n" /* save stack pointer to $a0, as arg1 of _start_c */ LONG_BSTRINS " $sp, $zero, 3, 0\n" /* $sp must be 16-byte aligned */ "bl _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_LOONGARCH_H */ diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 62cc50ef3288..1791a8ce58da 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -179,7 +179,7 @@ }) /* startup code, note that it's called __start on MIPS */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector __start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void) { __asm__ volatile ( ".set push\n" @@ -194,11 +194,13 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "li $t0, -8\n" "and $sp, $sp, $t0\n" /* $sp must be 8-byte aligned */ "addiu $sp, $sp, -16\n" /* the callee expects to save a0..a3 there */ - "jal _start_c\n" /* transfer to c runtime */ + "lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */ + "ori $t9, %lo(_start_c)\n" + "jalr $t9\n" /* transfer to c runtime */ " nop\n" /* delayed slot */ ".set pop\n" ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_MIPS_H */ diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h index ac212e6185b2..ee2fdb8d601d 100644 --- a/tools/include/nolibc/arch-powerpc.h +++ b/tools/include/nolibc/arch-powerpc.h @@ -172,7 +172,7 @@ _ret; \ }) -#ifndef __powerpc64__ +#if !defined(__powerpc64__) && !defined(__clang__) /* FIXME: For 32-bit PowerPC, with newer gcc compilers (e.g. gcc 13.1.0), * "omit-frame-pointer" fails with __attribute__((no_stack_protector)) but * works with __attribute__((__optimize__("-fno-stack-protector"))) @@ -184,7 +184,7 @@ #endif /* !__powerpc64__ */ /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { #ifdef __powerpc64__ #if _CALL_ELF == 2 @@ -215,7 +215,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "bl _start_c\n" /* transfer to c runtime */ ); #endif - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_POWERPC_H */ diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 1927c643c739..8827bf936212 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -140,7 +140,7 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( ".option push\n" @@ -151,7 +151,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "andi sp, a0, -16\n" /* sp must be 16-byte aligned */ "call _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_RISCV_H */ diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index 5d60fd43f883..2ec13d8b9a2d 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -139,7 +139,7 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ @@ -147,7 +147,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } struct s390_mmap_arg_struct { diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index 68609f421934..1e40620a2b33 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -161,7 +161,7 @@ * 2) The deepest stack frame should be zero (the %rbp). * */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "xor %ebp, %ebp\n" /* zero the stack frame */ @@ -170,7 +170,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "call _start_c\n" /* transfer to c runtime */ "hlt\n" /* ensure it does not return */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #define NOLIBC_ARCH_HAS_MEMMOVE @@ -193,10 +193,10 @@ __asm__ ( "movq %rdi, %rdx\n\t" "subq %rsi, %rdx\n\t" "cmpq %rcx, %rdx\n\t" - "jb .Lbackward_copy\n\t" + "jb 1f\n\t" "rep movsb\n\t" "retq\n" -".Lbackward_copy:" +"1:" /* backward copy */ "leaq -1(%rdi, %rcx, 1), %rdi\n\t" "leaq -1(%rsi, %rcx, 1), %rsi\n\t" "std\n\t" diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index beddc3665d69..9bc6a706a332 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -6,20 +6,30 @@ #ifndef _NOLIBC_COMPILER_H #define _NOLIBC_COMPILER_H +#if defined(__has_attribute) +# define __nolibc_has_attribute(attr) __has_attribute(attr) +#else +# define __nolibc_has_attribute(attr) 0 +#endif + +#if __nolibc_has_attribute(naked) +# define __nolibc_entrypoint __attribute__((naked)) +# define __nolibc_entrypoint_epilogue() +#else +# define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer"))) +# define __nolibc_entrypoint_epilogue() __builtin_unreachable() +#endif /* __nolibc_has_attribute(naked) */ + #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__) #define _NOLIBC_STACKPROTECTOR #endif /* defined(__SSP__) ... */ -#if defined(__has_attribute) -# if __has_attribute(no_stack_protector) -# define __no_stack_protector __attribute__((no_stack_protector)) -# else -# define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector"))) -# endif +#if __nolibc_has_attribute(no_stack_protector) +# define __no_stack_protector __attribute__((no_stack_protector)) #else # define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector"))) -#endif /* defined(__has_attribute) */ +#endif /* __nolibc_has_attribute(no_stack_protector) */ #endif /* _NOLIBC_COMPILER_H */ diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index 43b551468c2a..bbcd5fd09806 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -13,23 +13,24 @@ const unsigned long *_auxv __attribute__((weak)); static void __stack_chk_init(void); static void exit(int); -extern void (*const __preinit_array_start[])(void) __attribute__((weak)); -extern void (*const __preinit_array_end[])(void) __attribute__((weak)); +extern void (*const __preinit_array_start[])(int, char **, char**) __attribute__((weak)); +extern void (*const __preinit_array_end[])(int, char **, char**) __attribute__((weak)); -extern void (*const __init_array_start[])(void) __attribute__((weak)); -extern void (*const __init_array_end[])(void) __attribute__((weak)); +extern void (*const __init_array_start[])(int, char **, char**) __attribute__((weak)); +extern void (*const __init_array_end[])(int, char **, char**) __attribute__((weak)); extern void (*const __fini_array_start[])(void) __attribute__((weak)); extern void (*const __fini_array_end[])(void) __attribute__((weak)); -__attribute__((weak)) +__attribute__((weak,used)) void _start_c(long *sp) { long argc; char **argv; char **envp; int exitcode; - void (* const *func)(void); + void (* const *ctor_func)(int, char **, char **); + void (* const *dtor_func)(void); const unsigned long *auxv; /* silence potential warning: conflicting types for 'main' */ int _nolibc_main(int, char **, char **) __asm__ ("main"); @@ -66,16 +67,16 @@ void _start_c(long *sp) ; _auxv = auxv; - for (func = __preinit_array_start; func < __preinit_array_end; func++) - (*func)(); - for (func = __init_array_start; func < __init_array_end; func++) - (*func)(); + for (ctor_func = __preinit_array_start; ctor_func < __preinit_array_end; ctor_func++) + (*ctor_func)(argc, argv, envp); + for (ctor_func = __init_array_start; ctor_func < __init_array_end; ctor_func++) + (*ctor_func)(argc, argv, envp); /* go to application */ exitcode = _nolibc_main(argc, argv, envp); - for (func = __fini_array_end; func > __fini_array_start;) - (*--func)(); + for (dtor_func = __fini_array_end; dtor_func > __fini_array_start;) + (*--dtor_func)(); exit(exitcode); } diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 989e707263a4..92436b1e4441 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -74,7 +74,8 @@ * -I../nolibc -o hello hello.c -lgcc * * The available standard (but limited) include files are: - * ctype.h, errno.h, signal.h, stdarg.h, stdio.h, stdlib.h, string.h, time.h + * ctype.h, errno.h, signal.h, stdarg.h, stdbool.h stdio.h, stdlib.h, + * string.h, time.h * * In addition, the following ones are expected to be provided by the compiler: * float.h, stddef.h diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h index 13f1d0e60387..1d0d5259ec41 100644 --- a/tools/include/nolibc/stackprotector.h +++ b/tools/include/nolibc/stackprotector.h @@ -18,7 +18,7 @@ * triggering stack protector errors themselves */ -__attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) +__attribute__((weak,used,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail(void) { pid_t pid; @@ -34,7 +34,7 @@ void __stack_chk_fail_local(void) __stack_chk_fail(); } -__attribute__((weak,section(".data.nolibc_stack_chk"))) +__attribute__((weak,used,section(".data.nolibc_stack_chk"))) uintptr_t __stack_chk_guard; static __no_stack_protector void __stack_chk_init(void) diff --git a/tools/include/nolibc/stdbool.h b/tools/include/nolibc/stdbool.h new file mode 100644 index 000000000000..60feece22f17 --- /dev/null +++ b/tools/include/nolibc/stdbool.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Boolean types support for NOLIBC + * Copyright (C) 2024 Thomas Weißschuh <linux@weissschuh.net> + */ + +#ifndef _NOLIBC_STDBOOL_H +#define _NOLIBC_STDBOOL_H + +#define bool _Bool +#define true 1 +#define false 0 + +#define __bool_true_false_are_defined 1 + +#endif /* _NOLIBC_STDBOOL_H */ diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index f9ab28421e6d..9ec9c24f38c0 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -7,6 +7,7 @@ #ifndef _NOLIBC_STRING_H #define _NOLIBC_STRING_H +#include "arch.h" #include "std.h" static void *malloc(size_t len); diff --git a/tools/include/uapi/README b/tools/include/uapi/README new file mode 100644 index 000000000000..7147b1b2cb28 --- /dev/null +++ b/tools/include/uapi/README @@ -0,0 +1,73 @@ +Why we want a copy of kernel headers in tools? +============================================== + +There used to be no copies, with tools/ code using kernel headers +directly. From time to time tools/perf/ broke due to legitimate kernel +hacking. At some point Linus complained about such direct usage. Then we +adopted the current model. + +The way these headers are used in perf are not restricted to just +including them to compile something. + +There are sometimes used in scripts that convert defines into string +tables, etc, so some change may break one of these scripts, or new MSRs +may use some different #define pattern, etc. + +E.g.: + + $ ls -1 tools/perf/trace/beauty/*.sh | head -5 + tools/perf/trace/beauty/arch_errno_names.sh + tools/perf/trace/beauty/drm_ioctl.sh + tools/perf/trace/beauty/fadvise.sh + tools/perf/trace/beauty/fsconfig.sh + tools/perf/trace/beauty/fsmount.sh + $ + $ tools/perf/trace/beauty/fadvise.sh + static const char *fadvise_advices[] = { + [0] = "NORMAL", + [1] = "RANDOM", + [2] = "SEQUENTIAL", + [3] = "WILLNEED", + [4] = "DONTNEED", + [5] = "NOREUSE", + }; + $ + +The tools/perf/check-headers.sh script, part of the tools/ build +process, points out changes in the original files. + +So its important not to touch the copies in tools/ when doing changes in +the original kernel headers, that will be done later, when +check-headers.sh inform about the change to the perf tools hackers. + +Another explanation from Ingo Molnar: +It's better than all the alternatives we tried so far: + + - Symbolic links and direct #includes: this was the original approach but + was pushed back on from the kernel side, when tooling modified the + headers and broke them accidentally for kernel builds. + + - Duplicate self-defined ABI headers like glibc: double the maintenance + burden, double the chance for mistakes, plus there's no tech-driven + notification mechanism to look at new kernel side changes. + +What we are doing now is a third option: + + - A software-enforced copy-on-write mechanism of kernel headers to + tooling, driven by non-fatal warnings on the tooling side build when + kernel headers get modified: + + Warning: Kernel ABI header differences: + diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h + diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h + diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h + ... + + The tooling policy is to always pick up the kernel side headers as-is, + and integate them into the tooling build. The warnings above serve as a + notification to tooling maintainers that there's changes on the kernel + side. + +We've been using this for many years now, and it might seem hacky, but +works surprisingly well. + diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index a00d53d02723..5bf6148cac2b 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -737,7 +737,7 @@ __SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64) #define __NR_ppoll_time64 414 __SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64) #define __NR_io_pgetevents_time64 416 -__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +__SC_COMP(__NR_io_pgetevents_time64, sys_io_pgetevents, compat_sys_io_pgetevents_time64) #define __NR_recvmmsg_time64 417 __SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64) #define __NR_mq_timedsend_time64 418 diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index d4d86e566e07..535cb68fdb5c 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -2163,6 +2163,15 @@ struct drm_i915_gem_context_param { * supports this per context flag. */ #define I915_CONTEXT_PARAM_LOW_LATENCY 0xe + +/* + * I915_CONTEXT_PARAM_CONTEXT_IMAGE: + * + * Allows userspace to provide own context images. + * + * Note that this is a debug API not available on production kernel builds. + */ +#define I915_CONTEXT_PARAM_CONTEXT_IMAGE 0xf /* Must be kept compact -- no holes and well documented */ /** @value: Context parameter value to be set or queried */ @@ -2564,6 +2573,24 @@ struct i915_context_param_engines { struct i915_engine_class_instance engines[N__]; \ } __attribute__((packed)) name__ +struct i915_gem_context_param_context_image { + /** @engine: Engine class & instance to be configured. */ + struct i915_engine_class_instance engine; + + /** @flags: One of the supported flags or zero. */ + __u32 flags; +#define I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX (1u << 0) + + /** @size: Size of the image blob pointed to by @image. */ + __u32 size; + + /** @mbz: Must be zero. */ + __u32 mbz; + + /** @image: Userspace memory containing the context image. */ + __u64 image; +} __attribute__((packed)); + /** * struct drm_i915_gem_context_create_ext_setparam - Context parameter * to set or query during context creation. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..e05b39e39c3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2851,7 +2851,7 @@ union bpf_attr { * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, - * **TCP_BPF_RTO_MIN**. + * **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports the following *optname*\ s: * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. @@ -7080,6 +7080,7 @@ enum { TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ + TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ }; enum { diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 638c606dfa74..2f082b01ff22 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -41,6 +41,10 @@ */ #define XDP_UMEM_TX_SW_CSUM (1 << 1) +/* Request to reserve tx_metadata_len bytes of per-chunk metadata. + */ +#define XDP_UMEM_TX_METADATA_LEN (1 << 2) + struct sockaddr_xdp { __u16 sxdp_family; __u16 sxdp_flags; diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index e682ab628dfa..d358add1611c 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -81,6 +81,8 @@ enum { #define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_SMC = 256, /* Shared Memory Communications */ +#define IPPROTO_SMC IPPROTO_SMC IPPROTO_MPTCP = 262, /* Multipath TCP connection */ #define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index e5af8c692dc0..637efc055145 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -192,11 +192,24 @@ struct kvm_xen_exit { /* Flags that describe what fields in emulation_failure hold valid data. */ #define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) +/* + * struct kvm_run can be modified by userspace at any time, so KVM must be + * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM() + * renames fields in struct kvm_run from <symbol> to <symbol>__unsafe when + * compiled into the kernel, ensuring that any use within KVM is obvious and + * gets extra scrutiny. + */ +#ifdef __KERNEL__ +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe +#else +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol +#endif + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u8 request_interrupt_window; - __u8 immediate_exit; + __u8 HINT_UNSAFE_IN_KVM(immediate_exit); __u8 padding1[6]; /* out */ @@ -918,6 +931,8 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 #define KVM_CAP_PRE_FAULT_MEMORY 236 +#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 +#define KVM_CAP_X86_GUEST_MODE 238 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index a246e11988d5..e89d00528f2f 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 43742ac5b00d..7c308f04e7a0 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -93,6 +93,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, + NETDEV_A_PAGE_POOL_DMABUF, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -131,6 +132,7 @@ enum { NETDEV_A_QUEUE_IFINDEX, NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, + NETDEV_A_QUEUE_DMABUF, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -174,6 +176,16 @@ enum { }; enum { + NETDEV_A_DMABUF_IFINDEX = 1, + NETDEV_A_DMABUF_QUEUES, + NETDEV_A_DMABUF_FD, + NETDEV_A_DMABUF_ID, + + __NETDEV_A_DMABUF_MAX, + NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) +}; + +enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, NETDEV_CMD_DEV_DEL_NTF, @@ -186,6 +198,7 @@ enum { NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, + NETDEV_CMD_BIND_RX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 3a64499b0f5d..4842c36fdf80 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index 67626d535316..887a25286441 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index c3e4871967bc..2178862bb114 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -100,3 +100,23 @@ bool __bitmap_intersects(const unsigned long *bitmap1, return true; return false; } + +void __bitmap_clear(unsigned long *map, unsigned int start, int len) +{ + unsigned long *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 5dbca76b953f..894860111ddb 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1559,10 +1559,12 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, * Clang for BPF target generates func_proto with no * args as a func_proto with a single void arg (e.g., * `int (*f)(void)` vs just `int (*f)()`). We are - * going to pretend there are no args for such case. + * going to emit valid empty args (void) syntax for + * such case. Similarly and conveniently, valid + * no args case can be special-cased here as well. */ - if (vlen == 1 && p->type == 0) { - btf_dump_printf(d, ")"); + if (vlen == 0 || (vlen == 1 && p->type == 0)) { + btf_dump_printf(d, "void)"); return; } diff --git a/tools/memory-model/Documentation/README b/tools/memory-model/Documentation/README index 304162743a5b..9999c1effdb6 100644 --- a/tools/memory-model/Documentation/README +++ b/tools/memory-model/Documentation/README @@ -9,6 +9,8 @@ depending on what you know and what you would like to learn. Please note that the documents later in this list assume that the reader understands the material provided by documents earlier in this list. +If LKMM-specific terms lost you, glossary.txt might help you. + o You are new to Linux-kernel concurrency: simple.txt o You have some background in Linux-kernel concurrency, and would @@ -21,6 +23,9 @@ o You are familiar with the Linux-kernel concurrency primitives that you need, and just want to get started with LKMM litmus tests: litmus-tests.txt +o You would like to access lock-protected shared variables without + having their corresponding locks held: locking.txt + o You are familiar with Linux-kernel concurrency, and would like a detailed intuitive understanding of LKMM, including situations involving more than two threads: recipes.txt @@ -28,12 +33,18 @@ o You are familiar with Linux-kernel concurrency, and would o You would like a detailed understanding of what your compiler can and cannot do to control dependencies: control-dependencies.txt +o You would like to mark concurrent normal accesses to shared + variables so that intentional "racy" accesses can be properly + documented, especially when you are responding to complaints + from KCSAN: access-marking.txt + o You are familiar with Linux-kernel concurrency and the use of LKMM, and would like a quick reference: cheatsheet.txt o You are familiar with Linux-kernel concurrency and the use of LKMM, and would like to learn about LKMM's requirements, - rationale, and implementation: explanation.txt + rationale, and implementation: explanation.txt and + herd-representation.txt o You are interested in the publications related to LKMM, including hardware manuals, academic literature, standards-committee @@ -61,10 +72,21 @@ control-dependencies.txt explanation.txt Detailed description of the memory model. +glossary.txt + Brief definitions of LKMM-related terms. + +herd-representation.txt + The (abstract) representation of the Linux-kernel concurrency + primitives in terms of events. + litmus-tests.txt The format, features, capabilities, and limitations of the litmus tests that LKMM can evaluate. +locking.txt + Rules for accessing lock-protected shared variables outside of + their corresponding critical sections. + ordering.txt Overview of the Linux kernel's low-level memory-ordering primitives by category. diff --git a/tools/memory-model/Documentation/herd-representation.txt b/tools/memory-model/Documentation/herd-representation.txt new file mode 100644 index 000000000000..ed988906f2b7 --- /dev/null +++ b/tools/memory-model/Documentation/herd-representation.txt @@ -0,0 +1,110 @@ +# +# Legend: +# R, a Load event +# W, a Store event +# F, a Fence event +# LKR, a Lock-Read event +# LKW, a Lock-Write event +# UL, an Unlock event +# LF, a Lock-Fail event +# RL, a Read-Locked event +# RU, a Read-Unlocked event +# R*, a Load event included in RMW +# W*, a Store event included in RMW +# SRCU, a Sleepable-Read-Copy-Update event +# +# po, a Program-Order link +# rmw, a Read-Modify-Write link - every rmw link is a po link +# +# By convention, a blank line in a cell means "same as the preceding line". +# +# Disclaimer. The table includes representations of "add" and "and" operations; +# corresponding/identical representations of "sub", "inc", "dec" and "or", "xor", +# "andnot" operations are omitted. +# + ------------------------------------------------------------------------------ + | C macro | Events | + ------------------------------------------------------------------------------ + | Non-RMW ops | | + ------------------------------------------------------------------------------ + | READ_ONCE | R[once] | + | atomic_read | | + | WRITE_ONCE | W[once] | + | atomic_set | | + | smp_load_acquire | R[acquire] | + | atomic_read_acquire | | + | smp_store_release | W[release] | + | atomic_set_release | | + | smp_store_mb | W[once] ->po F[mb] | + | smp_mb | F[mb] | + | smp_rmb | F[rmb] | + | smp_wmb | F[wmb] | + | smp_mb__before_atomic | F[before-atomic] | + | smp_mb__after_atomic | F[after-atomic] | + | spin_unlock | UL | + | spin_is_locked | On success: RL | + | | On failure: RU | + | smp_mb__after_spinlock | F[after-spinlock] | + | smp_mb__after_unlock_lock | F[after-unlock-lock] | + | rcu_read_lock | F[rcu-lock] | + | rcu_read_unlock | F[rcu-unlock] | + | synchronize_rcu | F[sync-rcu] | + | rcu_dereference | R[once] | + | rcu_assign_pointer | W[release] | + | srcu_read_lock | R[srcu-lock] | + | srcu_down_read | | + | srcu_read_unlock | W[srcu-unlock] | + | srcu_up_read | | + | synchronize_srcu | SRCU[sync-srcu] | + | smp_mb__after_srcu_read_unlock | F[after-srcu-read-unlock] | + ------------------------------------------------------------------------------ + | RMW ops w/o return value | | + ------------------------------------------------------------------------------ + | atomic_add | R*[noreturn] ->rmw W*[once] | + | atomic_and | | + | spin_lock | LKR ->po LKW | + ------------------------------------------------------------------------------ + | RMW ops w/ return value | | + ------------------------------------------------------------------------------ + | atomic_add_return | F[mb] ->po R*[once] | + | | ->rmw W*[once] ->po F[mb] | + | atomic_fetch_add | | + | atomic_fetch_and | | + | atomic_xchg | | + | xchg | | + | atomic_add_negative | | + | atomic_add_return_relaxed | R*[once] ->rmw W*[once] | + | atomic_fetch_add_relaxed | | + | atomic_fetch_and_relaxed | | + | atomic_xchg_relaxed | | + | xchg_relaxed | | + | atomic_add_negative_relaxed | | + | atomic_add_return_acquire | R*[acquire] ->rmw W*[once] | + | atomic_fetch_add_acquire | | + | atomic_fetch_and_acquire | | + | atomic_xchg_acquire | | + | xchg_acquire | | + | atomic_add_negative_acquire | | + | atomic_add_return_release | R*[once] ->rmw W*[release] | + | atomic_fetch_add_release | | + | atomic_fetch_and_release | | + | atomic_xchg_release | | + | xchg_release | | + | atomic_add_negative_release | | + ------------------------------------------------------------------------------ + | Conditional RMW ops | | + ------------------------------------------------------------------------------ + | atomic_cmpxchg | On success: F[mb] ->po R*[once] | + | | ->rmw W*[once] ->po F[mb] | + | | On failure: R*[once] | + | cmpxchg | | + | atomic_add_unless | | + | atomic_cmpxchg_relaxed | On success: R*[once] ->rmw W*[once] | + | | On failure: R*[once] | + | atomic_cmpxchg_acquire | On success: R*[acquire] ->rmw W*[once] | + | | On failure: R*[once] | + | atomic_cmpxchg_release | On success: R*[once] ->rmw W*[release] | + | | On failure: R*[once] | + | spin_trylock | On success: LKR ->po LKW | + | | On failure: LF | + ------------------------------------------------------------------------------ diff --git a/tools/memory-model/Documentation/simple.txt b/tools/memory-model/Documentation/simple.txt index 4c789ec8334f..21f06c1d1b70 100644 --- a/tools/memory-model/Documentation/simple.txt +++ b/tools/memory-model/Documentation/simple.txt @@ -266,5 +266,5 @@ More complex use cases ====================== If the alternatives above do not do what you need, please look at the -recipes-pairs.txt file to peel off the next layer of the memory-ordering +recipes.txt file to peel off the next layer of the memory-ordering onion. diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore index c18dd8d83cee..296c4035dbf2 100644 --- a/tools/net/ynl/lib/.gitignore +++ b/tools/net/ynl/lib/.gitignore @@ -1 +1,2 @@ __pycache__/ +*.d diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index fcb18a5a6d70..e16cef160bc2 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -696,14 +696,14 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) addr.nl_family = AF_NETLINK; if (bind(ys->socket, (struct sockaddr *)&addr, sizeof(addr)) < 0) { __perr(yse, "unable to bind to a socket address"); - goto err_close_sock;; + goto err_close_sock; } memset(&addr, 0, sizeof(addr)); addrlen = sizeof(addr); if (getsockname(ys->socket, (struct sockaddr *)&addr, &addrlen) < 0) { __perr(yse, "unable to read socket address"); - goto err_close_sock;; + goto err_close_sock; } ys->portid = addr.nl_pid; ys->seq = random(); diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index d42c1d605969..c22c22bf2cb7 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -388,6 +388,8 @@ class NetlinkProtocol: def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) + if op is None: + op = ynl.rsp_by_value[msg.cmd()] fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -921,8 +923,7 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - op = self.rsp_by_value[nl_msg.cmd()] - decoded = self.nlproto.decode(self, nl_msg, op) + decoded = self.nlproto.decode(self, nl_msg, None) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -980,7 +981,7 @@ class YnlFamily(SpecFamily): if nl_msg.extack: self._decode_extack(req_msg, op, nl_msg.extack) else: - op = self.rsp_by_value[nl_msg.cmd()] + op = None req_flags = [] if nl_msg.error: diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index 3e7b29bd55d5..22609d44c89a 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -79,7 +79,10 @@ int main(int argc, char **argv) goto err_close; printf("Select ifc ($ifindex; or 0 = dump; or -2 ntf check): "); - scanf("%d", &ifindex); + if (scanf("%d", &ifindex) != 1) { + fprintf(stderr, "Error: unable to parse input\n"); + goto err_destroy; + } if (ifindex > 0) { struct netdev_dev_get_req *req; @@ -119,6 +122,7 @@ int main(int argc, char **argv) err_close: fprintf(stderr, "YNL: %s\n", ys->err.msg); +err_destroy: ynl_sock_destroy(ys); return 2; } diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 51529fabd517..717530bc9c52 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2668,13 +2668,15 @@ def main(): cw.p('#define ' + hdr_prot) cw.nl() + hdr_file=os.path.basename(args.out_file[:-2]) + ".h" + if args.mode == 'kernel': cw.p('#include <net/netlink.h>') cw.p('#include <net/genetlink.h>') cw.nl() if not args.header: if args.out_file: - cw.p(f'#include "{os.path.basename(args.out_file[:-2])}.h"') + cw.p(f'#include "{hdr_file}"') cw.nl() headers = ['uapi/' + parsed.uapi_header] headers += parsed.kernel_family.get('headers', []) @@ -2686,7 +2688,7 @@ def main(): if family_contains_bitfield32(parsed): cw.p('#include <linux/netlink.h>') else: - cw.p(f'#include "{parsed.name}-user.h"') + cw.p(f'#include "{hdr_file}"') cw.p('#include "ynl.h"') headers = [parsed.uapi_header] for definition in parsed['definitions']: diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt index 3766886c4bca..83dc87c662b6 100644 --- a/tools/perf/Documentation/Build.txt +++ b/tools/perf/Documentation/Build.txt @@ -71,3 +71,31 @@ supported by GCC. UBSan detects undefined behaviors of programs at runtime. $ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a If UBSan detects any problem at runtime, it outputs a “runtime error:” message. + +4) Cross compilation +==================== +As Multiarch is commonly supported in Linux distributions, we can install +libraries for multiple architectures on the same system and then cross-compile +Linux perf. For example, Aarch64 libraries and toolchains can be installed on +an x86_64 machine, allowing us to compile perf for an Aarch64 target. + +Below is the command for building the perf with dynamic linking. + + $ cd /path/to/Linux + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +For static linking, the option `LDFLAGS="-static"` is required. + + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \ + LDFLAGS="-static" -C tools/perf + +In the embedded system world, a use case is to explicitly specify the package +configuration paths for cross building: + + $ PKG_CONFIG_SYSROOT_DIR="/path/to/cross/build/sysroot" \ + PKG_CONFIG_LIBDIR="/usr/lib/:/usr/local/lib" \ + make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +In this case, the variable PKG_CONFIG_SYSROOT_DIR can be used alongside the +variable PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH to prepend the sysroot path to +the library paths for cross compilation. diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index a4829b6532d8..fa679db61f62 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -152,7 +152,17 @@ ifdef LIBDW_DIR endif DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -lebl -ldl -lz -llzma -lbz2 + DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 -lzstd + + LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) + LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) + LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION))) + + # Elfutils merged libebl.a into libdw.a starting from version 0.177, + # Link libebl.a only if libdw is older than this version. + ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0) + DWARFLIBS += -lebl + endif endif FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS) FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) $(DWARFLIBS) @@ -182,20 +192,15 @@ endif FEATURE_CHECK_CFLAGS-libzstd := $(LIBZSTD_CFLAGS) FEATURE_CHECK_LDFLAGS-libzstd := $(LIBZSTD_LDFLAGS) +# for linking with debug library, run like: +# make DEBUG=1 PKG_CONFIG_PATH=/opt/libtraceevent/(lib|lib64)/pkgconfig + ifneq ($(NO_LIBTRACEEVENT),1) ifeq ($(call get-executable,$(PKG_CONFIG)),) $(error Error: $(PKG_CONFIG) needed by libtraceevent is missing on this system, please install it) endif endif -# for linking with debug library, run like: -# make DEBUG=1 PKG_CONFIG_PATH=/opt/libtraceevent/(lib|lib64)/pkgconfig -FEATURE_CHECK_CFLAGS-libtraceevent := $(shell $(PKG_CONFIG) --cflags libtraceevent 2>/dev/null) -FEATURE_CHECK_LDFLAGS-libtraceevent := $(shell $(PKG_CONFIG) --libs libtraceevent 2>/dev/null) - -FEATURE_CHECK_CFLAGS-libtracefs := $(shell $(PKG_CONFIG) --cflags libtracefs 2>/dev/null) -FEATURE_CHECK_LDFLAGS-libtracefs := $(shell $(PKG_CONFIG) --libs libtracefs 2>/dev/null) - FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi # include ARCH specific config -include $(src-perf)/arch/$(SRCARCH)/Makefile @@ -301,6 +306,11 @@ endif ifdef PYTHON_CONFIG PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) $(PYTHON_CONFIG_LDFLAGS) 2>/dev/null) + # Update the python flags for cross compilation + ifdef CROSS_COMPILE + PYTHON_NATIVE := $(shell echo $(PYTHON_EMBED_LDOPTS) | sed 's/\(-L.*\/\)\(.*-linux-gnu\).*/\2/') + PYTHON_EMBED_LDOPTS := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EMBED_LDOPTS)) + endif PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --includes 2>/dev/null) @@ -902,6 +912,9 @@ else PYTHON_SETUPTOOLS_INSTALLED := $(shell $(PYTHON) -c 'import setuptools;' 2> /dev/null && echo "yes" || echo "no") ifeq ($(PYTHON_SETUPTOOLS_INSTALLED), yes) PYTHON_EXTENSION_SUFFIX := $(shell $(PYTHON) -c 'from importlib import machinery; print(machinery.EXTENSION_SUFFIXES[0])') + ifdef CROSS_COMPILE + PYTHON_EXTENSION_SUFFIX := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EXTENSION_SUFFIX)) + endif LANG_BINDINGS += $(obj-perf)python/perf$(PYTHON_EXTENSION_SUFFIX) else $(warning Missing python setuptools, the python binding won't be built, please install python3-setuptools or equivalent) @@ -1206,6 +1219,8 @@ ifneq ($(NO_LIBTRACEEVENT),1) LIBTRACEFS_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEFS_VERSION))) LIBTRACEFS_VERSION_CPP := $(shell expr $(LIBTRACEFS_VERSION_1) \* 255 \* 255 + $(LIBTRACEFS_VERSION_2) \* 255 + $(LIBTRACEFS_VERSION_3)) CFLAGS += -DLIBTRACEFS_VERSION=$(LIBTRACEFS_VERSION_CPP) + else + $(warning libtracefs is missing. Please install libtracefs-dev/libtracefs-devel) endif endif diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 175e4c7898f0..f8148db5fc38 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -193,7 +193,32 @@ HOSTLD ?= ld HOSTAR ?= ar CLANG ?= clang -PKG_CONFIG = $(CROSS_COMPILE)pkg-config +# Some distros provide the command $(CROSS_COMPILE)pkg-config for +# searching packges installed with Multiarch. Use it for cross +# compilation if it is existed. +ifneq (, $(shell which $(CROSS_COMPILE)pkg-config)) + PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +else + PKG_CONFIG ?= pkg-config + + # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR + # for modified system root, is required for the cross compilation. + # If these PKG_CONFIG environment variables are not set, Multiarch library + # paths are used instead. + ifdef CROSS_COMPILE + ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) + CROSS_ARCH = $(shell $(CC) -dumpmachine) + PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/ + export PKG_CONFIG_LIBDIR + $(warning Missing PKG_CONFIG_LIBDIR, PKG_CONFIG_PATH and PKG_CONFIG_SYSROOT_DIR for cross compilation,) + $(warning set PKG_CONFIG_LIBDIR for using Multiarch libs.) + endif + endif +endif RM = rm -f LN = ln -f diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 3656f1ca7a21..ebae8415dfbb 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -230,8 +230,10 @@ 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 179 64 pread64 sys_pread64 +179 spu pread64 sys_pread64 180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 180 64 pwrite64 sys_pwrite64 +180 spu pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -246,6 +248,7 @@ 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead 191 64 readahead sys_readahead +191 spu readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 @@ -293,6 +296,7 @@ 232 nospu set_tid_address sys_set_tid_address 233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 233 64 fadvise64 sys_fadvise64 +233 spu fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_ni_syscall 236 common epoll_create sys_epoll_create @@ -502,7 +506,7 @@ 412 32 utimensat_time64 sys_utimensat sys_utimensat 413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +416 32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend 419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index bd0fee24ad10..01071182763e 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -418,7 +418,7 @@ 412 32 utimensat_time64 - sys_utimensat 413 32 pselect6_time64 - compat_sys_pselect6_time64 414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents +416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 - sys_mq_timedsend 419 32 mq_timedreceive_time64 - sys_mq_timedreceive diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index a396f6e6ab5b..7093ee21c0d1 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 64-bit system call numbers and entry vectors # # The format is: -# <number> <abi> <name> <entry point> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]] # # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls # @@ -68,7 +69,7 @@ 57 common fork sys_fork 58 common vfork sys_vfork 59 64 execve sys_execve -60 common exit sys_exit +60 common exit sys_exit - noreturn 61 common wait4 sys_wait4 62 common kill sys_kill 63 common uname sys_newuname @@ -239,7 +240,7 @@ 228 common clock_gettime sys_clock_gettime 229 common clock_getres sys_clock_getres 230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group +231 common exit_group sys_exit_group - noreturn 232 common epoll_wait sys_epoll_wait 233 common epoll_ctl sys_epoll_ctl 234 common tgkill sys_tgkill @@ -343,6 +344,7 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index de76bbc50bfb..9a95871afc95 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <internal/lib.h> +#include <inttypes.h> #include <subcmd/parse-options.h> #include <api/fd/array.h> #include <api/fs/fs.h> @@ -688,9 +689,9 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) /* lock */ csv_sep, daemon->base, "lock"); - fprintf(out, "%c%lu", + fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - daemon->start) / 60); + csv_sep, (uint64_t)((curr - daemon->start) / 60)); fprintf(out, "\n"); } else { @@ -700,8 +701,8 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) daemon->base, SESSION_OUTPUT); fprintf(out, " lock: %s/lock\n", daemon->base); - fprintf(out, " up: %lu minutes\n", - (curr - daemon->start) / 60); + fprintf(out, " up: %" PRIu64 " minutes\n", + (uint64_t)((curr - daemon->start) / 60)); } } @@ -727,9 +728,9 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) /* session ack */ csv_sep, session->base, SESSION_ACK); - fprintf(out, "%c%lu", + fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - session->start) / 60); + csv_sep, (uint64_t)((curr - session->start) / 60)); fprintf(out, "\n"); } else { @@ -745,8 +746,8 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) session->base, SESSION_CONTROL); fprintf(out, " ack: %s/%s\n", session->base, SESSION_ACK); - fprintf(out, " up: %lu minutes\n", - (curr - session->start) / 60); + fprintf(out, " up: %" PRIu64 " minutes\n", + (uint64_t)((curr - session->start) / 60)); } } diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json index a9939823b14b..0c9b9a2d2958 100644 --- a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json +++ b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json @@ -74,7 +74,7 @@ { "PublicDescription": "Sent SFENCE.VMA with ASID request to other HART event", "ConfigCode": "0x800000000000000c", - "EventName": "FW_SFENCE_VMA_RECEIVED", + "EventName": "FW_SFENCE_VMA_ASID_SENT", "BriefDescription": "Sent SFENCE.VMA with ASID request to other HART event" }, { diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json index 9b4a032186a7..7149caec4f80 100644 --- a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json +++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json @@ -36,7 +36,7 @@ "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" }, { - "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" }, { "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 40132655ccd1..c76f53a90a7b 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -456,11 +456,13 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __ /** * Test perf_pmu__match() that's used to search for a PMU given a name passed * on the command line. The name that's passed may also be a filename type glob - * match. + * match. If the name does not match, perf_pmu__match() attempts to match the + * alias of the PMU, if provided. */ static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { struct perf_pmu test_pmu; + test_pmu.alias_name = NULL; test_pmu.name = "pmuname"; TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"), true); diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index e30fd55f8e51..cd3b480d20bd 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -26,7 +26,6 @@ static bool is_ignored_symbol(const char *name, char type) * when --all-symbols is specified so exclude them to get a * stable symbol list. */ - "kallsyms_addresses", "kallsyms_offsets", "kallsyms_relative_base", "kallsyms_num_syms", diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 89d16b90370b..df9cdb8bbfb8 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -76,7 +76,7 @@ struct msghdr { __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; - int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; @@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); +extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 45e4e64fd664..753971770733 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -329,12 +329,17 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO negation of O_APPEND */ #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + +#define PROCFS_IOCTL_MAGIC 'f' /* Pagemap ioctl */ -#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) +#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg) /* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ #define PAGE_IS_WPALLOWED (1 << 0) @@ -393,4 +398,158 @@ struct pm_scan_arg { __u64 return_mask; }; +/* /proc/<pid>/maps ioctl */ +#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query) + +enum procmap_query_flags { + /* + * VMA permission flags. + * + * Can be used as part of procmap_query.query_flags field to look up + * only VMAs satisfying specified subset of permissions. E.g., specifying + * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs, + * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only + * return read/write VMAs, though both executable/non-executable and + * private/shared will be ignored. + * + * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags + * field to specify actual VMA permissions. + */ + PROCMAP_QUERY_VMA_READABLE = 0x01, + PROCMAP_QUERY_VMA_WRITABLE = 0x02, + PROCMAP_QUERY_VMA_EXECUTABLE = 0x04, + PROCMAP_QUERY_VMA_SHARED = 0x08, + /* + * Query modifier flags. + * + * By default VMA that covers provided address is returned, or -ENOENT + * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest + * VMA with vma_start > addr will be returned if no covering VMA is + * found. + * + * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that + * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA + * to iterate all VMAs with file backing. + */ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10, + PROCMAP_QUERY_FILE_BACKED_VMA = 0x20, +}; + +/* + * Input/output argument structured passed into ioctl() call. It can be used + * to query a set of VMAs (Virtual Memory Areas) of a process. + * + * Each field can be one of three kinds, marked in a short comment to the + * right of the field: + * - "in", input argument, user has to provide this value, kernel doesn't modify it; + * - "out", output argument, kernel sets this field with VMA data; + * - "in/out", input and output argument; user provides initial value (used + * to specify maximum allowable buffer size), and kernel sets it to actual + * amount of data written (or zero, if there is no data). + * + * If matching VMA is found (according to criterias specified by + * query_addr/query_flags, all the out fields are filled out, and ioctl() + * returns 0. If there is no matching VMA, -ENOENT will be returned. + * In case of any other error, negative error code other than -ENOENT is + * returned. + * + * Most of the data is similar to the one returned as text in /proc/<pid>/maps + * file, but procmap_query provides more querying flexibility. There are no + * consistency guarantees between subsequent ioctl() calls, but data returned + * for matched VMA is self-consistent. + */ +struct procmap_query { + /* Query struct size, for backwards/forward compatibility */ + __u64 size; + /* + * Query flags, a combination of enum procmap_query_flags values. + * Defines query filtering and behavior, see enum procmap_query_flags. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_flags; /* in */ + /* + * Query address. By default, VMA that covers this address will + * be looked up. PROCMAP_QUERY_* flags above modify this default + * behavior further. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_addr; /* in */ + /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */ + __u64 vma_flags; /* out */ + /* VMA backing page size granularity. */ + __u64 vma_page_size; /* out */ + /* + * VMA file offset. If VMA has file backing, this specifies offset + * within the file that VMA's start address corresponds to. + * Is set to zero if VMA has no backing file. + */ + __u64 vma_offset; /* out */ + /* Backing file's inode number, or zero, if VMA has no backing file. */ + __u64 inode; /* out */ + /* Backing file's device major/minor number, or zero, if VMA has no backing file. */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + /* + * If set to non-zero value, signals the request to return VMA name + * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix + * appended, if file was unlinked from FS) for matched VMA. VMA name + * can also be some special name (e.g., "[heap]", "[stack]") or could + * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME). + * + * Kernel will set this field to zero, if VMA has no associated name. + * Otherwise kernel will return actual amount of bytes filled in + * user-supplied buffer (see vma_name_addr field below), including the + * terminating zero. + * + * If VMA name is longer that user-supplied maximum buffer size, + * -E2BIG error is returned. + * + * If this field is set to non-zero value, vma_name_addr should point + * to valid user space memory buffer of at least vma_name_size bytes. + * If set to zero, vma_name_addr should be set to zero as well + */ + __u32 vma_name_size; /* in/out */ + /* + * If set to non-zero value, signals the request to extract and return + * VMA's backing file's build ID, if the backing file is an ELF file + * and it contains embedded build ID. + * + * Kernel will set this field to zero, if VMA has no backing file, + * backing file is not an ELF file, or ELF file has no build ID + * embedded. + * + * Build ID is a binary value (not a string). Kernel will set + * build_id_size field to exact number of bytes used for build ID. + * If build ID is requested and present, but needs more bytes than + * user-supplied maximum buffer size (see build_id_addr field below), + * -E2BIG error will be returned. + * + * If this field is set to non-zero value, build_id_addr should point + * to valid user space memory buffer of at least build_id_size bytes. + * If set to zero, build_id_addr should be set to zero as well + */ + __u32 build_id_size; /* in/out */ + /* + * User-supplied address of a buffer of at least vma_name_size bytes + * for kernel to fill with matched VMA's name (see vma_name_size field + * description above for details). + * + * Should be set to zero if VMA name should not be returned. + */ + __u64 vma_name_addr; /* in */ + /* + * User-supplied address of a buffer of at least build_id_size bytes + * for kernel to fill with matched VMA's ELF build ID, if available + * (see build_id_size field description above for details). + * + * Should be set to zero if build ID should not be returned. + */ + __u64 build_id_addr; /* in */ +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/mount.h b/tools/perf/trace/beauty/include/uapi/linux/mount.h index ad5478dbad00..225bc366ffcb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/mount.h +++ b/tools/perf/trace/beauty/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 __spare1; + __u32 mnt_opts; /* [str] Mount options of the mount */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -172,7 +172,8 @@ struct statmount { __u64 propagate_from; /* Propagation from in current namespace */ __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; + __u64 mnt_ns_id; /* ID of the mount namespace */ + __u64 __spare2[49]; char str[]; /* Variable size part containing strings */ }; @@ -188,10 +189,12 @@ struct mnt_id_req { __u32 spare; __u64 mnt_id; __u64 param; + __u64 mnt_ns_id; }; /* List of all mnt_id_req versions. */ #define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ /* * @mask bits for statmount(2) @@ -202,10 +205,13 @@ struct mnt_id_req { #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ /* * Special @mnt_id values that can be passed to listmount */ #define LSMT_ROOT 0xffffffffffffffff /* root mount */ +#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h index 67626d535316..887a25286441 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/stat.h +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/sound/asound.h b/tools/perf/trace/beauty/include/uapi/sound/asound.h index 628d46a0da92..8bf7e8a0eb6f 100644 --- a/tools/perf/trace/beauty/include/uapi/sound/asound.h +++ b/tools/perf/trace/beauty/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 18) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -334,7 +334,7 @@ union snd_pcm_sync_id { unsigned char id[16]; unsigned short id16[8]; unsigned int id32[4]; -}; +} __attribute__((deprecated)); struct snd_pcm_info { unsigned int device; /* RO/WR (control): device number */ @@ -348,7 +348,7 @@ struct snd_pcm_info { int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ unsigned int subdevices_count; unsigned int subdevices_avail; - union snd_pcm_sync_id sync; /* hardware synchronization ID */ + unsigned char pad1[16]; /* was: hardware synchronization ID */ unsigned char reserved[64]; /* reserved for future... */ }; @@ -420,7 +420,8 @@ struct snd_pcm_hw_params { unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ - unsigned char reserved[64]; /* reserved for future */ + unsigned char sync[16]; /* R: synchronization ID (perfect sync - one clock source) */ + unsigned char reserved[48]; /* reserved for future */ }; enum { diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index b4cb3fe5cc25..bc4e92c0c08b 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -286,6 +286,9 @@ static void account_end_timestamp(struct lock_contention *con) goto next; for (int i = 0; i < total_cpus; i++) { + if (cpu_data[i].lock == 0) + continue; + update_lock_stat(stat_fd, -1, end_ts, aggr_mode, &cpu_data[i]); } diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 1730b852a947..6d075648d2cc 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1141,7 +1141,7 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node, bool hide_unresolved) { - struct machine *machine = maps__machine(node->ms.maps); + struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL; maps__put(al->maps); al->maps = maps__get(node->ms.maps); diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 2340c4f6d0c2..67414944f245 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1501,7 +1501,7 @@ void dso__delete(struct dso *dso) auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache); dso_cache__free(dso); dso__free_a2l(dso); - zfree(&RC_CHK_ACCESS(dso)->symsrc_filename); + dso__free_symsrc_filename(dso); nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo); mutex_destroy(dso__lock(dso)); RC_CHK_FREE(dso); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 878c1f441868..ed0068251c65 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -602,6 +602,11 @@ static inline void dso__set_symsrc_filename(struct dso *dso, char *val) RC_CHK_ACCESS(dso)->symsrc_filename = val; } +static inline void dso__free_symsrc_filename(struct dso *dso) +{ + zfree(&RC_CHK_ACCESS(dso)->symsrc_filename); +} + static inline enum dso_binary_type dso__symtab_type(const struct dso *dso) { return RC_CHK_ACCESS(dso)->symtab_type; diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 3be882b2e845..31a223eaf8e6 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -20,6 +20,7 @@ #include "util/env.h" #include "util/kvm-stat.h" #include "util/kwork.h" +#include "util/sample.h" #include "util/lock-contention.h" #include <internal/lib.h> #include "../builtin.h" diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index f6a6f6a91030..16c2b03831f3 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -413,7 +413,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso, __func__, dso__symsrc_filename(dso), debuglink); - zfree(&dso__symsrc_filename(dso)); + dso__free_symsrc_filename(dso); } dso__set_symsrc_filename(dso, debuglink); } else { diff --git a/tools/power/cpupower/bindings/python/.gitignore b/tools/power/cpupower/bindings/python/.gitignore new file mode 100644 index 000000000000..5c9a1f0212dd --- /dev/null +++ b/tools/power/cpupower/bindings/python/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +raw_pylibcpupower_wrap.c +*.o +*.so +*.py +!test_raw_pylibcpupower.py +# git keeps ignoring this file, use git add -f raw_libcpupower.i +!raw_pylibcpupower.i diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile new file mode 100644 index 000000000000..dc09c5b66ead --- /dev/null +++ b/tools/power/cpupower/bindings/python/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Makefile for libcpupower's Python bindings +# +# This Makefile expects you have already run the makefile for cpupower to build +# the .o files in the lib directory for the bindings to be created. + +CC := gcc +HAVE_SWIG := $(shell if which swig >/dev/null 2>&1; then echo 1; else echo 0; fi) +HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; else echo 0; fi) + +LIB_DIR := ../../lib +PY_INCLUDE = $(firstword $(shell python-config --includes)) +OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o) + +all: _raw_pylibcpupower.so + +_raw_pylibcpupower.so: raw_pylibcpupower_wrap.o + $(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so + +raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c + $(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE) + +raw_pylibcpupower_wrap.c: raw_pylibcpupower.i +ifeq ($(HAVE_SWIG),0) + $(error "swig was not found. Make sure you have it installed and in the PATH to generate the bindings.") +else ifeq ($(HAVE_PYCONFIG),0) + $(error "python-config was not found. Make sure you have it installed and in the PATH to generate the bindings.") +endif + swig -python raw_pylibcpupower.i + +# Will only clean the bindings folder; will not clean the actual cpupower folder +clean: + rm -f raw_pylibcpupower.py raw_pylibcpupower_wrap.c raw_pylibcpupower_wrap.o _raw_pylibcpupower.so diff --git a/tools/power/cpupower/bindings/python/README b/tools/power/cpupower/bindings/python/README new file mode 100644 index 000000000000..0a4bb2581e8a --- /dev/null +++ b/tools/power/cpupower/bindings/python/README @@ -0,0 +1,59 @@ +This folder contains the necessary files to build the Python bindings for +libcpupower (aside from the libcpupower object files). + + +requirements +------------ + +* You need the object files in the libcpupower directory compiled by +cpupower's makefile. +* The SWIG program must be installed. +* The Python's development libraries installed. + +Please check that your version of SWIG is compatible with the version of Python +installed on your machine by checking the SWIG changelog on their website. +https://swig.org/ + +Note that while SWIG itself is GPL v3+ licensed; the resulting output, +the bindings code: is permissively licensed + the license of libcpupower's .o +files. For these bindings that means GPL v2. + +Please see https://swig.org/legal.html and the discussion [1] for more details. + +[1] +https://lore.kernel.org/linux-pm/Zqv9BOjxLAgyNP5B@hatbackup/ + + +build +----- + +Install SWIG and the Python development files provided by your distribution. + +Build the object files for libcpupower by running make in the cpupower +directory. + +Return to the directory this README is in to run: + +$ make + + +testing +------- + +Please verify the _raw_pylibcpupower.so and raw_pylibcpupower.py files have +been created. + +To run the test script: + +$ python test_raw_pylibcpupower.py + + +credits +------- + +Original Bindings Author: +John B. Wyatt IV +jwyatt@redhat.com +sageofredondo@gmail.com + +Copyright (C) 2024 Red Hat diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.i b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i new file mode 100644 index 000000000000..96556d87a745 --- /dev/null +++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +%module raw_pylibcpupower +%{ +#include "../../lib/cpupower_intern.h" +#include "../../lib/acpi_cppc.h" +#include "../../lib/cpufreq.h" +#include "../../lib/cpuidle.h" +#include "../../lib/cpupower.h" +#include "../../lib/powercap.h" +%} + +/* + * cpupower_intern.h + */ + +#define PATH_TO_CPU "/sys/devices/system/cpu/" +#define MAX_LINE_LEN 4096 +#define SYSFS_PATH_MAX 255 + +int is_valid_path(const char *path); + +unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen); + +unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen); + +/* + * acpi_cppc.h + */ + +enum acpi_cppc_value { + HIGHEST_PERF, + LOWEST_PERF, + NOMINAL_PERF, + LOWEST_NONLINEAR_PERF, + LOWEST_FREQ, + NOMINAL_FREQ, + REFERENCE_PERF, + WRAPAROUND_TIME, + MAX_CPPC_VALUE_FILES +}; + +unsigned long acpi_cppc_get_data(unsigned int cpu, + enum acpi_cppc_value which); + +/* + * cpufreq.h + */ + +struct cpufreq_policy { + unsigned long min; + unsigned long max; + char *governor; +}; + +struct cpufreq_available_governors { + char *governor; + struct cpufreq_available_governors *next; + struct cpufreq_available_governors *first; +}; + +struct cpufreq_available_frequencies { + unsigned long frequency; + struct cpufreq_available_frequencies *next; + struct cpufreq_available_frequencies *first; +}; + + +struct cpufreq_affected_cpus { + unsigned int cpu; + struct cpufreq_affected_cpus *next; + struct cpufreq_affected_cpus *first; +}; + +struct cpufreq_stats { + unsigned long frequency; + unsigned long long time_in_state; + struct cpufreq_stats *next; + struct cpufreq_stats *first; +}; + +unsigned long cpufreq_get_freq_kernel(unsigned int cpu); + +unsigned long cpufreq_get_freq_hardware(unsigned int cpu); + +#define cpufreq_get(cpu) cpufreq_get_freq_kernel(cpu); + +unsigned long cpufreq_get_transition_latency(unsigned int cpu); + +int cpufreq_get_hardware_limits(unsigned int cpu, + unsigned long *min, + unsigned long *max); + +char *cpufreq_get_driver(unsigned int cpu); + +void cpufreq_put_driver(char *ptr); + +struct cpufreq_policy *cpufreq_get_policy(unsigned int cpu); + +void cpufreq_put_policy(struct cpufreq_policy *policy); + +struct cpufreq_available_governors +*cpufreq_get_available_governors(unsigned int cpu); + +void cpufreq_put_available_governors( + struct cpufreq_available_governors *first); + +struct cpufreq_available_frequencies +*cpufreq_get_available_frequencies(unsigned int cpu); + +void cpufreq_put_available_frequencies( + struct cpufreq_available_frequencies *first); + +struct cpufreq_available_frequencies +*cpufreq_get_boost_frequencies(unsigned int cpu); + +void cpufreq_put_boost_frequencies( + struct cpufreq_available_frequencies *first); + +struct cpufreq_affected_cpus *cpufreq_get_affected_cpus(unsigned + int cpu); + +void cpufreq_put_affected_cpus(struct cpufreq_affected_cpus *first); + +struct cpufreq_affected_cpus *cpufreq_get_related_cpus(unsigned + int cpu); + +void cpufreq_put_related_cpus(struct cpufreq_affected_cpus *first); + +struct cpufreq_stats *cpufreq_get_stats(unsigned int cpu, + unsigned long long *total_time); + +void cpufreq_put_stats(struct cpufreq_stats *stats); + +unsigned long cpufreq_get_transitions(unsigned int cpu); + +int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy); + +int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq); + +int cpufreq_modify_policy_max(unsigned int cpu, unsigned long max_freq); + +int cpufreq_modify_policy_governor(unsigned int cpu, char *governor); + +int cpufreq_set_frequency(unsigned int cpu, + unsigned long target_frequency); + +unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu, + const char **table, + unsigned int index, + unsigned int size); + +/* + * cpuidle.h + */ + +int cpuidle_is_state_disabled(unsigned int cpu, + unsigned int idlestate); +int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate, + unsigned int disable); +unsigned long cpuidle_state_latency(unsigned int cpu, + unsigned int idlestate); +unsigned long cpuidle_state_usage(unsigned int cpu, + unsigned int idlestate); +unsigned long long cpuidle_state_time(unsigned int cpu, + unsigned int idlestate); +char *cpuidle_state_name(unsigned int cpu, + unsigned int idlestate); +char *cpuidle_state_desc(unsigned int cpu, + unsigned int idlestate); +unsigned int cpuidle_state_count(unsigned int cpu); + +char *cpuidle_get_governor(void); + +char *cpuidle_get_driver(void); + +/* + * cpupower.h + */ + +struct cpupower_topology { + /* Amount of CPU cores, packages and threads per core in the system */ + unsigned int cores; + unsigned int pkgs; + unsigned int threads; /* per core */ + + /* Array gets mallocated with cores entries, holding per core info */ + struct cpuid_core_info *core_info; +}; + +struct cpuid_core_info { + int pkg; + int core; + int cpu; + + /* flags */ + unsigned int is_online:1; +}; + +int get_cpu_topology(struct cpupower_topology *cpu_top); + +void cpu_topology_release(struct cpupower_topology cpu_top); + +int cpupower_is_cpu_online(unsigned int cpu); + +/* + * powercap.h + */ + +struct powercap_zone { + char name[MAX_LINE_LEN]; + /* + * sys_name relative to PATH_TO_POWERCAP, + * do not forget the / in between + */ + char sys_name[SYSFS_PATH_MAX]; + int tree_depth; + struct powercap_zone *parent; + struct powercap_zone *children[POWERCAP_MAX_CHILD_ZONES]; + /* More possible caps or attributes to be added? */ + uint32_t has_power_uw:1, + has_energy_uj:1; + +}; + +int powercap_walk_zones(struct powercap_zone *zone, + int (*f)(struct powercap_zone *zone)); + +struct powercap_zone *powercap_init_zones(void); + +int powercap_get_enabled(int *mode); + +int powercap_set_enabled(int mode); + +int powercap_get_driver(char *driver, int buflen); + +int powercap_get_max_energy_range_uj(struct powercap_zone *zone, uint64_t *val); + +int powercap_get_energy_uj(struct powercap_zone *zone, uint64_t *val); + +int powercap_get_max_power_range_uw(struct powercap_zone *zone, uint64_t *val); + +int powercap_get_power_uw(struct powercap_zone *zone, uint64_t *val); + +int powercap_zone_get_enabled(struct powercap_zone *zone, int *mode); + +int powercap_zone_set_enabled(struct powercap_zone *zone, int mode); diff --git a/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py new file mode 100755 index 000000000000..3d6f62b9556a --- /dev/null +++ b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only + +import raw_pylibcpupower as p + +# Simple function call + +""" +Get cstate count +""" +cpu_cstates_count = p.cpuidle_state_count(0) +if cpu_cstates_count > -1: + print(f"CPU 0 has {cpu_cstates_count} c-states") +else: + print(f"cstate count error: return code: {cpu_cstates_count}") + +""" +Disable cstate (will fail if the above is 0, ex: a virtual machine) +""" +cstate_disabled = p.cpuidle_state_disable(0, 0, 1) +if cpu_cstates_count == 0: + print(f"CPU 0 has {cpu_cstates_count} c-states") +else: + print(f"cstate count error: return code: {cpu_cstates_count}") + +match cstate_disabled: + case 0: + print(f"CPU state disabled") + case -1: + print(f"Idlestate not available") + case _: + print(f"Not documented") + + +# Pointer example + +topo = p.cpupower_topology() +total_cpus = p.get_cpu_topology(topo) +if total_cpus > 0: + print(f"Number of total cpus: {total_cpus} and number of cores: {topo.cores}") +else: + print(f"Error: could not get cpu topology") diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index 479c5971aa6d..0ecac009273c 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -116,6 +116,7 @@ enum idlestate_value { IDLESTATE_USAGE, IDLESTATE_POWER, IDLESTATE_LATENCY, + IDLESTATE_RESIDENCY, IDLESTATE_TIME, IDLESTATE_DISABLE, MAX_IDLESTATE_VALUE_FILES @@ -125,6 +126,7 @@ static const char *idlestate_value_files[MAX_IDLESTATE_VALUE_FILES] = { [IDLESTATE_USAGE] = "usage", [IDLESTATE_POWER] = "power", [IDLESTATE_LATENCY] = "latency", + [IDLESTATE_RESIDENCY] = "residency", [IDLESTATE_TIME] = "time", [IDLESTATE_DISABLE] = "disable", }; @@ -254,6 +256,12 @@ unsigned long cpuidle_state_latency(unsigned int cpu, return cpuidle_state_get_one_value(cpu, idlestate, IDLESTATE_LATENCY); } +unsigned long cpuidle_state_residency(unsigned int cpu, + unsigned int idlestate) +{ + return cpuidle_state_get_one_value(cpu, idlestate, IDLESTATE_RESIDENCY); +} + unsigned long cpuidle_state_usage(unsigned int cpu, unsigned int idlestate) { diff --git a/tools/power/cpupower/lib/cpuidle.h b/tools/power/cpupower/lib/cpuidle.h index 2e10fead2e1e..2ab404d40259 100644 --- a/tools/power/cpupower/lib/cpuidle.h +++ b/tools/power/cpupower/lib/cpuidle.h @@ -8,6 +8,8 @@ int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate, unsigned int disable); unsigned long cpuidle_state_latency(unsigned int cpu, unsigned int idlestate); +unsigned long cpuidle_state_residency(unsigned int cpu, + unsigned int idlestate); unsigned long cpuidle_state_usage(unsigned int cpu, unsigned int idlestate); unsigned long long cpuidle_state_time(unsigned int cpu, diff --git a/tools/power/cpupower/lib/powercap.c b/tools/power/cpupower/lib/powercap.c index a7a59c6bacda..94a0c69e55ef 100644 --- a/tools/power/cpupower/lib/powercap.c +++ b/tools/power/cpupower/lib/powercap.c @@ -78,6 +78,14 @@ int powercap_get_enabled(int *mode) } /* + * TODO: implement function. Returns dummy 0 for now. + */ +int powercap_set_enabled(int mode) +{ + return 0; +} + +/* * Hardcoded, because rapl is the only powercap implementation - * this needs to get more generic if more powercap implementations * should show up diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index 44126a87fa7a..e0d17f0de3fe 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -64,6 +64,8 @@ static void cpuidle_cpu_output(unsigned int cpu, int verbose) printf(_("Latency: %lu\n"), cpuidle_state_latency(cpu, idlestate)); + printf(_("Residency: %lu\n"), + cpuidle_state_residency(cpu, idlestate)); printf(_("Usage: %lu\n"), cpuidle_state_usage(cpu, idlestate)); printf(_("Duration: %llu\n"), @@ -115,6 +117,8 @@ static void proc_cpuidle_cpu_output(unsigned int cpu) printf(_("promotion[--] demotion[--] ")); printf(_("latency[%03lu] "), cpuidle_state_latency(cpu, cstate)); + printf(_("residency[%05lu] "), + cpuidle_state_residency(cpu, cstate)); printf(_("usage[%08lu] "), cpuidle_state_usage(cpu, cstate)); printf(_("duration[%020Lu] \n"), diff --git a/tools/power/pm-graph/.gitignore b/tools/power/pm-graph/.gitignore new file mode 100644 index 000000000000..37762a8a06d6 --- /dev/null +++ b/tools/power/pm-graph/.gitignore @@ -0,0 +1,3 @@ +# sleepgraph.py artifacts +suspend-[0-9]*-[0-9]* +suspend-[0-9]*-[0-9]*-x[0-9]* diff --git a/tools/power/pm-graph/Makefile b/tools/power/pm-graph/Makefile index b5310832c19c..aeddbaf2d4c4 100644 --- a/tools/power/pm-graph/Makefile +++ b/tools/power/pm-graph/Makefile @@ -1,51 +1,86 @@ # SPDX-License-Identifier: GPL-2.0 -PREFIX ?= /usr -DESTDIR ?= +# +# Copyright (c) 2013, Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms and conditions of the GNU General Public License, +# version 2, as published by the Free Software Foundation. +# +# This program is distributed in the hope it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# Authors: +# Todd Brandt <todd.e.brandt@linux.intel.com> + +# Prefix to the directories we're installing to +DESTDIR ?= + +# Directory definitions. These are default and most probably +# do not need to be changed. Please note that DESTDIR is +# added in front of any of them + +BINDIR ?= /usr/bin +MANDIR ?= /usr/share/man +LIBDIR ?= /usr/lib + +# Toolchain: what tools do we use, and what options do they need: +INSTALL = /usr/bin/install +INSTALL_DATA = ${INSTALL} -m 644 all: @echo "Nothing to build" install : uninstall - install -d $(DESTDIR)$(PREFIX)/lib/pm-graph - install sleepgraph.py $(DESTDIR)$(PREFIX)/lib/pm-graph - install bootgraph.py $(DESTDIR)$(PREFIX)/lib/pm-graph - install -d $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/cgskip.txt $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/freeze-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/freeze.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/freeze-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/standby-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/standby.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/standby-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend-x2-proc.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - - install -d $(DESTDIR)$(PREFIX)/bin - ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph - ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph - - install -d $(DESTDIR)$(PREFIX)/share/man/man8 - install bootgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8 - install sleepgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8 + $(INSTALL) -d $(DESTDIR)$(LIBDIR)/pm-graph + $(INSTALL) sleepgraph.py $(DESTDIR)$(LIBDIR)/pm-graph + $(INSTALL) bootgraph.py $(DESTDIR)$(LIBDIR)/pm-graph + $(INSTALL) -d $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/cgskip.txt $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/freeze-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/freeze.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/freeze-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/standby-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/standby.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/standby-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend-x2-proc.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + + $(INSTALL) -d $(DESTDIR)$(BINDIR) + ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(BINDIR)/bootgraph + ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(BINDIR)/sleepgraph + + $(INSTALL) -d $(DESTDIR)$(MANDIR)/man8 + $(INSTALL) bootgraph.8 $(DESTDIR)$(MANDIR)/man8 + $(INSTALL) sleepgraph.8 $(DESTDIR)$(MANDIR)/man8 uninstall : - rm -f $(DESTDIR)$(PREFIX)/share/man/man8/bootgraph.8 - rm -f $(DESTDIR)$(PREFIX)/share/man/man8/sleepgraph.8 + rm -f $(DESTDIR)$(MANDIR)/man8/bootgraph.8 + rm -f $(DESTDIR)$(MANDIR)/man8/sleepgraph.8 - rm -f $(DESTDIR)$(PREFIX)/bin/bootgraph - rm -f $(DESTDIR)$(PREFIX)/bin/sleepgraph + rm -f $(DESTDIR)$(BINDIR)/bootgraph + rm -f $(DESTDIR)$(BINDIR)/sleepgraph - rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/config/* - if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph/config ] ; then \ - rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph/config; \ + rm -f $(DESTDIR)$(LIBDIR)/pm-graph/config/* + if [ -d $(DESTDIR)$(LIBDIR)/pm-graph/config ] ; then \ + rmdir $(DESTDIR)$(LIBDIR)/pm-graph/config; \ fi; - rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__/* - if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__ ] ; then \ - rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__; \ + rm -f $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__/* + if [ -d $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__ ] ; then \ + rmdir $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__; \ fi; - rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/* - if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph ] ; then \ - rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph; \ + rm -f $(DESTDIR)$(LIBDIR)/pm-graph/* + if [ -d $(DESTDIR)$(LIBDIR)/pm-graph ] ; then \ + rmdir $(DESTDIR)$(LIBDIR)/pm-graph; \ fi; + +help: + @echo 'Building targets:' + @echo ' all - Nothing to build' + @echo ' install - Install the program and create necessary directories' + @echo ' uninstall - Remove installed files and directories' + +.PHONY: all install uninstall help diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index b1e6817f1e54..3946d5254a1f 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -46,6 +46,7 @@ snapshot: turbostat @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h @echo PWD=. > $(SNAPSHOT)/Makefile @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8d37acd39201..067717bce1d4 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -28,10 +28,13 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--add attributes\fP add column with counter having specified 'attributes'. The 'location' attribute is required, all others are optional. .nf - location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP} + location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP | \fBperf/<device>/<event>\fP} msrDDD is a decimal offset, eg. msr16 msr0xXXX is a hex offset, eg. msr0x10 /sys/path... is an absolute path to a sysfs attribute + <device> is a perf device from /sys/bus/event_source/devices/<device> eg. cstate_core + <event> is a perf event for given device from /sys/bus/event_source/devices/<device>/events/<event> eg. c1-residency + perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency scope: {\fBcpu\fP | \fBcore\fP | \fBpackage\fP} sample and print the counter for every cpu, core, or package. @@ -52,6 +55,39 @@ name as necessary to disambiguate it from others is necessary. Note that option as the column header. .fi .PP +\fB--add pmt,[attr_name=attr_value, ...]\fP add column with a PMT (Intel Platform Monitoring Technology) counter in a similar way to --add option above, but require PMT metadata to be supplied to correctly read and display the counter. The metadata can be found in the Intel PMT XML files, hosted at https://github.com/intel/Intel-PMT. For a complete example see "ADD PMT COUNTER EXAMPLE". +.nf + name="name_string" + For column header. + + type={\fBraw\fP} + 'raw' shows the counter contents in hex. + default: raw + + format={\fBraw\fP | \fBdelta\fP} + 'raw' shows the counter contents in hex. + 'delta' shows the difference in values during the measurement interval. + default: raw + + domain={\fBcpu%u\fP | \fBcore%u\fP | \fBpackage%u\fP} + 'cpu' per cpu/thread counter. + 'core' per core counter. + 'package' per package counter. + '%u' denotes id of the domain that the counter is associated with. For example core4 would mean that the counter is associated with core number 4. + + offset=\fB%u\fP + '%u' offset within the PMT MMIO region. + + lsb=\fB%u\fP + '%u' least significant bit within the 64 bit value read from 'offset'. Together with 'msb', used to form a read mask. + + msb=\fB%u\fP + '%u' most significant bit within the 64 bit value read from 'offset'. Together with 'lsb', used to form a read mask. + + guid=\fB%x\fP + '%x' hex identifier of the PMT MMIO region. +.fi +.PP \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44 .PP \fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. @@ -67,10 +103,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP -+\fB--no-msr\fP Disable all the uses of the MSR driver. -+.PP -+\fB--no-perf\fP Disable all the uses of the perf API. -+.PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP +\fB--no-perf\fP Disable all the uses of the perf API. +.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. @@ -320,7 +356,7 @@ available on all processors. Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. We add a counter showing the 32-bit raw value of MSR 0x199 (MSR_IA32_PERF_CTL), labeling it with the column header, "PRF_CTRL", and display it only once, -afte the conclusion of a 0.1 second sleep. +after the conclusion of a 0.1 second sleep. .nf sudo ./turbostat --quiet --cpu 0-3 --show CPU --add msr0x199,u32,raw,PRF_CTRL sleep .1 0.101604 sec @@ -333,6 +369,56 @@ CPU PRF_CTRL .fi +.SH ADD PERF COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. +We add a counter showing time spent in C1 core cstate, +labeling it with the column header, "pCPU%c1", and display it only once, +after the conclusion of 0.1 second sleep. +We also show CPU%c1 built-in counter that should show similar values. +.nf +sudo ./turbostat --quiet --cpu 0-3 --show CPU,CPU%c1 --add perf/cstate_core/c1-residency,cpu,delta,percent,pCPU%c1 sleep .1 +0.102448 sec +CPU pCPU%c1 CPU%c1 +- 34.89 34.89 +0 45.99 45.99 +1 45.94 45.94 +2 23.83 23.83 +3 23.84 23.84 + +.fi + +.SH ADD PMT COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number 0. +We add two counters, showing crystal clock count and the DC6 residency. +All the parameters passed are based on the metadata found in the PMT XML files. + +For the crystal clock count, we +label it with the column header, "XTAL", +we set the type to 'raw', to read the number of clock ticks in hex, +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +For the DC6 residency counter, we +label it with the column header, "Die%c6", +we set the type to 'txtal_time', to obtain the percent residency value +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +.nf +sudo ./turbostat --quiet --cpu 0 --show CPU --add pmt,name=XTAL,type=raw,format=delta,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102 --add pmt,name=Die%c6,type=txtal_time,format=delta,domain=package0,offset=120,lsb=0,msb=63,guid=0x1a067102 +0.104352 sec +CPU XTAL Die%c6 +- 0x0000006d4d957ca7 0.00 +0 0x0000006d4d957ca7 0.00 +0.102448 sec +.fi + .SH INPUT For interval-mode, turbostat will immediately end the current interval diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9f5d053d4bc6..089220aaa5c9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9,6 +9,30 @@ #define _GNU_SOURCE #include MSRHEADER + +// copied from arch/x86/include/asm/cpu_device_id.h +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) +// end copied section + +#define X86_VENDOR_INTEL 0 + #include INTEL_FAMILY_HEADER #include BUILD_BUG_HEADER #include <stdarg.h> @@ -20,6 +44,7 @@ #include <sys/stat.h> #include <sys/select.h> #include <sys/resource.h> +#include <sys/mman.h> #include <fcntl.h> #include <signal.h> #include <sys/time.h> @@ -55,15 +80,39 @@ */ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define PERF_NAME_BYTES 128 #define MAX_NOFILE 0x8000 +#define COUNTER_KIND_PERF_PREFIX "perf/" +#define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX) +#define PERF_DEV_NAME_BYTES 32 +#define PERF_EVT_NAME_BYTES 32 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; -enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; -enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; -enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; +enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; + +struct perf_counter_info { + struct perf_counter_info *next; + + /* How to open the counter / What counter it is. */ + char device[PERF_DEV_NAME_BYTES]; + char event[PERF_EVT_NAME_BYTES]; + + /* How to show/format the counter. */ + char name[PERF_NAME_BYTES]; + unsigned int width; + enum counter_scope scope; + enum counter_type type; + enum counter_format format; + double scale; + + /* For reading the counter. */ + int *fd_perf_per_domain; + size_t num_domains; +}; struct sysfs_path { char path[PATH_BYTES]; @@ -144,6 +193,7 @@ struct msr_counter bic[] = { { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -205,11 +255,12 @@ struct msr_counter bic[] = { #define BIC_SAM_mc6 (1ULL << 55) #define BIC_SAMMHz (1ULL << 56) #define BIC_SAMACTMHz (1ULL << 57) +#define BIC_Diec6 (1ULL << 58) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -252,7 +303,6 @@ char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; -struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -267,6 +317,7 @@ unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int has_aperf; +unsigned int has_aperf_access; unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; @@ -307,7 +358,6 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; -enum amperf_source amperf_source; enum gfx_sysfs_idx { GFX_rc6, @@ -367,7 +417,7 @@ struct platform_features { }; struct platform_data { - unsigned int model; + unsigned int vfm; const struct platform_features *features; }; @@ -910,75 +960,75 @@ static const struct platform_features amd_features_with_rapl = { }; static const struct platform_data turbostat_pdata[] = { - { INTEL_FAM6_NEHALEM, &nhm_features }, - { INTEL_FAM6_NEHALEM_G, &nhm_features }, - { INTEL_FAM6_NEHALEM_EP, &nhm_features }, - { INTEL_FAM6_NEHALEM_EX, &nhx_features }, - { INTEL_FAM6_WESTMERE, &nhm_features }, - { INTEL_FAM6_WESTMERE_EP, &nhm_features }, - { INTEL_FAM6_WESTMERE_EX, &nhx_features }, - { INTEL_FAM6_SANDYBRIDGE, &snb_features }, - { INTEL_FAM6_SANDYBRIDGE_X, &snx_features }, - { INTEL_FAM6_IVYBRIDGE, &ivb_features }, - { INTEL_FAM6_IVYBRIDGE_X, &ivx_features }, - { INTEL_FAM6_HASWELL, &hsw_features }, - { INTEL_FAM6_HASWELL_X, &hsx_features }, - { INTEL_FAM6_HASWELL_L, &hswl_features }, - { INTEL_FAM6_HASWELL_G, &hswg_features }, - { INTEL_FAM6_BROADWELL, &bdw_features }, - { INTEL_FAM6_BROADWELL_G, &bdwg_features }, - { INTEL_FAM6_BROADWELL_X, &bdx_features }, - { INTEL_FAM6_BROADWELL_D, &bdx_features }, - { INTEL_FAM6_SKYLAKE_L, &skl_features }, - { INTEL_FAM6_SKYLAKE, &skl_features }, - { INTEL_FAM6_SKYLAKE_X, &skx_features }, - { INTEL_FAM6_KABYLAKE_L, &skl_features }, - { INTEL_FAM6_KABYLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE_L, &skl_features }, - { INTEL_FAM6_CANNONLAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_X, &icx_features }, - { INTEL_FAM6_ICELAKE_D, &icx_features }, - { INTEL_FAM6_ICELAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_NNPI, &cnl_features }, - { INTEL_FAM6_ROCKETLAKE, &cnl_features }, - { INTEL_FAM6_TIGERLAKE_L, &cnl_features }, - { INTEL_FAM6_TIGERLAKE, &cnl_features }, - { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, - { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, - { INTEL_FAM6_GRANITERAPIDS_X, &spr_features }, - { INTEL_FAM6_LAKEFIELD, &cnl_features }, - { INTEL_FAM6_ALDERLAKE, &adl_features }, - { INTEL_FAM6_ALDERLAKE_L, &adl_features }, - { INTEL_FAM6_RAPTORLAKE, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_P, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, - { INTEL_FAM6_METEORLAKE, &cnl_features }, - { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE_H, &arl_features }, - { INTEL_FAM6_ARROWLAKE_U, &arl_features }, - { INTEL_FAM6_ARROWLAKE, &arl_features }, - { INTEL_FAM6_LUNARLAKE_M, &arl_features }, - { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, - { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, - { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, - { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features }, - { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features }, - { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features }, - { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, - { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, - { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, - { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, - { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, - { INTEL_FAM6_ATOM_CRESTMONT, &grr_features }, - { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, - { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, + { INTEL_NEHALEM, &nhm_features }, + { INTEL_NEHALEM_G, &nhm_features }, + { INTEL_NEHALEM_EP, &nhm_features }, + { INTEL_NEHALEM_EX, &nhx_features }, + { INTEL_WESTMERE, &nhm_features }, + { INTEL_WESTMERE_EP, &nhm_features }, + { INTEL_WESTMERE_EX, &nhx_features }, + { INTEL_SANDYBRIDGE, &snb_features }, + { INTEL_SANDYBRIDGE_X, &snx_features }, + { INTEL_IVYBRIDGE, &ivb_features }, + { INTEL_IVYBRIDGE_X, &ivx_features }, + { INTEL_HASWELL, &hsw_features }, + { INTEL_HASWELL_X, &hsx_features }, + { INTEL_HASWELL_L, &hswl_features }, + { INTEL_HASWELL_G, &hswg_features }, + { INTEL_BROADWELL, &bdw_features }, + { INTEL_BROADWELL_G, &bdwg_features }, + { INTEL_BROADWELL_X, &bdx_features }, + { INTEL_BROADWELL_D, &bdx_features }, + { INTEL_SKYLAKE_L, &skl_features }, + { INTEL_SKYLAKE, &skl_features }, + { INTEL_SKYLAKE_X, &skx_features }, + { INTEL_KABYLAKE_L, &skl_features }, + { INTEL_KABYLAKE, &skl_features }, + { INTEL_COMETLAKE, &skl_features }, + { INTEL_COMETLAKE_L, &skl_features }, + { INTEL_CANNONLAKE_L, &cnl_features }, + { INTEL_ICELAKE_X, &icx_features }, + { INTEL_ICELAKE_D, &icx_features }, + { INTEL_ICELAKE_L, &cnl_features }, + { INTEL_ICELAKE_NNPI, &cnl_features }, + { INTEL_ROCKETLAKE, &cnl_features }, + { INTEL_TIGERLAKE_L, &cnl_features }, + { INTEL_TIGERLAKE, &cnl_features }, + { INTEL_SAPPHIRERAPIDS_X, &spr_features }, + { INTEL_EMERALDRAPIDS_X, &spr_features }, + { INTEL_GRANITERAPIDS_X, &spr_features }, + { INTEL_LAKEFIELD, &cnl_features }, + { INTEL_ALDERLAKE, &adl_features }, + { INTEL_ALDERLAKE_L, &adl_features }, + { INTEL_RAPTORLAKE, &adl_features }, + { INTEL_RAPTORLAKE_P, &adl_features }, + { INTEL_RAPTORLAKE_S, &adl_features }, + { INTEL_METEORLAKE, &cnl_features }, + { INTEL_METEORLAKE_L, &cnl_features }, + { INTEL_ARROWLAKE_H, &arl_features }, + { INTEL_ARROWLAKE_U, &arl_features }, + { INTEL_ARROWLAKE, &arl_features }, + { INTEL_LUNARLAKE_M, &arl_features }, + { INTEL_ATOM_SILVERMONT, &slv_features }, + { INTEL_ATOM_SILVERMONT_D, &slvd_features }, + { INTEL_ATOM_AIRMONT, &amt_features }, + { INTEL_ATOM_GOLDMONT, &gmt_features }, + { INTEL_ATOM_GOLDMONT_D, &gmtd_features }, + { INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features }, + { INTEL_ATOM_TREMONT_D, &tmtd_features }, + { INTEL_ATOM_TREMONT, &tmt_features }, + { INTEL_ATOM_TREMONT_L, &tmt_features }, + { INTEL_ATOM_GRACEMONT, &adl_features }, + { INTEL_ATOM_CRESTMONT_X, &srf_features }, + { INTEL_ATOM_CRESTMONT, &grr_features }, + { INTEL_XEON_PHI_KNL, &knl_features }, + { INTEL_XEON_PHI_KNM, &knl_features }, /* * Missing support for - * INTEL_FAM6_ICELAKE - * INTEL_FAM6_ATOM_SILVERMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_NP + * INTEL_ICELAKE + * INTEL_ATOM_SILVERMONT_MID + * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, }; @@ -1003,11 +1053,11 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } - if (!genuine_intel || family != 6) + if (!genuine_intel) return; for (i = 0; turbostat_pdata[i].features; i++) { - if (turbostat_pdata[i].model == model) { + if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) { platform = turbostat_pdata[i].features; return; } @@ -1034,8 +1084,13 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 #define MAX_ADDED_PACKAGE_COUNTERS 16 +#define PMT_MAX_ADDED_THREAD_COUNTERS 24 +#define PMT_MAX_ADDED_CORE_COUNTERS 8 +#define PMT_MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 +#define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr)) + /* Indexes used to map data read from perf and MSRs into global variables */ enum rapl_rci_index { RAPL_RCI_INDEX_ENERGY_PKG = 0, @@ -1056,19 +1111,13 @@ enum rapl_unit { struct rapl_counter_info_t { unsigned long long data[NUM_RAPL_COUNTERS]; - enum rapl_source source[NUM_RAPL_COUNTERS]; + enum counter_source source[NUM_RAPL_COUNTERS]; unsigned long long flags[NUM_RAPL_COUNTERS]; double scale[NUM_RAPL_COUNTERS]; enum rapl_unit unit[NUM_RAPL_COUNTERS]; - - union { - /* Active when source == RAPL_SOURCE_MSR */ - struct { - unsigned long long msr[NUM_RAPL_COUNTERS]; - unsigned long long msr_mask[NUM_RAPL_COUNTERS]; - int msr_shift[NUM_RAPL_COUNTERS]; - }; - }; + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; int fd_perf; }; @@ -1224,7 +1273,7 @@ enum ccstate_rci_index { struct cstate_counter_info_t { unsigned long long data[NUM_CSTATE_COUNTERS]; - enum cstate_source source[NUM_CSTATE_COUNTERS]; + enum counter_source source[NUM_CSTATE_COUNTERS]; unsigned long long msr[NUM_CSTATE_COUNTERS]; int fd_perf_core; int fd_perf_pkg; @@ -1361,6 +1410,167 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { }, }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum msr_rci_index { + MSR_RCI_INDEX_APERF = 0, + MSR_RCI_INDEX_MPERF = 1, + MSR_RCI_INDEX_SMI = 2, + NUM_MSR_COUNTERS, +}; + +struct msr_counter_info_t { + unsigned long long data[NUM_MSR_COUNTERS]; + enum counter_source source[NUM_MSR_COUNTERS]; + unsigned long long msr[NUM_MSR_COUNTERS]; + unsigned long long msr_mask[NUM_MSR_COUNTERS]; + int fd_perf; +}; + +struct msr_counter_info_t *msr_counter_info; +unsigned int msr_counter_info_size; + +struct msr_counter_arch_info { + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + bool needed; + bool present; +}; + +enum msr_arch_info_index { + MSR_ARCH_INFO_APERF_INDEX = 0, + MSR_ARCH_INFO_MPERF_INDEX = 1, + MSR_ARCH_INFO_SMI_INDEX = 2, +}; + +static struct msr_counter_arch_info msr_counter_arch_infos[] = { + [MSR_ARCH_INFO_APERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "aperf", + .msr = MSR_IA32_APERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_APERF, + }, + + [MSR_ARCH_INFO_MPERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "mperf", + .msr = MSR_IA32_MPERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_MPERF, + }, + + [MSR_ARCH_INFO_SMI_INDEX] = { + .perf_subsys = "msr", + .perf_name = "smi", + .msr = MSR_SMI_COUNT, + .msr_mask = 0xFFFFFFFF, + .rci_index = MSR_RCI_INDEX_SMI, + }, +}; + +/* Can be redefined when compiling, useful for testing. */ +#ifndef SYSFS_TELEM_PATH +#define SYSFS_TELEM_PATH "/sys/class/intel_pmt" +#endif + +#define PMT_COUNTER_MTL_DC6_OFFSET 120 +#define PMT_COUNTER_MTL_DC6_LSB 0 +#define PMT_COUNTER_MTL_DC6_MSB 63 +#define PMT_MTL_DC6_GUID 0x1a067102 + +#define PMT_COUNTER_NAME_SIZE_BYTES 16 +#define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 + +struct pmt_mmio { + struct pmt_mmio *next; + + unsigned int guid; + unsigned int size; + + /* Base pointer to the mmaped memory. */ + void *mmio_base; + + /* + * Offset to be applied to the mmio_base + * to get the beginning of the PMT counters for given GUID. + */ + unsigned long pmt_offset; +} *pmt_mmios; + +enum pmt_datatype { + PMT_TYPE_RAW, + PMT_TYPE_XTAL_TIME, +}; + +struct pmt_domain_info { + /* + * Pointer to the MMIO obtained by applying a counter offset + * to the mmio_base of the mmaped region for the given GUID. + * + * This is where to read the raw value of the counter from. + */ + unsigned long *pcounter; +}; + +struct pmt_counter { + struct pmt_counter *next; + + /* PMT metadata */ + char name[PMT_COUNTER_NAME_SIZE_BYTES]; + enum pmt_datatype type; + enum counter_scope scope; + unsigned int lsb; + unsigned int msb; + + /* BIC-like metadata */ + enum counter_format format; + + unsigned int num_domains; + struct pmt_domain_info *domains; +}; + +unsigned int pmt_counter_get_width(const struct pmt_counter *p) +{ + return (p->msb - p->lsb) + 1; +} + +void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size) +{ + struct pmt_domain_info *new_mem; + + new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); + if (!new_mem) { + fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__); + exit(1); + } + + /* Zero initialize just allocated memory. */ + const size_t num_new_domains = new_size - pcounter->num_domains; + + memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains)); + + pcounter->num_domains = new_size; + pcounter->domains = new_mem; +} + +void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) +{ + /* + * Allocate more memory ahead of time. + * + * Always allocate space for at least 8 elements + * and double the size when growing. + */ + if (new_size < 8) + new_size = 8; + new_size = MAX(new_size, pcounter->num_domains * 2); + + pmt_counter_resize_(pcounter, new_size); +} + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1378,6 +1588,8 @@ struct thread_data { unsigned int flags; bool is_atom; unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { @@ -1391,6 +1603,8 @@ struct core_data { unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1423,7 +1637,10 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; + unsigned long long die_c6; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1558,12 +1775,25 @@ int idx_valid(int idx) } struct sys_counters { + /* MSR added counters */ unsigned int added_thread_counters; unsigned int added_core_counters; unsigned int added_package_counters; struct msr_counter *tp; struct msr_counter *cp; struct msr_counter *pp; + + /* perf added counters */ + unsigned int added_thread_perf_counters; + unsigned int added_core_perf_counters; + unsigned int added_package_perf_counters; + struct perf_counter_info *perf_tp; + struct perf_counter_info *perf_cp; + struct perf_counter_info *perf_pp; + + struct pmt_counter *pmt_tp; + struct pmt_counter *pmt_cp; + struct pmt_counter *pmt_pp; } sys; static size_t free_msr_counters_(struct msr_counter **pp) @@ -1747,7 +1977,7 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; @@ -1823,6 +2053,23 @@ int probe_msr(int cpu, off_t offset) return 0; } +/* Convert CPU ID to domain ID for given added perf counter. */ +unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu) +{ + switch (pc->scope) { + case SCOPE_CPU: + return cpu; + + case SCOPE_CORE: + return cpus[cpu].physical_core_id; + + case SCOPE_PACKAGE: + return cpus[cpu].physical_package_id; + } + + __builtin_unreachable(); +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1846,9 +2093,12 @@ void help(void) "to print statistics, until interrupted.\n" " -a, --add add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" + " eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " debug messages are printed to stderr\n" " -D, --Dump displays the raw counter values\n" " -e, --enable [all | column]\n" " shows all or the specified disabled column\n" @@ -1955,6 +2205,8 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) void print_header(char *delim) { struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; int printed = 0; if (DO_BIC(BIC_USEC)) @@ -2012,6 +2264,40 @@ void print_header(char *delim) } } + for (pp = sys.perf_tp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + + ppmt = sys.pmt_tp; + while (ppmt) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_c3)) @@ -2052,6 +2338,40 @@ void print_header(char *delim) } } + for (pp = sys.perf_cp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + + ppmt = sys.pmt_cp; + while (ppmt) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2096,6 +2416,8 @@ void print_header(char *delim) outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : "")); if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : "")); + if (DO_BIC(BIC_Diec6)) + outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_LPI)) outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : "")); if (DO_BIC(BIC_SYS_LPI)) @@ -2147,6 +2469,40 @@ void print_header(char *delim) } } + for (pp = sys.perf_pp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + + ppmt = sys.pmt_pp; + while (ppmt) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; + } + + ppmt = ppmt->next; + } + outp += sprintf(outp, "\n"); } @@ -2267,6 +2623,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data char *fmt8; int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; char *delim = "\t"; int printed = 0; @@ -2404,6 +2762,51 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + /* Added perf counters */ + for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + if (pp->type == COUNTER_USEC) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + t->perf_counter[i] / interval_float / 10000); + else + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc); + } + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); + + break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = t->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; + } + } + /* C1 */ if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); @@ -2447,6 +2850,44 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc); + } + } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); + + break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = c->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; + } + } + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) @@ -2526,6 +2967,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); + if (DO_BIC(BIC_Diec6)) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) outp += @@ -2601,6 +3046,47 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); + } else if (pp->type == COUNTER_K2M) { + outp += + sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + } + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); + + break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = p->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; + } + } + done: if (*(outp - 1) != '\n') outp += sprintf(outp, "\n"); @@ -2654,6 +3140,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; @@ -2674,6 +3162,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->pc8 = new->pc8 - old->pc8; old->pc9 = new->pc9 - old->pc9; old->pc10 = new->pc10 - old->pc10; + old->die_c6 = new->die_c6 - old->die_c6; old->cpu_lpi = new->cpu_lpi - old->cpu_lpi; old->sys_lpi = new->sys_lpi - old->sys_lpi; old->pkg_temp_c = new->pkg_temp_c; @@ -2714,6 +3203,22 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->counter[i] = new->counter[i] - old->counter[i]; } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else if (pp->format == FORMAT_AVERAGE) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -2721,6 +3226,8 @@ void delta_core(struct core_data *new, struct core_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; @@ -2737,6 +3244,20 @@ void delta_core(struct core_data *new, struct core_data *old) else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } } int soft_c1_residency_display(int bic) @@ -2754,6 +3275,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* we run cpuid just the 1st time, copy the results */ if (DO_BIC(BIC_APIC)) @@ -2832,6 +3355,21 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -2908,6 +3446,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->pc8 = 0; p->pc9 = 0; p->pc10 = 0; + p->die_c6 = 0; p->cpu_lpi = 0; p->sys_lpi = 0; @@ -2934,6 +3473,14 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) p->counter[i] = 0; + + memset(&t->perf_counter[0], 0, sizeof(t->perf_counter)); + memset(&c->perf_counter[0], 0, sizeof(c->perf_counter)); + memset(&p->perf_counter[0], 0, sizeof(p->perf_counter)); + + memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter)); + memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter)); + memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter)); } void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) @@ -2954,6 +3501,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) @@ -2984,6 +3533,16 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.counter[i] += t->counter[i]; } + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.threads.perf_counter[i] += t->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] += t->pmt_counter[i]; + } + /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3004,6 +3563,16 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.counter[i] += c->counter[i]; } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.cores.perf_counter[i] += c->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] += c->pmt_counter[i]; + } + /* sum per-pkg values only for 1st core in pkg */ if (!is_cpu_first_core_in_package(t, c, p)) return 0; @@ -3027,6 +3596,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pc8 += p->pc8; average.packages.pc9 += p->pc9; average.packages.pc10 += p->pc10; + average.packages.die_c6 += p->die_c6; average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; @@ -3055,6 +3625,18 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) else average.packages.counter[i] += p->counter[i]; } + + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.perf_counter[i] = p->perf_counter[i]; + else + average.packages.perf_counter[i] += p->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] += p->pmt_counter[i]; + } + return 0; } @@ -3066,6 +3648,8 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; clear_counters(&average.threads, &average.cores, &average.packages); @@ -3108,6 +3692,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data average.packages.pc8 /= topo.allowed_packages; average.packages.pc9 /= topo.allowed_packages; average.packages.pc10 /= topo.allowed_packages; + average.packages.die_c6 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -3137,6 +3722,45 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data } average.packages.counter[i] /= topo.allowed_packages; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.threads.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + continue; + } + average.threads.perf_counter[i] /= topo.allowed_cpus; + } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.cores.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.cores.perf_counter[i] /= topo.allowed_cores; + } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.packages.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.packages.perf_counter[i] /= topo.allowed_packages; + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] /= topo.allowed_cpus; + } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] /= topo.allowed_cores; + } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] /= topo.allowed_packages; + } } static unsigned long long rdtsc(void) @@ -3382,30 +4006,6 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned int read_msr_type(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/type"; - const char *const format = "%u"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_aperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_mperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; @@ -3417,15 +4017,55 @@ static unsigned int read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned int read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_perf_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; - const char *const format = "event=%x"; + FILE *fconfig = NULL; char path[128]; + char config_str[64]; + unsigned int config; + unsigned int umask; + bool has_config = false; + bool has_umask = false; + unsigned int ret = -1; snprintf(path, sizeof(path), path_format, subsys, event_name); - return read_perf_counter_info_n(path, format); + fconfig = fopen(path, "r"); + if (!fconfig) + return -1; + + if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str) + goto cleanup_and_exit; + + for (char *pconfig_str = &config_str[0]; pconfig_str;) { + if (sscanf(pconfig_str, "event=%x", &config) == 1) { + has_config = true; + goto next; + } + + if (sscanf(pconfig_str, "umask=%x", &umask) == 1) { + has_umask = true; + goto next; + } + +next: + pconfig_str = strchr(pconfig_str, ','); + if (pconfig_str) { + *pconfig_str = '\0'; + ++pconfig_str; + } + } + + if (!has_umask) + umask = 0; + + if (has_config) + ret = (umask << 8) | config; + +cleanup_and_exit: + fclose(fconfig); + return ret; } static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) @@ -3444,7 +4084,7 @@ static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_na return RAPL_UNIT_INVALID; } -static double read_perf_rapl_scale(const char *subsys, const char *event_name) +static double read_perf_scale(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; const char *const format = "%lf"; @@ -3459,130 +4099,12 @@ static double read_perf_rapl_scale(const char *subsys, const char *event_name) return scale; } -static struct amperf_group_fd open_amperf_fd(int cpu) -{ - const unsigned int msr_type = read_msr_type(); - const unsigned int aperf_config = read_aperf_config(); - const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; - - fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); - - return fds; -} - -static int get_amperf_fd(int cpu) -{ - assert(fd_amperf_percpu); - - if (fd_amperf_percpu[cpu].aperf) - return fd_amperf_percpu[cpu].aperf; - - fd_amperf_percpu[cpu] = open_amperf_fd(cpu); - - return fd_amperf_percpu[cpu].aperf; -} - -/* Read APERF, MPERF and TSC using the perf API. */ -static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) -{ - union { - struct { - unsigned long nr_entries; - unsigned long aperf; - unsigned long mperf; - }; - - unsigned long as_array[3]; - } cnt; - - const int fd_amperf = get_amperf_fd(cpu); - - /* - * Read the TSC with rdtsc, because we want the absolute value and not - * the offset from the start of the counter. - */ - t->tsc = rdtsc(); - - const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); - - if (n != sizeof(cnt.as_array)) - return -2; - - t->aperf = cnt.aperf * aperf_mperf_multiplier; - t->mperf = cnt.mperf * aperf_mperf_multiplier; - - return 0; -} - -/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ -static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) -{ - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - int aperf_mperf_retry_count = 0; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - -retry: - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; - - tsc_after = rdtsc(); - - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); - } - aperf_mperf_retry_count = 0; - - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; - - return 0; -} - size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) { size_t ret = 0; for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) - if (rci->source[i] == RAPL_SOURCE_PERF) + if (rci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3593,7 +4115,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t size_t ret = 0; for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) - if (cci->source[i] == CSTATE_SOURCE_PERF) + if (cci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3611,7 +4133,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3634,14 +4156,14 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { switch (rci->source[i]) { - case RAPL_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case RAPL_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(rci->fd_perf != -1); - if (debug) + if (debug >= 2) fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); @@ -3650,8 +4172,8 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct ++pi; break; - case RAPL_SOURCE_MSR: - if (debug) + case COUNTER_SOURCE_MSR: + if (debug >= 2) fprintf(stderr, "Reading rapl counter via msr at %u\n", i); assert(!no_msr); @@ -3709,15 +4231,15 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat struct cstate_counter_info_t *cci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d\n", __func__, cpu); assert(ccstate_counter_info); assert(cpu <= ccstate_counter_info_size); - memset(perf_data, 0, sizeof(perf_data)); - memset(perf_data_core, 0, sizeof(perf_data_core)); - memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + ZERO_ARRAY(perf_data); + ZERO_ARRAY(perf_data_core); + ZERO_ARRAY(perf_data_pkg); cci = &ccstate_counter_info[cpu]; @@ -3772,30 +4294,28 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { switch (cci->source[i]) { - case CSTATE_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case CSTATE_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); - } cci->data[i] = perf_data[pi]; ++pi; break; - case CSTATE_SOURCE_MSR: + case COUNTER_SOURCE_MSR: assert(!no_msr); if (get_msr(cpu, cci->msr[i], &cci->data[i])) return -13 - i; - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); - } break; } @@ -3809,7 +4329,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat * when invoked for the thread sibling. */ #define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ - if (cci->source[index] != CSTATE_SOURCE_NONE) \ + if (cci->source[index] != COUNTER_SOURCE_NONE) \ out_counter = cci->data[index]; \ } while (0) @@ -3833,6 +4353,135 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat return 0; } +size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) + if (mci->source[i] == COUNTER_SOURCE_PERF) + ++ret; + + return ret; +} + +int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) +{ + unsigned long long perf_data[NUM_MSR_COUNTERS + 1]; + + struct msr_counter_info_t *mci; + + if (debug >= 2) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(msr_counter_info); + assert(cpu <= msr_counter_info_size); + + mci = &msr_counter_info[cpu]; + + ZERO_ARRAY(perf_data); + ZERO_ARRAY(mci->data); + + if (mci->fd_perf != -1) { + const size_t num_perf_counters = msr_counter_info_count_perf(mci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) { + switch (mci->source[i]) { + case COUNTER_SOURCE_NONE: + break; + + case COUNTER_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(mci->fd_perf != -1); + + if (debug >= 2) + fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]); + + mci->data[i] = perf_data[pi]; + + ++pi; + break; + + case COUNTER_SOURCE_MSR: + assert(!no_msr); + + if (get_msr(cpu, mci->msr[i], &mci->data[i])) + return -2 - i; + + mci->data[i] &= mci->msr_mask[i]; + + if (debug >= 2) + fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]); + + break; + } + } + + BUILD_BUG_ON(NUM_MSR_COUNTERS != 3); + t->aperf = mci->data[MSR_RCI_INDEX_APERF]; + t->mperf = mci->data[MSR_RCI_INDEX_MPERF]; + t->smi_count = mci->data[MSR_RCI_INDEX_SMI]; + + return 0; +} + +int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size) +{ + unsigned int domain; + unsigned long long value; + int fd_counter; + + for (size_t i = 0; pp; ++i, pp = pp->next) { + domain = cpu_to_domain(pp, cpu); + assert(domain < pp->num_domains); + + fd_counter = pp->fd_perf_per_domain[domain]; + + if (fd_counter == -1) + continue; + + if (read(fd_counter, &value, sizeof(value)) != sizeof(value)) + return 1; + + assert(i < out_size); + out[i] = value * pp->scale; + } + + return 0; +} + +unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) +{ + unsigned long mask; + + if (msb == 63) + mask = 0xffffffffffffffff; + else + mask = ((1 << (msb + 1)) - 1); + + mask -= (1 << lsb) - 1; + + return mask; +} + +unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id) +{ + assert(domain_id < ppmt->num_domains); + + const unsigned long *pmmio = ppmt->domains[domain_id].pcounter; + const unsigned long value = pmmio ? *pmmio : 0; + const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb); + const unsigned long value_shift = ppmt->lsb; + + return (value & value_mask) >> value_shift; +} + /* * get_counters(...) * migrate to cpu @@ -3843,6 +4492,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int cpu = t->cpu_id; unsigned long long msr; struct msr_counter *mp; + struct pmt_counter *pp; int i; int status; @@ -3858,24 +4508,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) - || soft_c1_residency_display(BIC_Avg_MHz)) { - int status = -1; - - assert(!no_perf || !no_msr); - - switch (amperf_source) { - case AMPERF_SOURCE_PERF: - status = read_aperf_mperf_tsc_perf(t, cpu); - break; - case AMPERF_SOURCE_MSR: - status = read_aperf_mperf_tsc_msr(t, cpu); - break; - } - - if (status != 0) - return status; - } + get_smi_aperf_mperf(cpu, t); if (DO_BIC(BIC_IPC)) if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) @@ -3883,11 +4516,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; - if (DO_BIC(BIC_SMI)) { - if (get_msr(cpu, MSR_SMI_COUNT, &msr)) - return -5; - t->smi_count = msr & 0xFFFFFFFF; - } get_cstate_counters(cpu, t, c, p); @@ -3896,6 +4524,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS)) + return -10; + + for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next) + t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id); + /* collect core counters only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) goto done; @@ -3934,6 +4568,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS)) + return -10; + + for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next) + c->pmt_counter[i] = pmt_read_counter(pp, c->core_id); + /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) goto done; @@ -4006,6 +4646,13 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } + + if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS)) + return -10; + + for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next) + p->pmt_counter[i] = pmt_read_counter(pp, p->package_id); + done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -4469,25 +5116,6 @@ void free_fd_percpu(void) fd_percpu = NULL; } -void free_fd_amperf_percpu(void) -{ - int i; - - if (!fd_amperf_percpu) - return; - - for (i = 0; i < topo.max_cpu_num + 1; ++i) { - if (fd_amperf_percpu[i].mperf != 0) - close(fd_amperf_percpu[i].mperf); - - if (fd_amperf_percpu[i].aperf != 0) - close(fd_amperf_percpu[i].aperf); - } - - free(fd_amperf_percpu); - fd_amperf_percpu = NULL; -} - void free_fd_instr_count_percpu(void) { if (!fd_instr_count_percpu) @@ -4522,6 +5150,21 @@ void free_fd_cstate(void) ccstate_counter_info_size = 0; } +void free_fd_msr(void) +{ + if (!msr_counter_info) + return; + + for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) { + if (msr_counter_info[cpu].fd_perf != -1) + close(msr_counter_info[cpu].fd_perf); + } + + free(msr_counter_info); + msr_counter_info = NULL; + msr_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4539,6 +5182,36 @@ void free_fd_rapl_percpu(void) rapl_counter_info_perdomain_size = 0; } +void free_fd_added_perf_counters_(struct perf_counter_info *pp) +{ + if (!pp) + return; + + if (!pp->fd_perf_per_domain) + return; + + while (pp) { + for (size_t domain = 0; domain < pp->num_domains; ++domain) { + if (pp->fd_perf_per_domain[domain] != -1) { + close(pp->fd_perf_per_domain[domain]); + pp->fd_perf_per_domain[domain] = -1; + } + } + + free(pp->fd_perf_per_domain); + pp->fd_perf_per_domain = NULL; + + pp = pp->next; + } +} + +void free_fd_added_perf_counters(void) +{ + free_fd_added_perf_counters_(sys.perf_tp); + free_fd_added_perf_counters_(sys.perf_cp); + free_fd_added_perf_counters_(sys.perf_pp); +} + void free_all_buffers(void) { int i; @@ -4581,9 +5254,10 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); - free_fd_amperf_percpu(); + free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); + free_fd_added_perf_counters(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4918,16 +5592,22 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); +void added_perf_counters_init(void); +void pmt_init(void); void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + msr_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); + pmt_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -6779,22 +7459,13 @@ static int has_instr_count_access(void) return has_access; } -bool is_aperf_access_required(void) -{ - return BIC_IS_ENABLED(BIC_Avg_MHz) - || BIC_IS_ENABLED(BIC_Busy) - || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC) - || BIC_IS_ENABLED(BIC_CPU_c1); -} - int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, double *scale_, enum rapl_unit *unit_) { if (no_perf) return -1; - const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name); if (scale == 0.0) return -1; @@ -6805,7 +7476,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const unsigned int rapl_type = read_perf_type(cai->perf_subsys); - const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -6826,7 +7497,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct { int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; @@ -6846,14 +7517,6 @@ void linux_perf_init(void) if (fd_instr_count_percpu == NULL) err(-1, "calloc fd_instr_count_percpu"); } - - const bool aperf_required = is_aperf_access_required(); - - if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { - fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); - if (fd_amperf_percpu == NULL) - err(-1, "calloc fd_amperf_percpu"); - } } void rapl_perf_init(void) @@ -6875,7 +7538,7 @@ void rapl_perf_init(void) rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; - rci->source[i] = RAPL_SOURCE_NONE; + rci->source[i] = COUNTER_SOURCE_NONE; } } @@ -6917,14 +7580,14 @@ void rapl_perf_init(void) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { - rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->source[cai->rci_index] = COUNTER_SOURCE_PERF; rci->scale[cai->rci_index] = scale * cai->compat_scale; rci->unit[cai->rci_index] = unit; rci->flags[cai->rci_index] = cai->flags; /* Use MSR for this counter */ } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { - rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->source[cai->rci_index] = COUNTER_SOURCE_MSR; rci->msr[cai->rci_index] = cai->msr; rci->msr_mask[cai->rci_index] = cai->msr_mask; rci->msr_shift[cai->rci_index] = cai->msr_shift; @@ -6934,7 +7597,7 @@ void rapl_perf_init(void) } } - if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE) has_counter = 1; } @@ -6946,75 +7609,11 @@ void rapl_perf_init(void) free(domain_visited); } -static int has_amperf_access_via_msr(void) -{ - if (no_msr) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_APERF)) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_MPERF)) - return 0; - - return 1; -} - -static int has_amperf_access_via_perf(void) -{ - struct amperf_group_fd fds; - - /* - * Cache the last result, so we don't warn the user multiple times - * - * Negative means cached, no access - * Zero means not cached - * Positive means cached, has access - */ - static int has_access_cached; - - if (no_perf) - return 0; - - if (has_access_cached != 0) - return has_access_cached > 0; - - fds = open_amperf_fd(base_cpu); - has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); - - if (fds.aperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "APERF perf counter", "--no-perf"); - else - close(fds.aperf); - - if (fds.mperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "MPERF perf counter", "--no-perf"); - else - close(fds.mperf); - - if (has_access_cached == 0) - has_access_cached = -1; - - return has_access_cached > 0; -} - -/* Check if we can access APERF and MPERF */ +/* Assumes msr_counter_info is populated */ static int has_amperf_access(void) { - if (!is_aperf_access_required()) - return 0; - - if (!no_msr && has_amperf_access_via_msr()) - return 1; - - if (!no_perf && has_amperf_access_via_perf()) - return 1; - - return 0; + return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; } int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) @@ -7039,7 +7638,7 @@ int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const s return -1; const unsigned int type = read_perf_type(cai->perf_subsys); - const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); @@ -7057,12 +7656,120 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st { int ret = add_cstate_perf_counter_(cpu, cci, cai); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; } +int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + if (no_perf) + return -1; + + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (cci->fd_perf == -1) + cci->fd_perf = fd_counter; + + return fd_counter; +} + +int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + int ret = add_msr_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu); + + return ret; +} + +void msr_perf_init_(void) +{ + const int mci_num = topo.max_cpu_num + 1; + + msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info)); + if (!msr_counter_info) + err(1, "calloc msr_counter_info"); + msr_counter_info_size = mci_num; + + for (int cpu = 0; cpu < mci_num; ++cpu) + msr_counter_info[cpu].fd_perf = -1; + + for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) { + + struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx]; + + cai->present = false; + + for (int cpu = 0; cpu < mci_num; ++cpu) { + + struct msr_counter_info_t *const cci = &msr_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + if (cai->needed) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) { + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; + cai->present = true; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + cci->msr_mask[cai->rci_index] = cai->msr_mask; + cai->present = true; + } + } + } + } +} + +/* Initialize data for reading perf counters from the MSR group. */ +void msr_perf_init(void) +{ + bool need_amperf = false, need_smi = false; + const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1); + + need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1; + + if (BIC_IS_ENABLED(BIC_SMI)) + need_smi = true; + + /* Enable needed counters */ + msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi; + + msr_perf_init_(); + + const bool has_amperf = has_amperf_access(); + const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present; + + has_aperf_access = has_amperf; + + if (has_amperf) { + BIC_PRESENT(BIC_Avg_MHz); + BIC_PRESENT(BIC_Busy); + BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_SMI); + } + + if (has_smi) + BIC_PRESENT(BIC_SMI); +} + void cstate_perf_init_(bool soft_c1) { bool has_counter; @@ -7127,17 +7834,17 @@ void cstate_perf_init_(bool soft_c1) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { - cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; /* User MSR for this counter */ } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit && probe_msr(cpu, cai->msr) == 0) { - cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } } - if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) { has_counter = true; cores_visited[core_id] = true; pkg_visited[pkg_id] = true; @@ -7320,12 +8027,6 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf && has_amperf_access()) { - BIC_PRESENT(BIC_Avg_MHz); - BIC_PRESENT(BIC_Busy); - BIC_PRESENT(BIC_Bzy_MHz); - BIC_PRESENT(BIC_IPC); - } do_dts = eax & (1 << 0); if (do_dts) BIC_PRESENT(BIC_CoreTmp); @@ -7442,6 +8143,11 @@ static void counter_info_init(void) if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; } + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) { + msr_counter_arch_infos[i].present = false; + msr_counter_arch_infos[i].needed = false; + } } void probe_pm_features(void) @@ -7817,100 +8523,446 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } -static void set_amperf_source(void) +bool has_added_counters(void) { - amperf_source = AMPERF_SOURCE_PERF; + /* + * It only makes sense to call this after the command line is parsed, + * otherwise sys structure is not populated. + */ - const bool aperf_required = is_aperf_access_required(); + return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; +} - if (no_perf || !aperf_required || !has_amperf_access_via_perf()) - amperf_source = AMPERF_SOURCE_MSR; +void check_msr_access(void) +{ + check_dev_msr(); + check_msr_permission(); - if (quiet || !debug) - return; + if (no_msr) + bic_disable_msr_access(); +} - fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); +void check_perf_access(void) +{ + if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access()) + bic_enabled &= ~BIC_IPC; } -bool has_added_counters(void) +int added_perf_counters_init_(struct perf_counter_info *pinfo) +{ + size_t num_domains = 0; + unsigned int next_domain; + bool *domain_visited; + unsigned int perf_type, perf_config; + double perf_scale; + int fd_perf; + + if (!pinfo) + return 0; + + const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1)); + + domain_visited = calloc(max_num_domains, sizeof(*domain_visited)); + + while (pinfo) { + switch (pinfo->scope) { + case SCOPE_CPU: + num_domains = topo.max_cpu_num + 1; + break; + + case SCOPE_CORE: + num_domains = topo.max_core_id + 1; + break; + + case SCOPE_PACKAGE: + num_domains = topo.max_package_id + 1; + break; + } + + /* Allocate buffer for file descriptor for each domain. */ + pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain)); + if (!pinfo->fd_perf_per_domain) + errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain"); + + for (size_t i = 0; i < num_domains; ++i) + pinfo->fd_perf_per_domain[i] = -1; + + pinfo->num_domains = num_domains; + pinfo->scale = 1.0; + + memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + next_domain = cpu_to_domain(pinfo, cpu); + + assert(next_domain < num_domains); + + if (cpu_is_not_allowed(cpu)) + continue; + + if (domain_visited[next_domain]) + continue; + + perf_type = read_perf_type(pinfo->device); + if (perf_type == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "type"); + continue; + } + + perf_config = read_perf_config(pinfo->device, pinfo->event); + if (perf_config == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "config"); + continue; + } + + /* Scale is not required, some counters just don't have it. */ + perf_scale = read_perf_scale(pinfo->device, pinfo->event); + if (perf_scale == 0.0) + perf_scale = 1.0; + + fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0); + if (fd_perf == -1) { + warnx("%s: perf/%s/%s: failed to open counter on cpu%d", + __func__, pinfo->device, pinfo->event, cpu); + continue; + } + + domain_visited[next_domain] = 1; + pinfo->fd_perf_per_domain[next_domain] = fd_perf; + pinfo->scale = perf_scale; + + if (debug) + fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", + pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + } + + pinfo = pinfo->next; + } + + free(domain_visited); + + return 0; +} + +void added_perf_counters_init(void) +{ + if (added_perf_counters_init_(sys.perf_tp)) + errx(1, "%s: %s", __func__, "thread"); + + if (added_perf_counters_init_(sys.perf_cp)) + errx(1, "%s: %s", __func__, "core"); + + if (added_perf_counters_init_(sys.perf_pp)) + errx(1, "%s: %s", __func__, "package"); +} + +int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output) +{ + int fd_telem_info; + FILE *file_telem_info; + unsigned long value; + + fd_telem_info = openat(fd_dir, info_filename, O_RDONLY); + if (fd_telem_info == -1) + return -1; + + file_telem_info = fdopen(fd_telem_info, "r"); + if (file_telem_info == NULL) { + close(fd_telem_info); + return -1; + } + + if (fscanf(file_telem_info, format, &value) != 1) { + fclose(file_telem_info); + return -1; + } + + fclose(file_telem_info); + + *output = value; + + return 0; +} + +struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) { + DIR *dirp; + struct dirent *entry; + struct stat st; + unsigned int telem_idx; + int fd_telem_dir, fd_pmt; + unsigned long guid, size, offset; + size_t mmap_size; + void *mmio; + struct pmt_mmio *ret = NULL; + + if (stat(SYSFS_TELEM_PATH, &st) == -1) + return NULL; + + dirp = opendir(SYSFS_TELEM_PATH); + if (dirp == NULL) + return NULL; + + for (;;) { + entry = readdir(dirp); + + if (entry == NULL) + break; + + if (strcmp(entry->d_name, ".") == 0) + continue; + + if (strcmp(entry->d_name, "..") == 0) + continue; + + if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1) + continue; + + if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) { + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY); + if (fd_telem_dir == -1) { + break; + } + + if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { + close(fd_telem_dir); + break; + } + + if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) { + close(fd_telem_dir); + break; + } + + if (guid != target_guid) { + close(fd_telem_dir); + continue; + } + + if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) { + close(fd_telem_dir); + break; + } + + assert(offset == 0); + + fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY); + if (fd_pmt == -1) + goto loop_cleanup_and_break; + + mmap_size = (size + 0x1000UL) & (~0x1000UL); + mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0); + if (mmio != MAP_FAILED) { + + if (debug) + fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio); + + ret = calloc(1, sizeof(*ret)); + + if (!ret) { + fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__); + exit(1); + } + + ret->guid = guid; + ret->mmio_base = mmio; + ret->pmt_offset = offset; + ret->size = size; + + ret->next = pmt_mmios; + pmt_mmios = ret; + } + +loop_cleanup_and_break: + close(fd_pmt); + close(fd_telem_dir); + break; + } + + closedir(dirp); + + return ret; +} + +struct pmt_mmio *pmt_mmio_find(unsigned int guid) +{ + struct pmt_mmio *pmmio = pmt_mmios; + + while (pmmio) { + if (pmmio->guid == guid) + return pmmio; + + pmmio = pmmio->next; + } + + return NULL; +} + +void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) +{ + char *ret; + + /* Get base of mmaped PMT file. */ + ret = (char *)pmmio->mmio_base; + /* - * It only makes sense to call this after the command line is parsed, - * otherwise sys structure is not populated. + * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data. + * It's not guaranteed that the mmaped memory begins with the telemetry data + * - we might have to apply the offset first. */ + ret += pmmio->pmt_offset; - return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; + /* Apply the counter offset to get the address to the mmaped counter. */ + ret += counter_offset; + + return ret; } -bool is_msr_access_required(void) +struct pmt_mmio *pmt_add_guid(unsigned int guid) { - if (no_msr) - return false; - - if (has_added_counters()) - return true; - - return BIC_IS_ENABLED(BIC_SMI) - || BIC_IS_ENABLED(BIC_CPU_c1) - || BIC_IS_ENABLED(BIC_CPU_c3) - || BIC_IS_ENABLED(BIC_CPU_c6) - || BIC_IS_ENABLED(BIC_CPU_c7) - || BIC_IS_ENABLED(BIC_Mod_c6) - || BIC_IS_ENABLED(BIC_CoreTmp) - || BIC_IS_ENABLED(BIC_Totl_c0) - || BIC_IS_ENABLED(BIC_Any_c0) - || BIC_IS_ENABLED(BIC_GFX_c0) - || BIC_IS_ENABLED(BIC_CPUGFX) - || BIC_IS_ENABLED(BIC_Pkgpc3) - || BIC_IS_ENABLED(BIC_Pkgpc6) - || BIC_IS_ENABLED(BIC_Pkgpc2) - || BIC_IS_ENABLED(BIC_Pkgpc7) - || BIC_IS_ENABLED(BIC_Pkgpc8) - || BIC_IS_ENABLED(BIC_Pkgpc9) - || BIC_IS_ENABLED(BIC_Pkgpc10) - /* TODO: Multiplex access with perf */ - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_PkgWatt) - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_GFXWatt) - || BIC_IS_ENABLED(BIC_RAMWatt) - || BIC_IS_ENABLED(BIC_Pkg_J) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_GFX_J) - || BIC_IS_ENABLED(BIC_RAM_J) - || BIC_IS_ENABLED(BIC_PKG__) - || BIC_IS_ENABLED(BIC_RAM__) - || BIC_IS_ENABLED(BIC_PkgTmp) - || (is_aperf_access_required() && !has_amperf_access_via_perf()); + struct pmt_mmio *ret; + + ret = pmt_mmio_find(guid); + if (!ret) + ret = pmt_mmio_open(guid); + + return ret; } -void check_msr_access(void) +enum pmt_open_mode { + PMT_OPEN_TRY, /* Open failure is not an error. */ + PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */ +}; + +struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name) { - if (!is_msr_access_required()) - no_msr = 1; + while (pcounter) { + if (strcmp(pcounter->name, name) == 0) + break; - check_dev_msr(); - check_msr_permission(); + pcounter = pcounter->next; + } - if (no_msr) - bic_disable_msr_access(); + return pcounter; } -void check_perf_access(void) +struct pmt_counter **pmt_get_scope_root(enum counter_scope scope) { - const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + switch (scope) { + case SCOPE_CPU: + return &sys.pmt_tp; + case SCOPE_CORE: + return &sys.pmt_cp; + case SCOPE_PACKAGE: + return &sys.pmt_pp; + } - if (no_perf || !intrcount_required || !has_instr_count_access()) - bic_enabled &= ~BIC_IPC; + __builtin_unreachable(); +} - const bool aperf_required = is_aperf_access_required(); +void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id) +{ + /* Make sure the new domain fits. */ + if (domain_id >= pcounter->num_domains) + pmt_counter_resize(pcounter, domain_id + 1); - if (!aperf_required || !has_amperf_access()) { - bic_enabled &= ~BIC_Avg_MHz; - bic_enabled &= ~BIC_Busy; - bic_enabled &= ~BIC_Bzy_MHz; - bic_enabled &= ~BIC_IPC; + assert(pcounter->domains); + assert(domain_id < pcounter->num_domains); + + pcounter->domains[domain_id].pcounter = pmmio; +} + +int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, + unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, + enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) +{ + struct pmt_mmio *mmio; + struct pmt_counter *pcounter; + struct pmt_counter **const pmt_root = pmt_get_scope_root(scope); + bool new_counter = false; + int conflict = 0; + + if (lsb > msb) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name); + exit(1); + } + + if (msb >= 64) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name); + exit(1); + } + + mmio = pmt_add_guid(guid); + if (!mmio) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid); + exit(1); + } + + return 1; + } + + if (offset >= mmio->size) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size); + exit(1); + } + + return 1; + } + + pcounter = pmt_find_counter(*pmt_root, name); + if (!pcounter) { + pcounter = calloc(1, sizeof(*pcounter)); + new_counter = true; + } + + if (new_counter) { + strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1); + pcounter->type = type; + pcounter->scope = scope; + pcounter->lsb = lsb; + pcounter->msb = msb; + pcounter->format = format; + } else { + conflict += pcounter->type != type; + conflict += pcounter->scope != scope; + conflict += pcounter->lsb != lsb; + conflict += pcounter->msb != msb; + conflict += pcounter->format != format; + } + + if (conflict) { + fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n", + __func__, name); + exit(1); + } + + pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id); + + if (new_counter) { + pcounter->next = *pmt_root; + *pmt_root = pcounter; + } + + return 0; +} + +void pmt_init(void) +{ + if (BIC_IS_ENABLED(BIC_Diec6)) { + pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB, + PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, + 0, PMT_OPEN_TRY); } } @@ -7923,16 +8975,18 @@ void turbostat_init() process_cpuid(); counter_info_init(); probe_pm_features(); - set_amperf_source(); + msr_perf_init(); linux_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); + pmt_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (DO_BIC(BIC_IPC)) - (void)get_instr_count_fd(base_cpu); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1) + BIC_PRESENT(BIC_IPC); /* * If TSC tweak is needed, but couldn't get it, @@ -8017,7 +9071,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.05.10 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2024.07.26 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -8049,7 +9103,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) for (mp = head; mp; mp = mp->next) { if (debug) - printf("%s: %s %s\n", __func__, name, mp->name); + fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name); if (!strncmp(name, mp->name, strlen(mp->name))) return mp; } @@ -8066,8 +9120,8 @@ int add_counter(unsigned int msr_num, char *path, char *name, errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); if (debug) - printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, - path, name, width, scope, type, format, flags, id); + fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", + __func__, msr_num, path, name, width, scope, type, format, flags, id); switch (scope) { @@ -8075,7 +9129,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.tp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { @@ -8087,7 +9141,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.cp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { @@ -8099,7 +9153,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.pp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { @@ -8116,6 +9170,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; strncpy(msrp->name, name, NAME_BYTES - 1); msrp->width = width; @@ -8156,11 +9211,106 @@ int add_counter(unsigned int msr_num, char *path, char *name, return 0; } -void parse_add_command(char *add_command) +/* + * Initialize the fields used for identifying and opening the counter. + * + * Defer the initialization of any runtime buffers for actually reading + * the counters for when we initialize all perf counters, so we can later + * easily call re_initialize(). + */ +struct perf_counter_info *make_perf_counter_info(const char *perf_device, + const char *perf_event, + const char *name, + unsigned int width, + enum counter_scope scope, + enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + pinfo = calloc(1, sizeof(*pinfo)); + if (!pinfo) + errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event); + + strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1); + strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1); + + strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1); + pinfo->width = width; + pinfo->scope = scope; + pinfo->type = type; + pinfo->format = format; + + return pinfo; +} + +int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width, + enum counter_scope scope, enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + switch (scope) { + case SCOPE_CPU: + if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_CORE: + if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_PACKAGE: + if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + } + + pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format); + + if (!pinfo) + return -1; + + switch (scope) { + case SCOPE_CPU: + pinfo->next = sys.perf_tp; + sys.perf_tp = pinfo; + ++sys.added_thread_perf_counters; + break; + + case SCOPE_CORE: + pinfo->next = sys.perf_cp; + sys.perf_cp = pinfo; + ++sys.added_core_perf_counters; + break; + + case SCOPE_PACKAGE: + pinfo->next = sys.perf_pp; + sys.perf_pp = pinfo; + ++sys.added_package_perf_counters; + break; + } + + // FIXME: we might not have debug here yet + if (debug) + fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", + __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + + return 0; +} + +void parse_add_command_msr(char *add_command) { int msr_num = 0; char *path = NULL; - char name_buffer[NAME_BYTES] = ""; + char perf_device[PERF_DEV_NAME_BYTES] = ""; + char perf_event[PERF_EVT_NAME_BYTES] = ""; + char name_buffer[PERF_NAME_BYTES] = ""; int width = 64; int fail = 0; enum counter_scope scope = SCOPE_CPU; @@ -8175,6 +9325,11 @@ void parse_add_command(char *add_command) if (sscanf(add_command, "msr%d", &msr_num) == 1) goto next; + BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31); + BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31); + if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2) + goto next; + if (*add_command == '/') { path = add_command; goto next; @@ -8222,7 +9377,8 @@ void parse_add_command(char *add_command) goto next; } - if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { /* 18 < NAME_BYTES */ + BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18); + if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { char *eos; eos = strchr(name_buffer, ','); @@ -8239,21 +9395,33 @@ next: } } - if ((msr_num == 0) && (path == NULL)) { - fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n"); + if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) { + fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n"); fail++; } + /* Test for non-empty perf_device and perf_event */ + const bool is_perf_counter = perf_device[0] && perf_event[0]; + /* generate default column header */ if (*name_buffer == '\0') { - if (width == 32) - sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); - else - sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + if (is_perf_counter) { + snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event); + } else { + if (width == 32) + sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + else + sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + } } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) - fail++; + if (is_perf_counter) { + if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format)) + fail++; + } else { + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) + fail++; + } if (fail) { help(); @@ -8261,6 +9429,195 @@ next: } } +bool starts_with(const char *str, const char *prefix) +{ + return strncmp(prefix, str, strlen(prefix)) == 0; +} + +void parse_add_command_pmt(char *add_command) +{ + char *name = NULL; + char *type_name = NULL; + char *format_name = NULL; + unsigned int offset; + unsigned int lsb; + unsigned int msb; + unsigned int guid; + unsigned int domain_id; + enum counter_scope scope = 0; + enum pmt_datatype type = PMT_TYPE_RAW; + enum counter_format format = FORMAT_RAW; + bool has_offset = false; + bool has_lsb = false; + bool has_msb = false; + bool has_format = true; /* Format has a default value. */ + bool has_guid = false; + bool has_scope = false; + bool has_type = true; /* Type has a default value. */ + + /* Consume the "pmt," prefix. */ + add_command = strchr(add_command, ','); + if (!add_command) { + help(); + exit(1); + } + ++add_command; + + while (add_command) { + if (starts_with(add_command, "name=")) { + name = add_command + strlen("name="); + goto next; + } + + if (starts_with(add_command, "type=")) { + type_name = add_command + strlen("type="); + goto next; + } + + if (starts_with(add_command, "domain=")) { + const size_t prefix_len = strlen("domain="); + + if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) { + scope = SCOPE_CPU; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) { + scope = SCOPE_CORE; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) { + scope = SCOPE_PACKAGE; + has_scope = true; + } + + if (!has_scope) { + printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", + __func__); + exit(1); + } + + goto next; + } + + if (starts_with(add_command, "format=")) { + format_name = add_command + strlen("format="); + goto next; + } + + if (sscanf(add_command, "offset=%u", &offset) == 1) { + has_offset = true; + goto next; + } + + if (sscanf(add_command, "lsb=%u", &lsb) == 1) { + has_lsb = true; + goto next; + } + + if (sscanf(add_command, "msb=%u", &msb) == 1) { + has_msb = true; + goto next; + } + + if (sscanf(add_command, "guid=%x", &guid) == 1) { + has_guid = true; + goto next; + } + +next: + add_command = strchr(add_command, ','); + if (add_command) { + *add_command = '\0'; + add_command++; + } + } + + if (!name) { + printf("%s: missing %s\n", __func__, "name"); + exit(1); + } + + if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) { + printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES); + exit(1); + } + + if (format_name) { + has_format = false; + + if (strcmp("raw", format_name) == 0) { + format = FORMAT_RAW; + has_format = true; + } + + if (strcmp("delta", format_name) == 0) { + format = FORMAT_DELTA; + has_format = true; + } + + if (!has_format) { + fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name); + exit(1); + } + } + + if (type_name) { + has_type = false; + + if (strcmp("raw", type_name) == 0) { + type = PMT_TYPE_RAW; + has_type = true; + } + + if (strcmp("txtal_time", type_name) == 0) { + type = PMT_TYPE_XTAL_TIME; + has_type = true; + } + + if (!has_type) { + printf("%s: invalid %s: %s\n", __func__, "type", type_name); + exit(1); + } + } + + if (!has_offset) { + printf("%s : missing %s\n", __func__, "offset"); + exit(1); + } + + if (!has_lsb) { + printf("%s: missing %s\n", __func__, "lsb"); + exit(1); + } + + if (!has_msb) { + printf("%s: missing %s\n", __func__, "msb"); + exit(1); + } + + if (!has_guid) { + printf("%s: missing %s\n", __func__, "guid"); + exit(1); + } + + if (!has_scope) { + printf("%s: missing %s\n", __func__, "scope"); + exit(1); + } + + if (lsb > msb) { + printf("%s: lsb > msb doesn't make sense\n", __func__); + exit(1); + } + + pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED); +} + +void parse_add_command(char *add_command) +{ + if (strncmp(add_command, "pmt", strlen("pmt")) == 0) + return parse_add_command_pmt(add_command); + return parse_add_command_msr(add_command); +} + int is_deferred_add(char *name) { int i; diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh index 4ef1397927bb..8a5df3f22550 100755 --- a/tools/rcu/rcu-updaters.sh +++ b/tools/rcu/rcu-updaters.sh @@ -21,12 +21,10 @@ fi bpftrace -e 'kprobe:kvfree_call_rcu, kprobe:call_rcu, kprobe:call_rcu_tasks, - kprobe:call_rcu_tasks_rude, kprobe:call_rcu_tasks_trace, kprobe:call_srcu, kprobe:rcu_barrier, kprobe:rcu_barrier_tasks, - kprobe:rcu_barrier_tasks_rude, kprobe:rcu_barrier_tasks_trace, kprobe:srcu_barrier, kprobe:synchronize_rcu, diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph index 57d78f6df041..f14bdfedee8f 100755 --- a/tools/sound/dapm-graph +++ b/tools/sound/dapm-graph @@ -8,6 +8,8 @@ set -eu +STYLE_COMPONENT_ON="color=dodgerblue;style=bold" +STYLE_COMPONENT_OFF="color=gray40;style=filled;fillcolor=gray90" STYLE_NODE_ON="shape=box,style=bold,color=green4" STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95" @@ -132,11 +134,17 @@ process_dapm_widget() # Collect any links. We could use "in" links or "out" links, # let's use "in" links if echo "${line}" | grep -q '^in '; then + local w_route=$(echo "$line" | awk -F\" '{print $2}') local w_src=$(echo "$line" | awk -F\" '{print $6 "_" $4}' | sed 's/^(null)_/ROOT_/') dbg_echo " - Input route from: ${w_src}" - echo " \"${w_src}\" -> \"$w_tag\"" >> "${links_file}" + dbg_echo " - Route: ${w_route}" + local w_edge_attrs="" + if [ "${w_route}" != "static" ]; then + w_edge_attrs=" [label=\"${w_route}\"]" + fi + echo " \"${w_src}\" -> \"$w_tag\"${w_edge_attrs}" >> "${links_file}" fi done @@ -150,16 +158,20 @@ process_dapm_widget() # # $1 = temporary work dir # $2 = component directory -# $3 = forced component name (extracted for path if empty) +# $3 = "ROOT" for the root card directory, empty otherwise process_dapm_component() { local tmp_dir="${1}" local c_dir="${2}" local c_name="${3}" + local is_component=0 local dot_file="${tmp_dir}/main.dot" local links_file="${tmp_dir}/links.dot" + local c_attribs="" if [ -z "${c_name}" ]; then + is_component=1 + # Extract directory name into component name: # "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a" c_name="$(basename $(dirname "${c_dir}"))" @@ -167,11 +179,23 @@ process_dapm_component() dbg_echo " * Component: ${c_name}" - echo "" >> "${dot_file}" - echo " subgraph \"${c_name}\" {" >> "${dot_file}" - echo " cluster = true" >> "${dot_file}" - echo " label = \"${c_name}\"" >> "${dot_file}" - echo " color=dodgerblue" >> "${dot_file}" + if [ ${is_component} = 1 ]; then + if [ -f "${c_dir}/bias_level" ]; then + c_onoff=$(sed -n -e 1p "${c_dir}/bias_level" | awk '{print $1}') + dbg_echo " - bias_level: ${c_onoff}" + if [ "$c_onoff" = "On" ]; then + c_attribs="${STYLE_COMPONENT_ON}" + elif [ "$c_onoff" = "Off" ]; then + c_attribs="${STYLE_COMPONENT_OFF}" + fi + fi + + echo "" >> "${dot_file}" + echo " subgraph \"${c_name}\" {" >> "${dot_file}" + echo " cluster = true" >> "${dot_file}" + echo " label = \"${c_name}\"" >> "${dot_file}" + echo " ${c_attribs}" >> "${dot_file}" + fi # Create empty file to ensure it will exist in all cases >"${links_file}" @@ -181,7 +205,9 @@ process_dapm_component() process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}" done - echo " }" >> "${dot_file}" + if [ ${is_component} = 1 ]; then + echo " }" >> "${dot_file}" + fi cat "${links_file}" >> "${dot_file}" } @@ -200,7 +226,7 @@ process_dapm_tree() echo "digraph G {" > "${dot_file}" echo " fontname=\"sans-serif\"" >> "${dot_file}" echo " node [fontname=\"sans-serif\"]" >> "${dot_file}" - + echo " edge [fontname=\"sans-serif\"]" >> "${dot_file}" # Process root directory (no component) process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT" diff --git a/tools/spi/spidev_fdx.c b/tools/spi/spidev_fdx.c index 7d2a867cd4ae..bc9c4f6c3ba8 100644 --- a/tools/spi/spidev_fdx.c +++ b/tools/spi/spidev_fdx.c @@ -99,7 +99,7 @@ static void dumpstat(const char *name, int fd) return; } - printf("%s: spi mode 0x%x, %d bits %sper word, %d Hz max\n", + printf("%s: spi mode 0x%x, %d bits %sper word, %u Hz max\n", name, mode, bits, lsb ? "(lsb first) " : "", speed); } diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 030b388800f0..3d1ca9e38b1f 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -14,6 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat +ldflags-y += --wrap=cxl_setup_parent_dport DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index eaf091a3d331..129f179b0ac5 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -385,19 +385,21 @@ struct cxl_test_gen_media { struct cxl_test_gen_media gen_media = { .id = CXL_EVENT_GEN_MEDIA_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_gen_media), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_gen_media), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x2000), + .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, + .transaction_type = CXL_GMER_TRANS_HOST_WRITE, + /* .validity_flags = <set below> */ + .channel = 1, + .rank = 30, }, - .phys_addr = cpu_to_le64(0x2000), - .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, - .transaction_type = CXL_GMER_TRANS_HOST_WRITE, - /* .validity_flags = <set below> */ - .channel = 1, - .rank = 30 }, }; @@ -409,18 +411,20 @@ struct cxl_test_dram { struct cxl_test_dram dram = { .id = CXL_EVENT_DRAM_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_dram), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_dram), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x8000), + .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, + .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, + /* .validity_flags = <set below> */ + .channel = 1, }, - .phys_addr = cpu_to_le64(0x8000), - .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, - .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, - /* .validity_flags = <set below> */ - .channel = 1, .bank_group = 5, .bank = 2, .column = {0xDE, 0xAD}, @@ -474,11 +478,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds, static void cxl_mock_add_event_logs(struct mock_event_store *mes) { put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK, - &gen_media.rec.validity_flags); + &gen_media.rec.media_hdr.validity_flags); put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP | CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN, - &dram.rec.validity_flags); + &dram.rec.media_hdr.validity_flags); mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed); mes_add_event(mes, CXL_EVENT_TYPE_INFO, @@ -1131,27 +1135,28 @@ static bool mock_poison_dev_max_injected(struct cxl_dev_state *cxlds) return (count >= poison_inject_dev_max); } -static bool mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) +static int mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) { + /* Return EBUSY to match the CXL driver handling */ if (mock_poison_dev_max_injected(cxlds)) { dev_dbg(cxlds->dev, "Device poison injection limit has been reached: %d\n", - MOCK_INJECT_DEV_MAX); - return false; + poison_inject_dev_max); + return -EBUSY; } for (int i = 0; i < MOCK_INJECT_TEST_MAX; i++) { if (!mock_poison_list[i].cxlds) { mock_poison_list[i].cxlds = cxlds; mock_poison_list[i].dpa = dpa; - return true; + return 0; } } dev_dbg(cxlds->dev, "Mock test poison injection limit has been reached: %d\n", MOCK_INJECT_TEST_MAX); - return false; + return -ENXIO; } static bool mock_poison_found(struct cxl_dev_state *cxlds, u64 dpa) @@ -1175,10 +1180,8 @@ static int mock_inject_poison(struct cxl_dev_state *cxlds, dev_dbg(cxlds->dev, "DPA: 0x%llx already poisoned\n", dpa); return 0; } - if (!mock_poison_add(cxlds, dpa)) - return -ENXIO; - return 0; + return mock_poison_add(cxlds, dpa); } static bool mock_poison_del(struct cxl_dev_state *cxlds, u64 dpa) diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 6f737941dc0e..d619672faa49 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -299,6 +299,18 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL); +void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) +{ + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (!ops || !ops->is_mock_port(dport->dport_dev)) + cxl_setup_parent_dport(host, dport); + + put_cxl_mock_ops(index); +} +EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL); + MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); MODULE_IMPORT_NS(CXL); diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 7254c110ff23..61931c4926fd 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -72,7 +72,8 @@ class LinuxSourceTreeOperations: raise ConfigError(e.output.decode()) def make(self, jobs: int, build_dir: str, make_options: Optional[List[str]]) -> None: - command = ['make', 'ARCH=' + self._linux_arch, 'O=' + build_dir, '--jobs=' + str(jobs)] + command = ['make', 'all', 'compile_commands.json', 'ARCH=' + self._linux_arch, + 'O=' + build_dir, '--jobs=' + str(jobs)] if make_options: command.extend(make_options) if self._cross_compile: diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 7527f738b4a1..d1acd7d58850 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -5,8 +5,8 @@ CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \ LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu TARGETS = main idr-test multiorder xarray maple -CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \ - slab.o maple.o +LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o +CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS) OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \ iteration_check_2.o benchmark.o diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c deleted file mode 100644 index 66ec4a24a203..000000000000 --- a/tools/testing/radix-tree/bitmap.c +++ /dev/null @@ -1,23 +0,0 @@ -/* lib/bitmap.c pulls in at least two other files. */ - -#include <linux/bitmap.h> - -void bitmap_clear(unsigned long *map, unsigned int start, int len) -{ - unsigned long *p = map + BIT_WORD(start); - const unsigned int size = start + len; - int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); - unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - - while (len - bits_to_clear >= 0) { - *p &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_LONG; - mask_to_clear = ~0UL; - p++; - } - if (len) { - mask_to_clear &= BITMAP_LAST_WORD_MASK(size); - *p &= ~mask_to_clear; - } -} diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index bc8fe9e8f7f2..b38199965f99 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +TARGETS += acct TARGETS += alsa TARGETS += amd-pstate TARGETS += arm64 @@ -65,9 +66,11 @@ TARGETS += net/af_unix TARGETS += net/forwarding TARGETS += net/hsr TARGETS += net/mptcp +TARGETS += net/netfilter TARGETS += net/openvswitch +TARGETS += net/packetdrill +TARGETS += net/rds TARGETS += net/tcp_ao -TARGETS += net/netfilter TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd @@ -107,7 +110,6 @@ TARGETS += tmpfs TARGETS += tpm2 TARGETS += tty TARGETS += uevent -TARGETS += user TARGETS += user_events TARGETS += vDSO TARGETS += mm diff --git a/tools/testing/selftests/acct/.gitignore b/tools/testing/selftests/acct/.gitignore new file mode 100644 index 000000000000..7e78aac19038 --- /dev/null +++ b/tools/testing/selftests/acct/.gitignore @@ -0,0 +1,3 @@ +acct_syscall +config +process_log
\ No newline at end of file diff --git a/tools/testing/selftests/acct/Makefile b/tools/testing/selftests/acct/Makefile new file mode 100644 index 000000000000..7e025099cf65 --- /dev/null +++ b/tools/testing/selftests/acct/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := acct_syscall +CFLAGS += -Wall + +include ../lib.mk
\ No newline at end of file diff --git a/tools/testing/selftests/acct/acct_syscall.c b/tools/testing/selftests/acct/acct_syscall.c new file mode 100644 index 000000000000..e44e8fe1f4a3 --- /dev/null +++ b/tools/testing/selftests/acct/acct_syscall.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* kselftest for acct() system call + * The acct() system call enables or disables process accounting. + */ + +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <sys/wait.h> + +#include "../kselftest.h" + +int main(void) +{ + char filename[] = "process_log"; + FILE *fp; + pid_t child_pid; + int sz; + + // Setting up kselftest framework + ksft_print_header(); + ksft_set_plan(1); + + // Check if test is run a root + if (geteuid()) { + ksft_test_result_skip("This test needs root to run!\n"); + return 1; + } + + // Create file to log closed processes + fp = fopen(filename, "w"); + + if (!fp) { + ksft_test_result_error("%s.\n", strerror(errno)); + ksft_finished(); + return 1; + } + + acct(filename); + + // Handle error conditions + if (errno) { + ksft_test_result_error("%s.\n", strerror(errno)); + fclose(fp); + ksft_finished(); + return 1; + } + + // Create child process and wait for it to terminate. + + child_pid = fork(); + + if (child_pid < 0) { + ksft_test_result_error("Creating a child process to log failed\n"); + acct(NULL); + return 1; + } else if (child_pid > 0) { + wait(NULL); + fseek(fp, 0L, SEEK_END); + sz = ftell(fp); + + acct(NULL); + + if (sz <= 0) { + ksft_test_result_fail("Terminated child process not logged\n"); + ksft_exit_fail(); + return 1; + } + + ksft_test_result_pass("Successfully logged terminated process.\n"); + fclose(fp); + ksft_exit_pass(); + return 0; + } + + return 1; +} diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile index c1ce39874e2b..25be68025290 100644 --- a/tools/testing/selftests/alsa/Makefile +++ b/tools/testing/selftests/alsa/Makefile @@ -12,9 +12,9 @@ LDLIBS+=-lpthread OVERRIDE_TARGETS = 1 -TEST_GEN_PROGS := mixer-test pcm-test test-pcmtest-driver +TEST_GEN_PROGS := mixer-test pcm-test test-pcmtest-driver utimer-test -TEST_GEN_PROGS_EXTENDED := libatest.so +TEST_GEN_PROGS_EXTENDED := libatest.so global-timer TEST_FILES := conf.d pcm-test.conf diff --git a/tools/testing/selftests/alsa/global-timer.c b/tools/testing/selftests/alsa/global-timer.c new file mode 100644 index 000000000000..c15ec0ba851a --- /dev/null +++ b/tools/testing/selftests/alsa/global-timer.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This tool is used by the utimer test, and it allows us to + * count the ticks of a global timer in a certain time frame + * (which is set by `timeout` parameter). + * + * Author: Ivan Orlov <ivan.orlov0322@gmail.com> + */ +#include <stdio.h> +#include <stdlib.h> +#include <alsa/asoundlib.h> +#include <time.h> + +static int ticked; +static void async_callback(snd_async_handler_t *ahandler) +{ + ticked++; +} + +static char timer_name[64]; +static void bind_to_timer(int device, int subdevice, int timeout) +{ + snd_timer_t *handle; + snd_timer_params_t *params; + snd_async_handler_t *ahandler; + + time_t end; + + sprintf(timer_name, "hw:CLASS=%d,SCLASS=%d,DEV=%d,SUBDEV=%d", + SND_TIMER_CLASS_GLOBAL, SND_TIMER_SCLASS_NONE, + device, subdevice); + + snd_timer_params_alloca(¶ms); + + if (snd_timer_open(&handle, timer_name, SND_TIMER_OPEN_NONBLOCK) < 0) { + perror("Can't open the timer"); + exit(EXIT_FAILURE); + } + + snd_timer_params_set_auto_start(params, 1); + snd_timer_params_set_ticks(params, 1); + if (snd_timer_params(handle, params) < 0) { + perror("Can't set timer params"); + exit(EXIT_FAILURE); + } + + if (snd_async_add_timer_handler(&ahandler, handle, async_callback, NULL) < 0) { + perror("Can't create a handler"); + exit(EXIT_FAILURE); + } + end = time(NULL) + timeout; + if (snd_timer_start(handle) < 0) { + perror("Failed to start the timer"); + exit(EXIT_FAILURE); + } + printf("Timer has started\n"); + while (time(NULL) <= end) { + /* + * Waiting for the timeout to elapse. Can't use sleep here, as it gets + * constantly interrupted by the signal from the timer (SIGIO) + */ + } + snd_timer_stop(handle); + snd_timer_close(handle); +} + +int main(int argc, char *argv[]) +{ + int device, subdevice, timeout; + + if (argc < 4) { + perror("Usage: %s <device> <subdevice> <timeout>"); + return EXIT_FAILURE; + } + + setlinebuf(stdout); + + device = atoi(argv[1]); + subdevice = atoi(argv[2]); + timeout = atoi(argv[3]); + + bind_to_timer(device, subdevice, timeout); + + printf("Total ticks count: %d\n", ticked); + + return EXIT_SUCCESS; +} diff --git a/tools/testing/selftests/alsa/utimer-test.c b/tools/testing/selftests/alsa/utimer-test.c new file mode 100644 index 000000000000..32ee3ce57721 --- /dev/null +++ b/tools/testing/selftests/alsa/utimer-test.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This test covers the functionality of userspace-driven ALSA timers. Such timers + * are purely virtual (so they don't directly depend on the hardware), and they could be + * created and triggered by userspace applications. + * + * Author: Ivan Orlov <ivan.orlov0322@gmail.com> + */ +#include "../kselftest_harness.h" +#include <sound/asound.h> +#include <unistd.h> +#include <fcntl.h> +#include <limits.h> +#include <sys/ioctl.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#define FRAME_RATE 8000 +#define PERIOD_SIZE 4410 +#define UTIMER_DEFAULT_ID -1 +#define UTIMER_DEFAULT_FD -1 +#define NANO 1000000000ULL +#define TICKS_COUNT 10 +#define TICKS_RECORDING_DELTA 5 +#define TIMER_OUTPUT_BUF_LEN 1024 +#define TIMER_FREQ_SEC 1 +#define RESULT_PREFIX_LEN strlen("Total ticks count: ") + +enum timer_app_event { + TIMER_APP_STARTED, + TIMER_APP_RESULT, + TIMER_NO_EVENT, +}; + +FIXTURE(timer_f) { + struct snd_timer_uinfo *utimer_info; +}; + +FIXTURE_SETUP(timer_f) { + int timer_dev_fd; + + if (geteuid()) + SKIP(return, "This test needs root to run!"); + + self->utimer_info = calloc(1, sizeof(*self->utimer_info)); + ASSERT_NE(NULL, self->utimer_info); + + /* Resolution is the time the period of frames takes in nanoseconds */ + self->utimer_info->resolution = (NANO / FRAME_RATE * PERIOD_SIZE); + + timer_dev_fd = open("/dev/snd/timer", O_RDONLY); + ASSERT_GE(timer_dev_fd, 0); + + ASSERT_EQ(ioctl(timer_dev_fd, SNDRV_TIMER_IOCTL_CREATE, self->utimer_info), 0); + ASSERT_GE(self->utimer_info->fd, 0); + + close(timer_dev_fd); +} + +FIXTURE_TEARDOWN(timer_f) { + close(self->utimer_info->fd); + free(self->utimer_info); +} + +static void *ticking_func(void *data) +{ + int i; + int *fd = (int *)data; + + for (i = 0; i < TICKS_COUNT; i++) { + /* Well, trigger the timer! */ + ioctl(*fd, SNDRV_TIMER_IOCTL_TRIGGER, NULL); + sleep(TIMER_FREQ_SEC); + } + + return NULL; +} + +static enum timer_app_event parse_timer_output(const char *s) +{ + if (strstr(s, "Timer has started")) + return TIMER_APP_STARTED; + if (strstr(s, "Total ticks count")) + return TIMER_APP_RESULT; + + return TIMER_NO_EVENT; +} + +static int parse_timer_result(const char *s) +{ + char *end; + long d; + + d = strtol(s + RESULT_PREFIX_LEN, &end, 10); + if (end == s + RESULT_PREFIX_LEN) + return -1; + + return d; +} + +/* + * This test triggers the timer and counts ticks at the same time. The amount + * of the timer trigger calls should be equal to the amount of ticks received. + */ +TEST_F(timer_f, utimer) { + char command[64]; + pthread_t ticking_thread; + int total_ticks = 0; + FILE *rfp; + char *buf = malloc(TIMER_OUTPUT_BUF_LEN); + + ASSERT_NE(buf, NULL); + + /* The timeout should be the ticks interval * count of ticks + some delta */ + sprintf(command, "./global-timer %d %d %d", SNDRV_TIMER_GLOBAL_UDRIVEN, + self->utimer_info->id, TICKS_COUNT * TIMER_FREQ_SEC + TICKS_RECORDING_DELTA); + + rfp = popen(command, "r"); + while (fgets(buf, TIMER_OUTPUT_BUF_LEN, rfp)) { + buf[TIMER_OUTPUT_BUF_LEN - 1] = 0; + switch (parse_timer_output(buf)) { + case TIMER_APP_STARTED: + /* global-timer waits for timer to trigger, so start the ticking thread */ + pthread_create(&ticking_thread, NULL, ticking_func, + &self->utimer_info->fd); + break; + case TIMER_APP_RESULT: + total_ticks = parse_timer_result(buf); + break; + case TIMER_NO_EVENT: + break; + } + } + pthread_join(ticking_thread, NULL); + ASSERT_EQ(total_ticks, TICKS_COUNT); + pclose(rfp); +} + +TEST(wrong_timers_test) { + int timer_dev_fd; + int utimer_fd; + size_t i; + struct snd_timer_uinfo wrong_timer = { + .resolution = 0, + .id = UTIMER_DEFAULT_ID, + .fd = UTIMER_DEFAULT_FD, + }; + + timer_dev_fd = open("/dev/snd/timer", O_RDONLY); + ASSERT_GE(timer_dev_fd, 0); + + utimer_fd = ioctl(timer_dev_fd, SNDRV_TIMER_IOCTL_CREATE, &wrong_timer); + ASSERT_LT(utimer_fd, 0); + /* Check that id was not updated */ + ASSERT_EQ(wrong_timer.id, UTIMER_DEFAULT_ID); + + /* Test the NULL as an argument is processed correctly */ + ASSERT_LT(ioctl(timer_dev_fd, SNDRV_TIMER_IOCTL_CREATE, NULL), 0); + + close(timer_dev_fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index d8909b2b535a..f2d6007a2b98 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -156,6 +156,12 @@ static void pmull_sigill(void) asm volatile(".inst 0x0ee0e000" : : : ); } +static void poe_sigill(void) +{ + /* mrs x0, POR_EL0 */ + asm volatile("mrs x0, S3_3_C10_C2_4" : : : "x0"); +} + static void rng_sigill(void) { asm volatile("mrs x0, S3_3_C2_C4_0" : : : "x0"); @@ -602,6 +608,14 @@ static const struct hwcap_data { .sigill_fn = pmull_sigill, }, { + .name = "POE", + .at_hwcap = AT_HWCAP2, + .hwcap_bit = HWCAP2_POE, + .cpuinfo = "poe", + .sigill_fn = poe_sigill, + .sigill_reliable = true, + }, + { .name = "RNG", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_RNG, diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c index 4c941270d8de..b51d21f78cf9 100644 --- a/tools/testing/selftests/arm64/abi/ptrace.c +++ b/tools/testing/selftests/arm64/abi/ptrace.c @@ -156,17 +156,17 @@ static void test_hw_debug(pid_t child, int type, const char *type_name) /* Zero is not currently architecturally valid */ ksft_test_result(arch, "%s_arch_set\n", type_name); } else { - ksft_test_result_skip("%s_arch_set\n"); + ksft_test_result_skip("%s_arch_set\n", type_name); } } static int do_child(void) { if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) - ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + ksft_exit_fail_perror("PTRACE_TRACEME"); if (raise(SIGSTOP)) - ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + ksft_exit_fail_perror("raise(SIGSTOP)"); return EXIT_SUCCESS; } diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index 1ce5b5eac386..b2f2bfd5c6aa 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -2,6 +2,7 @@ mangle_* fake_sigreturn_* fpmr_* +poe_* sme_* ssve_* sve_* diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile index 8f5febaf1a9a..edb3613513b8 100644 --- a/tools/testing/selftests/arm64/signal/Makefile +++ b/tools/testing/selftests/arm64/signal/Makefile @@ -23,7 +23,7 @@ $(TEST_GEN_PROGS): $(PROGS) # Common test-unit targets to build common-layout test-cases executables # Needs secondary expansion to properly include the testcase c-file in pre-reqs COMMON_SOURCES := test_signals.c test_signals_utils.c testcases/testcases.c \ - signals.S + signals.S sve_helpers.c COMMON_HEADERS := test_signals.h test_signals_utils.h testcases/testcases.h .SECONDEXPANSION: diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.c b/tools/testing/selftests/arm64/signal/sve_helpers.c new file mode 100644 index 000000000000..0acc121af306 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/sve_helpers.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Limited + * + * Common helper functions for SVE and SME functionality. + */ + +#include <stdbool.h> +#include <kselftest.h> +#include <asm/sigcontext.h> +#include <sys/prctl.h> + +unsigned int vls[SVE_VQ_MAX]; +unsigned int nvls; + +int sve_fill_vls(bool use_sme, int min_vls) +{ + int vq, vl; + int pr_set_vl = use_sme ? PR_SME_SET_VL : PR_SVE_SET_VL; + int len_mask = use_sme ? PR_SME_VL_LEN_MASK : PR_SVE_VL_LEN_MASK; + + /* + * Enumerate up to SVE_VQ_MAX vector lengths + */ + for (vq = SVE_VQ_MAX; vq > 0; --vq) { + vl = prctl(pr_set_vl, vq * 16); + if (vl == -1) + return KSFT_FAIL; + + vl &= len_mask; + + /* + * Unlike SVE, SME does not require the minimum vector length + * to be implemented, or the VLs to be consecutive, so any call + * to the prctl might return the single implemented VL, which + * might be larger than 16. So to avoid this loop never + * terminating, bail out here when we find a higher VL than + * we asked for. + * See the ARM ARM, DDI 0487K.a, B1.4.2: I_QQRNR and I_NWYBP. + */ + if (vq < sve_vq_from_vl(vl)) + break; + + /* Skip missing VLs */ + vq = sve_vq_from_vl(vl); + + vls[nvls++] = vl; + } + + if (nvls < min_vls) { + fprintf(stderr, "Only %d VL supported\n", nvls); + return KSFT_SKIP; + } + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h new file mode 100644 index 000000000000..50948ce471cc --- /dev/null +++ b/tools/testing/selftests/arm64/signal/sve_helpers.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Limited + * + * Common helper functions for SVE and SME functionality. + */ + +#ifndef __SVE_HELPERS_H__ +#define __SVE_HELPERS_H__ + +#include <stdbool.h> + +#define VLS_USE_SVE false +#define VLS_USE_SME true + +extern unsigned int vls[]; +extern unsigned int nvls; + +int sve_fill_vls(bool use_sme, int min_vls); + +#endif diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c index ebd5815b54bb..dfd6a2badf9f 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c @@ -6,44 +6,28 @@ * handler, this is not supported and is expected to segfault. */ +#include <kselftest.h> #include <signal.h> #include <ucontext.h> #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" struct fake_sigframe sf; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 2); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least two VLs */ - if (nvls < 2) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static int fake_sigreturn_ssve_change_vl(struct tdescr *td, @@ -51,30 +35,30 @@ static int fake_sigreturn_ssve_change_vl(struct tdescr *td, { size_t resv_sz, offset; struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf); - struct sve_context *sve; + struct za_context *za; /* Get a signal context with a SME ZA frame in it */ if (!get_current_context(td, &sf.uc, sizeof(sf.uc))) return 1; resv_sz = GET_SF_RESV_SIZE(sf); - head = get_header(head, SVE_MAGIC, resv_sz, &offset); + head = get_header(head, ZA_MAGIC, resv_sz, &offset); if (!head) { - fprintf(stderr, "No SVE context\n"); + fprintf(stderr, "No ZA context\n"); return 1; } - if (head->size != sizeof(struct sve_context)) { + if (head->size != sizeof(struct za_context)) { fprintf(stderr, "Register data present, aborting\n"); return 1; } - sve = (struct sve_context *)head; + za = (struct za_context *)head; /* No changes are supported; init left us at minimum VL so go to max */ fprintf(stderr, "Attempting to change VL from %d to %d\n", - sve->vl, vls[0]); - sve->vl = vls[0]; + za->vl, vls[0]); + za->vl = vls[0]; fake_sigreturn(&sf, sizeof(sf), 0); diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c index e2a452190511..e1ccf8f85a70 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c @@ -12,40 +12,22 @@ #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" struct fake_sigframe sf; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sve_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SVE, 2); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SVE_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least two VLs */ - if (nvls < 2) { - fprintf(stderr, "Only %d VL supported\n", nvls); + if (res == KSFT_SKIP) td->result = KSFT_SKIP; - return false; - } - return true; + return false; } static int fake_sigreturn_sve_change_vl(struct tdescr *td, diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c new file mode 100644 index 000000000000..36bd9940ee05 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Arm Limited + * + * Verify that the POR_EL0 register context in signal frames is set up as + * expected. + */ + +#include <signal.h> +#include <ucontext.h> +#include <sys/auxv.h> +#include <sys/prctl.h> +#include <unistd.h> +#include <asm/sigcontext.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +static union { + ucontext_t uc; + char buf[1024 * 128]; +} context; + +#define SYS_POR_EL0 "S3_3_C10_C2_4" + +static uint64_t get_por_el0(void) +{ + uint64_t val; + + asm volatile( + "mrs %0, " SYS_POR_EL0 "\n" + : "=r"(val) + : + : ); + + return val; +} + +int poe_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); + struct poe_context *poe_ctx; + size_t offset; + bool in_sigframe; + bool have_poe; + __u64 orig_poe; + + have_poe = getauxval(AT_HWCAP2) & HWCAP2_POE; + if (have_poe) + orig_poe = get_por_el0(); + + if (!get_current_context(td, &context.uc, sizeof(context))) + return 1; + + poe_ctx = (struct poe_context *) + get_header(head, POE_MAGIC, td->live_sz, &offset); + + in_sigframe = poe_ctx != NULL; + + fprintf(stderr, "POR_EL0 sigframe %s on system %s POE\n", + in_sigframe ? "present" : "absent", + have_poe ? "with" : "without"); + + td->pass = (in_sigframe == have_poe); + + /* + * Check that the value we read back was the one present at + * the time that the signal was triggered. + */ + if (have_poe && poe_ctx) { + if (poe_ctx->por_el0 != orig_poe) { + fprintf(stderr, "POR_EL0 in frame is %llx, was %llx\n", + poe_ctx->por_el0, orig_poe); + td->pass = false; + } + } + + return 0; +} + +struct tdescr tde = { + .name = "POR_EL0", + .descr = "Validate that POR_EL0 is present as expected", + .timeout = 3, + .run = poe_present, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c index 3d37daafcff5..6dbe48cf8b09 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c @@ -6,51 +6,31 @@ * set up as expected. */ +#include <kselftest.h> #include <signal.h> #include <ucontext.h> #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 64]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; + if (!res) + return true; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_ssve_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c index 9dc5f128bbc0..5557e116e973 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c @@ -6,51 +6,31 @@ * signal frames is set up as expected when enabled simultaneously. */ +#include <kselftest.h> #include <signal.h> #include <ucontext.h> #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; + if (!res) + return true; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c index 8b16eabbb769..8143eb1c58c1 100644 --- a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c @@ -6,47 +6,31 @@ * expected. */ +#include <kselftest.h> #include <signal.h> #include <ucontext.h> #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 64]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sve_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SVE, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SVE_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); + if (!res) + return true; - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_sve_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index 674b88cc8c39..e6daa94fcd2e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -6,29 +6,6 @@ #include "testcases.h" -struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset) -{ - size_t offs = 0; - struct _aarch64_ctx *found = NULL; - - if (!head || resv_sz < HDR_SZ) - return found; - - while (offs <= resv_sz - HDR_SZ && - head->magic != magic && head->magic) { - offs += head->size; - head = GET_RESV_NEXT_HEAD(head); - } - if (head->magic == magic) { - found = head; - if (offset) - *offset = offs; - } - - return found; -} - bool validate_extra_context(struct extra_context *extra, char **err, void **extra_data, size_t *extra_size) { @@ -184,6 +161,10 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) if (head->size != sizeof(struct esr_context)) *err = "Bad size for esr_context"; break; + case POE_MAGIC: + if (head->size != sizeof(struct poe_context)) + *err = "Bad size for poe_context"; + break; case TPIDR2_MAGIC: if (head->size != sizeof(struct tpidr2_context)) *err = "Bad size for tpidr2_context"; diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 7727126347e0..9872b8912714 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -26,6 +26,9 @@ #define HDR_SZ \ sizeof(struct _aarch64_ctx) +#define GET_UC_RESV_HEAD(uc) \ + (struct _aarch64_ctx *)(&(uc->uc_mcontext.__reserved)) + #define GET_SF_RESV_HEAD(sf) \ (struct _aarch64_ctx *)(&(sf).uc.uc_mcontext.__reserved) @@ -88,8 +91,29 @@ struct fake_sigframe { bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err); -struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset); +static inline struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, + size_t resv_sz, size_t *offset) +{ + size_t offs = 0; + struct _aarch64_ctx *found = NULL; + + if (!head || resv_sz < HDR_SZ) + return found; + + while (offs <= resv_sz - HDR_SZ && + head->magic != magic && head->magic) { + offs += head->size; + head = GET_RESV_NEXT_HEAD(head); + } + if (head->magic == magic) { + found = head; + if (offset) + *offset = offs; + } + + return found; +} + static inline struct _aarch64_ctx *get_terminator(struct _aarch64_ctx *head, size_t resv_sz, diff --git a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c index 4d6f94b6178f..ce26e9c2fa5e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c @@ -6,47 +6,31 @@ * expected. */ +#include <kselftest.h> #include <signal.h> #include <ucontext.h> #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SME_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); + if (!res) + return true; - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc, diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c index 174ad6656696..b9e13f27f1f9 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c @@ -6,51 +6,31 @@ * expected. */ +#include <kselftest.h> #include <signal.h> #include <ucontext.h> #include <sys/prctl.h> #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SME_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; + if (!res) + return true; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_za_regs(void) diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 3c7c3e79aa93..901349da680f 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -1,6 +1,5 @@ bpf_cookie/multi_kprobe_attach_api # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3 bpf_cookie/multi_kprobe_link_api # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3 -fexit_sleep # The test never returns. The remaining tests cannot start. kprobe_multi_bench_attach # needs CONFIG_FPROBE kprobe_multi_test # needs CONFIG_FPROBE module_attach # prog 'kprobe_multi': failed to auto-attach: -95 diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd49c1d23a60..81d4757ecd4c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h index 11ee801e75e7..6c3b4d4f173a 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -34,6 +34,12 @@ DECLARE_TRACE(bpf_testmod_test_write_bare, TP_ARGS(task, ctx) ); +/* Used in bpf_testmod_test_read() to test __nullable suffix */ +DECLARE_TRACE(bpf_testmod_test_nullable_bare, + TP_PROTO(struct bpf_testmod_test_read_ctx *ctx__nullable), + TP_ARGS(ctx__nullable) +); + #undef BPF_TESTMOD_DECLARE_TRACE #ifdef DECLARE_TRACE_WRITABLE #define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \ diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index fd28c1157bd3..22807fd78fe6 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -356,6 +356,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, if (bpf_testmod_loop_test(101) > 100) trace_bpf_testmod_test_read(current, &ctx); + trace_bpf_testmod_test_nullable_bare(NULL); + /* Magic number to enable writable tp */ if (len == 64) { struct bpf_testmod_test_writable_ctx writable = { @@ -432,7 +434,7 @@ uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct testmod_uprobe { struct path path; - loff_t offset; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -446,25 +448,25 @@ static int testmod_register_uprobe(loff_t offset) { int err = -EBUSY; - if (uprobe.offset) + if (uprobe.uprobe) return -EBUSY; mutex_lock(&testmod_uprobe_mutex); - if (uprobe.offset) + if (uprobe.uprobe) goto out; err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path); if (err) goto out; - err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry), - offset, 0, &uprobe.consumer); - if (err) + uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry), + offset, 0, &uprobe.consumer); + if (IS_ERR(uprobe.uprobe)) { + err = PTR_ERR(uprobe.uprobe); path_put(&uprobe.path); - else - uprobe.offset = offset; - + uprobe.uprobe = NULL; + } out: mutex_unlock(&testmod_uprobe_mutex); return err; @@ -474,10 +476,11 @@ static void testmod_unregister_uprobe(void) { mutex_lock(&testmod_uprobe_mutex); - if (uprobe.offset) { - uprobe_unregister(d_real_inode(uprobe.path.dentry), - uprobe.offset, &uprobe.consumer); - uprobe.offset = 0; + if (uprobe.uprobe) { + uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer); + uprobe_unregister_sync(); + path_put(&uprobe.path); + uprobe.uprobe = NULL; } mutex_unlock(&testmod_uprobe_mutex); diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 00965a6e83bb..61de88cf4ad0 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -3551,6 +3551,40 @@ static struct btf_raw_test raw_tests[] = { BTF_STR_SEC("\0x\0?.foo bar:buz"), }, { + .descr = "datasec: name with non-printable first char not is ok", + .raw_types = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC ?.data */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + BTF_END_RAW, + }, + BTF_STR_SEC("\0x\0\7foo"), + .err_str = "Invalid name", + .btf_load_err = true, +}, +{ + .descr = "datasec: name '\\0' is not ok", + .raw_types = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC \0 */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + BTF_END_RAW, + }, + BTF_STR_SEC("\0x\0"), + .err_str = "Invalid name", + .btf_load_err = true, +}, +{ .descr = "type name '?foo' is not ok", .raw_types = { /* union ?foo; */ diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 7cfac53c0d58..b614a5272dfd 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -9,6 +9,7 @@ enum test_setup_type { SETUP_SYSCALL_SLEEP, SETUP_SKB_PROG, + SETUP_SKB_PROG_TP, }; static struct { @@ -28,6 +29,7 @@ static struct { {"test_dynptr_clone", SETUP_SKB_PROG}, {"test_dynptr_skb_no_buff", SETUP_SKB_PROG}, {"test_dynptr_skb_strcmp", SETUP_SKB_PROG}, + {"test_dynptr_skb_tp_btf", SETUP_SKB_PROG_TP}, }; static void verify_success(const char *prog_name, enum test_setup_type setup_type) @@ -35,7 +37,7 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ struct dynptr_success *skel; struct bpf_program *prog; struct bpf_link *link; - int err; + int err; skel = dynptr_success__open(); if (!ASSERT_OK_PTR(skel, "dynptr_success__open")) @@ -47,7 +49,7 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) goto cleanup; - bpf_program__set_autoload(prog, true); + bpf_program__set_autoload(prog, true); err = dynptr_success__load(skel); if (!ASSERT_OK(err, "dynptr_success__load")) @@ -87,6 +89,37 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ break; } + case SETUP_SKB_PROG_TP: + { + struct __sk_buff skb = {}; + struct bpf_object *obj; + int aux_prog_fd; + + /* Just use its test_run to trigger kfree_skb tracepoint */ + err = bpf_prog_test_load("./test_pkt_access.bpf.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &aux_prog_fd); + if (!ASSERT_OK(err, "prog_load sched cls")) + goto cleanup; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + ); + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto cleanup; + + err = bpf_prog_test_run_opts(aux_prog_fd, &topts); + bpf_link__destroy(link); + + if (!ASSERT_OK(err, "test_run")) + goto cleanup; + + break; + } } ASSERT_EQ(skel->bss->err, 0, "err"); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c index f949647dbbc2..552a0875ca6d 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -21,13 +21,13 @@ static int do_sleep(void *skel) } #define STACK_SIZE (1024 * 1024) -static char child_stack[STACK_SIZE]; void test_fexit_sleep(void) { struct fexit_sleep_lskel *fexit_skel = NULL; int wstatus, duration = 0; pid_t cpid; + char *child_stack = NULL; int err, fexit_cnt; fexit_skel = fexit_sleep_lskel__open_and_load(); @@ -38,6 +38,11 @@ void test_fexit_sleep(void) if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto cleanup; + child_stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (!ASSERT_NEQ(child_stack, MAP_FAILED, "mmap")) + goto cleanup; + cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) goto cleanup; @@ -78,5 +83,6 @@ void test_fexit_sleep(void) goto cleanup; cleanup: + munmap(child_stack, STACK_SIZE); fexit_sleep_lskel__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 9e5f38739104..6b3078dd5645 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -378,8 +378,8 @@ struct test tests[] = { .iph_inner.ihl = 5, .iph_inner.protocol = IPPROTO_TCP, .iph_inner.tot_len = - __bpf_constant_htons(MAGIC_BYTES) - - sizeof(struct iphdr), + __bpf_constant_htons(MAGIC_BYTES - + sizeof(struct iphdr)), .tcp.doff = 5, .tcp.source = 80, .tcp.dest = 8080, @@ -407,8 +407,8 @@ struct test tests[] = { .iph_inner.ihl = 5, .iph_inner.protocol = IPPROTO_TCP, .iph_inner.tot_len = - __bpf_constant_htons(MAGIC_BYTES) - - sizeof(struct iphdr), + __bpf_constant_htons(MAGIC_BYTES - + sizeof(struct iphdr)), .tcp.doff = 5, .tcp.source = 80, .tcp.dest = 8080, @@ -436,8 +436,8 @@ struct test tests[] = { .iph_inner.ihl = 5, .iph_inner.protocol = IPPROTO_TCP, .iph_inner.tot_len = - __bpf_constant_htons(MAGIC_BYTES) - - sizeof(struct iphdr), + __bpf_constant_htons(MAGIC_BYTES - + sizeof(struct iphdr)), .tcp.doff = 5, .tcp.source = 99, .tcp.dest = 9090, diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c index 7d4a9b3d3722..e12255121c15 100644 --- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -154,6 +154,51 @@ err_out: close(sfd); } +static void test_nonstandard_opt(int family) +{ + struct setget_sockopt__bss *bss = skel->bss; + struct bpf_link *getsockopt_link = NULL; + int sfd = -1, fd = -1, cfd = -1, flags; + socklen_t flagslen = sizeof(flags); + + memset(bss, 0, sizeof(*bss)); + + sfd = start_server(family, SOCK_STREAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + return; + + fd = connect_to_fd(sfd, 0); + if (!ASSERT_GE(fd, 0, "connect_to_fd_server")) + goto err_out; + + /* cgroup/getsockopt prog will intercept getsockopt() below and + * retrieve the tcp socket bpf_sock_ops_cb_flags value for the + * accept()ed socket; this was set earlier in the passive established + * callback for the accept()ed socket via bpf_setsockopt(). + */ + getsockopt_link = bpf_program__attach_cgroup(skel->progs._getsockopt, cg_fd); + if (!ASSERT_OK_PTR(getsockopt_link, "getsockopt prog")) + goto err_out; + + cfd = accept(sfd, NULL, 0); + if (!ASSERT_GE(cfd, 0, "accept")) + goto err_out; + + if (!ASSERT_OK(getsockopt(cfd, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, &flagslen), + "getsockopt_flags")) + goto err_out; + ASSERT_EQ(flags & BPF_SOCK_OPS_STATE_CB_FLAG, BPF_SOCK_OPS_STATE_CB_FLAG, + "cb_flags_set"); +err_out: + close(sfd); + if (fd != -1) + close(fd); + if (cfd != -1) + close(cfd); + bpf_link__destroy(getsockopt_link); +} + void test_setget_sockopt(void) { cg_fd = test__join_cgroup(CG_NAME); @@ -191,6 +236,8 @@ void test_setget_sockopt(void) test_udp(AF_INET); test_ktls(AF_INET6); test_ktls(AF_INET); + test_nonstandard_opt(AF_INET); + test_nonstandard_opt(AF_INET6); done: setget_sockopt__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1337153eb0ad..82bfb266741c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -451,11 +451,11 @@ out: #define MAX_EVENTS 10 static void test_sockmap_skb_verdict_shutdown(void) { + int n, err, map, verdict, c1 = -1, p1 = -1; struct epoll_event ev, events[MAX_EVENTS]; - int n, err, map, verdict, s, c1 = -1, p1 = -1; struct test_sockmap_pass_prog *skel; - int epollfd; int zero = 0; + int epollfd; char b; skel = test_sockmap_pass_prog__open_and_load(); @@ -469,10 +469,7 @@ static void test_sockmap_skb_verdict_shutdown(void) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); - if (s < 0) - goto out; - err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); if (err < 0) goto out; @@ -506,8 +503,8 @@ out: static void test_sockmap_skb_verdict_fionread(bool pass_prog) { + int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; int expected, zero = 0, sent, recvd, avail; - int err, map, verdict, s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; struct test_sockmap_pass_prog *pass = NULL; struct test_sockmap_drop_prog *drop = NULL; char buf[256] = "0123456789"; @@ -534,11 +531,8 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); - if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; - err = create_socket_pairs(s, AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); - if (!ASSERT_OK(err, "create_socket_pairs(s)")) + err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + if (!ASSERT_OK(err, "create_socket_pairs()")) goto out; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); @@ -570,16 +564,12 @@ out: static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, s, c1, p1, zero = 0, sent, recvd, avail; + int err, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - s = socket_loopback(AF_INET, SOCK_STREAM); - if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - return; - - err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); - if (!ASSERT_OK(err, "create_pairs(s)")) + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair()")) return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index e880f97bc44d..38e35c72bdaa 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -3,6 +3,9 @@ #include <linux/vm_sockets.h> +/* include/linux/net.h */ +#define SOCK_TYPE_MASK 0xf + #define IO_TIMEOUT_SEC 30 #define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 @@ -14,6 +17,17 @@ #define __always_unused __attribute__((__unused__)) +/* include/linux/cleanup.h */ +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) + +#define take_fd(fd) __get_and_null(fd, -EBADF) + #define _FAIL(errnum, fmt...) \ ({ \ error_at_line(0, (errnum), __func__, __LINE__, fmt); \ @@ -179,6 +193,14 @@ __ret; \ }) +static inline void close_fd(int *fd) +{ + if (*fd >= 0) + xclose(*fd); +} + +#define __close_fd __attribute__((cleanup(close_fd))) + static inline int poll_connect(int fd, unsigned int timeout_sec) { struct timeval timeout = { .tv_sec = timeout_sec }; @@ -312,54 +334,6 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); } -static inline int create_pair(int s, int family, int sotype, int *c, int *p) -{ - struct sockaddr_storage addr; - socklen_t len; - int err = 0; - - len = sizeof(addr); - err = xgetsockname(s, sockaddr(&addr), &len); - if (err) - return err; - - *c = xsocket(family, sotype, 0); - if (*c < 0) - return errno; - err = xconnect(*c, sockaddr(&addr), len); - if (err) { - err = errno; - goto close_cli0; - } - - *p = xaccept_nonblock(s, NULL, NULL); - if (*p < 0) { - err = errno; - goto close_cli0; - } - return err; -close_cli0: - close(*c); - return err; -} - -static inline int create_socket_pairs(int s, int family, int sotype, - int *c0, int *c1, int *p0, int *p1) -{ - int err; - - err = create_pair(s, family, sotype, c0, p0); - if (err) - return err; - - err = create_pair(s, family, sotype, c1, p1); - if (err) { - close(*c0); - close(*p0); - } - return err; -} - static inline int enable_reuseport(int s, int progfd) { int err, one = 1; @@ -412,5 +386,84 @@ static inline int socket_loopback(int family, int sotype) return socket_loopback_reuseport(family, sotype, -1); } +static inline int create_pair(int family, int sotype, int *p0, int *p1) +{ + __close_fd int s, c = -1, p = -1; + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int err; + + s = socket_loopback(family, sotype); + if (s < 0) + return s; + + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + return err; + + c = xsocket(family, sotype, 0); + if (c < 0) + return c; + + err = connect(c, sockaddr(&addr), len); + if (err) { + if (errno != EINPROGRESS) { + FAIL_ERRNO("connect"); + return err; + } + + err = poll_connect(c, IO_TIMEOUT_SEC); + if (err) { + FAIL_ERRNO("poll_connect"); + return err; + } + } + + switch (sotype & SOCK_TYPE_MASK) { + case SOCK_DGRAM: + err = xgetsockname(c, sockaddr(&addr), &len); + if (err) + return err; + + err = xconnect(s, sockaddr(&addr), len); + if (err) + return err; + + *p0 = take_fd(s); + break; + case SOCK_STREAM: + case SOCK_SEQPACKET: + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + return p; + + *p0 = take_fd(p); + break; + default: + FAIL("Unsupported socket type %#x", sotype); + return -EOPNOTSUPP; + } + + *p1 = take_fd(c); + return 0; +} + +static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, + int *p0, int *p1) +{ + int err; + + err = create_pair(family, sotype, c0, p0); + if (err) + return err; + + err = create_pair(family, sotype, c1, p1); + if (err) { + close(*c0); + close(*p0); + } + + return err; +} #endif // __SOCKMAP_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index e91b59366030..da5a6fb03b69 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -29,6 +29,8 @@ #include "sockmap_helpers.h" +#define NO_FLAGS 0 + static void test_insert_invalid(struct test_sockmap_listen *skel __always_unused, int family, int sotype, int mapfd) { @@ -675,7 +677,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { const char *log_prefix = redir_mode_str(mode); - int s, c0, c1, p0, p1; + int c0, c1, p0, p1; unsigned int pass; int err, n; u32 key; @@ -683,13 +685,10 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, zero_verdict_count(verd_mapfd); - s = socket_loopback(family, sotype | SOCK_NONBLOCK); - if (s < 0) - return; - - err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1); + err = create_socket_pairs(family, sotype | SOCK_NONBLOCK, &c0, &c1, + &p0, &p1); if (err) - goto close_srv; + return; err = add_to_sockmap(sock_mapfd, p0, p1); if (err) @@ -720,8 +719,6 @@ close: xclose(c1); xclose(p0); xclose(c0); -close_srv: - xclose(s); } static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, @@ -907,7 +904,7 @@ static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *sk static void redir_partial(int family, int sotype, int sock_map, int parser_map) { - int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; + int c0 = -1, c1 = -1, p0 = -1, p1 = -1; int err, n, key, value; char buf[] = "abc"; @@ -917,13 +914,10 @@ static void redir_partial(int family, int sotype, int sock_map, int parser_map) if (err) return; - s = socket_loopback(family, sotype | SOCK_NONBLOCK); - if (s < 0) - goto clean_parser_map; - - err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1); + err = create_socket_pairs(family, sotype | SOCK_NONBLOCK, &c0, &c1, + &p0, &p1); if (err) - goto close_srv; + goto clean_parser_map; err = add_to_sockmap(sock_map, p0, p1); if (err) @@ -942,8 +936,6 @@ close: xclose(p0); xclose(c1); xclose(p1); -close_srv: - xclose(s); clean_parser_map: key = 0; @@ -1376,7 +1368,8 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, static void pairs_redir_to_connected(int cli0, int peer0, int cli1, int peer1, int sock_mapfd, int nop_mapfd, - int verd_mapfd, enum redir_mode mode) + int verd_mapfd, enum redir_mode mode, + int send_flags) { const char *log_prefix = redir_mode_str(mode); unsigned int pass; @@ -1396,12 +1389,11 @@ static void pairs_redir_to_connected(int cli0, int peer0, int cli1, int peer1, return; } - n = write(cli1, "a", 1); - if (n < 0) - FAIL_ERRNO("%s: write", log_prefix); - if (n == 0) - FAIL("%s: incomplete write", log_prefix); - if (n < 1) + /* Last byte is OOB data when send_flags has MSG_OOB bit set */ + n = xsend(cli1, "ab", 2, send_flags); + if (n >= 0 && n < 2) + FAIL("%s: incomplete send", log_prefix); + if (n < 2) return; key = SK_PASS; @@ -1416,6 +1408,25 @@ static void pairs_redir_to_connected(int cli0, int peer0, int cli1, int peer1, FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) FAIL("%s: incomplete recv", log_prefix); + + if (send_flags & MSG_OOB) { + /* Check that we can't read OOB while in sockmap */ + errno = 0; + n = recv(peer1, &b, 1, MSG_OOB | MSG_DONTWAIT); + if (n != -1 || errno != EOPNOTSUPP) + FAIL("%s: recv(MSG_OOB): expected EOPNOTSUPP: retval=%d errno=%d", + log_prefix, n, errno); + + /* Remove peer1 from sockmap */ + xbpf_map_delete_elem(sock_mapfd, &(int){ 1 }); + + /* Check that OOB was dropped on redirect */ + errno = 0; + n = recv(peer1, &b, 1, MSG_OOB | MSG_DONTWAIT); + if (n != -1 || errno != EINVAL) + FAIL("%s: recv(MSG_OOB): expected EINVAL: retval=%d errno=%d", + log_prefix, n, errno); + } } static void unix_redir_to_connected(int sotype, int sock_mapfd, @@ -1432,7 +1443,8 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd, goto close0; c1 = sfd[0], p1 = sfd[1]; - pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, mode); + pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, + mode, NO_FLAGS); xclose(c1); xclose(p1); @@ -1478,49 +1490,7 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma /* Returns two connected loopback vsock sockets */ static int vsock_socketpair_connectible(int sotype, int *v0, int *v1) { - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int s, p, c; - - s = socket_loopback(AF_VSOCK, sotype); - if (s < 0) - return -1; - - c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0); - if (c == -1) - goto close_srv; - - if (getsockname(s, sockaddr(&addr), &len) < 0) - goto close_cli; - - if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) { - FAIL_ERRNO("connect"); - goto close_cli; - } - - len = sizeof(addr); - p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC); - if (p < 0) - goto close_cli; - - if (poll_connect(c, IO_TIMEOUT_SEC) < 0) { - FAIL_ERRNO("poll_connect"); - goto close_acc; - } - - *v0 = p; - *v1 = c; - - return 0; - -close_acc: - close(p); -close_cli: - close(c); -close_srv: - close(s); - - return -1; + return create_pair(AF_VSOCK, sotype | SOCK_NONBLOCK, v0, v1); } static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd, @@ -1669,44 +1639,7 @@ static void test_reuseport(struct test_sockmap_listen *skel, static int inet_socketpair(int family, int type, int *s, int *c) { - struct sockaddr_storage addr; - socklen_t len; - int p0, c0; - int err; - - p0 = socket_loopback(family, type | SOCK_NONBLOCK); - if (p0 < 0) - return p0; - - len = sizeof(addr); - err = xgetsockname(p0, sockaddr(&addr), &len); - if (err) - goto close_peer0; - - c0 = xsocket(family, type | SOCK_NONBLOCK, 0); - if (c0 < 0) { - err = c0; - goto close_peer0; - } - err = xconnect(c0, sockaddr(&addr), len); - if (err) - goto close_cli0; - err = xgetsockname(c0, sockaddr(&addr), &len); - if (err) - goto close_cli0; - err = xconnect(p0, sockaddr(&addr), len); - if (err) - goto close_cli0; - - *s = p0; - *c = c0; - return 0; - -close_cli0: - xclose(c0); -close_peer0: - xclose(p0); - return err; + return create_pair(family, type | SOCK_NONBLOCK, s, c); } static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, @@ -1722,7 +1655,8 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, if (err) goto close_cli0; - pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, mode); + pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, + mode, NO_FLAGS); xclose(c1); xclose(p1); @@ -1772,15 +1706,16 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, int sfd[2]; int err; - if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + if (socketpair(AF_UNIX, type | SOCK_NONBLOCK, 0, sfd)) return; c0 = sfd[0], p0 = sfd[1]; - err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1); + err = inet_socketpair(family, type, &p1, &c1); if (err) goto close; - pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, mode); + pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, + mode, NO_FLAGS); xclose(c1); xclose(p1); @@ -1815,32 +1750,30 @@ static void inet_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } -static void unix_inet_redir_to_connected(int family, int type, - int sock_mapfd, int nop_mapfd, - int verd_mapfd, - enum redir_mode mode) +static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, + int nop_mapfd, int verd_mapfd, + enum redir_mode mode, int send_flags) { int c0, c1, p0, p1; int sfd[2]; int err; - err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0); + err = inet_socketpair(family, type, &p0, &c0); if (err) return; - if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + if (socketpair(AF_UNIX, type | SOCK_NONBLOCK, 0, sfd)) goto close_cli0; c1 = sfd[0], p1 = sfd[1]; - pairs_redir_to_connected(c0, p0, c1, p1, - sock_mapfd, nop_mapfd, verd_mapfd, mode); + pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, nop_mapfd, + verd_mapfd, mode, send_flags); xclose(c1); xclose(p1); close_cli0: xclose(c0); xclose(p0); - } static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, @@ -1859,31 +1792,42 @@ static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, skel->bss->test_ingress = false; unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, -1, verdict_map, - REDIR_EGRESS); - unix_inet_redir_to_connected(family, SOCK_DGRAM, + REDIR_EGRESS, NO_FLAGS); + unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, -1, verdict_map, - REDIR_EGRESS); + REDIR_EGRESS, NO_FLAGS); unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, nop_map, verdict_map, - REDIR_EGRESS); + REDIR_EGRESS, NO_FLAGS); + unix_inet_redir_to_connected(family, SOCK_STREAM, + sock_map, nop_map, verdict_map, + REDIR_EGRESS, NO_FLAGS); + + /* MSG_OOB not supported by AF_UNIX SOCK_DGRAM */ unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, nop_map, verdict_map, - REDIR_EGRESS); + REDIR_EGRESS, MSG_OOB); + skel->bss->test_ingress = true; unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, -1, verdict_map, - REDIR_INGRESS); + REDIR_INGRESS, NO_FLAGS); unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, -1, verdict_map, - REDIR_INGRESS); + REDIR_INGRESS, NO_FLAGS); unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, nop_map, verdict_map, - REDIR_INGRESS); + REDIR_INGRESS, NO_FLAGS); + unix_inet_redir_to_connected(family, SOCK_STREAM, + sock_map, nop_map, verdict_map, + REDIR_INGRESS, NO_FLAGS); + + /* MSG_OOB not supported by AF_UNIX SOCK_DGRAM */ unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, nop_map, verdict_map, - REDIR_INGRESS); + REDIR_INGRESS, MSG_OOB); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } diff --git a/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c b/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c new file mode 100644 index 000000000000..accc42e01f8a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "test_tp_btf_nullable.skel.h" + +void test_tp_btf_nullable(void) +{ + if (!env.has_testmod) { + test__skip(); + return; + } + + RUN_TESTS(test_tp_btf_nullable); +} diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index bd8c75b620c2..c397336fe1ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -216,7 +216,7 @@ static void test_uretprobe_regs_change(void) } #ifndef __NR_uretprobe -#define __NR_uretprobe 467 +#define __NR_uretprobe 335 #endif __naked unsigned long uretprobe_syscall_call_1(void) @@ -253,7 +253,7 @@ static void test_uretprobe_syscall_call(void) struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c; - if (ASSERT_OK(pipe(go), "pipe")) + if (!ASSERT_OK(pipe(go), "pipe")) return; skel = uprobe_syscall_executed__open_and_load(); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index f76b5d67a3ee..c87ee2bf558c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -68,7 +68,8 @@ static int open_xsk(int ifindex, struct xsk *xsk) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM, + .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM | + XDP_UMEM_TX_METADATA_LEN, .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx; diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c index ba97165bdb28..a657651eba52 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c @@ -14,9 +14,9 @@ typedef int *ptr_arr_t[6]; typedef int *ptr_multiarr_t[7][8][9][10]; -typedef int * (*fn_ptr_arr_t[11])(); +typedef int * (*fn_ptr_arr_t[11])(void); -typedef int * (*fn_ptr_multiarr_t[12][13])(); +typedef int * (*fn_ptr_multiarr_t[12][13])(void); struct root_struct { arr_t _1; diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index ad21ee8c7e23..29d01fff32bd 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -100,7 +100,7 @@ typedef void (*printf_fn_t)(const char *, ...); * `int -> char *` function and returns pointer to a char. Equivalent: * typedef char * (*fn_input_t)(int); * typedef char * (*fn_output_outer_t)(fn_input_t); - * typedef const fn_output_outer_t (* fn_output_inner_t)(); + * typedef const fn_output_outer_t (* fn_output_inner_t)(void); * typedef const fn_output_inner_t fn_ptr_arr2_t[5]; */ /* ----- START-EXPECTED-OUTPUT ----- */ @@ -127,7 +127,7 @@ typedef void (* (*signal_t)(int, void (*)(int)))(int); typedef char * (*fn_ptr_arr1_t[10])(int **); -typedef char * (* (* const fn_ptr_arr2_t[5])())(char * (*)(int)); +typedef char * (* (* const fn_ptr_arr2_t[5])(void))(char * (*)(int)); struct struct_w_typedefs { int_t a; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index e35bc1eac52a..c3bc186af21e 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -6,6 +6,7 @@ #include <stdbool.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include <linux/if_ether.h> #include "bpf_misc.h" #include "bpf_kfuncs.h" @@ -1254,6 +1255,30 @@ int skb_invalid_ctx(void *ctx) return 0; } +SEC("fentry/skb_tx_error") +__failure __msg("must be referenced or trusted") +int BPF_PROG(skb_invalid_ctx_fentry, void *skb) +{ + struct bpf_dynptr ptr; + + /* this should fail */ + bpf_dynptr_from_skb(skb, 0, &ptr); + + return 0; +} + +SEC("fexit/skb_tx_error") +__failure __msg("must be referenced or trusted") +int BPF_PROG(skb_invalid_ctx_fexit, void *skb) +{ + struct bpf_dynptr ptr; + + /* this should fail */ + bpf_dynptr_from_skb(skb, 0, &ptr); + + return 0; +} + /* Reject writes to dynptr slot for uninit arg */ SEC("?raw_tp") __failure __msg("potential write to dynptr at off=-16") diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 5985920d162e..bfcc85686cf0 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -5,6 +5,7 @@ #include <stdbool.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include "bpf_misc.h" #include "bpf_kfuncs.h" #include "errno.h" @@ -544,3 +545,25 @@ int test_dynptr_skb_strcmp(struct __sk_buff *skb) return 1; } + +SEC("tp_btf/kfree_skb") +int BPF_PROG(test_dynptr_skb_tp_btf, void *skb, void *location) +{ + __u8 write_data[2] = {1, 2}; + struct bpf_dynptr ptr; + int ret; + + if (bpf_dynptr_from_skb(skb, 0, &ptr)) { + err = 1; + return 1; + } + + /* since tp_btf skbs are read only, writes should fail */ + ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); + if (ret != -EINVAL) { + err = 2; + return 1; + } + + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 16bdc3e25591..ef70b88bccb2 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1432,4 +1432,58 @@ int iter_arr_with_actual_elem_count(const void *ctx) return sum; } +__u32 upper, select_n, result; +__u64 global; + +static __noinline bool nest_2(char *str) +{ + /* some insns (including branch insns) to ensure stacksafe() is triggered + * in nest_2(). This way, stacksafe() can compare frame associated with nest_1(). + */ + if (str[0] == 't') + return true; + if (str[1] == 'e') + return true; + if (str[2] == 's') + return true; + if (str[3] == 't') + return true; + return false; +} + +static __noinline bool nest_1(int n) +{ + /* case 0: allocate stack, case 1: no allocate stack */ + switch (n) { + case 0: { + char comm[16]; + + if (bpf_get_current_comm(comm, 16)) + return false; + return nest_2(comm); + } + case 1: + return nest_2((char *)&global); + default: + return false; + } +} + +SEC("raw_tp") +__success +int iter_subprog_check_stacksafe(const void *ctx) +{ + long i; + + bpf_for(i, 0, upper) { + if (!nest_1(select_n)) { + result = 1; + return 0; + } + } + + result = 2; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 60518aed1ffc..6dd4318debbf 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -59,6 +59,8 @@ static const struct sockopt_test sol_tcp_tests[] = { { .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, }, { .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, }, { .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, }, + { .opt = TCP_BPF_SOCK_OPS_CB_FLAGS, .new = BPF_SOCK_OPS_ALL_CB_FLAGS, + .expected = BPF_SOCK_OPS_ALL_CB_FLAGS, }, { .opt = 0, }, }; @@ -353,11 +355,30 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, return 1; } +SEC("cgroup/getsockopt") +int _getsockopt(struct bpf_sockopt *ctx) +{ + struct bpf_sock *sk = ctx->sk; + int *optval = ctx->optval; + struct tcp_sock *tp; + + if (!sk || ctx->level != SOL_TCP || ctx->optname != TCP_BPF_SOCK_OPS_CB_FLAGS) + return 1; + + tp = bpf_core_cast(sk, struct tcp_sock); + if (ctx->optval + sizeof(int) <= ctx->optval_end) { + *optval = tp->bpf_sock_ops_cb_flags; + ctx->retval = 0; + } + return 1; +} + SEC("sockops") int skops_sockopt(struct bpf_sock_ops *skops) { struct bpf_sock *bpf_sk = skops->sk; struct sock *sk; + int flags; if (!bpf_sk) return 1; @@ -384,9 +405,8 @@ int skops_sockopt(struct bpf_sock_ops *skops) nr_passive += !(bpf_test_sockopt(skops, sk) || test_tcp_maxseg(skops, sk) || test_tcp_saved_syn(skops, sk)); - bpf_sock_ops_cb_flags_set(skops, - skops->bpf_sock_ops_cb_flags | - BPF_SOCK_OPS_STATE_CB_FLAG); + flags = skops->bpf_sock_ops_cb_flags | BPF_SOCK_OPS_STATE_CB_FLAG; + bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, sizeof(flags)); break; case BPF_SOCK_OPS_STATE_CB: if (skops->args[1] == BPF_TCP_CLOSE_WAIT) diff --git a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c index 44ee0d037f95..eb5cca1fce16 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c @@ -486,17 +486,10 @@ static int tcp_validate_cookie(struct tcp_syncookie *ctx) goto err; mssind = (cookie & (3 << 6)) >> 6; - if (ctx->ipv4) { - if (mssind > ARRAY_SIZE(msstab4)) - goto err; - + if (ctx->ipv4) ctx->attrs.mss = msstab4[mssind]; - } else { - if (mssind > ARRAY_SIZE(msstab6)) - goto err; - + else ctx->attrs.mss = msstab6[mssind]; - } ctx->attrs.snd_wscale = cookie & BPF_SYNCOOKIE_WSCALE_MASK; ctx->attrs.rcv_wscale = ctx->attrs.snd_wscale; diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c new file mode 100644 index 000000000000..bba3e37f749b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "../bpf_testmod/bpf_testmod.h" +#include "bpf_misc.h" + +SEC("tp_btf/bpf_testmod_test_nullable_bare") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) +{ + return nullable_ctx->len; +} + +SEC("tp_btf/bpf_testmod_test_nullable_bare") +int BPF_PROG(handle_tp_btf_nullable_bare2, struct bpf_testmod_test_read_ctx *nullable_ctx) +{ + if (nullable_ctx) + return nullable_ctx->len; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 8144fd145237..1ee0ef114f9d 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -324,6 +324,25 @@ out: return zc_avail; } +#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" +static unsigned int get_max_skb_frags(void) +{ + unsigned int max_skb_frags = 0; + FILE *file; + + file = fopen(MAX_SKB_FRAGS_PATH, "r"); + if (!file) { + ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); + return 0; + } + + if (fscanf(file, "%u", &max_skb_frags) != 1) + ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); + + fclose(file); + return max_skb_frags; +} + static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"busy-poll", no_argument, 0, 'b'}, @@ -2244,13 +2263,24 @@ static int testapp_poll_rxq_tmout(struct test_spec *test) static int testapp_too_many_frags(struct test_spec *test) { - struct pkt pkts[2 * XSK_DESC__MAX_SKB_FRAGS + 2] = {}; + struct pkt *pkts; u32 max_frags, i; + int ret; - if (test->mode == TEST_MODE_ZC) + if (test->mode == TEST_MODE_ZC) { max_frags = test->ifobj_tx->xdp_zc_max_segs; - else - max_frags = XSK_DESC__MAX_SKB_FRAGS; + } else { + max_frags = get_max_skb_frags(); + if (!max_frags) { + ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n"); + max_frags = 17; + } + max_frags += 1; + } + + pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); + if (!pkts) + return TEST_FAILURE; test->mtu = MAX_ETH_JUMBO_SIZE; @@ -2280,7 +2310,10 @@ static int testapp_too_many_frags(struct test_spec *test) pkts[2 * max_frags + 1].valid = true; pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2); - return testapp_validate_traffic(test); + ret = testapp_validate_traffic(test); + + free(pkts); + return ret; } static int xsk_load_xdp_programs(struct ifobject *ifobj) diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 885c948c5d83..e46e823f6a1a 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -55,7 +55,6 @@ #define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) #define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) #define XSK_DESC__INVALID_OPTION (0xffff) -#define XSK_DESC__MAX_SKB_FRAGS 18 #define HUGEPAGE_SIZE (2 * 1024 * 1024) #define PKT_DUMP_NB_TO_PRINT 16 #define RUN_ALL_TESTS UINT_MAX diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7c08cc153367..03c1bdaed2c3 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -84,6 +84,20 @@ echo member > test/cpuset.cpus.partition echo "" > test/cpuset.cpus [[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!" +# +# If isolated CPUs have been reserved at boot time (as shown in +# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7 +# that will be used by this script for testing purpose. If not, some of +# the tests may fail incorrectly. These isolated CPUs will also be removed +# before being compared with the expected results. +# +BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) +if [[ -n "$BOOT_ISOLCPUS" ]] +then + [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] && + skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested" + echo "Pre-isolated CPUs: $BOOT_ISOLCPUS" +fi cleanup() { online_cpus @@ -321,7 +335,7 @@ TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # - # Incorrect change to cpuset.cpus invalidates partition root + # Incorrect change to cpuset.cpus[.exclusive] invalidates partition root # # Adding CPUs to partition root that are not in parent's # cpuset.cpus is allowed, but those extra CPUs are ignored. @@ -365,6 +379,16 @@ TEST_MATRIX=( # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5" + # Child partition root that try to take all CPUs from parent partition + # with tasks will remain invalid. + " C1-4:P1:S+ P1 . . . . . . 0 A1:1-4,A2:1-4 A1:P1,A2:P-1" + " C1-4:P1:S+ P1 . . . C1-4 . . 0 A1,A2:1-4 A1:P1,A2:P1" + " C1-4:P1:S+ P1 . . T C1-4 . . 0 A1:1-4,A2:1-4 A1:P1,A2:P-1" + + # Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't + # affect cpuset.cpus.exclusive.effective. + " C1-4:X3:S+ C1:X3 . . . C . . 0 A2:1-4,XA2:3" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -632,7 +656,8 @@ check_cgroup_states() # Note that isolated CPUs from the sched/domains context include offline # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may # not be included in the cpuset.cpus.isolated control file which contains -# only CPUs in isolated partitions. +# only CPUs in isolated partitions as well as those that are isolated at +# boot time. # # $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>} # <isolcpus1> - expected sched/domains value @@ -659,18 +684,21 @@ check_isolcpus() fi # - # Check the debug isolated cpumask, if present + # Check cpuset.cpus.isolated cpumask # - [[ -f $ISCPUS ]] && { + if [[ -z "$BOOT_ISOLCPUS" ]] + then + ISOLCPUS=$(cat $ISCPUS) + else + ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + fi + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 ISOLCPUS=$(cat $ISCPUS) - [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { - # Take a 50ms pause and try again - pause 0.05 - ISOLCPUS=$(cat $ISCPUS) - } - [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 - ISOLCPUS= } + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= # # Use the sched domain in debugfs to check isolated CPUs, if available @@ -703,6 +731,9 @@ check_isolcpus() fi done [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU + [[ -n "BOOT_ISOLCPUS" ]] && + ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + [[ "$EXPECT_VAL" = "$ISOLCPUS" ]] } @@ -720,7 +751,8 @@ test_fail() } # -# Check to see if there are unexpected isolated CPUs left +# Check to see if there are unexpected isolated CPUs left beyond the boot +# time isolated ones. # null_isolcpus_check() { diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh new file mode 100755 index 000000000000..42a6628fb8bc --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Basc test for cpuset v1 interfaces write/read +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +write_test() { + dir=$1 + interface=$2 + value=$3 + original=$(cat $dir/$interface) + echo "testing $interface $value" + echo $value > $dir/$interface + new=$(cat $dir/$interface) + [[ $value -ne $(cat $dir/$interface) ]] && { + echo "$interface write $value failed: new:$new" + exit 1 + } +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, read write test +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR + +ITF_MATRIX=( + #interface value expect root_only + 'cpuset.cpus 0-1 0-1 0' + 'cpuset.mem_exclusive 1 1 0' + 'cpuset.mem_exclusive 0 0 0' + 'cpuset.mem_hardwall 1 1 0' + 'cpuset.mem_hardwall 0 0 0' + 'cpuset.memory_migrate 1 1 0' + 'cpuset.memory_migrate 0 0 0' + 'cpuset.memory_spread_page 1 1 0' + 'cpuset.memory_spread_page 0 0 0' + 'cpuset.memory_spread_slab 1 1 0' + 'cpuset.memory_spread_slab 0 0 0' + 'cpuset.mems 0 0 0' + 'cpuset.sched_load_balance 1 1 0' + 'cpuset.sched_load_balance 0 0 0' + 'cpuset.sched_relax_domain_level 2 2 0' + 'cpuset.memory_pressure_enabled 1 1 1' + 'cpuset.memory_pressure_enabled 0 0 1' +) + +run_test() +{ + cnt="${ITF_MATRIX[@]}" + for i in "${ITF_MATRIX[@]}" ; do + args=($i) + root_only=${args[3]} + [[ $root_only -eq 1 ]] && { + write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}" + continue + } + write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}" + done +} + +run_test +rmdir $CPUSET/$TDIR +echo "Test PASSED" +exit 0 diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile index ce262d097269..8e99f87f5d7c 100644 --- a/tools/testing/selftests/core/Makefile +++ b/tools/testing/selftests/core/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g $(KHDR_INCLUDES) -TEST_GEN_PROGS := close_range_test +TEST_GEN_PROGS := close_range_test unshare_test include ../lib.mk diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 991c473e3859..e0d9851fe1c9 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -26,6 +26,10 @@ #define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) #endif +#ifndef F_CREATED_QUERY +#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -589,4 +593,74 @@ TEST(close_range_cloexec_unshare_syzbot) EXPECT_EQ(close(fd3), 0); } +TEST(close_range_bitmap_corruption) +{ + pid_t pid; + int status; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + /* get the first 128 descriptors open */ + for (int i = 2; i < 128; i++) + EXPECT_GE(dup2(0, i), 0); + + /* get descriptor table shared */ + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* unshare and truncate descriptor table down to 64 */ + if (sys_close_range(64, ~0U, CLOSE_RANGE_UNSHARE)) + exit(EXIT_FAILURE); + + ASSERT_EQ(fcntl(64, F_GETFD), -1); + /* ... and verify that the range 64..127 is not + stuck "fully used" according to secondary bitmap */ + EXPECT_EQ(dup(0), 64) + exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(fcntl_created) +{ + for (int i = 0; i < 101; i++) { + int fd; + char path[PATH_MAX]; + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + SKIP(return, + "Skipping test since /dev/null does not exist"); + } + + /* We didn't create "/dev/null". */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0); + close(fd); + + sprintf(path, "aaaa_%d", i); + fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0600); + ASSERT_GE(fd, 0); + + /* We created "aaaa_%d". */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 1); + close(fd); + + fd = open(path, O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0); + + /* We're opening it again, so no positive creation check. */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0); + close(fd); + unlink(path); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/core/unshare_test.c b/tools/testing/selftests/core/unshare_test.c new file mode 100644 index 000000000000..7fec9dfb1b0e --- /dev/null +++ b/tools/testing/selftests/core/unshare_test.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <linux/kernel.h> +#include <limits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <unistd.h> +#include <sys/resource.h> +#include <linux/close_range.h> + +#include "../kselftest_harness.h" +#include "../clone3/clone3_selftests.h" + +TEST(unshare_EMFILE) +{ + pid_t pid; + int status; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + int fd; + ssize_t n, n2; + static char buf[512], buf2[512]; + struct rlimit rlimit; + int nr_open; + + fd = open("/proc/sys/fs/nr_open", O_RDWR); + ASSERT_GE(fd, 0); + + n = read(fd, buf, sizeof(buf)); + ASSERT_GT(n, 0); + ASSERT_EQ(buf[n - 1], '\n'); + + ASSERT_EQ(sscanf(buf, "%d", &nr_open), 1); + + ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit)); + + /* bump fs.nr_open */ + n2 = sprintf(buf2, "%d\n", nr_open + 1024); + lseek(fd, 0, SEEK_SET); + write(fd, buf2, n2); + + /* bump ulimit -n */ + rlimit.rlim_cur = nr_open + 1024; + rlimit.rlim_max = nr_open + 1024; + EXPECT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit)) { + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + exit(EXIT_FAILURE); + } + + /* get a descriptor past the old fs.nr_open */ + EXPECT_GE(dup2(2, nr_open + 64), 0) { + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + exit(EXIT_FAILURE); + } + + /* get descriptor table shared */ + pid = sys_clone3(&args, sizeof(args)); + EXPECT_GE(pid, 0) { + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + exit(EXIT_FAILURE); + } + + if (pid == 0) { + int err; + + /* restore fs.nr_open */ + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + /* ... and now unshare(CLONE_FILES) must fail with EMFILE */ + err = unshare(CLONE_FILES); + EXPECT_EQ(err, -1) + exit(EXIT_FAILURE); + EXPECT_EQ(errno, EMFILE) + exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh index a8b1dbc0a3a5..e350c521b467 100755 --- a/tools/testing/selftests/cpufreq/cpufreq.sh +++ b/tools/testing/selftests/cpufreq/cpufreq.sh @@ -231,6 +231,21 @@ do_suspend() for i in `seq 1 $2`; do printf "Starting $1\n" + + if [ "$3" = "rtc" ]; then + if ! command -v rtcwake &> /dev/null; then + printf "rtcwake could not be found, please install it.\n" + return 1 + fi + + rtcwake -m $filename -s 15 + + if [ $? -ne 0 ]; then + printf "Failed to suspend using RTC wake alarm\n" + return 1 + fi + fi + echo $filename > $SYSFS/power/state printf "Came out of $1\n" diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index a0eb84cf7167..f12ff7416e41 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -24,6 +24,8 @@ helpme() [-t <basic: Basic cpufreq testing suspend: suspend/resume, hibernate: hibernate/resume, + suspend_rtc: suspend/resume back using the RTC wakeup alarm, + hibernate_rtc: hibernate/resume back using the RTC wakeup alarm, modtest: test driver or governor modules. Only to be used with -d or -g options, sptest1: Simple governor switch to produce lockdep. sptest2: Concurrent governor switch to produce lockdep. @@ -76,7 +78,8 @@ parse_arguments() helpme ;; - t) # --func_type (Function to perform: basic, suspend, hibernate, modtest, sptest1/2/3/4 (default: basic)) + t) # --func_type (Function to perform: basic, suspend, hibernate, + # suspend_rtc, hibernate_rtc, modtest, sptest1/2/3/4 (default: basic)) FUNC=$OPTARG ;; @@ -121,6 +124,14 @@ do_test() do_suspend "hibernate" 1 ;; + "suspend_rtc") + do_suspend "suspend" 1 rtc + ;; + + "hibernate_rtc") + do_suspend "hibernate" 1 rtc + ;; + "modtest") # Do we have modules in place? if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index 5f541522364f..5d0a809dc2df 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -29,9 +29,11 @@ static int check_vgem(int fd) version.name = name; ret = ioctl(fd, DRM_IOCTL_VERSION, &version); - if (ret) + if (ret || version.name_len != 4) return 0; + name[4] = '\0'; + return !strcmp(name, "vgem"); } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index e54f382bcb02..39fb97a8c1df 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -1,8 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_INCLUDES := $(wildcard lib/py/*.py) +TEST_INCLUDES := $(wildcard lib/py/*.py) \ + ../../net/net_helper.sh \ + ../../net/lib.sh \ TEST_PROGS := \ + netcons_basic.sh \ ping.py \ queues.py \ stats.py \ diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config index f6a58ce8a230..a2d8af60876d 100644 --- a/tools/testing/selftests/drivers/net/config +++ b/tools/testing/selftests/drivers/net/config @@ -1,2 +1,6 @@ CONFIG_IPV6=y CONFIG_NETDEVSIM=m +CONFIG_CONFIGFS_FS=y +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py index 026d98976c35..05b6fbb3fcdd 100755 --- a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py +++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +import errno import time import os from lib.py import ksft_run, ksft_exit, ksft_pr @@ -61,7 +62,7 @@ def test_pp_alloc(cfg, netdevnl): try: stats = get_stats() except NlError as e: - if e.nl_msg.error == -95: + if e.nl_msg.error == -errno.EOPNOTSUPP: stats = {} else: raise diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 931dbc36ca43..9d7adb3cf33b 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -3,7 +3,7 @@ import datetime import random -from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ge, ksft_lt +from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ne, ksft_ge, ksft_lt from lib.py import NetDrvEpEnv from lib.py import EthtoolFamily, NetdevFamily from lib.py import KsftSkipEx @@ -19,6 +19,15 @@ def _rss_key_rand(length): return [random.randint(0, 255) for _ in range(length)] +def _rss_key_check(cfg, data=None, context=0): + if data is None: + data = get_rss(cfg, context=context) + if 'rss-hash-key' not in data: + return + non_zero = [x for x in data['rss-hash-key'] if x != 0] + ksft_eq(bool(non_zero), True, comment=f"RSS key is all zero {data['rss-hash-key']}") + + def get_rss(cfg, context=0): return ethtool(f"-x {cfg.ifname} context {context}", json=True)[0] @@ -81,17 +90,18 @@ def _send_traffic_check(cfg, port, name, params): ksft_ge(directed, 20000, f"traffic on {name}: " + str(cnts)) if params.get('noise'): ksft_lt(sum(cnts[i] for i in params['noise']), directed / 2, - "traffic on other queues:" + str(cnts)) + f"traffic on other queues ({name})':" + str(cnts)) if params.get('empty'): ksft_eq(sum(cnts[i] for i in params['empty']), 0, - "traffic on inactive queues: " + str(cnts)) + f"traffic on inactive queues ({name}): " + str(cnts)) def test_rss_key_indir(cfg): """Test basics like updating the main RSS key and indirection table.""" - if len(_get_rx_cnts(cfg)) < 2: - KsftSkipEx("Device has only one queue (or doesn't support queue stats)") + qcnt = len(_get_rx_cnts(cfg)) + if qcnt < 3: + KsftSkipEx("Device has fewer than 3 queues (or doesn't support queue stats)") data = get_rss(cfg) want_keys = ['rss-hash-key', 'rss-hash-function', 'rss-indirection-table'] @@ -101,6 +111,7 @@ def test_rss_key_indir(cfg): if not data[k]: raise KsftFailEx(f"ethtool results empty for '{k}': {data[k]}") + _rss_key_check(cfg, data=data) key_len = len(data['rss-hash-key']) # Set the key @@ -110,9 +121,26 @@ def test_rss_key_indir(cfg): data = get_rss(cfg) ksft_eq(key, data['rss-hash-key']) + # Set the indirection table and the key together + key = _rss_key_rand(key_len) + ethtool(f"-X {cfg.ifname} equal 3 hkey " + _rss_key_str(key)) + reset_indir = defer(ethtool, f"-X {cfg.ifname} default") + + data = get_rss(cfg) + _rss_key_check(cfg, data=data) + ksft_eq(0, min(data['rss-indirection-table'])) + ksft_eq(2, max(data['rss-indirection-table'])) + + # Reset indirection table and set the key + key = _rss_key_rand(key_len) + ethtool(f"-X {cfg.ifname} default hkey " + _rss_key_str(key)) + data = get_rss(cfg) + _rss_key_check(cfg, data=data) + ksft_eq(0, min(data['rss-indirection-table'])) + ksft_eq(qcnt - 1, max(data['rss-indirection-table'])) + # Set the indirection table ethtool(f"-X {cfg.ifname} equal 2") - reset_indir = defer(ethtool, f"-X {cfg.ifname} default") data = get_rss(cfg) ksft_eq(0, min(data['rss-indirection-table'])) ksft_eq(1, max(data['rss-indirection-table'])) @@ -274,6 +302,78 @@ def test_hitless_key_update(cfg): ksft_eq(carrier1 - carrier0, 0) +def test_rss_context_dump(cfg): + """ + Test dumping RSS contexts. This tests mostly exercises the kernel APIs. + """ + + # Get a random key of the right size + data = get_rss(cfg) + if 'rss-hash-key' in data: + key_data = _rss_key_rand(len(data['rss-hash-key'])) + key = _rss_key_str(key_data) + else: + key_data = [] + key = "ba:ad" + + ids = [] + try: + ids.append(ethtool_create(cfg, "-X", f"context new")) + defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete") + + ids.append(ethtool_create(cfg, "-X", f"context new weight 1 1")) + defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete") + + ids.append(ethtool_create(cfg, "-X", f"context new hkey {key}")) + defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete") + except CmdExitFailure: + if not ids: + raise KsftSkipEx("Unable to add any contexts") + ksft_pr(f"Added only {len(ids)} out of 3 contexts") + + expect_tuples = set([(cfg.ifname, -1)] + [(cfg.ifname, ctx_id) for ctx_id in ids]) + + # Dump all + ctxs = cfg.ethnl.rss_get({}, dump=True) + tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs] + ksft_eq(len(tuples), len(set(tuples)), "duplicates in context dump") + ctx_tuples = set([ctx for ctx in tuples if ctx[0] == cfg.ifname]) + ksft_eq(expect_tuples, ctx_tuples) + + # Sanity-check the results + for data in ctxs: + ksft_ne(set(data['indir']), {0}, "indir table is all zero") + ksft_ne(set(data.get('hkey', [1])), {0}, "key is all zero") + + # More specific checks + if len(ids) > 1 and data.get('context') == ids[1]: + ksft_eq(set(data['indir']), {0, 1}, + "ctx1 - indir table mismatch") + if len(ids) > 2 and data.get('context') == ids[2]: + ksft_eq(data['hkey'], bytes(key_data), "ctx2 - key mismatch") + + # Ifindex filter + ctxs = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}, dump=True) + tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs] + ctx_tuples = set(tuples) + ksft_eq(len(tuples), len(ctx_tuples), "duplicates in context dump") + ksft_eq(expect_tuples, ctx_tuples) + + # Skip ctx 0 + expect_tuples.remove((cfg.ifname, -1)) + + ctxs = cfg.ethnl.rss_get({'start-context': 1}, dump=True) + tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs] + ksft_eq(len(tuples), len(set(tuples)), "duplicates in context dump") + ctx_tuples = set([ctx for ctx in tuples if ctx[0] == cfg.ifname]) + ksft_eq(expect_tuples, ctx_tuples) + + # And finally both with ifindex and skip main + ctxs = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}, 'start-context': 1}, dump=True) + ctx_tuples = set([(c['header']['dev-name'], c.get('context', -1)) for c in ctxs]) + ksft_eq(expect_tuples, ctx_tuples) + + def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None): """ Test separating traffic into RSS contexts. @@ -317,8 +417,11 @@ def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None): ctx_cnt = i break + _rss_key_check(cfg, context=ctx_id) + if not create_with_cfg: ethtool(f"-X {cfg.ifname} context {ctx_id} {want_cfg}") + _rss_key_check(cfg, context=ctx_id) # Sanity check the context we just created data = get_rss(cfg, ctx_id) @@ -511,7 +614,7 @@ def main() -> None: ksft_run([test_rss_key_indir, test_rss_queue_reconfigure, test_rss_resize, test_hitless_key_update, test_rss_context, test_rss_context4, test_rss_context32, - test_rss_context_queue_reconfigure, + test_rss_context_dump, test_rss_context_queue_reconfigure, test_rss_context_overlap, test_rss_context_overlap2, test_rss_context_out_of_order, test_rss_context4_create_with_cfg], args=(cfg, )) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index a5e800b8f103..1ea9bb695e94 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -4,6 +4,7 @@ import os import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx +from lib.py import ksft_setup from lib.py import cmd, ethtool, ip from lib.py import NetNS, NetdevSimDev from .remote import Remote @@ -14,7 +15,7 @@ def _load_env_file(src_path): src_dir = Path(src_path).parent.resolve() if not (src_dir / "net.config").exists(): - return env + return ksft_setup(env) with open((src_dir / "net.config").as_posix(), 'r') as fp: for line in fp.readlines(): @@ -30,7 +31,7 @@ def _load_env_file(src_path): if len(pair) != 2: raise Exception("Can't parse configuration line:", full_file) env[pair[0]] = pair[1] - return env + return ksft_setup(env) class NetDrvEnv: diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 877cd6df94a1..fe905a7f34b3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -2,6 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 lib_dir=$(dirname $0)/../../../net/forwarding +ethtool_lib_dir=$(dirname $0)/../hw ALL_TESTS=" autoneg @@ -11,7 +12,7 @@ ALL_TESTS=" NUM_NETIFS=2 : ${TIMEOUT:=30000} # ms source $lib_dir/lib.sh -source $lib_dir/ethtool_lib.sh +source $ethtool_lib_dir/ethtool_lib.sh setup_prepare() { diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh new file mode 100755 index 000000000000..06021b2059b7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_basic.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test creates two netdevsim virtual interfaces, assigns one of them (the +# "destination interface") to a new namespace, and assigns IP addresses to both +# interfaces. +# +# It listens on the destination interface using socat and configures a dynamic +# target on netconsole, pointing to the destination IP address. +# +# Finally, it checks whether the message was received properly on the +# destination interface. Note that this test may pollute the kernel log buffer +# (dmesg) and relies on dynamic configuration and namespaces being configured. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +# Simple script to test dynamic targets in netconsole +SRCIF="" # to be populated later +SRCIP=192.168.1.1 +DSTIF="" # to be populated later +DSTIP=192.168.1.2 + +PORT="6666" +MSG="netconsole selftest" +TARGET=$(mktemp -u netcons_XXXXX) +DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk) +NETCONS_CONFIGFS="/sys/kernel/config/netconsole" +NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" +# NAMESPACE will be populated by setup_ns with a random value +NAMESPACE="" + +# IDs for netdevsim +NSIM_DEV_1_ID=$((256 + RANDOM % 256)) +NSIM_DEV_2_ID=$((512 + RANDOM % 256)) + +# Used to create and delete namespaces +source "${SCRIPTDIR}"/../../net/lib.sh +source "${SCRIPTDIR}"/../../net/net_helper.sh + +# Create netdevsim interfaces +create_ifaces() { + local NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device + + echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" + echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" + udevadm settle 2> /dev/null || true + + local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID" + local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID" + + # These are global variables + SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \ + -path "$NSIM1"/net -exec basename {} \;) + DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \ + -path "$NSIM2"/net -exec basename {} \;) +} + +link_ifaces() { + local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" + local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex) + local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex) + + exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}" + exec {INITNS_FD}</proc/self/ns/net + + # Bind the dst interface to namespace + ip link set "${DSTIF}" netns "${NAMESPACE}" + + # Linking one device to the other one (on the other namespace} + if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX" > $NSIM_DEV_SYS_LINK + then + echo "linking netdevsim1 with netdevsim2 should succeed" + cleanup + exit "${ksft_skip}" + fi +} + +function configure_ip() { + # Configure the IPs for both interfaces + ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}" + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up + + ip addr add "${SRCIP}"/24 dev "${SRCIF}" + ip link set "${SRCIF}" up +} + +function set_network() { + # setup_ns function is coming from lib.sh + setup_ns NAMESPACE + + # Create both interfaces, and assign the destination to a different + # namespace + create_ifaces + + # Link both interfaces back to back + link_ifaces + + configure_ip +} + +function create_dynamic_target() { + DSTMAC=$(ip netns exec "${NAMESPACE}" \ + ip link show "${DSTIF}" | awk '/ether/ {print $2}') + + # Create a dynamic target + mkdir "${NETCONS_PATH}" + + echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip + echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip + echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac + echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + + echo 1 > "${NETCONS_PATH}"/enabled +} + +function cleanup() { + local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" + + # delete netconsole dynamic reconfiguration + echo 0 > "${NETCONS_PATH}"/enabled + # Remove the configfs entry + rmdir "${NETCONS_PATH}" + + # Delete netdevsim devices + echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL" + echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL" + + # this is coming from lib.sh + cleanup_all_ns + + # Restoring printk configurations + echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk +} + +function listen_port_and_save_to() { + local OUTPUT=${1} + # Just wait for 2 seconds + timeout 2 ip netns exec "${NAMESPACE}" \ + socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}" +} + +function validate_result() { + local TMPFILENAME="$1" + + # Check if the file exists + if [ ! -f "$TMPFILENAME" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${TMPFILENAME}"; then + echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2 + cat "${TMPFILENAME}" >&2 + exit "${ksft_fail}" + fi + + # Delete the file once it is validated, otherwise keep it + # for debugging purposes + rm "${TMPFILENAME}" + exit "${ksft_pass}" +} + +function check_for_dependencies() { + if [ "$(id -u)" -ne 0 ]; then + echo "This test must be run as root" >&2 + exit "${ksft_skip}" + fi + + if ! which socat > /dev/null ; then + echo "SKIP: socat(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if ! which ip > /dev/null ; then + echo "SKIP: ip(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if ! which udevadm > /dev/null ; then + echo "SKIP: udevadm(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if [ ! -d "${NETCONS_CONFIGFS}" ]; then + echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2 + exit "${ksft_skip}" + fi + + if ip link show "${DSTIF}" 2> /dev/null; then + echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2 + exit "${ksft_skip}" + fi + + if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then + echo "SKIP: IPs already in use. Skipping it" >&2 + exit "${ksft_skip}" + fi +} + +# ========== # +# Start here # +# ========== # +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Listed for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + +# Make sure the message was received in the dst part +# and exit +validate_result "${OUTPUT_FILE}" diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 820b8e0a22c6..63e3c045a3b2 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -1,10 +1,13 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +import errno from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import ksft_disruptive from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv +from lib.py import ip, defer ethnl = EthtoolFamily() netfam = NetdevFamily() @@ -17,7 +20,7 @@ def check_pause(cfg) -> None: try: ethnl.pause_get({"header": {"dev-index": cfg.ifindex}}) except NlError as e: - if e.error == 95: + if e.error == errno.EOPNOTSUPP: raise KsftXfailEx("pause not supported by the device") raise @@ -32,7 +35,7 @@ def check_fec(cfg) -> None: try: ethnl.fec_get({"header": {"dev-index": cfg.ifindex}}) except NlError as e: - if e.error == 95: + if e.error == errno.EOPNOTSUPP: raise KsftXfailEx("FEC not supported by the device") raise @@ -117,7 +120,7 @@ def qstat_by_ifindex(cfg) -> None: # loopback has no stats with ksft_raises(NlError) as cm: netfam.qstats_get({"ifindex": 1}, dump=True) - ksft_eq(cm.exception.nl_msg.error, -95) + ksft_eq(cm.exception.nl_msg.error, -errno.EOPNOTSUPP) ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') # Try to get stats for lowest unused ifindex but not 0 @@ -133,9 +136,31 @@ def qstat_by_ifindex(cfg) -> None: ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') +@ksft_disruptive +def check_down(cfg) -> None: + try: + qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + except NlError as e: + if e.error == errno.EOPNOTSUPP: + raise KsftSkipEx("qstats not supported by the device") + raise + + ip(f"link set dev {cfg.dev['ifname']} down") + defer(ip, f"link set dev {cfg.dev['ifname']} up") + + qstat2 = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + for k, v in qstat.items(): + ksft_ge(qstat2[k], qstat[k], comment=f"{k} went backwards on device down") + + # exercise per-queue API to make sure that "device down" state + # is handled correctly and doesn't crash + netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True) + + def main() -> None: with NetDrvEnv(__file__) as cfg: - ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex], + ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, + check_down], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c index ea0cdc37b44f..7ee7492138c6 100644 --- a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c +++ b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c @@ -257,12 +257,6 @@ TEST_F(attest_fixture, att_inval_addr) att_inval_addr_test(&self->uvio_attest.meas_addr, _metadata, self); } -static void __attribute__((constructor)) __constructor_order_last(void) -{ - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; -} - int main(int argc, char **argv) { int fd = open(UV_PATH, O_ACCMODE); diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index 6418ded40bdd..071e03532cba 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -117,7 +117,7 @@ static int check_execveat_invoked_rc(int fd, const char *path, int flags, } if ((WEXITSTATUS(status) != expected_rc) && (WEXITSTATUS(status) != expected_rc2)) { - ksft_print_msg("child %d exited with %d not %d nor %d\n", + ksft_print_msg("child %d exited with %d neither %d nor %d\n", child, WEXITSTATUS(status), expected_rc, expected_rc2); ksft_test_result_fail("%s\n", test_name); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index e044f5fc57fd..70cb0c8b21cf 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -319,8 +319,11 @@ static void test_listmount_ns(void) * Tell our parent how many mounts we have, and then wait for it * to tell us we're done. */ - write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)); - read(parent_ready_pipe[0], &cval, sizeof(cval)); + if (write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)) != + sizeof(nr_mounts)) + ret = NSID_ERROR; + if (read(parent_ready_pipe[0], &cval, sizeof(cval)) != sizeof(cval)) + ret = NSID_ERROR; exit(NSID_PASS); } diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc index c45094d1e1d2..094419e190c2 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc @@ -6,6 +6,18 @@ original_group=`stat -c "%g" .` original_owner=`stat -c "%u" .` mount_point=`stat -c '%m' .` + +# If stat -c '%m' does not work (e.g. busybox) or failed, try to use the +# current working directory (which should be a tracefs) as the mount point. +if [ ! -d "$mount_point" ]; then + if mount | grep -qw $PWD ; then + mount_point=$PWD + else + # If PWD doesn't work, that is an environmental problem. + exit_unresolved + fi +fi + mount_options=`mount | grep "$mount_point" | sed -e 's/.*(\(.*\)).*/\1/'` # find another owner and group that is not the original @@ -83,32 +95,38 @@ run_tests() { done } -mount -o remount,"$new_options" . +# Run the tests twice as leftovers can cause issues +for loop in 1 2 ; do -run_tests + echo "Running iteration $loop" -mount -o remount,"$mount_options" . + mount -o remount,"$new_options" . -for d in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable" $canary; do - test "$d" $original_group -done + run_tests + + mount -o remount,"$mount_options" . + + for d in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable" $canary; do + test "$d" $original_group + done # check instances as well -chgrp $other_group instances + chgrp $other_group instances -instance="$(mktemp -u test-XXXXXX)" + instance="$(mktemp -u test-XXXXXX)" -mkdir instances/$instance + mkdir instances/$instance -cd instances/$instance + cd instances/$instance -run_tests + run_tests -cd ../.. + cd ../.. -rmdir instances/$instance + rmdir instances/$instance -chgrp $original_group instances + chgrp $original_group instances +done exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc new file mode 100644 index 000000000000..a275decdc880 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc @@ -0,0 +1,26 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - add/remove/test uprobe events +# requires: uprobe_events + +echo 0 > events/enable +echo > dynamic_events + +echo 'cat /proc/$$/maps' | /bin/sh | \ + grep "r-xp .*/bin/.*sh$" | \ + awk '{printf "p:myevent %s:0x%s\n", $6,$3 }' >> uprobe_events + +grep -q myevent uprobe_events +test -d events/uprobes/myevent + +echo 1 > events/uprobes/myevent/enable +echo 'ls' | /bin/sh > /dev/null +echo 0 > events/uprobes/myevent/enable +grep -q myevent trace + +echo "-:myevent" >> uprobe_events +! grep -q myevent uprobe_events + +echo > uprobe_events + +clear_trace diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 073a748b9380..263f6b798c85 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,14 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="sched_tick" +if grep '^sched_tick\b' available_filter_functions; then + FUNC2="sched_tick" +elif grep '^scheduler_tick\b' available_filter_functions; then + FUNC2="scheduler_tick" +else + exit_unresolved +fi + ALL_FUNCS="#### all functions enabled ####" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc index e21c9c27ece4..77f4c07cdcb8 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Kprobe event char type argument -# requires: kprobe_events +# requires: kprobe_events available_filter_functions case `uname -m` in x86_64) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index 93217d459556..39001073f7ed 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Kprobe event string type argument -# requires: kprobe_events +# requires: kprobe_events available_filter_functions case `uname -m` in x86_64) diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index dc0408a831d0..c4bc2aa508c3 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -532,6 +532,7 @@ static void load_programs(const struct test_program programs[], FIXTURE_DATA(hid_bpf) * self, const FIXTURE_VARIANT(hid_bpf) * variant) { + struct bpf_map *iter_map; int err = -EINVAL; ASSERT_LE(progs_count, ARRAY_SIZE(self->hid_links)) @@ -564,6 +565,13 @@ static void load_programs(const struct test_program programs[], *ops_hid_id = self->hid_id; } + /* we disable the auto-attach feature of all maps because we + * only want the tested one to be manually attached in the next + * call to bpf_map__attach_struct_ops() + */ + bpf_object__for_each_map(iter_map, *self->skel->skeleton->obj) + bpf_map__set_autoattach(iter_map, false); + err = hid__load(self->skel); ASSERT_OK(err) TH_LOG("hid_skel_load failed: %d", err); @@ -687,6 +695,24 @@ TEST_F(hid_bpf, subprog_raw_event) } /* + * Attach hid_first_event to the given uhid device, + * attempt at re-attaching it, we should not lock and + * return an invalid struct bpf_link + */ +TEST_F(hid_bpf, multiple_attach) +{ + const struct test_program progs[] = { + { .name = "hid_first_event" }, + }; + struct bpf_link *link; + + LOAD_PROGRAMS(progs); + + link = bpf_map__attach_struct_ops(self->skel->maps.first_event); + ASSERT_NULL(link) TH_LOG("unexpected return value when re-attaching the struct_ops"); +} + +/* * Ensures that we can attach/detach programs */ TEST_F(hid_bpf, test_attach_detach) @@ -1331,12 +1357,6 @@ static int libbpf_print_fn(enum libbpf_print_level level, return 0; } -static void __attribute__((constructor)) __constructor_order_last(void) -{ - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; -} - int main(int argc, char **argv) { /* Use libbpf 1.0 API mode */ diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index ee9bbbcf751b..5ecc845ef792 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -455,7 +455,7 @@ struct { __type(value, struct elem); } hmap SEC(".maps"); -static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +static int wq_cb_sleepable(void *map, int *key, void *work) { __u8 buf[9] = {2, 3, 4, 5, 6, 7, 8, 9, 10}; struct hid_bpf_ctx *hid_ctx; diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index cfe37f491906..e5db897586bb 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -114,7 +114,7 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + int (callback_fn)(void *map, int *key, void *wq), unsigned int flags__k, void *aux__ign) __ksym; #define bpf_wq_set_callback(timer, cb, flags) \ bpf_wq_set_callback_impl(timer, cb, flags, NULL) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6343f4053bd4..4927b9add5ad 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -825,7 +825,7 @@ TEST_F(iommufd_ioas, copy_area) { struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .dst_ioas_id = self->ioas_id, .src_ioas_id = self->ioas_id, .length = PAGE_SIZE, @@ -1318,7 +1318,7 @@ TEST_F(iommufd_ioas, copy_sweep) { struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .src_ioas_id = self->ioas_id, .dst_iova = MOCK_APERTURE_START, .length = MOCK_PAGE_SIZE, @@ -1608,7 +1608,7 @@ TEST_F(iommufd_mock_domain, user_copy) }; struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .dst_ioas_id = self->ioas_id, .dst_iova = MOCK_APERTURE_START, .length = BUFFER_SIZE, diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index b8967b6e29d5..29fedf609611 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -61,6 +61,7 @@ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* * gcc cpuid.h provides __cpuid_count() since v4.4. * Clang/LLVM cpuid.h provides __cpuid_count() since v3.4.0. @@ -75,6 +76,7 @@ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ : "0" (level), "2" (count)) #endif +#endif /* end arch */ /* define kselftest exit codes */ #define KSFT_PASS 0 @@ -371,15 +373,7 @@ static inline __noreturn __printf(1, 2) void ksft_exit_fail_msg(const char *msg, static inline __noreturn void ksft_exit_fail_perror(const char *msg) { -#ifndef NOLIBC ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno); -#else - /* - * nolibc doesn't provide strerror() and it seems - * inappropriate to add one, just print the errno. - */ - ksft_exit_fail_msg("%s: %d)\n", msg, errno); -#endif } static inline __noreturn void ksft_exit_xfail(void) diff --git a/tools/testing/selftests/kselftest/ksft.py b/tools/testing/selftests/kselftest/ksft.py index cd89fb2bc10e..bf215790a89d 100644 --- a/tools/testing/selftests/kselftest/ksft.py +++ b/tools/testing/selftests/kselftest/ksft.py @@ -70,7 +70,7 @@ def test_result(condition, description=""): def finished(): - if ksft_cnt["pass"] == ksft_num_tests: + if ksft_cnt["pass"] + ksft_cnt["skip"] == ksft_num_tests: exit_code = KSFT_PASS else: exit_code = KSFT_FAIL diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index 74954f6a8f94..2c3c58e65a41 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -111,8 +111,11 @@ run_one() stdbuf="/usr/bin/stdbuf --output=L " fi eval kselftest_cmd_args="\$${kselftest_cmd_args_ref:-}" - cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args" - if [ ! -x "$TEST" ]; then + if [ -x "$TEST" ]; then + cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args" + elif [ -x "./ksft_runner.sh" ]; then + cmd="$stdbuf ./ksft_runner.sh ./$BASENAME_TEST" + else echo "# Warning: file $TEST is not executable" if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ] diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 40723a6a083f..a5a72415e37b 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -488,12 +488,6 @@ * Use once to append a main() to the test file. */ #define TEST_HARNESS_MAIN \ - static void __attribute__((constructor)) \ - __constructor_order_last(void) \ - { \ - if (!__constructor_order) \ - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; \ - } \ int main(int argc, char **argv) { \ return test_harness_run(argc, argv); \ } @@ -824,7 +818,7 @@ item->prev = item; \ return; \ } \ - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + if (__constructor_order_forward) { \ item->next = NULL; \ item->prev = head->prev; \ item->prev->next = item; \ @@ -888,10 +882,7 @@ struct __test_xfail { } static struct __fixture_metadata *__fixture_list = &_fixture_global; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 +static bool __constructor_order_forward; static inline void __register_fixture(struct __fixture_metadata *f) { @@ -942,7 +933,7 @@ static inline bool __test_passed(struct __test_metadata *metadata) * list so tests are run in source declaration order. * https://gcc.gnu.org/onlinedocs/gccint/Initialization.html * However, it seems not all toolchains do this correctly, so use - * __constructor_order to detect which direction is called first + * __constructor_order_foward to detect which direction is called first * and adjust list building logic to get things running in the right * direction. */ @@ -1337,8 +1328,7 @@ static int test_harness_run(int argc, char **argv) static void __attribute__((constructor)) __constructor_order_first(void) { - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_FORWARD; + __constructor_order_forward = true; } #endif /* __KSELFTEST_HARNESS_H */ diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb..0c4b254ab56b 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -152,6 +152,7 @@ TEST_GEN_PROGS_x86_64 += pre_fault_memory_test TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs +TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/hypercalls TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test @@ -163,6 +164,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access +TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer TEST_GEN_PROGS_aarch64 += demand_paging_test diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c new file mode 100644 index 000000000000..a36a7e2db434 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c @@ -0,0 +1,1062 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * arch_timer_edge_cases.c - Tests the aarch64 timer IRQ functionality. + * + * The test validates some edge cases related to the arch-timer: + * - timers above the max TVAL value. + * - timers in the past + * - moving counters ahead and behind pending timers. + * - reprograming timers. + * - timers fired multiple times. + * - masking/unmasking using the timer control mask. + * + * Copyright (c) 2021, Google LLC. + */ + +#define _GNU_SOURCE + +#include <pthread.h> +#include <sys/sysinfo.h> + +#include "arch_timer.h" +#include "gic.h" +#include "vgic.h" + +static const uint64_t CVAL_MAX = ~0ULL; +/* tval is a signed 32-bit int. */ +static const int32_t TVAL_MAX = INT32_MAX; +static const int32_t TVAL_MIN = INT32_MIN; + +/* After how much time we say there is no IRQ. */ +static const uint32_t TIMEOUT_NO_IRQ_US = 50000; + +/* A nice counter value to use as the starting one for most tests. */ +static const uint64_t DEF_CNT = (CVAL_MAX / 2); + +/* Number of runs. */ +static const uint32_t NR_TEST_ITERS_DEF = 5; + +/* Default wait test time in ms. */ +static const uint32_t WAIT_TEST_MS = 10; + +/* Default "long" wait test time in ms. */ +static const uint32_t LONG_WAIT_TEST_MS = 100; + +/* Shared with IRQ handler. */ +struct test_vcpu_shared_data { + atomic_t handled; + atomic_t spurious; +} shared_data; + +struct test_args { + /* Virtual or physical timer and counter tests. */ + enum arch_timer timer; + /* Delay used for most timer tests. */ + uint64_t wait_ms; + /* Delay used in the test_long_timer_delays test. */ + uint64_t long_wait_ms; + /* Number of iterations. */ + int iterations; + /* Whether to test the physical timer. */ + bool test_physical; + /* Whether to test the virtual timer. */ + bool test_virtual; +}; + +struct test_args test_args = { + .wait_ms = WAIT_TEST_MS, + .long_wait_ms = LONG_WAIT_TEST_MS, + .iterations = NR_TEST_ITERS_DEF, + .test_physical = true, + .test_virtual = true, +}; + +static int vtimer_irq, ptimer_irq; + +enum sync_cmd { + SET_COUNTER_VALUE, + USERSPACE_USLEEP, + USERSPACE_SCHED_YIELD, + USERSPACE_MIGRATE_SELF, + NO_USERSPACE_CMD, +}; + +typedef void (*sleep_method_t)(enum arch_timer timer, uint64_t usec); + +static void sleep_poll(enum arch_timer timer, uint64_t usec); +static void sleep_sched_poll(enum arch_timer timer, uint64_t usec); +static void sleep_in_userspace(enum arch_timer timer, uint64_t usec); +static void sleep_migrate(enum arch_timer timer, uint64_t usec); + +sleep_method_t sleep_method[] = { + sleep_poll, + sleep_sched_poll, + sleep_migrate, + sleep_in_userspace, +}; + +typedef void (*irq_wait_method_t)(void); + +static void wait_for_non_spurious_irq(void); +static void wait_poll_for_irq(void); +static void wait_sched_poll_for_irq(void); +static void wait_migrate_poll_for_irq(void); + +irq_wait_method_t irq_wait_method[] = { + wait_for_non_spurious_irq, + wait_poll_for_irq, + wait_sched_poll_for_irq, + wait_migrate_poll_for_irq, +}; + +enum timer_view { + TIMER_CVAL, + TIMER_TVAL, +}; + +static void assert_irqs_handled(uint32_t n) +{ + int h = atomic_read(&shared_data.handled); + + __GUEST_ASSERT(h == n, "Handled %d IRQS but expected %d", h, n); +} + +static void userspace_cmd(uint64_t cmd) +{ + GUEST_SYNC_ARGS(cmd, 0, 0, 0, 0); +} + +static void userspace_migrate_vcpu(void) +{ + userspace_cmd(USERSPACE_MIGRATE_SELF); +} + +static void userspace_sleep(uint64_t usecs) +{ + GUEST_SYNC_ARGS(USERSPACE_USLEEP, usecs, 0, 0, 0); +} + +static void set_counter(enum arch_timer timer, uint64_t counter) +{ + GUEST_SYNC_ARGS(SET_COUNTER_VALUE, counter, timer, 0, 0); +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid = gic_get_and_ack_irq(); + enum arch_timer timer; + uint64_t cnt, cval; + uint32_t ctl; + bool timer_condition, istatus; + + if (intid == IAR_SPURIOUS) { + atomic_inc(&shared_data.spurious); + goto out; + } + + if (intid == ptimer_irq) + timer = PHYSICAL; + else if (intid == vtimer_irq) + timer = VIRTUAL; + else + goto out; + + ctl = timer_get_ctl(timer); + cval = timer_get_cval(timer); + cnt = timer_get_cntct(timer); + timer_condition = cnt >= cval; + istatus = (ctl & CTL_ISTATUS) && (ctl & CTL_ENABLE); + GUEST_ASSERT_EQ(timer_condition, istatus); + + /* Disable and mask the timer. */ + timer_set_ctl(timer, CTL_IMASK); + + atomic_inc(&shared_data.handled); + +out: + gic_set_eoi(intid); +} + +static void set_cval_irq(enum arch_timer timer, uint64_t cval_cycles, + uint32_t ctl) +{ + atomic_set(&shared_data.handled, 0); + atomic_set(&shared_data.spurious, 0); + timer_set_cval(timer, cval_cycles); + timer_set_ctl(timer, ctl); +} + +static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles, + uint32_t ctl) +{ + atomic_set(&shared_data.handled, 0); + atomic_set(&shared_data.spurious, 0); + timer_set_ctl(timer, ctl); + timer_set_tval(timer, tval_cycles); +} + +static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl, + enum timer_view tv) +{ + switch (tv) { + case TIMER_CVAL: + set_cval_irq(timer, xval, ctl); + break; + case TIMER_TVAL: + set_tval_irq(timer, xval, ctl); + break; + default: + GUEST_FAIL("Could not get timer %d", timer); + } +} + +/* + * Note that this can theoretically hang forever, so we rely on having + * a timeout mechanism in the "runner", like: + * tools/testing/selftests/kselftest/runner.sh. + */ +static void wait_for_non_spurious_irq(void) +{ + int h; + + local_irq_disable(); + + for (h = atomic_read(&shared_data.handled); h == atomic_read(&shared_data.handled);) { + wfi(); + local_irq_enable(); + isb(); /* handle IRQ */ + local_irq_disable(); + } +} + +/* + * Wait for an non-spurious IRQ by polling in the guest or in + * userspace (e.g. userspace_cmd=USERSPACE_SCHED_YIELD). + * + * Note that this can theoretically hang forever, so we rely on having + * a timeout mechanism in the "runner", like: + * tools/testing/selftests/kselftest/runner.sh. + */ +static void poll_for_non_spurious_irq(enum sync_cmd usp_cmd) +{ + int h; + + local_irq_disable(); + + h = atomic_read(&shared_data.handled); + + local_irq_enable(); + while (h == atomic_read(&shared_data.handled)) { + if (usp_cmd == NO_USERSPACE_CMD) + cpu_relax(); + else + userspace_cmd(usp_cmd); + } + local_irq_disable(); +} + +static void wait_poll_for_irq(void) +{ + poll_for_non_spurious_irq(NO_USERSPACE_CMD); +} + +static void wait_sched_poll_for_irq(void) +{ + poll_for_non_spurious_irq(USERSPACE_SCHED_YIELD); +} + +static void wait_migrate_poll_for_irq(void) +{ + poll_for_non_spurious_irq(USERSPACE_MIGRATE_SELF); +} + +/* + * Sleep for usec microseconds by polling in the guest or in + * userspace (e.g. userspace_cmd=USERSPACE_SCHEDULE). + */ +static void guest_poll(enum arch_timer test_timer, uint64_t usec, + enum sync_cmd usp_cmd) +{ + uint64_t cycles = usec_to_cycles(usec); + /* Whichever timer we are testing with, sleep with the other. */ + enum arch_timer sleep_timer = 1 - test_timer; + uint64_t start = timer_get_cntct(sleep_timer); + + while ((timer_get_cntct(sleep_timer) - start) < cycles) { + if (usp_cmd == NO_USERSPACE_CMD) + cpu_relax(); + else + userspace_cmd(usp_cmd); + } +} + +static void sleep_poll(enum arch_timer timer, uint64_t usec) +{ + guest_poll(timer, usec, NO_USERSPACE_CMD); +} + +static void sleep_sched_poll(enum arch_timer timer, uint64_t usec) +{ + guest_poll(timer, usec, USERSPACE_SCHED_YIELD); +} + +static void sleep_migrate(enum arch_timer timer, uint64_t usec) +{ + guest_poll(timer, usec, USERSPACE_MIGRATE_SELF); +} + +static void sleep_in_userspace(enum arch_timer timer, uint64_t usec) +{ + userspace_sleep(usec); +} + +/* + * Reset the timer state to some nice values like the counter not being close + * to the edge, and the control register masked and disabled. + */ +static void reset_timer_state(enum arch_timer timer, uint64_t cnt) +{ + set_counter(timer, cnt); + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_timer_xval(enum arch_timer timer, uint64_t xval, + enum timer_view tv, irq_wait_method_t wm, bool reset_state, + uint64_t reset_cnt) +{ + local_irq_disable(); + + if (reset_state) + reset_timer_state(timer, reset_cnt); + + set_xval_irq(timer, xval, CTL_ENABLE, tv); + + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* + * The test_timer_* functions will program the timer, wait for it, and assert + * the firing of the correct IRQ. + * + * These functions don't have a timeout and return as soon as they receive an + * IRQ. They can hang (forever), so we rely on having a timeout mechanism in + * the "runner", like: tools/testing/selftests/kselftest/runner.sh. + */ + +static void test_timer_cval(enum arch_timer timer, uint64_t cval, + irq_wait_method_t wm, bool reset_state, + uint64_t reset_cnt) +{ + test_timer_xval(timer, cval, TIMER_CVAL, wm, reset_state, reset_cnt); +} + +static void test_timer_tval(enum arch_timer timer, int32_t tval, + irq_wait_method_t wm, bool reset_state, + uint64_t reset_cnt) +{ + test_timer_xval(timer, (uint64_t) tval, TIMER_TVAL, wm, reset_state, + reset_cnt); +} + +static void test_xval_check_no_irq(enum arch_timer timer, uint64_t xval, + uint64_t usec, enum timer_view timer_view, + sleep_method_t guest_sleep) +{ + local_irq_disable(); + + set_xval_irq(timer, xval, CTL_ENABLE | CTL_IMASK, timer_view); + guest_sleep(timer, usec); + + local_irq_enable(); + isb(); + + /* Assume success (no IRQ) after waiting usec microseconds */ + assert_irqs_handled(0); +} + +static void test_cval_no_irq(enum arch_timer timer, uint64_t cval, + uint64_t usec, sleep_method_t wm) +{ + test_xval_check_no_irq(timer, cval, usec, TIMER_CVAL, wm); +} + +static void test_tval_no_irq(enum arch_timer timer, int32_t tval, uint64_t usec, + sleep_method_t wm) +{ + /* tval will be cast to an int32_t in test_xval_check_no_irq */ + test_xval_check_no_irq(timer, (uint64_t) tval, usec, TIMER_TVAL, wm); +} + +/* Test masking/unmasking a timer using the timer mask (not the IRQ mask). */ +static void test_timer_control_mask_then_unmask(enum arch_timer timer) +{ + reset_timer_state(timer, DEF_CNT); + set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK); + + /* Unmask the timer, and then get an IRQ. */ + local_irq_disable(); + timer_set_ctl(timer, CTL_ENABLE); + /* This method re-enables IRQs to handle the one we're looking for. */ + wait_for_non_spurious_irq(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* Check that timer control masks actually mask a timer being fired. */ +static void test_timer_control_masks(enum arch_timer timer) +{ + reset_timer_state(timer, DEF_CNT); + + /* Local IRQs are not masked at this point. */ + + set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK); + + /* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */ + sleep_poll(timer, TIMEOUT_NO_IRQ_US); + + assert_irqs_handled(0); + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_fire_a_timer_multiple_times(enum arch_timer timer, + irq_wait_method_t wm, int num) +{ + int i; + + local_irq_disable(); + reset_timer_state(timer, DEF_CNT); + + set_tval_irq(timer, 0, CTL_ENABLE); + + for (i = 1; i <= num; i++) { + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + /* The IRQ handler masked and disabled the timer. + * Enable and unmmask it again. + */ + timer_set_ctl(timer, CTL_ENABLE); + + assert_irqs_handled(i); + } + + local_irq_enable(); +} + +static void test_timers_fired_multiple_times(enum arch_timer timer) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) + test_fire_a_timer_multiple_times(timer, irq_wait_method[i], 10); +} + +/* + * Set a timer for tval=delta_1_ms then reprogram it to + * tval=delta_2_ms. Check that we get the timer fired. There is no + * timeout for the wait: we use the wfi instruction. + */ +static void test_reprogramming_timer(enum arch_timer timer, irq_wait_method_t wm, + int32_t delta_1_ms, int32_t delta_2_ms) +{ + local_irq_disable(); + reset_timer_state(timer, DEF_CNT); + + /* Program the timer to DEF_CNT + delta_1_ms. */ + set_tval_irq(timer, msec_to_cycles(delta_1_ms), CTL_ENABLE); + + /* Reprogram the timer to DEF_CNT + delta_2_ms. */ + timer_set_tval(timer, msec_to_cycles(delta_2_ms)); + + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + /* The IRQ should arrive at DEF_CNT + delta_2_ms (or after). */ + GUEST_ASSERT(timer_get_cntct(timer) >= + DEF_CNT + msec_to_cycles(delta_2_ms)); + + local_irq_enable(); + assert_irqs_handled(1); +}; + +static void test_reprogram_timers(enum arch_timer timer) +{ + int i; + uint64_t base_wait = test_args.wait_ms; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + /* + * Ensure reprogramming works whether going from a + * longer time to a shorter or vice versa. + */ + test_reprogramming_timer(timer, irq_wait_method[i], 2 * base_wait, + base_wait); + test_reprogramming_timer(timer, irq_wait_method[i], base_wait, + 2 * base_wait); + } +} + +static void test_basic_functionality(enum arch_timer timer) +{ + int32_t tval = (int32_t) msec_to_cycles(test_args.wait_ms); + uint64_t cval = DEF_CNT + msec_to_cycles(test_args.wait_ms); + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + test_timer_cval(timer, cval, wm, true, DEF_CNT); + test_timer_tval(timer, tval, wm, true, DEF_CNT); + } +} + +/* + * This test checks basic timer behavior without actually firing timers, things + * like: the relationship between cval and tval, tval down-counting. + */ +static void timers_sanity_checks(enum arch_timer timer, bool use_sched) +{ + reset_timer_state(timer, DEF_CNT); + + local_irq_disable(); + + /* cval in the past */ + timer_set_cval(timer, + timer_get_cntct(timer) - + msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_tval(timer) < 0); + + /* tval in the past */ + timer_set_tval(timer, -1); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_cval(timer) < timer_get_cntct(timer)); + + /* tval larger than TVAL_MAX. This requires programming with + * timer_set_cval instead so the value is expressible + */ + timer_set_cval(timer, + timer_get_cntct(timer) + TVAL_MAX + + msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_tval(timer) <= 0); + + /* + * tval larger than 2 * TVAL_MAX. + * Twice the TVAL_MAX completely loops around the TVAL. + */ + timer_set_cval(timer, + timer_get_cntct(timer) + 2ULL * TVAL_MAX + + msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_tval(timer) <= + msec_to_cycles(test_args.wait_ms)); + + /* negative tval that rollovers from 0. */ + set_counter(timer, msec_to_cycles(1)); + timer_set_tval(timer, -1 * msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_cval(timer) >= (CVAL_MAX - msec_to_cycles(test_args.wait_ms))); + + /* tval should keep down-counting from 0 to -1. */ + timer_set_tval(timer, 0); + sleep_poll(timer, 1); + GUEST_ASSERT(timer_get_tval(timer) < 0); + + local_irq_enable(); + + /* Mask and disable any pending timer. */ + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_timers_sanity_checks(enum arch_timer timer) +{ + timers_sanity_checks(timer, false); + /* Check how KVM saves/restores these edge-case values. */ + timers_sanity_checks(timer, true); +} + +static void test_set_cnt_after_tval_max(enum arch_timer timer, irq_wait_method_t wm) +{ + local_irq_disable(); + reset_timer_state(timer, DEF_CNT); + + set_cval_irq(timer, + (uint64_t) TVAL_MAX + + msec_to_cycles(test_args.wait_ms) / 2, CTL_ENABLE); + + set_counter(timer, TVAL_MAX); + + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* Test timers set for: cval = now + TVAL_MAX + wait_ms / 2 */ +static void test_timers_above_tval_max(enum arch_timer timer) +{ + uint64_t cval; + int i; + + /* + * Test that the system is not implementing cval in terms of + * tval. If that was the case, setting a cval to "cval = now + * + TVAL_MAX + wait_ms" would wrap to "cval = now + + * wait_ms", and the timer would fire immediately. Test that it + * doesn't. + */ + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + reset_timer_state(timer, DEF_CNT); + cval = timer_get_cntct(timer) + TVAL_MAX + + msec_to_cycles(test_args.wait_ms); + test_cval_no_irq(timer, cval, + msecs_to_usecs(test_args.wait_ms) + + TIMEOUT_NO_IRQ_US, sleep_method[i]); + } + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + /* Get the IRQ by moving the counter forward. */ + test_set_cnt_after_tval_max(timer, irq_wait_method[i]); + } +} + +/* + * Template function to be used by the test_move_counter_ahead_* tests. It + * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and + * then waits for an IRQ. + */ +static void test_set_cnt_after_xval(enum arch_timer timer, uint64_t cnt_1, + uint64_t xval, uint64_t cnt_2, + irq_wait_method_t wm, enum timer_view tv) +{ + local_irq_disable(); + + set_counter(timer, cnt_1); + timer_set_ctl(timer, CTL_IMASK); + + set_xval_irq(timer, xval, CTL_ENABLE, tv); + set_counter(timer, cnt_2); + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* + * Template function to be used by the test_move_counter_ahead_* tests. It + * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and + * then waits for an IRQ. + */ +static void test_set_cnt_after_xval_no_irq(enum arch_timer timer, + uint64_t cnt_1, uint64_t xval, + uint64_t cnt_2, + sleep_method_t guest_sleep, + enum timer_view tv) +{ + local_irq_disable(); + + set_counter(timer, cnt_1); + timer_set_ctl(timer, CTL_IMASK); + + set_xval_irq(timer, xval, CTL_ENABLE, tv); + set_counter(timer, cnt_2); + guest_sleep(timer, TIMEOUT_NO_IRQ_US); + + local_irq_enable(); + isb(); + + /* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */ + assert_irqs_handled(0); + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_set_cnt_after_tval(enum arch_timer timer, uint64_t cnt_1, + int32_t tval, uint64_t cnt_2, + irq_wait_method_t wm) +{ + test_set_cnt_after_xval(timer, cnt_1, tval, cnt_2, wm, TIMER_TVAL); +} + +static void test_set_cnt_after_cval(enum arch_timer timer, uint64_t cnt_1, + uint64_t cval, uint64_t cnt_2, + irq_wait_method_t wm) +{ + test_set_cnt_after_xval(timer, cnt_1, cval, cnt_2, wm, TIMER_CVAL); +} + +static void test_set_cnt_after_tval_no_irq(enum arch_timer timer, + uint64_t cnt_1, int32_t tval, + uint64_t cnt_2, sleep_method_t wm) +{ + test_set_cnt_after_xval_no_irq(timer, cnt_1, tval, cnt_2, wm, + TIMER_TVAL); +} + +static void test_set_cnt_after_cval_no_irq(enum arch_timer timer, + uint64_t cnt_1, uint64_t cval, + uint64_t cnt_2, sleep_method_t wm) +{ + test_set_cnt_after_xval_no_irq(timer, cnt_1, cval, cnt_2, wm, + TIMER_CVAL); +} + +/* Set a timer and then move the counter ahead of it. */ +static void test_move_counters_ahead_of_timers(enum arch_timer timer) +{ + int i; + int32_t tval; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + test_set_cnt_after_cval(timer, 0, DEF_CNT, DEF_CNT + 1, wm); + test_set_cnt_after_cval(timer, CVAL_MAX, 1, 2, wm); + + /* Move counter ahead of negative tval. */ + test_set_cnt_after_tval(timer, 0, -1, DEF_CNT + 1, wm); + test_set_cnt_after_tval(timer, 0, -1, TVAL_MAX, wm); + tval = TVAL_MAX; + test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1, + wm); + } + + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + sleep_method_t sm = sleep_method[i]; + + test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm); + } +} + +/* + * Program a timer, mask it, and then change the tval or counter to cancel it. + * Unmask it and check that nothing fires. + */ +static void test_move_counters_behind_timers(enum arch_timer timer) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + sleep_method_t sm = sleep_method[i]; + + test_set_cnt_after_cval_no_irq(timer, DEF_CNT, DEF_CNT - 1, 0, + sm); + test_set_cnt_after_tval_no_irq(timer, DEF_CNT, -1, 0, sm); + } +} + +static void test_timers_in_the_past(enum arch_timer timer) +{ + int32_t tval = -1 * (int32_t) msec_to_cycles(test_args.wait_ms); + uint64_t cval; + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + /* set a timer wait_ms the past. */ + cval = DEF_CNT - msec_to_cycles(test_args.wait_ms); + test_timer_cval(timer, cval, wm, true, DEF_CNT); + test_timer_tval(timer, tval, wm, true, DEF_CNT); + + /* Set a timer to counter=0 (in the past) */ + test_timer_cval(timer, 0, wm, true, DEF_CNT); + + /* Set a time for tval=0 (now) */ + test_timer_tval(timer, 0, wm, true, DEF_CNT); + + /* Set a timer to as far in the past as possible */ + test_timer_tval(timer, TVAL_MIN, wm, true, DEF_CNT); + } + + /* + * Set the counter to wait_ms, and a tval to -wait_ms. There should be no + * IRQ as that tval means cval=CVAL_MAX-wait_ms. + */ + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + sleep_method_t sm = sleep_method[i]; + + set_counter(timer, msec_to_cycles(test_args.wait_ms)); + test_tval_no_irq(timer, tval, TIMEOUT_NO_IRQ_US, sm); + } +} + +static void test_long_timer_delays(enum arch_timer timer) +{ + int32_t tval = (int32_t) msec_to_cycles(test_args.long_wait_ms); + uint64_t cval = DEF_CNT + msec_to_cycles(test_args.long_wait_ms); + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + test_timer_cval(timer, cval, wm, true, DEF_CNT); + test_timer_tval(timer, tval, wm, true, DEF_CNT); + } +} + +static void guest_run_iteration(enum arch_timer timer) +{ + test_basic_functionality(timer); + test_timers_sanity_checks(timer); + + test_timers_above_tval_max(timer); + test_timers_in_the_past(timer); + + test_move_counters_ahead_of_timers(timer); + test_move_counters_behind_timers(timer); + test_reprogram_timers(timer); + + test_timers_fired_multiple_times(timer); + + test_timer_control_mask_then_unmask(timer); + test_timer_control_masks(timer); +} + +static void guest_code(enum arch_timer timer) +{ + int i; + + local_irq_disable(); + + gic_init(GIC_V3, 1); + + timer_set_ctl(VIRTUAL, CTL_IMASK); + timer_set_ctl(PHYSICAL, CTL_IMASK); + + gic_irq_enable(vtimer_irq); + gic_irq_enable(ptimer_irq); + local_irq_enable(); + + for (i = 0; i < test_args.iterations; i++) { + GUEST_SYNC(i); + guest_run_iteration(timer); + } + + test_long_timer_delays(timer); + GUEST_DONE(); +} + +static uint32_t next_pcpu(void) +{ + uint32_t max = get_nprocs(); + uint32_t cur = sched_getcpu(); + uint32_t next = cur; + cpu_set_t cpuset; + + TEST_ASSERT(max > 1, "Need at least two physical cpus"); + + sched_getaffinity(0, sizeof(cpuset), &cpuset); + + do { + next = (next + 1) % CPU_SETSIZE; + } while (!CPU_ISSET(next, &cpuset)); + + return next; +} + +static void migrate_self(uint32_t new_pcpu) +{ + int ret; + cpu_set_t cpuset; + pthread_t thread; + + thread = pthread_self(); + + CPU_ZERO(&cpuset); + CPU_SET(new_pcpu, &cpuset); + + pr_debug("Migrating from %u to %u\n", sched_getcpu(), new_pcpu); + + ret = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); + + TEST_ASSERT(ret == 0, "Failed to migrate to pCPU: %u; ret: %d\n", + new_pcpu, ret); +} + +static void kvm_set_cntxct(struct kvm_vcpu *vcpu, uint64_t cnt, + enum arch_timer timer) +{ + if (timer == PHYSICAL) + vcpu_set_reg(vcpu, KVM_REG_ARM_PTIMER_CNT, cnt); + else + vcpu_set_reg(vcpu, KVM_REG_ARM_TIMER_CNT, cnt); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + enum sync_cmd cmd = uc->args[1]; + uint64_t val = uc->args[2]; + enum arch_timer timer = uc->args[3]; + + switch (cmd) { + case SET_COUNTER_VALUE: + kvm_set_cntxct(vcpu, val, timer); + break; + case USERSPACE_USLEEP: + usleep(val); + break; + case USERSPACE_SCHED_YIELD: + sched_yield(); + break; + case USERSPACE_MIGRATE_SELF: + migrate_self(next_pcpu()); + break; + default: + break; + } +} + +static void test_run(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + /* Start on CPU 0 */ + migrate_self(0); + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + handle_sync(vcpu, &uc); + break; + case UCALL_DONE: + goto out; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + goto out; + default: + TEST_FAIL("Unexpected guest exit\n"); + } + } + + out: + return; +} + +static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq); + vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq); + + sync_global_to_guest(vm, ptimer_irq); + sync_global_to_guest(vm, vtimer_irq); + + pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq); +} + +static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, + enum arch_timer timer) +{ + *vm = vm_create_with_one_vcpu(vcpu, guest_code); + TEST_ASSERT(*vm, "Failed to create the test VM\n"); + + vm_init_descriptor_tables(*vm); + vm_install_exception_handler(*vm, VECTOR_IRQ_CURRENT, + guest_irq_handler); + + vcpu_init_descriptor_tables(*vcpu); + vcpu_args_set(*vcpu, 1, timer); + + test_init_timer_irq(*vm, *vcpu); + vgic_v3_setup(*vm, 1, 64); + sync_global_to_guest(*vm, test_args); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n" + , name); + pr_info("\t-i: Number of iterations (default: %u)\n", + NR_TEST_ITERS_DEF); + pr_info("\t-b: Test both physical and virtual timers (default: true)\n"); + pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n", + LONG_WAIT_TEST_MS); + pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n", + WAIT_TEST_MS); + pr_info("\t-p: Test physical timer (default: true)\n"); + pr_info("\t-v: Test virtual timer (default: true)\n"); + pr_info("\t-h: Print this help message\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "bhi:l:pvw:")) != -1) { + switch (opt) { + case 'b': + test_args.test_physical = true; + test_args.test_virtual = true; + break; + case 'i': + test_args.iterations = + atoi_positive("Number of iterations", optarg); + break; + case 'l': + test_args.long_wait_ms = + atoi_positive("Long wait time", optarg); + break; + case 'p': + test_args.test_physical = true; + test_args.test_virtual = false; + break; + case 'v': + test_args.test_virtual = true; + test_args.test_physical = false; + break; + case 'w': + test_args.wait_ms = atoi_positive("Wait time", optarg); + break; + case 'h': + default: + goto err; + } + } + + return true; + + err: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (test_args.test_virtual) { + test_vm_create(&vm, &vcpu, VIRTUAL); + test_run(vm, vcpu); + kvm_vm_free(vm); + } + + if (test_args.test_physical) { + test_vm_create(&vm, &vcpu, PHYSICAL); + test_run(vm, vcpu); + kvm_vm_free(vm); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 709d7d721760..d43fb3f49050 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -32,13 +32,25 @@ static struct feature_id_reg feat_id_regs[] = { { ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ - 4, + 8, 1 }, { ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ - 4, + 8, + 1 + }, + { + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 + }, + { + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, 1 } }; @@ -468,6 +480,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ @@ -475,6 +488,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 1), /* CNTPCT_EL0 */ diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c new file mode 100644 index 000000000000..943d65fc6b0b --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Check that, on a GICv3 system, not configuring GICv3 correctly +// results in all of the sysregs generating an UNDEF exception. + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +static volatile bool handled; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + handled = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +#define __check_sr_write(r) \ + do { \ + handled = false; \ + dsb(sy); \ + write_sysreg_s(0, SYS_ ## r); \ + isb(); \ + } while(0) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(handled, #r " no read trap"); \ + } while(0) + +#define check_sr_write(r) \ + do { \ + __check_sr_write(r); \ + __GUEST_ASSERT(handled, #r " no write trap"); \ + } while(0) + +#define check_sr_rw(r) \ + do { \ + check_sr_read(r); \ + check_sr_write(r); \ + } while(0) + +static void guest_code(void) +{ + uint64_t val; + + /* + * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having + * hidden the feature at runtime without any other userspace action. + */ + __GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), + read_sysreg(id_aa64pfr0_el1)) == 0, + "GICv3 wrongly advertised"); + + /* + * Access all GICv3 registers, and fail if we don't get an UNDEF. + * Note that we happily access all the APxRn registers without + * checking their existance, as all we want to see is a failure. + */ + check_sr_rw(ICC_PMR_EL1); + check_sr_read(ICC_IAR0_EL1); + check_sr_write(ICC_EOIR0_EL1); + check_sr_rw(ICC_HPPIR0_EL1); + check_sr_rw(ICC_BPR0_EL1); + check_sr_rw(ICC_AP0R0_EL1); + check_sr_rw(ICC_AP0R1_EL1); + check_sr_rw(ICC_AP0R2_EL1); + check_sr_rw(ICC_AP0R3_EL1); + check_sr_rw(ICC_AP1R0_EL1); + check_sr_rw(ICC_AP1R1_EL1); + check_sr_rw(ICC_AP1R2_EL1); + check_sr_rw(ICC_AP1R3_EL1); + check_sr_write(ICC_DIR_EL1); + check_sr_read(ICC_RPR_EL1); + check_sr_write(ICC_SGI1R_EL1); + check_sr_write(ICC_ASGI1R_EL1); + check_sr_write(ICC_SGI0R_EL1); + check_sr_read(ICC_IAR1_EL1); + check_sr_write(ICC_EOIR1_EL1); + check_sr_rw(ICC_HPPIR1_EL1); + check_sr_rw(ICC_BPR1_EL1); + check_sr_rw(ICC_CTLR_EL1); + check_sr_rw(ICC_IGRPEN0_EL1); + check_sr_rw(ICC_IGRPEN1_EL1); + + /* + * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can + * be RAO/WI. Engage in non-fatal accesses, starting with a + * write of 0 to try and disable SRE, and let's see if it + * sticks. + */ + __check_sr_write(ICC_SRE_EL1); + if (!handled) + GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n"); + + val = __check_sr_read(ICC_SRE_EL1); + if (!handled) { + __GUEST_ASSERT((val & BIT(0)), + "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n"); + GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n"); + } + + GUEST_DONE(); +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + /* Success, we've gracefully exploded! */ + handled = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_no_gicv3(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Create a VM without a GICv3 */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t pfr0; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &pfr0); + __TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0), + "GICv3 not supported."); + kvm_vm_free(vm); + + test_guest_no_gicv3(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index d20981663831..2a3fe7914b72 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -126,6 +126,7 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0), + REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0), diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index a51dbd2a5f84..f4ac28d53747 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -269,13 +269,12 @@ static void guest_inject(struct test_args *args, KVM_INJECT_MULTI(cmd, first_intid, num); while (irq_handled < num) { - asm volatile("wfi\n" - "msr daifclr, #2\n" - /* handle IRQ */ - "msr daifset, #2\n" - : : : "memory"); + wfi(); + local_irq_enable(); + isb(); /* handle IRQ */ + local_irq_disable(); } - asm volatile("msr daifclr, #2" : : : "memory"); + local_irq_enable(); GUEST_ASSERT_EQ(irq_handled, num); for (i = first_intid; i < num + first_intid; i++) diff --git a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h index b3e97525cb55..bf461de34785 100644 --- a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h +++ b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h @@ -79,7 +79,7 @@ static inline uint64_t timer_get_cval(enum arch_timer timer) return 0; } -static inline void timer_set_tval(enum arch_timer timer, uint32_t tval) +static inline void timer_set_tval(enum arch_timer timer, int32_t tval) { switch (timer) { case VIRTUAL: @@ -95,6 +95,22 @@ static inline void timer_set_tval(enum arch_timer timer, uint32_t tval) isb(); } +static inline int32_t timer_get_tval(enum arch_timer timer) +{ + isb(); + switch (timer) { + case VIRTUAL: + return read_sysreg(cntv_tval_el0); + case PHYSICAL: + return read_sysreg(cntp_tval_el0); + default: + GUEST_FAIL("Could not get timer %d\n", timer); + } + + /* We should not reach here */ + return 0; +} + static inline void timer_set_ctl(enum arch_timer timer, uint32_t ctl) { switch (timer) { diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9b20a355d81a..de977d131082 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -243,4 +243,7 @@ void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, uint64_t arg6, struct arm_smccc_res *res); +/* Execute a Wait For Interrupt instruction. */ +void wfi(void); + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 0ac7cc89f38c..fe4dc3693112 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -639,3 +639,9 @@ void vm_vaddr_populate_bitmap(struct kvm_vm *vm) sparsebit_set_num(vm->vpages_valid, 0, (1ULL << vm->va_bits) >> vm->page_shift); } + +/* Helper to call wfi instruction. */ +void wfi(void) +{ + asm volatile("wfi"); +} diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index f92c2fb23fcd..8e34f7fa44e9 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -961,10 +961,10 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zbkb, ZBKB); KVM_ISA_EXT_SIMPLE_CONFIG(zbkc, ZBKC); KVM_ISA_EXT_SIMPLE_CONFIG(zbkx, ZBKX); KVM_ISA_EXT_SIMPLE_CONFIG(zbs, ZBS); -KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA), -KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB), -KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD), -KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF), +KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA); +KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB); +KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD); +KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF); KVM_ISA_EXT_SIMPLE_CONFIG(zcmop, ZCMOP); KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA); KVM_ISA_EXT_SIMPLE_CONFIG(zfh, ZFH); diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 69849acd95b0..618cd2442390 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -184,6 +184,33 @@ static void test_apic_id(void) kvm_vm_free(vm); } +static void test_x2apic_id(void) +{ + struct kvm_lapic_state lapic = {}; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + vcpu_set_msr(vcpu, MSR_IA32_APICBASE, MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + + /* + * Try stuffing a modified x2APIC ID, KVM should ignore the value and + * always return the vCPU's default/readonly x2APIC ID. + */ + for (i = 0; i <= 0xff; i++) { + *(u32 *)(lapic.regs + APIC_ID) = i << 24; + *(u32 *)(lapic.regs + APIC_SPIV) = APIC_SPIV_APIC_ENABLED; + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic); + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &lapic); + TEST_ASSERT(*((u32 *)&lapic.regs[APIC_ID]) == vcpu->id << 24, + "x2APIC ID should be fully readonly"); + } + + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { struct xapic_vcpu x = { @@ -211,4 +238,5 @@ int main(int argc, char *argv[]) kvm_vm_free(vm); test_apic_id(); + test_x2apic_id(); } diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 3c1e9f35b531..3b26bf3cf5b9 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE #include <errno.h> #include <fcntl.h> +#include <linux/keyctl.h> #include <linux/landlock.h> #include <string.h> #include <sys/prctl.h> @@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer) ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); } +TEST(cred_transfer) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + }; + int ruleset_fd, dir_fd; + pid_t child; + int status; + + drop_caps(_metadata); + + dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + EXPECT_LE(0, dir_fd); + EXPECT_EQ(0, close(dir_fd)); + + /* Denies opening directories. */ + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); + + /* Checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + /* Needed for KEYCTL_SESSION_TO_PARENT permission checks */ + EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0, + 0, 0)) + { + TH_LOG("Failed to join session keyring: %s", strerror(errno)); + } + + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + /* Checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + /* + * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a + * different session keyring in the child, so make that happen. + */ + EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, + NULL, 0, 0, 0)); + + /* + * KEYCTL_SESSION_TO_PARENT installs credentials on the parent + * that never go through the cred_prepare hook, this path uses + * cred_transfer instead. + */ + EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0, + 0, 0, 0)); + + /* Re-checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + _exit(_metadata->exit_code); + return; + } + + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* Re-checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config index 0086efaa7b68..29af19c4e9f9 100644 --- a/tools/testing/selftests/landlock/config +++ b/tools/testing/selftests/landlock/config @@ -2,6 +2,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y CONFIG_INET=y CONFIG_IPV6=y +CONFIG_KEYS=y CONFIG_NET=y CONFIG_NET_NS=y CONFIG_OVERLAY_FS=y diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile index ee71fc99d5b5..c52fe3ad8e98 100644 --- a/tools/testing/selftests/lib/Makefile +++ b/tools/testing/selftests/lib/Makefile @@ -4,6 +4,5 @@ # No binaries, but make sure arg-less "make" doesn't trigger "run_tests" all: -TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh strscpy.sh - +TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh include ../lib.mk diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 645839b50b0a..dc15aba8d0a3 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,5 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m -CONFIG_TEST_STRSCPY=m CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/lib/strscpy.sh b/tools/testing/selftests/lib/strscpy.sh deleted file mode 100755 index be60ef6e1a7f..000000000000 --- a/tools/testing/selftests/lib/strscpy.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0+ -$(dirname $0)/../kselftest/module.sh "strscpy*" test_strscpy diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh index 65c9c058458d..bd13257bfdfe 100755 --- a/tools/testing/selftests/livepatch/test-livepatch.sh +++ b/tools/testing/selftests/livepatch/test-livepatch.sh @@ -139,11 +139,8 @@ load_lp $MOD_REPLACE replace=1 grep 'live patched' /proc/cmdline > /dev/kmsg grep 'live patched' /proc/meminfo > /dev/kmsg -mods=(/sys/kernel/livepatch/*) -nmods=${#mods[@]} -if [ "$nmods" -ne 1 ]; then - die "Expecting only one moduled listed, found $nmods" -fi +loop_until 'mods=(/sys/kernel/livepatch/*); nmods=${#mods[@]}; [[ "$nmods" -eq 1 ]]' || + die "Expecting only one moduled listed, found $nmods" # These modules were disabled by the atomic replace for mod in $MOD_LIVEPATCH3 $MOD_LIVEPATCH2 $MOD_LIVEPATCH1; do diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c index 06d24d4679a6..1cc8a977c711 100644 --- a/tools/testing/selftests/lsm/lsm_list_modules_test.c +++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c @@ -128,6 +128,9 @@ TEST(correct_lsm_list_modules) case LSM_ID_EVM: name = "evm"; break; + case LSM_ID_IPE: + name = "ipe"; + break; default: name = "INVALID"; break; diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 064e7b125643..da030b43e43b 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -50,3 +50,4 @@ hugetlb_fault_after_madv hugetlb_madv_vs_map mseal_test seal_elf +droppable diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index e1aa09ddaa3d..e10b87376fde 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -53,7 +53,9 @@ TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate +ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64)) TEST_GEN_FILES += memfd_secret +endif TEST_GEN_FILES += migration TEST_GEN_FILES += mkdirty TEST_GEN_FILES += mlock-random-test @@ -76,6 +78,7 @@ TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += hugetlb_fault_after_madv TEST_GEN_FILES += hugetlb_madv_vs_map TEST_GEN_FILES += hugetlb_dio +TEST_GEN_FILES += droppable ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty @@ -87,6 +90,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) VMTARGETS := protection_keys +VMTARGETS += pkey_sighandler_tests BINARIES_32 := $(VMTARGETS:%=%_32) BINARIES_64 := $(VMTARGETS:%=%_64) @@ -103,13 +107,13 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),powerpc)) +ifneq (,$(filter $(ARCH),arm64 powerpc)) TEST_GEN_FILES += protection_keys endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index e140558e6f53..2c3a0eb6b22d 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -89,9 +89,10 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size, int fd, ret = -1; int compaction_index = 0; char nr_hugepages[20] = {0}; - char init_nr_hugepages[20] = {0}; + char init_nr_hugepages[24] = {0}; - sprintf(init_nr_hugepages, "%lu", initial_nr_hugepages); + snprintf(init_nr_hugepages, sizeof(init_nr_hugepages), + "%lu", initial_nr_hugepages); /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c new file mode 100644 index 000000000000..f3d9ecf96890 --- /dev/null +++ b/tools/testing/selftests/mm/droppable.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> +#include <linux/mman.h> + +#include "../kselftest.h" + +int main(int argc, char *argv[]) +{ + size_t alloc_size = 134217728; + size_t page_size = getpagesize(); + void *alloc; + pid_t child; + + ksft_print_header(); + ksft_set_plan(1); + + alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + assert(alloc != MAP_FAILED); + memset(alloc, 'A', alloc_size); + for (size_t i = 0; i < alloc_size; i += page_size) + assert(*(uint8_t *)(alloc + i)); + + child = fork(); + assert(child >= 0); + if (!child) { + for (;;) + *(char *)malloc(page_size) = 'B'; + } + + for (bool done = false; !done;) { + for (size_t i = 0; i < alloc_size; i += page_size) { + if (!*(uint8_t *)(alloc + i)) { + done = true; + break; + } + } + } + kill(child, SIGTERM); + + ksft_test_result_pass("MAP_DROPPABLE: PASS\n"); + exit(KSFT_PASS); +} diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 1b03bcfaefdf..5a3a9bcba640 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -22,8 +22,10 @@ #define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) +#endif #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index a818f010de47..bfcea5cf9a48 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -81,17 +81,6 @@ static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static int sys_munmap(void *ptr, size_t size) { int sret; @@ -172,7 +161,7 @@ static void setup_single_address(int size, void **ptrOut) { void *ptr; - ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); *ptrOut = ptr; } @@ -181,7 +170,7 @@ static void setup_single_address_rw(int size, void **ptrOut) void *ptr; unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; - ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); *ptrOut = ptr; } @@ -205,7 +194,7 @@ bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; @@ -481,8 +470,8 @@ static void test_seal_zero_address(void) int prot; /* use mmap to change protection. */ - ptr = sys_mmap(0, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ptr = mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); FAIL_TEST_IF_FALSE(ptr == 0); size = get_vma_size(ptr, &prot); @@ -1209,8 +1198,8 @@ static void test_seal_mmap_overwrite_prot(bool seal) } /* use mmap to change protection. */ - ret2 = sys_mmap(ptr, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1240,8 +1229,8 @@ static void test_seal_mmap_expand(bool seal) } /* use mmap to expand. */ - ret2 = sys_mmap(ptr, size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1268,8 +1257,8 @@ static void test_seal_mmap_shrink(bool seal) } /* use mmap to shrink. */ - ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1650,7 +1639,7 @@ static void test_seal_discard_ro_anon_on_filebacked(bool seal) ret = fallocate(fd, 0, 0, size); FAIL_TEST_IF_FALSE(!ret); - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, fd, 0); FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); if (seal) { @@ -1680,7 +1669,7 @@ static void test_seal_discard_ro_anon_on_shared(bool seal) int ret; unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, -1, 0); FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h new file mode 100644 index 000000000000..580e1b0bb38e --- /dev/null +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _PKEYS_ARM64_H +#define _PKEYS_ARM64_H + +#include "vm_util.h" +/* for signal frame parsing */ +#include "../arm64/signal/testcases/testcases.h" + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 288 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 289 +# define SYS_pkey_free 290 +#endif +#define MCONTEXT_IP(mc) mc.pc +#define MCONTEXT_TRAPNO(mc) -1 + +#define PKEY_MASK 0xf + +#define POE_NONE 0x0 +#define POE_X 0x2 +#define POE_RX 0x3 +#define POE_RWX 0x7 + +#define NR_PKEYS 8 +#define NR_RESERVED_PKEYS 1 /* pkey-0 */ + +#define PKEY_ALLOW_ALL 0x77777777 + +#define PKEY_BITS_PER_PKEY 4 +#define PAGE_SIZE sysconf(_SC_PAGESIZE) +#undef HPAGE_SIZE +#define HPAGE_SIZE default_huge_page_size() + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg = 0; + + // POR_EL0 + asm volatile("mrs %0, S3_3_c10_c2_4" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 por = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + // POR_EL0 + asm volatile("msr S3_3_c10_c2_4, %0\nisb" :: "r" (por) :); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#define set_pkey_bits set_pkey_bits +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + u64 new_val = POE_RWX; + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + + if (flags & PKEY_DISABLE_ACCESS) + new_val = POE_X; + else if (flags & PKEY_DISABLE_WRITE) + new_val = POE_RX; + + /* OR in new bits for pkey */ + reg |= new_val << shift; + + return reg; +} + +#define get_pkey_bits get_pkey_bits +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest four, then + * mask off all the other higher bits + */ + u32 perm = (reg >> shift) & PKEY_MASK; + + if (perm == POE_X) + return PKEY_DISABLE_ACCESS; + if (perm == POE_RX) + return PKEY_DISABLE_WRITE; + return 0; +} + +static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) +{ + struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt); + struct poe_context *poe_ctx = + (struct poe_context *) get_header(ctx, POE_MAGIC, + sizeof(uctxt->uc_mcontext), NULL); + if (poe_ctx) + poe_ctx->por_el0 = pkey; +} + +#endif /* _PKEYS_ARM64_H */ diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 1af3156a9db8..9ab6a3ee153b 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -79,7 +79,18 @@ extern void abort_hooks(void); } \ } while (0) -__attribute__((noinline)) int read_ptr(int *ptr); +#define barrier() __asm__ __volatile__("": : :"memory") +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +noinline int read_ptr(int *ptr) +{ + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; +} + void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); @@ -91,12 +102,17 @@ void record_pkey_malloc(void *ptr, long size, int prot); #include "pkey-x86.h" #elif defined(__powerpc64__) /* arch */ #include "pkey-powerpc.h" +#elif defined(__aarch64__) /* arch */ +#include "pkey-arm64.h" #else /* arch */ #error Architecture not supported #endif /* arch */ +#ifndef PKEY_MASK #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif +#ifndef set_pkey_bits static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) { u32 shift = pkey_bit_position(pkey); @@ -106,7 +122,9 @@ static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) reg |= (flags & PKEY_MASK) << shift; return reg; } +#endif +#ifndef get_pkey_bits static inline u64 get_pkey_bits(u64 reg, int pkey) { u32 shift = pkey_bit_position(pkey); @@ -116,6 +134,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey) */ return ((reg >> shift) & PKEY_MASK); } +#endif extern u64 shadow_pkey_reg; diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index ae5df26104e5..3d0c0bdae5bc 100644 --- a/tools/testing/selftests/mm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -8,7 +8,10 @@ # define SYS_pkey_free 385 #endif #define REG_IP_IDX PT_NIP +#define MCONTEXT_IP(mc) mc.gp_regs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gp_regs[REG_TRAPNO] #define REG_TRAPNO PT_TRAP +#define MCONTEXT_FPREGS #define gregs gp_regs #define fpregs fp_regs #define si_pkey_offset 0x20 diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index 814758e109c0..5f28e26a2511 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,10 @@ #endif +#define MCONTEXT_IP(mc) mc.gregs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gregs[REG_TRAPNO] +#define MCONTEXT_FPREGS + #ifndef PKEY_DISABLE_ACCESS # define PKEY_DISABLE_ACCESS 0x1 #endif diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c new file mode 100644 index 000000000000..a8088b645ad6 --- /dev/null +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * The testcases in this file exercise various flows related to signal handling, + * using an alternate signal stack, with the default pkey (pkey 0) disabled. + * + * Compile with: + * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <errno.h> +#include <sys/syscall.h> +#include <string.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <pthread.h> +#include <limits.h> + +#include "pkey-helpers.h" + +#define STACK_SIZE PTHREAD_STACK_MIN + +void expected_pkey_fault(int pkey) {} + +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +siginfo_t siginfo = {0}; + +/* + * We need to use inline assembly instead of glibc's syscall because glibc's + * syscall will attempt to access the PLT in order to call a library function + * which is protected by MPK 0 which we don't have access to. + */ +static inline __always_inline +long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) +{ + unsigned long ret; +#ifdef __x86_64__ + register long r10 asm("r10") = a4; + register long r8 asm("r8") = a5; + register long r9 asm("r9") = a6; + asm volatile ("syscall" + : "=a"(ret) + : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9) + : "rcx", "r11", "memory"); +#elif defined __i386__ + asm volatile ("int $0x80" + : "=a"(ret) + : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5) + : "memory"); +#else +# error syscall_raw() not implemented +#endif + return ret; +} + +static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); +} + +static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); +} + +static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext) +{ + /* + * pkru should be the init_pkru value which enabled MPK 0 so + * we can use library functions. + */ + printf("%s invoked.\n", __func__); +} + +static void raise_sigusr2(void) +{ + pid_t tid = 0; + + tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0); + + syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0); + + /* + * We should return from the signal handler here and be able to + * return to the interrupted thread. + */ +} + +static void *thread_segv_with_pkey0_disabled(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* Segfault (with SEGV_MAPERR) */ + *(int *) (0x1) = 1; + return NULL; +} + +static void *thread_segv_pkuerr_stack(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* After we disable MPK 0, we can't access the stack to return */ + return NULL; +} + +static void *thread_segv_maperr_ptr(void *ptr) +{ + stack_t *stack = ptr; + int *bad = (int *)1; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 1 is enabled. */ + __write_pkey_reg(0x55555551); + + /* Segfault */ + *bad = 1; + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0. + */ +static void test_sigsegv_handler_with_pkey0_disabled(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0, which renders them inaccessible when MPK 0 + * is disabled. So just the return from the thread should cause a + * segfault with SEGV_PKUERR. + */ +static void test_sigsegv_handler_cannot_access_stack(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_PKUERR, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler that uses an alternate signal stack + * is correctly invoked for a thread which uses a non-zero MPK to protect + * its own stack, and disables all other MPKs (including 0). + */ +static void test_sigsegv_handler_with_different_pkey_for_stack(void) +{ + struct sigaction sa; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + + sa.sa_sigaction = sigsegv_handler; + + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* Allow access to MPK 0 and MPK 1 */ + __write_pkey_reg(0x55555550); + + /* Protect the new stack with MPK 1 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + memset(&siginfo, 0, sizeof(siginfo)); + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_segv_maperr_ptr(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the PKRU value set by the application is correctly + * restored upon return from signal handling. + */ +static void test_pkru_preserved_after_sigusr1(void) +{ + struct sigaction sa; + unsigned long pkru = 0x45454544; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigusr1_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + __write_pkey_reg(pkru); + + raise(SIGUSR1); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + /* Ensure the pkru value is the same after returning from signal. */ + ksft_test_result(pkru == __read_pkey_reg() && + siginfo.si_signo == SIGUSR1, + "%s\n", __func__); +} + +static noinline void *thread_sigusr2_self(void *ptr) +{ + /* + * A const char array like "Resuming after SIGUSR2" won't be stored on + * the stack and the code could access it via an offset from the program + * counter. This makes sure it's on the function's stack frame. + */ + char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ', + 'a', 'f', 't', 'e', 'r', ' ', + 'S', 'I', 'G', 'U', 'S', 'R', '2', + '.', '.', '.', '\n', '\0'}; + stack_t *stack = ptr; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 2 is enabled. */ + __write_pkey_reg(0x55555545); + + raise_sigusr2(); + + /* Do something, to show the thread resumed execution after the signal */ + syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0); + + /* + * We can't return to test_pkru_sigreturn because it + * will attempt to use a %rbp value which is on the stack + * of the main thread. + */ + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that sigreturn is able to restore altstack even if the thread had + * disabled pkey 0. + */ +static void test_pkru_sigreturn(void) +{ + struct sigaction sa = {0}; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_handler = SIG_DFL; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + + /* + * For this testcase, we do not want to handle SIGSEGV. Reset handler + * to default so that the application can crash if it receives SIGSEGV. + */ + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + sa.sa_sigaction = sigusr2_handler; + sigemptyset(&sa.sa_mask); + + if (sigaction(SIGUSR2, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* + * Allow access to MPK 0 and MPK 2. The child thread (to be created + * later in this flow) will have its stack protected by MPK 2, whereas + * the current thread's stack is protected by the default MPK 0. Hence + * both need to be enabled. + */ + __write_pkey_reg(0x55555544); + + /* Protect the stack with MPK 2 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_sigusr2_self(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + child_pid = ret; + /* Check that thread exited */ + do { + sched_yield(); + ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0); + } while (ret != -ESRCH && ret != -EINVAL); + + ksft_test_result_pass("%s\n", __func__); +} + +static void (*pkey_tests[])(void) = { + test_sigsegv_handler_with_pkey0_disabled, + test_sigsegv_handler_cannot_access_stack, + test_sigsegv_handler_with_different_pkey_for_stack, + test_pkru_preserved_after_sigusr1, + test_pkru_sigreturn +}; + +int main(int argc, char *argv[]) +{ + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(pkey_tests)); + + for (i = 0; i < ARRAY_SIZE(pkey_tests); i++) + (*pkey_tests[i])(); + + ksft_finished(); + return 0; +} diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index eaa6d1fc5328..4990f7ab4cb7 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -147,7 +147,7 @@ void abort_hooks(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__aarch64__) /* This way, both 4K and 64K alignment are maintained */ __attribute__((__aligned__(65536))) #else @@ -212,7 +212,6 @@ void pkey_disable_set(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@ -242,8 +241,6 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -253,7 +250,6 @@ void pkey_disable_clear(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); @@ -273,8 +269,6 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); } void pkey_write_allow(int pkey) @@ -314,7 +308,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) ucontext_t *uctxt = vucontext; int trapno; unsigned long ip; +#ifdef MCONTEXT_FPREGS char *fpregs; +#endif #if defined(__i386__) || defined(__x86_64__) /* arch */ u32 *pkey_reg_ptr; int pkey_reg_offset; @@ -328,9 +324,11 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); + ip = MCONTEXT_IP(uctxt->uc_mcontext); +#ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; +#endif dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", __func__, trapno, ip, si_code_str(si->si_code), @@ -359,7 +357,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #endif /* arch */ dprintf1("siginfo: %p\n", si); +#ifdef MCONTEXT_FPREGS dprintf1(" fpregs: %p\n", fpregs); +#endif if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -389,6 +389,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #elif defined(__powerpc64__) /* arch */ /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); +#elif defined(__aarch64__) + aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); @@ -902,7 +904,9 @@ void expected_pkey_fault(int pkey) * test program continue. We now have to restore it. */ if (__read_pkey_reg() != 0) -#else /* arch */ +#elif defined(__aarch64__) + if (__read_pkey_reg() != PKEY_ALLOW_ALL) +#else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ pkey_assert(0); @@ -950,16 +954,6 @@ void close_test_fds(void) nr_test_fds = 0; } -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; @@ -1492,6 +1486,11 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); expect_fault_on_read_execonly_key(p1, pkey); + + // Reset back to PROT_EXEC | PROT_READ for architectures that support + // non-PKEY execute-only permissions. + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); + pkey_assert(!ret); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1665,6 +1664,84 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) } #endif +#if defined(__aarch64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + pid_t child; + int status, ret; + struct iovec iov; + u64 trace_pkey; + /* Just a random pkey value.. */ + u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | + (POE_NONE << PKEY_BITS_PER_PKEY) | + POE_RWX; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkey) + exit(1); + + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + iov.iov_base = &trace_pkey; + iov.iov_len = 8; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == read_pkey_reg()); + + trace_pkey = new_pkey; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); +} +#endif + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@ -1700,7 +1777,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) test_ptrace_modifies_pkru, #endif }; diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 03ac4f2e1cce..36045edb10de 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -374,8 +374,11 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +if [ -x ./memfd_secret ] +then (echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix CATEGORY="memfd_secret" run_test ./memfd_secret +fi # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 CATEGORY="ksm" run_test ./ksm_tests -H -s 100 diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index 7aa1366063e4..d9f8ba8d5050 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -30,17 +30,6 @@ static int sys_mseal(void *start, size_t len) return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) { int sret; @@ -56,7 +45,7 @@ static bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index fa7eabfaf841..896b3f73fc53 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -293,6 +293,20 @@ static int run_test(struct testcase *test, int count) return ret; } +#ifdef __aarch64__ +/* Check if userspace VA > 48 bits */ +static int high_address_present(void) +{ + void *ptr = mmap((void *)(1UL << 50), 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (ptr == MAP_FAILED) + return 0; + + munmap(ptr, 1); + return 1; +} +#endif + static int supported_arch(void) { #if defined(__powerpc64__) @@ -300,7 +314,7 @@ static int supported_arch(void) #elif defined(__x86_64__) return 1; #elif defined(__aarch64__) - return 1; + return high_address_present(); #else return 0; #endif diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 666ab7d9390b..1c04c780db66 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -17,6 +17,7 @@ ipv6_flowlabel ipv6_flowlabel_mgr log.txt msg_zerocopy +ncdevmem nettest psock_fanout psock_snd @@ -34,6 +35,7 @@ scm_pidfd scm_rights sk_bind_sendto_listen sk_connect_zero_addr +sk_so_peek_off socket so_incoming_cpu so_netns_cookie diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8eaffd7a641c..649f1fe0dc46 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -56,7 +56,7 @@ TEST_PROGS += ip_local_port_range.sh TEST_PROGS += rps_default_mask.sh TEST_PROGS += big_tcp.sh TEST_PROGS += netns-sysctl.sh -TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh +TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh xfrm_policy_add_speed.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite @@ -80,12 +80,14 @@ TEST_PROGS += io_uring_zerocopy_tx.sh TEST_GEN_FILES += bind_bhash TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr +TEST_GEN_PROGS += sk_so_peek_off TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += ip_local_port_range -TEST_GEN_FILES += bind_wildcard +TEST_GEN_PROGS += bind_wildcard +TEST_GEN_PROGS += bind_timewait TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh TEST_PROGS += test_vxlan_nolocalbypass.sh @@ -95,6 +97,11 @@ TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh TEST_PROGS += bpf_offload.py +# YNL files, must be before "include ..lib.mk" +EXTRA_CLEAN += $(OUTPUT)/libynl.a +YNL_GEN_FILES := ncdevmem +TEST_GEN_FILES += $(YNL_GEN_FILES) + TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh @@ -104,6 +111,10 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk +# YNL build +YNL_GENS := netdev +include ynl.mk + $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 16d0c172eaeb..3ed3882a93b8 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -209,7 +209,7 @@ static void __sendpair(struct __test_metadata *_metadata, static void __recvpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self, - const void *expected_buf, int expected_len, + const char *expected_buf, int expected_len, int buf_len, int flags) { int i, ret[2], recv_errno[2], expected_errno = 0; @@ -525,6 +525,29 @@ TEST_F(msg_oob, ex_oob_drop_2) } } +TEST_F(msg_oob, ex_oob_oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("", -EAGAIN, 1, 0); + epollpair(false); + siocatmarkpair(false); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); +} + TEST_F(msg_oob, ex_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 386ebd829df5..899dbad0104b 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -4304,14 +4304,7 @@ elif [ "$TESTS" = "ipv6" ]; then TESTS="$TESTS_IPV6" fi -# nettest can be run from PATH or from same directory as this selftest -if ! which nettest >/dev/null; then - PATH=$PWD:$PATH - if ! which nettest >/dev/null; then - echo "'nettest' command not found; skipping tests" - exit $ksft_skip - fi -fi +check_gen_prog "nettest" declare -i nfail=0 declare -i nsuccess=0 diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index ac0b2c6a5761..77c83d9508d3 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -78,7 +78,12 @@ log_test() else ret=1 nfail=$((nfail+1)) - printf "TEST: %-60s [FAIL]\n" "${msg}" + if [[ $rc -eq $ksft_skip ]]; then + printf "TEST: %-60s [SKIP]\n" "${msg}" + else + printf "TEST: %-60s [FAIL]\n" "${msg}" + fi + if [ "$VERBOSE" = "1" ]; then echo " rc=$rc, expected $expected" fi @@ -923,6 +928,29 @@ ipv6_grp_fcnal() ipv6_grp_refs log_test $? 0 "Nexthop group replace refcounts" + + # + # 16-bit weights. + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1" + run_cmd "$IP nexthop add id 66 dev veth1" + + run_cmd "$IP nexthop add id 103 group 62,1000" + if [[ $? == 0 ]]; then + local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535" + run_cmd "$IP nexthop replace $GRP" + check_nexthop "id 103" "$GRP" + rc=$? + else + rc=$ksft_skip + fi + + $IP nexthop flush >/dev/null 2>&1 + + log_test $rc 0 "16-bit weights" } ipv6_res_grp_fcnal() @@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal() check_nexthop_bucket "list id 102" \ "id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" log_test $? 0 "Nexthop buckets updated after replace - nECMP" + + # + # 16-bit weights. + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1" + run_cmd "$IP nexthop add id 66 dev veth1" + + run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32" + if [[ $? == 0 ]]; then + local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(: + )type resilient buckets 32 idle_timer 0 $(: + )unbalanced_timer 0" + run_cmd "$IP nexthop replace $GRP" + check_nexthop "id 103" "$GRP unbalanced_time 0" + rc=$? + else + rc=$ksft_skip + fi + + $IP nexthop flush >/dev/null 2>&1 + + log_test $rc 0 "16-bit weights" } ipv6_fcnal_runtime() diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 7c01f58a20de..1d58b3b87465 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -35,18 +35,13 @@ log_test() local expected=$2 local msg="$3" - $IP rule show | grep -q l3mdev - if [ $? -eq 0 ]; then - msg="$msg (VRF)" - fi - if [ ${rc} -eq ${expected} ]; then nsuccess=$((nsuccess+1)) - printf "\n TEST: %-60s [ OK ]\n" "${msg}" + printf " TEST: %-60s [ OK ]\n" "${msg}" else ret=1 nfail=$((nfail+1)) - printf "\n TEST: %-60s [FAIL]\n" "${msg}" + printf " TEST: %-60s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" @@ -56,39 +51,6 @@ log_test() fi } -log_section() -{ - echo - echo "######################################################################" - echo "TEST SECTION: $*" - echo "######################################################################" -} - -check_nettest() -{ - if which nettest > /dev/null 2>&1; then - return 0 - fi - - # Add the selftest directory to PATH if not already done - if [ "${SELFTEST_PATH}" = "" ]; then - SELFTEST_PATH="$(dirname $0)" - PATH="${PATH}:${SELFTEST_PATH}" - - # Now retry with the new path - if which nettest > /dev/null 2>&1; then - return 0 - fi - - if [ "${ret}" -eq 0 ]; then - ret="${ksft_skip}" - fi - echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')" - fi - - return 1 -} - setup() { set -e @@ -187,12 +149,17 @@ fib_rule6_test_match_n_redirect() { local match="$1" local getmatch="$2" - local description="$3" + local getnomatch="$3" + local description="$4" + local nomatch_description="$5" $IP -6 rule add $match table $RTABLE $IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE" log_test $? 0 "rule6 check: $description" + $IP -6 route get $GW_IP6 $getnomatch 2>&1 | grep -q "table $RTABLE" + log_test $? 1 "rule6 check: $nomatch_description" + fib_rule6_del_by_pref "$match" log_test $? 0 "rule6 del by pref: $description" } @@ -213,18 +180,27 @@ fib_rule6_test_reject() fib_rule6_test() { + local ext_name=$1; shift + local getnomatch local getmatch local match local cnt + echo + echo "IPv6 FIB rule tests $ext_name" + # setup the fib rule redirect route $IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink match="oif $DEV" - fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table" + getnomatch="oif lo" + fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "oif redirect to table" "oif no redirect to table" match="from $SRC_IP6 iif $DEV" - fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table" + getnomatch="from $SRC_IP6 iif lo" + fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "iif redirect to table" "iif no redirect to table" # Reject dsfield (tos) options which have ECN bits set for cnt in $(seq 1 3); do @@ -238,44 +214,89 @@ fib_rule6_test() # Using option 'tos' instead of 'dsfield' as old iproute2 # versions don't support 'dsfield' in ip rule show. getmatch="tos $cnt" + getnomatch="tos 0x20" fib_rule6_test_match_n_redirect "$match" "$getmatch" \ - "$getmatch redirect to table" + "$getnomatch" "$getmatch redirect to table" \ + "$getnomatch no redirect to table" + done + + # Re-test TOS matching, but with input routes since they are handled + # differently from output routes. + match="tos 0x10" + for cnt in "0x10" "0x11" "0x12" "0x13"; do + getmatch="tos $cnt" + getnomatch="tos 0x20" + fib_rule6_test_match_n_redirect "$match" \ + "from $SRC_IP6 iif $DEV $getmatch" \ + "from $SRC_IP6 iif $DEV $getnomatch" \ + "iif $getmatch redirect to table" \ + "iif $getnomatch no redirect to table" done match="fwmark 0x64" getmatch="mark 0x64" - fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table" + getnomatch="mark 0x63" + fib_rule6_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \ + "fwmark redirect to table" "fwmark no redirect to table" fib_check_iproute_support "uidrange" "uid" if [ $? -eq 0 ]; then match="uidrange 100-100" getmatch="uid 100" - fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table" + getnomatch="uid 101" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "uid redirect to table" \ + "uid no redirect to table" fi fib_check_iproute_support "sport" "sport" if [ $? -eq 0 ]; then match="sport 666 dport 777" - fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table" + getnomatch="sport 667 dport 778" + fib_rule6_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "sport and dport redirect to table" \ + "sport and dport no redirect to table" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto tcp" - fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match" + getnomatch="ipproto udp" + fib_rule6_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto tcp match" "ipproto udp no match" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto ipv6-icmp" - fib_rule6_test_match_n_redirect "$match" "$match" "ipproto ipv6-icmp match" + getnomatch="ipproto tcp" + fib_rule6_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto ipv6-icmp match" \ + "ipproto ipv6-tcp no match" + fi + + fib_check_iproute_support "dscp" "tos" + if [ $? -eq 0 ]; then + match="dscp 0x3f" + getmatch="tos 0xfc" + getnomatch="tos 0xf4" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "dscp redirect to table" \ + "dscp no redirect to table" + + match="dscp 0x3f" + getmatch="from $SRC_IP6 iif $DEV tos 0xfc" + getnomatch="from $SRC_IP6 iif $DEV tos 0xf4" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif dscp redirect to table" \ + "iif dscp no redirect to table" fi } fib_rule6_vrf_test() { setup_vrf - fib_rule6_test + fib_rule6_test "- with VRF" cleanup_vrf } @@ -285,10 +306,8 @@ fib_rule6_connect_test() { local dsfield - if ! check_nettest; then - echo "SKIP: Could not run test without nettest tool" - return - fi + echo + echo "IPv6 FIB rule connect tests" setup_peer $IP -6 rule add dsfield 0x04 table $RTABLE_PEER @@ -306,7 +325,45 @@ fib_rule6_connect_test() log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})" done + # Check that UDP and TCP connections fail when using a DS Field that + # does not match the previously configured FIB rule. + nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D \ + -Q 0x20 -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 1 "rule6 dsfield udp no connect (dsfield 0x20)" + + nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0x20 \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 1 "rule6 dsfield tcp no connect (dsfield 0x20)" + $IP -6 rule del dsfield 0x04 table $RTABLE_PEER + + ip rule help 2>&1 | grep -q dscp + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 iprule too old, missing dscp match" + cleanup_peer + return + fi + + $IP -6 rule add dscp 0x3f table $RTABLE_PEER + + nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xfc \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dscp udp connect" + + nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xfc \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dscp tcp connect" + + nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xf4 \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 1 "rule6 dscp udp no connect" + + nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xf4 \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 1 "rule6 dscp tcp no connect" + + $IP -6 rule del dscp 0x3f table $RTABLE_PEER + cleanup_peer } @@ -326,12 +383,17 @@ fib_rule4_test_match_n_redirect() { local match="$1" local getmatch="$2" - local description="$3" + local getnomatch="$3" + local description="$4" + local nomatch_description="$5" $IP rule add $match table $RTABLE $IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE" log_test $? 0 "rule4 check: $description" + $IP route get $GW_IP4 $getnomatch 2>&1 | grep -q "table $RTABLE" + log_test $? 1 "rule4 check: $nomatch_description" + fib_rule4_del_by_pref "$match" log_test $? 0 "rule4 del by pref: $description" } @@ -352,23 +414,31 @@ fib_rule4_test_reject() fib_rule4_test() { + local ext_name=$1; shift + local getnomatch local getmatch local match local cnt + echo + echo "IPv4 FIB rule tests $ext_name" + # setup the fib rule redirect route $IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink match="oif $DEV" - fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table" + getnomatch="oif lo" + fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "oif redirect to table" "oif no redirect to table" - # need enable forwarding and disable rp_filter temporarily as all the - # addresses are in the same subnet and egress device == ingress device. + # Enable forwarding and disable rp_filter as all the addresses are in + # the same subnet and egress device == ingress device. ip netns exec $testns sysctl -qw net.ipv4.ip_forward=1 ip netns exec $testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0 match="from $SRC_IP iif $DEV" - fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table" - ip netns exec $testns sysctl -qw net.ipv4.ip_forward=0 + getnomatch="from $SRC_IP iif lo" + fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "iif redirect to table" "iif no redirect to table" # Reject dsfield (tos) options which have ECN bits set for cnt in $(seq 1 3); do @@ -382,44 +452,90 @@ fib_rule4_test() # Using option 'tos' instead of 'dsfield' as old iproute2 # versions don't support 'dsfield' in ip rule show. getmatch="tos $cnt" + getnomatch="tos 0x20" fib_rule4_test_match_n_redirect "$match" "$getmatch" \ - "$getmatch redirect to table" + "$getnomatch" "$getmatch redirect to table" \ + "$getnomatch no redirect to table" + done + + # Re-test TOS matching, but with input routes since they are handled + # differently from output routes. + match="tos 0x10" + for cnt in "0x10" "0x11" "0x12" "0x13"; do + getmatch="tos $cnt" + getnomatch="tos 0x20" + fib_rule4_test_match_n_redirect "$match" \ + "from $SRC_IP iif $DEV $getmatch" \ + "from $SRC_IP iif $DEV $getnomatch" \ + "iif $getmatch redirect to table" \ + "iif $getnomatch no redirect to table" done match="fwmark 0x64" getmatch="mark 0x64" - fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table" + getnomatch="mark 0x63" + fib_rule4_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \ + "fwmark redirect to table" "fwmark no redirect to table" fib_check_iproute_support "uidrange" "uid" if [ $? -eq 0 ]; then match="uidrange 100-100" getmatch="uid 100" - fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table" + getnomatch="uid 101" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "uid redirect to table" \ + "uid no redirect to table" fi fib_check_iproute_support "sport" "sport" if [ $? -eq 0 ]; then match="sport 666 dport 777" - fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table" + getnomatch="sport 667 dport 778" + fib_rule4_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "sport and dport redirect to table" \ + "sport and dport no redirect to table" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto tcp" - fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match" + getnomatch="ipproto udp" + fib_rule4_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto tcp match" \ + "ipproto udp no match" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto icmp" - fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match" + getnomatch="ipproto tcp" + fib_rule4_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto icmp match" \ + "ipproto tcp no match" + fi + + fib_check_iproute_support "dscp" "tos" + if [ $? -eq 0 ]; then + match="dscp 0x3f" + getmatch="tos 0xfc" + getnomatch="tos 0xf4" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "dscp redirect to table" \ + "dscp no redirect to table" + + match="dscp 0x3f" + getmatch="from $SRC_IP iif $DEV tos 0xfc" + getnomatch="from $SRC_IP iif $DEV tos 0xf4" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif dscp redirect to table" \ + "iif dscp no redirect to table" fi } fib_rule4_vrf_test() { setup_vrf - fib_rule4_test + fib_rule4_test "- with VRF" cleanup_vrf } @@ -429,10 +545,8 @@ fib_rule4_connect_test() { local dsfield - if ! check_nettest; then - echo "SKIP: Could not run test without nettest tool" - return - fi + echo + echo "IPv4 FIB rule connect tests" setup_peer $IP -4 rule add dsfield 0x04 table $RTABLE_PEER @@ -450,16 +564,46 @@ fib_rule4_connect_test() log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})" done + # Check that UDP and TCP connections fail when using a DS Field that + # does not match the previously configured FIB rule. + nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0x20 \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 1 "rule4 dsfield udp no connect (dsfield 0x20)" + + nettest -q -B -t 5 -N $testns -O $peerns -Q 0x20 \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 1 "rule4 dsfield tcp no connect (dsfield 0x20)" + $IP -4 rule del dsfield 0x04 table $RTABLE_PEER - cleanup_peer -} -run_fibrule_tests() -{ - log_section "IPv4 fib rule" - fib_rule4_test - log_section "IPv6 fib rule" - fib_rule6_test + ip rule help 2>&1 | grep -q dscp + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 iprule too old, missing dscp match" + cleanup_peer + return + fi + + $IP -4 rule add dscp 0x3f table $RTABLE_PEER + + nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xfc \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dscp udp connect" + + nettest -q -B -t 5 -N $testns -O $peerns -Q 0xfc \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dscp tcp connect" + + nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xf4 \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 1 "rule4 dscp udp no connect" + + nettest -q -B -t 5 -N $testns -O $peerns -Q 0xf4 \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 1 "rule4 dscp tcp no connect" + + $IP -4 rule del dscp 0x3f table $RTABLE_PEER + + cleanup_peer } ################################################################################ # usage @@ -495,6 +639,8 @@ if [ ! -x "$(command -v ip)" ]; then exit $ksft_skip fi +check_gen_prog "nettest" + # start clean cleanup &> /dev/null setup diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index 7fdb6a9ca543..a652429bfd53 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -6,7 +6,7 @@ to easily create and test complex environments. Unfortunately, these namespaces can not be used with actual switching ASICs, as their ports can not be migrated to other network namespaces -(NETIF_F_NETNS_LOCAL) and most of them probably do not support the +(dev->netns_local) and most of them probably do not support the L1-separation provided by namespaces. However, a similar kind of flexibility can be achieved by using VRFs and diff --git a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh index 0760a34b7114..a21b7085da2e 100755 --- a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh +++ b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh @@ -178,6 +178,22 @@ fdb_del() check_err $? "Failed to remove a FDB entry of type ${type}" } +check_fdb_n_learned_support() +{ + if ! ip link help bridge 2>&1 | grep -q "fdb_max_learned"; then + echo "SKIP: iproute2 too old, missing bridge max learned support" + exit $ksft_skip + fi + + ip link add dev br0 type bridge + local learned=$(fdb_get_n_learned) + ip link del dev br0 + if [ "$learned" == "null" ]; then + echo "SKIP: kernel too old; bridge fdb_n_learned feature not supported." + exit $ksft_skip + fi +} + check_accounting_one_type() { local type=$1 is_counted=$2 overrides_learned=$3 @@ -274,6 +290,8 @@ check_limit() done } +check_fdb_n_learned_support + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh index 64bd00fe9a4f..90f8a244ea90 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid" NUM_NETIFS=4 CHECK_TC="yes" source lib.sh @@ -142,6 +142,58 @@ extern_learn() bridge fdb del de:ad:be:ef:13:37 dev $swp1 master vlan 1 &> /dev/null } +other_tpid() +{ + local mac=de:ad:be:ef:13:37 + + # Test that packets with TPID 802.1ad VID 3 + TPID 802.1Q VID 5 are + # classified as untagged by a bridge with vlan_protocol 802.1Q, and + # are processed in the PVID of the ingress port (here 1). Not VID 3, + # and not VID 5. + RET=0 + + tc qdisc add dev $h2 clsact + tc filter add dev $h2 ingress protocol all pref 1 handle 101 \ + flower dst_mac $mac action drop + ip link set $h2 promisc on + ethtool -K $h2 rx-vlan-filter off rx-vlan-stag-filter off + + $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + # Match on 'self' addresses as well, for those drivers which + # do not push their learned addresses to the bridge software + # database + bridge -j fdb show $swp1 | \ + jq -e ".[] | select(.mac == \"$(mac_get $h1)\") | select(.vlan == 1)" &> /dev/null + check_err $? "FDB entry was not learned when it should" + + log_test "FDB entry in PVID for VLAN-tagged with other TPID" + + RET=0 + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err $? "Packet was not forwarded when it should" + log_test "Reception of VLAN with other TPID as untagged" + + bridge vlan del dev $swp1 vid 1 + + $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + RET=0 + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err $? "Packet was forwarded when should not" + log_test "Reception of VLAN with other TPID as untagged (no PVID)" + + bridge vlan add dev $swp1 vid 1 pvid untagged + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh index 1783c10215e5..7d531f7091e6 100755 --- a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh +++ b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh @@ -224,10 +224,10 @@ send_dst_ipv6() send_flowlabel() { # Generate 16384 echo requests, each with a random flow label. - for _ in $(seq 1 16384); do - ip vrf exec v$h1 \ - $PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1 - done + ip vrf exec v$h1 sh -c \ + "for _ in {1..16384}; do \ + $PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1; \ + done" } send_src_udp6() diff --git a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh index 9788bd0f6e8b..dda11a4a9450 100755 --- a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh +++ b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh @@ -319,10 +319,10 @@ send_dst_ipv6() send_flowlabel() { # Generate 16384 echo requests, each with a random flow label. - for _ in $(seq 1 16384); do - ip vrf exec v$h1 \ - $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1 - done + ip vrf exec v$h1 sh -c \ + "for _ in {1..16384}; do \ + $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \ + done" } send_src_udp6() diff --git a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh index 2ab9eaaa5532..e28b4a079e52 100755 --- a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh +++ b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh @@ -321,10 +321,10 @@ send_dst_ipv6() send_flowlabel() { # Generate 16384 echo requests, each with a random flow label. - for _ in $(seq 1 16384); do - ip vrf exec v$h1 \ - $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1 - done + ip vrf exec v$h1 sh -c \ + "for _ in {1..16384}; do \ + $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \ + done" } send_src_udp6() diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ff96bb7535ff..c992e385159c 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -500,6 +500,11 @@ check_err_fail() fi } +xfail() +{ + FAIL_TO_XFAIL=yes "$@" +} + xfail_on_slow() { if [[ $KSFT_MACHINE_SLOW = yes ]]; then @@ -509,6 +514,13 @@ xfail_on_slow() fi } +omit_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW != yes ]]; then + "$@" + fi +} + xfail_on_veth() { local dev=$1; shift @@ -1113,6 +1125,39 @@ mac_get() ip -j link show dev $if_name | jq -r '.[]["address"]' } +ether_addr_to_u64() +{ + local addr="$1" + local order="$((1 << 40))" + local val=0 + local byte + + addr="${addr//:/ }" + + for byte in $addr; do + byte="0x$byte" + val=$((val + order * byte)) + order=$((order >> 8)) + done + + printf "0x%x" $val +} + +u64_to_ether_addr() +{ + local val=$1 + local byte + local i + + for ((i = 40; i >= 0; i -= 8)); do + byte=$(((val & (0xff << i)) >> i)) + printf "%02x" $byte + if [ $i -ne 0 ]; then + printf ":" + fi + done +} + ipv6_lladdr_get() { local if_name=$1 @@ -2229,3 +2274,22 @@ absval() echo $((v > 0 ? v : -v)) } + +has_unicast_flt() +{ + local dev=$1; shift + local mac_addr=$(mac_get $dev) + local tmp=$(ether_addr_to_u64 $mac_addr) + local promisc + + ip link set $dev up + ip link add link $dev name macvlan-tmp type macvlan mode private + ip link set macvlan-tmp address $(u64_to_ether_addr $((tmp + 1))) + ip link set macvlan-tmp up + + promisc=$(ip -j -d link show dev $dev | jq -r '.[].promiscuity') + + ip link del macvlan-tmp + + [[ $promisc == 1 ]] && echo "no" || echo "yes" +} diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 4b364cdf3ef0..c35548767756 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -1,7 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="standalone bridge" +ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ + vlan_over_vlan_unaware_bridged_port vlan_over_vlan_aware_bridged_port \ + vlan_over_vlan_unaware_bridge vlan_over_vlan_aware_bridge" NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes @@ -37,9 +39,68 @@ UNKNOWN_MACV6_MC_ADDR1="33:33:01:02:03:05" UNKNOWN_MACV6_MC_ADDR2="33:33:01:02:03:06" UNKNOWN_MACV6_MC_ADDR3="33:33:01:02:03:07" -NON_IP_MC="01:02:03:04:05:06" -NON_IP_PKT="00:04 48:45:4c:4f" -BC="ff:ff:ff:ff:ff:ff" +PTP_1588_L2_SYNC=" \ +01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 00 02 \ +00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00" +PTP_1588_L2_FOLLOW_UP=" \ +01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 08 02 \ +00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \ +00 00 66 83 c5 f1 17 97 ed f0" +PTP_1588_L2_PDELAY_REQ=" \ +01:80:c2:00:00:0e 00:00:de:ad:be:ef 88:f7 02 02 \ +00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 06 05 7f \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 00 00" +PTP_1588_IPV4_SYNC=" \ +01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \ +00 48 0a 9a 40 00 01 11 cb 88 c0 00 02 01 e0 00 \ +01 81 01 3f 01 3f 00 34 a3 c8 00 02 00 2c 00 00 \ +02 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 00" +PTP_1588_IPV4_FOLLOW_UP=" +01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \ +00 48 0a 9b 40 00 01 11 cb 87 c0 00 02 01 e0 00 \ +01 81 01 40 01 40 00 34 a3 c8 08 02 00 2c 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 00 02 00 00 00 66 83 \ +c6 0f 1d 9a 61 87" +PTP_1588_IPV4_PDELAY_REQ=" \ +01:00:5e:00:00:6b 00:00:de:ad:be:ef 08:00 45 00 \ +00 52 35 a9 40 00 01 11 a1 85 c0 00 02 01 e0 00 \ +00 6b 01 3f 01 3f 00 3e a2 bc 02 02 00 36 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 01 05 7f 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00" +PTP_1588_IPV6_SYNC=" \ +33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 06 \ +7c 2f 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \ +00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \ +00 00 00 00 01 81 01 3f 01 3f 00 36 2e 92 00 02 \ +00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00" +PTP_1588_IPV6_FOLLOW_UP=" \ +33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 0a \ +00 bc 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \ +00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \ +00 00 00 00 01 81 01 40 01 40 00 36 2e 92 08 02 \ +00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \ +00 00 66 83 c6 2a 32 09 bd 74 00 00" +PTP_1588_IPV6_PDELAY_REQ=" \ +33:33:00:00:00:6b 00:00:de:ad:be:ef 86:dd 60 0c \ +5c fd 00 40 11 01 fe 80 00 00 00 00 00 00 3c 37 \ +63 ff fe cf 17 0e ff 02 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 6b 01 3f 01 3f 00 40 b4 54 02 02 \ +00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 01 05 7f \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 00" # Disable promisc to ensure we don't receive unknown MAC DA packets export TCPDUMP_EXTRA_FLAGS="-pl" @@ -47,13 +108,15 @@ export TCPDUMP_EXTRA_FLAGS="-pl" h1=${NETIFS[p1]} h2=${NETIFS[p2]} -send_non_ip() +send_raw() { - local if_name=$1 - local smac=$2 - local dmac=$3 + local if_name=$1; shift + local pkt="$1"; shift + local smac=$(mac_get $if_name) + + pkt="${pkt/00:00:de:ad:be:ef/$smac}" - $MZ -q $if_name "$dmac $smac $NON_IP_PKT" + $MZ -q $if_name "$pkt" } send_uc_ipv4() @@ -68,10 +131,11 @@ send_uc_ipv4() check_rcv() { - local if_name=$1 - local type=$2 - local pattern=$3 - local should_receive=$4 + local if_name=$1; shift + local type=$1; shift + local pattern=$1; shift + local should_receive=$1; shift + local test_name="$1"; shift local should_fail= [ $should_receive = true ] && should_fail=0 || should_fail=1 @@ -81,7 +145,7 @@ check_rcv() check_err_fail "$should_fail" "$?" "reception" - log_test "$if_name: $type" + log_test "$test_name: $type" } mc_route_prepare() @@ -104,44 +168,78 @@ mc_route_destroy() run_test() { - local rcv_if_name=$1 - local smac=$(mac_get $h1) + local send_if_name=$1; shift + local rcv_if_name=$1; shift + local skip_ptp=$1; shift + local no_unicast_flt=$1; shift + local test_name="$1"; shift + local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) + local should_receive tcpdump_start $rcv_if_name - mc_route_prepare $h1 + mc_route_prepare $send_if_name mc_route_prepare $rcv_if_name - send_uc_ipv4 $h1 $rcv_dmac - send_uc_ipv4 $h1 $MACVLAN_ADDR - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR1 + send_uc_ipv4 $send_if_name $rcv_dmac + send_uc_ipv4 $send_if_name $MACVLAN_ADDR + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR1 ip link set dev $rcv_if_name promisc on - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR2 - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR2 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR2 + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR2 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR2 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR2 ip link set dev $rcv_if_name promisc off mc_join $rcv_if_name $JOINED_IPV4_MC_ADDR - mc_send $h1 $JOINED_IPV4_MC_ADDR + mc_send $send_if_name $JOINED_IPV4_MC_ADDR mc_leave mc_join $rcv_if_name $JOINED_IPV6_MC_ADDR - mc_send $h1 $JOINED_IPV6_MC_ADDR + mc_send $send_if_name $JOINED_IPV6_MC_ADDR mc_leave - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR1 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR1 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR1 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR1 ip link set dev $rcv_if_name allmulticast on - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR3 - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR3 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR3 + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR3 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR3 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR3 ip link set dev $rcv_if_name allmulticast off mc_route_destroy $rcv_if_name - mc_route_destroy $h1 + mc_route_destroy $send_if_name + + if [ $skip_ptp = false ]; then + ip maddress add 01:1b:19:00:00:00 dev $rcv_if_name + send_raw $send_if_name "$PTP_1588_L2_SYNC" + send_raw $send_if_name "$PTP_1588_L2_FOLLOW_UP" + ip maddress del 01:1b:19:00:00:00 dev $rcv_if_name + + ip maddress add 01:80:c2:00:00:0e dev $rcv_if_name + send_raw $send_if_name "$PTP_1588_L2_PDELAY_REQ" + ip maddress del 01:80:c2:00:00:0e dev $rcv_if_name + + mc_join $rcv_if_name 224.0.1.129 + send_raw $send_if_name "$PTP_1588_IPV4_SYNC" + send_raw $send_if_name "$PTP_1588_IPV4_FOLLOW_UP" + mc_leave + + mc_join $rcv_if_name 224.0.0.107 + send_raw $send_if_name "$PTP_1588_IPV4_PDELAY_REQ" + mc_leave + + mc_join $rcv_if_name ff0e::181 + send_raw $send_if_name "$PTP_1588_IPV6_SYNC" + send_raw $send_if_name "$PTP_1588_IPV6_FOLLOW_UP" + mc_leave + + mc_join $rcv_if_name ff02::6b + send_raw $send_if_name "$PTP_1588_IPV6_PDELAY_REQ" + mc_leave + fi sleep 1 @@ -149,61 +247,99 @@ run_test() check_rcv $rcv_if_name "Unicast IPv4 to primary MAC address" \ "$smac > $rcv_dmac, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to macvlan MAC address" \ "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ - true + true "$test_name" - xfail_on_veth $h1 \ - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + [ $no_unicast_flt = true ] && should_receive=true || should_receive=false + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + $should_receive "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ - true + true "$test_name" - xfail_on_veth $h1 \ - check_rcv $rcv_if_name \ - "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + [ $no_unicast_flt = true ] && should_receive=true || should_receive=false + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + $should_receive "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ - true + true "$test_name" - xfail_on_veth $h1 \ + xfail \ check_rcv $rcv_if_name \ "Multicast IPv4 to unknown group" \ "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to unknown group, allmulti" \ "$smac > $UNKNOWN_MACV4_MC_ADDR3, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to joined group" \ "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" - xfail_on_veth $h1 \ + xfail \ check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to unknown group, allmulti" \ "$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" + + if [ $skip_ptp = false ]; then + check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + fi tcpdump_cleanup $rcv_if_name } @@ -228,62 +364,217 @@ h2_destroy() simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 } +h1_vlan_create() +{ + simple_if_init $h1 + vlan_create $h1 100 v$h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_vlan_destroy() +{ + vlan_destroy $h1 100 + simple_if_fini $h1 +} + +h2_vlan_create() +{ + simple_if_init $h2 + vlan_create $h2 100 v$h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_vlan_destroy() +{ + vlan_destroy $h2 100 + simple_if_fini $h2 +} + bridge_create() { - ip link add br0 type bridge + local vlan_filtering=$1 + + ip link add br0 type bridge vlan_filtering $vlan_filtering ip link set br0 address $BRIDGE_ADDR ip link set br0 up ip link set $h2 master br0 ip link set $h2 up - - simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 } bridge_destroy() { - simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 - ip link del br0 } -standalone() +macvlan_create() { - h1_create - h2_create + local lower=$1 - ip link add link $h2 name macvlan0 type macvlan mode private + ip link add link $lower name macvlan0 type macvlan mode private ip link set macvlan0 address $MACVLAN_ADDR ip link set macvlan0 up +} - run_test $h2 - +macvlan_destroy() +{ ip link del macvlan0 +} + +standalone() +{ + local no_unicast_flt=true + local skip_ptp=false + if [ $(has_unicast_flt $h2) = yes ]; then + no_unicast_flt=false + fi + + h1_create + h2_create + macvlan_create $h2 + + run_test $h1 $h2 $skip_ptp $no_unicast_flt "$h2" + + macvlan_destroy h2_destroy h1_destroy } -bridge() +test_bridge() { + local no_unicast_flt=true + local vlan_filtering=$1 + local skip_ptp=true + h1_create - bridge_create + bridge_create $vlan_filtering + simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 + macvlan_create br0 - ip link add link br0 name macvlan0 type macvlan mode private - ip link set macvlan0 address $MACVLAN_ADDR - ip link set macvlan0 up + run_test $h1 br0 $skip_ptp $no_unicast_flt \ + "vlan_filtering=$vlan_filtering bridge" - run_test br0 + macvlan_destroy + simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 + bridge_destroy + h1_destroy +} - ip link del macvlan0 +vlan_unaware_bridge() +{ + test_bridge 0 +} + +vlan_aware_bridge() +{ + test_bridge 1 +} + +test_vlan() +{ + local no_unicast_flt=true + local skip_ptp=false + + if [ $(has_unicast_flt $h2) = yes ]; then + no_unicast_flt=false + fi + + h1_vlan_create + h2_vlan_create + macvlan_create $h2.100 + run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt "VLAN upper" + + macvlan_destroy + h2_vlan_destroy + h1_vlan_destroy +} + +vlan_over_bridged_port() +{ + local no_unicast_flt=true + local vlan_filtering=$1 + local skip_ptp=false + + # br_manage_promisc() will not force a single vlan_filtering port to + # promiscuous mode, so we should still expect unicast filtering to take + # place if the device can do it. + if [ $(has_unicast_flt $h2) = yes ] && [ $vlan_filtering = 1 ]; then + no_unicast_flt=false + fi + + h1_vlan_create + h2_vlan_create + bridge_create $vlan_filtering + macvlan_create $h2.100 + + run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt \ + "VLAN over vlan_filtering=$vlan_filtering bridged port" + + macvlan_destroy bridge_destroy - h1_destroy + h2_vlan_destroy + h1_vlan_destroy +} + +vlan_over_vlan_unaware_bridged_port() +{ + vlan_over_bridged_port 0 +} + +vlan_over_vlan_aware_bridged_port() +{ + vlan_over_bridged_port 1 +} + +vlan_over_bridge() +{ + local no_unicast_flt=true + local vlan_filtering=$1 + local skip_ptp=true + + h1_vlan_create + bridge_create $vlan_filtering + simple_if_init br0 + vlan_create br0 100 vbr0 $H2_IPV4/24 $H2_IPV6/64 + macvlan_create br0.100 + + if [ $vlan_filtering = 1 ]; then + bridge vlan add dev $h2 vid 100 master + bridge vlan add dev br0 vid 100 self + fi + + run_test $h1.100 br0.100 $skip_ptp $no_unicast_flt \ + "VLAN over vlan_filtering=$vlan_filtering bridge" + + if [ $vlan_filtering = 1 ]; then + bridge vlan del dev br0 vid 100 self + bridge vlan del dev $h2 vid 100 master + fi + + macvlan_destroy + vlan_destroy br0 100 + simple_if_fini br0 + bridge_destroy + h1_vlan_destroy +} + +vlan_over_vlan_unaware_bridge() +{ + vlan_over_bridge 0 +} + +vlan_over_vlan_aware_bridge() +{ + vlan_over_bridge 1 } cleanup() { pre_cleanup + + ip link set $h2 down + ip link set $h1 down + vrf_cleanup } diff --git a/tools/testing/selftests/net/forwarding/no_forwarding.sh b/tools/testing/selftests/net/forwarding/no_forwarding.sh index af3b398d13f0..9e677aa64a06 100755 --- a/tools/testing/selftests/net/forwarding/no_forwarding.sh +++ b/tools/testing/selftests/net/forwarding/no_forwarding.sh @@ -233,6 +233,9 @@ cleanup() { pre_cleanup + ip link set dev $swp2 down + ip link set dev $swp1 down + h2_destroy h1_destroy diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 2ba44247c60a..a7d8399c8d4f 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -40,6 +40,7 @@ ALL_TESTS=" ping_ipv4 ping_ipv6 multipath_test + multipath16_test ping_ipv4_blackhole ping_ipv6_blackhole nh_stats_test_v4 @@ -226,9 +227,11 @@ routing_nh_obj() multipath4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -242,7 +245,8 @@ multipath4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -258,9 +262,11 @@ multipath4_test() multipath6_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -275,7 +281,8 @@ multipath6_test() t0_rp13=$(link_stats_tx_packets_get $rp13) $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -313,6 +320,23 @@ multipath_test() multipath6_test "Weighted MP 11:45" 11 45 } +multipath16_test() +{ + check_nhgw16 104 || return + + log_info "Running 16-bit IPv4 multipath tests" + multipath4_test "65535:65535" 65535 65535 + multipath4_test "128:512" 128 512 + omit_on_slow \ + multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + log_info "Running 16-bit IPv6 multipath tests" + multipath6_test "65535:65535" 65535 65535 + multipath6_test "128:512" 128 512 + omit_on_slow \ + multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 +} + ping_ipv4_blackhole() { RET=0 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index 2903294d8bca..507b2852dabe 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -117,3 +117,16 @@ __nh_stats_test_v6() $MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2 sysctl_restore net.ipv6.fib_multipath_hash_policy } + +check_nhgw16() +{ + local nhid=$1; shift + + ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null + if (( $? )); then + log_test_skip "16-bit multipath tests" \ + "iproute2 or the kernel do not support 16-bit next hop weights" + return 1 + fi + ip nexthop del id 9999 ||: +} diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index cd9e346436fc..88ddae05b39d 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -40,6 +40,7 @@ ALL_TESTS=" ping_ipv4 ping_ipv6 multipath_test + multipath16_test nh_stats_test_v4 nh_stats_test_v6 " @@ -228,9 +229,11 @@ routing_nh_obj() multipath4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -243,7 +246,8 @@ multipath4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -258,9 +262,11 @@ multipath4_test() multipath6_l4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -273,7 +279,8 @@ multipath6_l4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -371,6 +378,41 @@ multipath_test() ip nexthop replace id 106 group 104,1/105,1 type resilient } +multipath16_test() +{ + check_nhgw16 104 || return + + log_info "Running 16-bit IPv4 multipath tests" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 0 + + ip nexthop replace id 103 group 101,65535/102,65535 type resilient + multipath4_test "65535:65535" 65535 65535 + + ip nexthop replace id 103 group 101,128/102,512 type resilient + multipath4_test "128:512" 128 512 + + ip nexthop replace id 103 group 101,255/102,65535 type resilient + omit_on_slow \ + multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running 16-bit IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104/105 type resilient idle_timer 0 + + ip nexthop replace id 106 group 104,65535/105,65535 type resilient + multipath6_l4_test "65535:65535" 65535 65535 + + ip nexthop replace id 106 group 104,128/105,512 type resilient + multipath6_l4_test "128:512" 128 512 + + ip nexthop replace id 106 group 104,255/105,65535 type resilient + omit_on_slow \ + multipath6_l4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + ip nexthop replace id 106 group 104,1/105,1 type resilient +} + nh_stats_test_v4() { __nh_stats_test_v4 resilient diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh index e2be354167a1..46f365b557b7 100755 --- a/tools/testing/selftests/net/forwarding/router_multipath.sh +++ b/tools/testing/selftests/net/forwarding/router_multipath.sh @@ -180,6 +180,7 @@ multipath4_test() ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -217,6 +218,7 @@ multipath6_test() $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 589629636502..ea89e558672d 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -4,7 +4,8 @@ ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ gact_trap_test mirred_egress_to_ingress_test \ - mirred_egress_to_ingress_tcp_test" + mirred_egress_to_ingress_tcp_test \ + ingress_2nd_vlan_push egress_2nd_vlan_push" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -244,6 +245,49 @@ mirred_egress_to_ingress_tcp_test() log_test "mirred_egress_to_ingress_tcp ($tcflags)" } +ingress_2nd_vlan_push() +{ + tc filter add dev $swp1 ingress pref 20 chain 0 handle 20 flower \ + $tcflags num_of_vlans 1 \ + action vlan push id 100 protocol 0x8100 action goto chain 5 + tc filter add dev $swp1 ingress pref 30 chain 5 handle 30 flower \ + $tcflags num_of_vlans 2 \ + cvlan_ethtype 0x800 action pass + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -Q 10 -q + + tc_check_packets "dev $swp1 ingress" 30 1 + check_err $? "No double-vlan packets received" + + tc filter del dev $swp1 ingress pref 20 chain 0 handle 20 flower + tc filter del dev $swp1 ingress pref 30 chain 5 handle 30 flower + + log_test "ingress_2nd_vlan_push ($tcflags)" +} + +egress_2nd_vlan_push() +{ + tc filter add dev $h1 egress pref 20 chain 0 handle 20 flower \ + $tcflags num_of_vlans 0 \ + action vlan push id 10 protocol 0x8100 \ + pipe action vlan push id 100 protocol 0x8100 action goto chain 5 + tc filter add dev $h1 egress pref 30 chain 5 handle 30 flower \ + $tcflags num_of_vlans 2 \ + cvlan_ethtype 0x800 action pass + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -q + + tc_check_packets "dev $h1 egress" 30 1 + check_err $? "No double-vlan packets received" + + tc filter del dev $h1 egress pref 20 chain 0 handle 20 flower + tc filter del dev $h1 egress pref 30 chain 5 handle 30 flower + + log_test "egress_2nd_vlan_push ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]} diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index d0219032f773..be8707bfb46e 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -125,6 +125,21 @@ slowwait_for_counter() slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@" } +# Check for existence of tools which are built as part of selftests +# but may also already exist in $PATH +check_gen_prog() +{ + local prog_name=$1; shift + + if ! which $prog_name >/dev/null 2>/dev/null; then + PATH=$PWD:$PATH + if ! which $prog_name >/dev/null; then + echo "'$prog_name' command not found; skipping tests" + exit $ksft_skip + fi + fi +} + remove_ns_list() { local item=$1 @@ -146,6 +161,7 @@ cleanup_ns() for ns in "$@"; do [ -z "${ns}" ] && continue + ip netns pids "${ns}" 2> /dev/null | xargs -r kill || true ip netns delete "${ns}" &> /dev/null || true if ! busywait $BUSYWAIT_TIMEOUT ip netns list \| grep -vq "^$ns$" &> /dev/null; then echo "Warn: Failed to remove namespace $ns" diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c index b9f3fc3c3426..e0a34e5e8dd5 100644 --- a/tools/testing/selftests/net/lib/csum.c +++ b/tools/testing/selftests/net/lib/csum.c @@ -654,10 +654,16 @@ static int recv_verify_packet_ipv4(void *nh, int len) { struct iphdr *iph = nh; uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + uint16_t ip_len; if (len < sizeof(*iph) || iph->protocol != proto) return -1; + ip_len = ntohs(iph->tot_len); + if (ip_len > len || ip_len < sizeof(*iph)) + return -1; + + len = ip_len; iph_addr_p = &iph->saddr; if (proto == IPPROTO_TCP) return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph)); @@ -669,16 +675,22 @@ static int recv_verify_packet_ipv6(void *nh, int len) { struct ipv6hdr *ip6h = nh; uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + uint16_t ip_len; if (len < sizeof(*ip6h) || ip6h->nexthdr != proto) return -1; + ip_len = ntohs(ip6h->payload_len); + if (ip_len > len - sizeof(*ip6h)) + return -1; + + len = ip_len; iph_addr_p = &ip6h->saddr; if (proto == IPPROTO_TCP) - return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h)); + return recv_verify_packet_tcp(ip6h + 1, len); else - return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h)); + return recv_verify_packet_udp(ip6h + 1, len); } /* return whether auxdata includes TP_STATUS_CSUM_VALID */ diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index f26c20df9db4..477ae76de93d 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 import builtins +import functools import inspect import sys import time @@ -10,6 +11,7 @@ from .utils import global_defer_queue KSFT_RESULT = None KSFT_RESULT_ALL = True +KSFT_DISRUPTIVE = True class KsftFailEx(Exception): @@ -32,8 +34,18 @@ def _fail(*args): global KSFT_RESULT KSFT_RESULT = False - frame = inspect.stack()[2] - ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":") + stack = inspect.stack() + started = False + for frame in reversed(stack[2:]): + # Start printing from the test case function + if not started: + if frame.function == 'ksft_run': + started = True + continue + + ksft_pr("Check| At " + frame.filename + ", line " + str(frame.lineno) + + ", in " + frame.function + ":") + ksft_pr("Check| " + frame.code_context[0].strip()) ksft_pr(*args) @@ -43,6 +55,12 @@ def ksft_eq(a, b, comment=""): _fail("Check failed", a, "!=", b, comment) +def ksft_ne(a, b, comment=""): + global KSFT_RESULT + if a == b: + _fail("Check failed", a, "==", b, comment) + + def ksft_true(a, comment=""): if not a: _fail("Check failed", a, "does not eval to True", comment) @@ -127,6 +145,44 @@ def ksft_flush_defer(): KSFT_RESULT = False +def ksft_disruptive(func): + """ + Decorator that marks the test as disruptive (e.g. the test + that can down the interface). Disruptive tests can be skipped + by passing DISRUPTIVE=False environment variable. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not KSFT_DISRUPTIVE: + raise KsftSkipEx(f"marked as disruptive") + return func(*args, **kwargs) + return wrapper + + +def ksft_setup(env): + """ + Setup test framework global state from the environment. + """ + + def get_bool(env, name): + value = env.get(name, "").lower() + if value in ["yes", "true"]: + return True + if value in ["no", "false"]: + return False + try: + return bool(int(value)) + except: + raise Exception(f"failed to parse {name}") + + if "DISRUPTIVE" in env: + global KSFT_DISRUPTIVE + KSFT_DISRUPTIVE = get_bool(env, "DISRUPTIVE") + + return env + + def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cases = cases or [] diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 7b936a926859..5d796622e730 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -11,6 +11,8 @@ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq TEST_FILES := mptcp_lib.sh settings +TEST_INCLUDES := ../lib.sh ../net_helper.sh + EXTRA_CLEAN := *.pcap include ../../lib.mk diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 776d43a6922d..2bd0c1eb70c5 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -284,7 +284,7 @@ echo "b" | \ ./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} -w 20 \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 -chk_msk_nr 2 "after MPC handshake " +chk_msk_nr 2 "after MPC handshake" chk_last_time_info 10000 chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index d2043ec3bf6d..4209b9569039 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1115,11 +1115,11 @@ again: return 1; } - if (--cfg_repeat > 0) { - if (cfg_input) - close(fd); + if (cfg_input) + close(fd); + + if (--cfg_repeat > 0) goto again; - } return 0; } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index b77fb7065bfb..57325d57e4c6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -345,9 +345,11 @@ do_transfer() local addr_port addr_port=$(printf "%s:%d" ${connect_addr} ${port}) - local result_msg - result_msg="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})" - mptcp_lib_print_title "${result_msg}" + local pretty_title + pretty_title="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})" + mptcp_lib_print_title "${pretty_title}" + + local tap_title="${connector_ns:0:3} ${cl_proto} -> ${listener_ns:0:3} (${addr_port}) ${srv_proto}" if $capture; then local capuser @@ -431,7 +433,6 @@ do_transfer() local duration duration=$((stop-start)) - result_msg+=" # time=${duration}ms" printf "(duration %05sms) " "${duration}" if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then mptcp_lib_pr_fail "client exit code $retc, server $rets" @@ -444,7 +445,7 @@ do_transfer() echo cat "$capout" - mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}" + mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}" return 1 fi @@ -544,12 +545,12 @@ do_transfer() if [ $retc -eq 0 ] && [ $rets -eq 0 ]; then mptcp_lib_pr_ok "${extra:1}" - mptcp_lib_result_pass "${TEST_GROUP}: ${result_msg}" + mptcp_lib_result_pass "${TEST_GROUP}: ${tap_title}" else if [ -n "${extra}" ]; then mptcp_lib_print_warn "${extra:1}" fi - mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}" + mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}" fi cat "$capout" @@ -848,6 +849,8 @@ stop_if_error() make_file "$cin" "client" make_file "$sin" "server" +mptcp_lib_subtests_last_ts_reset + check_mptcp_disabled stop_if_error "The kernel configuration is not valid for MPTCP" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 108aeeb84ef1..e8d0a01b4144 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -61,6 +61,16 @@ unset sflags unset fastclose unset fullmesh unset speed +unset join_csum_ns1 +unset join_csum_ns2 +unset join_fail_nr +unset join_rst_nr +unset join_infi_nr +unset join_corrupted_pkts +unset join_syn_tx +unset join_create_err +unset join_bind_err +unset join_connect_err # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || # (ip6 && (ip6[74] & 0xf0) == 0x30)'" @@ -196,6 +206,22 @@ print_skip() mptcp_lib_pr_skip "${@}" } +# $1: check name; $2: rc +print_results() +{ + local check="${1}" + local rc=${2} + + print_check "${check}" + if [ ${rc} = ${KSFT_PASS} ]; then + print_ok + elif [ ${rc} = ${KSFT_SKIP} ]; then + print_skip + else + fail_test "see above" + fi +} + # [ $1: fail msg ] mark_as_skipped() { @@ -337,7 +363,7 @@ reset_with_checksum() local ns1_enable=$1 local ns2_enable=$2 - reset "checksum test ${1} ${2}" || return 1 + reset "checksum test ${ns1_enable} ${ns2_enable}" || return 1 ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable @@ -420,12 +446,17 @@ reset_with_fail() fi } +start_events() +{ + mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid + mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid +} + reset_with_events() { reset "${1}" || return 1 - mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid - mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid + start_events } reset_with_tcp_filter() @@ -436,9 +467,10 @@ reset_with_tcp_filter() local ns="${!1}" local src="${2}" local target="${3}" + local chain="${4:-INPUT}" if ! ip netns exec "${ns}" ${iptables} \ - -A INPUT \ + -A "${chain}" \ -s "${src}" \ -p tcp \ -j "${target}"; then @@ -661,7 +693,7 @@ pm_nl_check_endpoint() done if [ -z "${id}" ]; then - test_fail "bad test - missing endpoint id" + fail_test "bad test - missing endpoint id" return fi @@ -833,7 +865,7 @@ chk_cestab_nr() local cestab=$2 local count - print_check "cestab $cestab" + print_check "currently established: $cestab" count=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPCurrEstab") if [ -z "$count" ]; then print_skip @@ -1109,28 +1141,29 @@ chk_csum_nr() csum_ns2=${csum_ns2:1} fi - print_check "sum" + print_check "checksum server" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtDataCsumErr") - if [ "$count" != "$csum_ns1" ]; then + if [ -n "$count" ] && [ "$count" != "$csum_ns1" ]; then extra_msg+=" ns1=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } || - { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then + { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then fail_test "got $count data checksum error[s] expected $csum_ns1" else print_ok fi - print_check "csum" + + print_check "checksum client" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtDataCsumErr") - if [ "$count" != "$csum_ns2" ]; then + if [ -n "$count" ] && [ "$count" != "$csum_ns2" ]; then extra_msg+=" ns2=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } || - { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then + { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then fail_test "got $count data checksum error[s] expected $csum_ns2" else print_ok @@ -1147,6 +1180,8 @@ chk_fail_nr() local count local ns_tx=$ns1 local ns_rx=$ns2 + local tx="server" + local rx="client" local extra_msg="" local allow_tx_lost=0 local allow_rx_lost=0 @@ -1154,7 +1189,8 @@ chk_fail_nr() if [[ $ns_invert = "invert" ]]; then ns_tx=$ns2 ns_rx=$ns1 - extra_msg="invert" + tx="client" + rx="server" fi if [[ "${fail_tx}" = "-"* ]]; then @@ -1166,29 +1202,29 @@ chk_fail_nr() fail_rx=${fail_rx:1} fi - print_check "ftx" + print_check "fail tx ${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx") - if [ "$count" != "$fail_tx" ]; then - extra_msg+=",tx=$count" + if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then + extra_msg+=" tx=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } || - { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then + { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then fail_test "got $count MP_FAIL[s] TX expected $fail_tx" else print_ok fi - print_check "failrx" + print_check "fail rx ${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx") - if [ "$count" != "$fail_rx" ]; then - extra_msg+=",rx=$count" + if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then + extra_msg+=" rx=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } || - { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then + { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then fail_test "got $count MP_FAIL[s] RX expected $fail_rx" else print_ok @@ -1205,37 +1241,35 @@ chk_fclose_nr() local count local ns_tx=$ns2 local ns_rx=$ns1 - local extra_msg="" + local tx="client" + local rx="server" if [[ $ns_invert = "invert" ]]; then ns_tx=$ns1 ns_rx=$ns2 - extra_msg="invert" + tx="server" + rx="client" fi - print_check "ctx" + print_check "fast close tx ${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFastcloseTx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$fclose_tx" ]; then - extra_msg+=",tx=$count" fail_test "got $count MP_FASTCLOSE[s] TX expected $fclose_tx" else print_ok fi - print_check "fclzrx" + print_check "fast close rx ${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFastcloseRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$fclose_rx" ]; then - extra_msg+=",rx=$count" fail_test "got $count MP_FASTCLOSE[s] RX expected $fclose_rx" else print_ok fi - - print_info "$extra_msg" } chk_rst_nr() @@ -1246,15 +1280,17 @@ chk_rst_nr() local count local ns_tx=$ns1 local ns_rx=$ns2 - local extra_msg="" + local tx="server" + local rx="client" if [[ $ns_invert = "invert" ]]; then ns_tx=$ns2 ns_rx=$ns1 - extra_msg="invert" + tx="client" + rx="server" fi - print_check "rtx" + print_check "reset tx ${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPRstTx") if [ -z "$count" ]; then print_skip @@ -1266,7 +1302,7 @@ chk_rst_nr() print_ok fi - print_check "rstrx" + print_check "reset rx ${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPRstRx") if [ -z "$count" ]; then print_skip @@ -1277,8 +1313,6 @@ chk_rst_nr() else print_ok fi - - print_info "$extra_msg" } chk_infi_nr() @@ -1287,7 +1321,7 @@ chk_infi_nr() local infi_rx=$2 local count - print_check "itx" + print_check "infi tx client" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtInfiniteMapTx") if [ -z "$count" ]; then print_skip @@ -1297,7 +1331,7 @@ chk_infi_nr() print_ok fi - print_check "infirx" + print_check "infi rx server" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtInfiniteMapRx") if [ -z "$count" ]; then print_skip @@ -1308,17 +1342,66 @@ chk_infi_nr() fi } +chk_join_tx_nr() +{ + local syn_tx=${join_syn_tx:-0} + local create=${join_create_err:-0} + local bind=${join_bind_err:-0} + local connect=${join_connect_err:-0} + local rc=${KSFT_PASS} + local count + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTx") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$syn_tx" ]; then + rc=${KSFT_FAIL} + print_check "syn tx" + fail_test "got $count JOIN[s] syn tx expected $syn_tx" + fi + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxCreatSkErr") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$create" ]; then + rc=${KSFT_FAIL} + print_check "syn tx create socket error" + fail_test "got $count JOIN[s] syn tx create socket error expected $create" + fi + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxBindErr") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$bind" ]; then + rc=${KSFT_FAIL} + print_check "syn tx bind error" + fail_test "got $count JOIN[s] syn tx bind error expected $bind" + fi + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxConnectErr") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$connect" ]; then + rc=${KSFT_FAIL} + print_check "syn tx connect error" + fail_test "got $count JOIN[s] syn tx connect error expected $connect" + fi + + print_results "join Tx" ${rc} +} + chk_join_nr() { local syn_nr=$1 local syn_ack_nr=$2 local ack_nr=$3 - local csum_ns1=${4:-0} - local csum_ns2=${5:-0} - local fail_nr=${6:-0} - local rst_nr=${7:-0} - local infi_nr=${8:-0} - local corrupted_pkts=${9:-0} + local csum_ns1=${join_csum_ns1:-0} + local csum_ns2=${join_csum_ns2:-0} + local fail_nr=${join_fail_nr:-0} + local rst_nr=${join_rst_nr:-0} + local infi_nr=${join_infi_nr:-0} + local corrupted_pkts=${join_corrupted_pkts:-0} + local rc=${KSFT_PASS} local count local with_cookie @@ -1326,43 +1409,44 @@ chk_join_nr() print_info "${corrupted_pkts} corrupted pkts" fi - print_check "syn" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynRx") if [ -z "$count" ]; then - print_skip + rc=${KSFT_SKIP} elif [ "$count" != "$syn_nr" ]; then - fail_test "got $count JOIN[s] syn expected $syn_nr" - else - print_ok + rc=${KSFT_FAIL} + print_check "syn rx" + fail_test "got $count JOIN[s] syn rx expected $syn_nr" fi - print_check "synack" with_cookie=$(ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies) count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckRx") if [ -z "$count" ]; then - print_skip + rc=${KSFT_SKIP} elif [ "$count" != "$syn_ack_nr" ]; then # simult connections exceeding the limit with cookie enabled could go up to # synack validation as the conn limit can be enforced reliably only after # the subflow creation - if [ "$with_cookie" = 2 ] && [ "$count" -gt "$syn_ack_nr" ] && [ "$count" -le "$syn_nr" ]; then - print_ok - else - fail_test "got $count JOIN[s] synack expected $syn_ack_nr" + if [ "$with_cookie" != 2 ] || [ "$count" -le "$syn_ack_nr" ] || [ "$count" -gt "$syn_nr" ]; then + rc=${KSFT_FAIL} + print_check "synack rx" + fail_test "got $count JOIN[s] synack rx expected $syn_ack_nr" fi - else - print_ok fi - print_check "ack" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx") if [ -z "$count" ]; then - print_skip + rc=${KSFT_SKIP} elif [ "$count" != "$ack_nr" ]; then - fail_test "got $count JOIN[s] ack expected $ack_nr" - else - print_ok + rc=${KSFT_FAIL} + print_check "ack rx" + fail_test "got $count JOIN[s] ack rx expected $ack_nr" fi + + print_results "join Rx" ${rc} + + join_syn_tx="${join_syn_tx:-${syn_nr}}" \ + chk_join_tx_nr + if $validate_checksum; then chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr @@ -1415,18 +1499,30 @@ chk_add_nr() local add_nr=$1 local echo_nr=$2 local port_nr=${3:-0} - local syn_nr=${4:-$port_nr} - local syn_ack_nr=${5:-$port_nr} - local ack_nr=${6:-$port_nr} - local mis_syn_nr=${7:-0} - local mis_ack_nr=${8:-0} + local ns_invert=${4:-""} + local syn_nr=$port_nr + local syn_ack_nr=$port_nr + local ack_nr=$port_nr + local mis_syn_nr=0 + local mis_ack_nr=0 + local ns_tx=$ns1 + local ns_rx=$ns2 + local tx="" + local rx="" local count local timeout - timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout) + if [[ $ns_invert = "invert" ]]; then + ns_tx=$ns2 + ns_rx=$ns1 + tx=" client" + rx=" server" + fi + + timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout) - print_check "add" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtAddAddr") + print_check "add addr rx${rx}" + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr") if [ -z "$count" ]; then print_skip # if the test configured a short timeout tolerate greater then expected @@ -1437,8 +1533,8 @@ chk_add_nr() print_ok fi - print_check "echo" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtEchoAdd") + print_check "add addr echo rx${tx}" + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd") if [ -z "$count" ]; then print_skip elif [ "$count" != "$echo_nr" ]; then @@ -1448,8 +1544,8 @@ chk_add_nr() fi if [ $port_nr -gt 0 ]; then - print_check "pt" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtPortAdd") + print_check "add addr rx with port${rx}" + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd") if [ -z "$count" ]; then print_skip elif [ "$count" != "$port_nr" ]; then @@ -1458,8 +1554,8 @@ chk_add_nr() print_ok fi - print_check "syn" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortSynRx") + print_check "syn rx port${tx}" + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$syn_nr" ]; then @@ -1469,8 +1565,8 @@ chk_add_nr() print_ok fi - print_check "synack" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinPortSynAckRx") + print_check "synack rx port${rx}" + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$syn_ack_nr" ]; then @@ -1480,8 +1576,8 @@ chk_add_nr() print_ok fi - print_check "ack" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortAckRx") + print_check "ack rx port${tx}" + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$ack_nr" ]; then @@ -1491,8 +1587,8 @@ chk_add_nr() print_ok fi - print_check "syn" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortSynRx") + print_check "syn rx port mismatch${tx}" + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$mis_syn_nr" ]; then @@ -1502,8 +1598,8 @@ chk_add_nr() print_ok fi - print_check "ack" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortAckRx") + print_check "ack rx port mismatch${tx}" + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$mis_ack_nr" ]; then @@ -1524,7 +1620,7 @@ chk_add_tx_nr() timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout) - print_check "add TX" + print_check "add addr tx" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtAddAddrTx") if [ -z "$count" ]; then print_skip @@ -1536,7 +1632,7 @@ chk_add_tx_nr() print_ok fi - print_check "echo TX" + print_check "add addr echo tx" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtEchoAddTx") if [ -z "$count" ]; then print_skip @@ -1556,6 +1652,8 @@ chk_rm_nr() local count local addr_ns=$ns1 local subflow_ns=$ns2 + local addr="server" + local subflow="client" local extra_msg="" shift 2 @@ -1565,16 +1663,14 @@ chk_rm_nr() shift done - if [ -z $invert ]; then - addr_ns=$ns1 - subflow_ns=$ns2 - elif [ $invert = "true" ]; then + if [ "$invert" = "true" ]; then addr_ns=$ns2 subflow_ns=$ns1 - extra_msg="invert" + addr="client" + subflow="server" fi - print_check "rm" + print_check "rm addr rx ${addr}" count=$(mptcp_lib_get_counter ${addr_ns} "MPTcpExtRmAddr") if [ -z "$count" ]; then print_skip @@ -1584,7 +1680,7 @@ chk_rm_nr() print_ok fi - print_check "rmsf" + print_check "rm subflow ${subflow}" count=$(mptcp_lib_get_counter ${subflow_ns} "MPTcpExtRmSubflow") if [ -z "$count" ]; then print_skip @@ -1598,7 +1694,7 @@ chk_rm_nr() count=$((count + cnt)) if [ "$count" != "$rm_subflow_nr" ]; then suffix="$count in [$rm_subflow_nr:$((rm_subflow_nr*2))]" - extra_msg+=" simult" + extra_msg="simult" fi if [ $count -ge "$rm_subflow_nr" ] && \ [ "$count" -le "$((rm_subflow_nr *2 ))" ]; then @@ -1619,7 +1715,7 @@ chk_rm_tx_nr() { local rm_addr_tx_nr=$1 - print_check "rm TX" + print_check "rm addr tx client" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtRmAddrTx") if [ -z "$count" ]; then print_skip @@ -1634,9 +1730,11 @@ chk_prio_nr() { local mp_prio_nr_tx=$1 local mp_prio_nr_rx=$2 + local mpj_syn=$3 + local mpj_syn_ack=$4 local count - print_check "ptx" + print_check "mp_prio tx server" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioTx") if [ -z "$count" ]; then print_skip @@ -1646,7 +1744,7 @@ chk_prio_nr() print_ok fi - print_check "prx" + print_check "mp_prio rx client" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioRx") if [ -z "$count" ]; then print_skip @@ -1655,6 +1753,26 @@ chk_prio_nr() else print_ok fi + + print_check "syn backup" + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynBackupRx") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$mpj_syn" ]; then + fail_test "got $count JOIN[s] syn with Backup expected $mpj_syn" + else + print_ok + fi + + print_check "synack backup" + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckBackupRx") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$mpj_syn_ack" ]; then + fail_test "got $count JOIN[s] synack with Backup expected $mpj_syn_ack" + else + print_ok + fi } chk_subflow_nr() @@ -1869,9 +1987,11 @@ subflows_error_tests() pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.12.2 flags subflow speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 + join_bind_err=1 \ + chk_join_nr 0 0 0 fi # multiple subflows, with subflow creation error @@ -1883,7 +2003,8 @@ subflows_error_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 fi # multiple subflows, with subflow timeout on MPJ @@ -1895,7 +2016,8 @@ subflows_error_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 fi # multiple subflows, check that the endpoint corresponding to @@ -1916,7 +2038,8 @@ subflows_error_tests() # additional subflow could be created only if the PM select # the later endpoint, skipping the already used one - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 fi } @@ -1955,6 +2078,21 @@ signal_address_tests() chk_add_nr 1 1 fi + # uncommon: subflow and signal flags on the same endpoint + # or because the user wrongly picked both, but still expects the client + # to create additional subflows + if reset "subflow and signal together"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 0 2 + pm_nl_add_endpoint $ns2 10.0.3.2 flags signal,subflow + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 0 invert # only initiated by ns2 + chk_add_nr 0 0 0 # none initiated by ns1 + chk_rst_nr 0 0 invert # no RST sent by the client + chk_rst_nr 0 0 # no RST sent by the server + fi + # accept and use add_addr with additional subflows if reset "multiple subflows and signal"; then pm_nl_set_limits $ns1 0 3 @@ -1987,7 +2125,8 @@ signal_address_tests() pm_nl_add_endpoint $ns1 10.0.14.1 flags signal pm_nl_set_limits $ns2 3 3 run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=3 \ + chk_join_nr 1 1 1 chk_add_nr 3 3 fi @@ -2155,7 +2294,8 @@ add_addr_timeout_tests() pm_nl_set_limits $ns2 2 2 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 chk_add_nr 8 0 fi } @@ -2255,7 +2395,8 @@ remove_tests() pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 join_connect_err=1 \ + chk_join_nr 1 1 1 chk_add_nr 3 3 chk_rm_nr 3 1 invert chk_rst_nr 0 0 @@ -2320,7 +2461,8 @@ remove_tests() pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=3 \ + chk_join_nr 1 1 1 chk_add_nr 3 3 chk_rm_nr 3 1 invert chk_rst_nr 0 0 @@ -2612,33 +2754,46 @@ backup_tests() sflags=nobackup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 fi # single address, backup if reset "single address, backup" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 1 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup + pm_nl_set_limits $ns2 1 1 + sflags=nobackup speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 + chk_prio_nr 1 0 0 1 + fi + + # single address, switch to backup + if reset "single address, switch to backup" && + continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 1 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal pm_nl_set_limits $ns2 1 1 sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi # single address with port, backup if reset "single address with port, backup" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup port 10100 pm_nl_set_limits $ns2 1 1 - sflags=backup speed=slow \ + sflags=nobackup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 + chk_prio_nr 1 0 0 1 fi if reset "mpc backup" && @@ -2647,17 +2802,26 @@ backup_tests() speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 0 1 + chk_prio_nr 0 1 0 0 fi if reset "mpc backup both sides" && continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then - pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 2 + pm_nl_add_endpoint $ns1 10.0.1.1 flags signal,backup pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup + + # 10.0.2.2 (non-backup) -> 10.0.1.1 (backup) + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow + # 10.0.1.2 (backup) -> 10.0.2.1 (non-backup) + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + ip -net "$ns2" route add 10.0.2.1 via 10.0.1.1 dev ns2eth1 # force this path + speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - chk_prio_nr 1 1 + chk_join_nr 2 2 2 + chk_prio_nr 1 1 1 1 fi if reset "mpc switch to backup" && @@ -2666,7 +2830,7 @@ backup_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 0 1 + chk_prio_nr 0 1 0 0 fi if reset "mpc switch to backup both sides" && @@ -2676,7 +2840,7 @@ backup_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi } @@ -2868,37 +3032,16 @@ syncookies_tests() checksum_tests() { - # checksum test 0 0 - if reset_with_checksum 0 0; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi - - # checksum test 1 1 - if reset_with_checksum 1 1; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi - - # checksum test 0 1 - if reset_with_checksum 0 1; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi - - # checksum test 1 0 - if reset_with_checksum 1 0; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi + local checksum_enable + for checksum_enable in "0 0" "1 1" "0 1" "1 0"; do + # checksum test 0 0, 1 1, 0 1, 1 0 + if reset_with_checksum ${checksum_enable}; then + pm_nl_set_limits $ns1 0 1 + pm_nl_set_limits $ns2 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 0 0 0 + fi + done } deny_join_id0_tests() @@ -2987,6 +3130,9 @@ fullmesh_tests() pm_nl_set_limits $ns1 1 3 pm_nl_set_limits $ns2 1 3 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh + fi fullmesh=1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -3053,7 +3199,7 @@ fullmesh_tests() addr_nr_ns2=1 sflags=backup,fullmesh speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi @@ -3066,7 +3212,7 @@ fullmesh_tests() sflags=nobackup,nofullmesh speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi } @@ -3086,7 +3232,8 @@ fastclose_tests() MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 0 0 0 1 + join_rst_nr=1 \ + chk_join_nr 0 0 0 chk_fclose_nr 1 1 invert chk_rst_nr 1 1 fi @@ -3105,7 +3252,10 @@ fail_tests() MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=128 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)" + join_csum_ns1=+1 join_csum_ns2=+0 \ + join_fail_nr=1 join_rst_nr=0 join_infi_nr=1 \ + join_corrupted_pkts="$(pedit_action_pkts)" \ + chk_join_nr 0 0 0 chk_fail_nr 1 -1 invert fi @@ -3118,7 +3268,10 @@ fail_tests() pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow test_linkfail=1024 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 1 0 1 1 0 "$(pedit_action_pkts)" + join_csum_ns1=1 join_csum_ns2=0 \ + join_fail_nr=1 join_rst_nr=1 join_infi_nr=0 \ + join_corrupted_pkts="$(pedit_action_pkts)" \ + chk_join_nr 1 1 1 fi } @@ -3260,6 +3413,36 @@ userspace_pm_chk_get_addr() fi } +# $1: ns ; $2: event type ; $3: count +chk_evt_nr() +{ + local ns=${1} + local evt_name="${2}" + local exp="${3}" + + local evts="${evts_ns1}" + local evt="${!evt_name}" + local count + + evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_ + [ "${ns}" == "ns2" ] && evts="${evts_ns2}" + + print_check "event ${ns} ${evt_name} (${exp})" + + if [[ "${evt_name}" = "LISTENER_"* ]] && + ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then + print_skip "event not supported" + return + fi + + count=$(grep -cw "type:${evt}" "${evts}") + if [ "${count}" != "${exp}" ]; then + fail_test "got ${count} events, expected ${exp}" + else + print_ok + fi +} + userspace_tests() { # userspace pm type prevents add_addr @@ -3318,7 +3501,7 @@ userspace_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 0 - chk_prio_nr 0 0 + chk_prio_nr 0 0 0 0 fi # userspace pm type prevents rm_addr @@ -3340,8 +3523,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 userspace_pm_add_addr $ns1 10.0.2.1 10 @@ -3356,14 +3539,12 @@ userspace_tests() "signal" userspace_pm_chk_get_addr "${ns1}" "10" "id 10 flags signal 10.0.2.1" userspace_pm_chk_get_addr "${ns1}" "20" "id 20 flags signal 10.0.3.1" - userspace_pm_rm_addr $ns1 10 userspace_pm_rm_sf $ns1 "::ffff:10.0.2.1" $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns1}" \ - "id 20 flags signal 10.0.3.1" "after rm_addr 10" + "id 20 flags signal 10.0.3.1" "after rm_sf 10" userspace_pm_rm_addr $ns1 20 - userspace_pm_rm_sf $ns1 10.0.3.1 $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns1}" "" "after rm_addr 20" - chk_rm_nr 2 2 invert + chk_rm_nr 1 1 invert chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids @@ -3375,8 +3556,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 userspace_pm_add_sf $ns2 10.0.3.2 20 @@ -3387,12 +3568,11 @@ userspace_tests() "id 20 flags subflow 10.0.3.2" \ "subflow" userspace_pm_chk_get_addr "${ns2}" "20" "id 20 flags subflow 10.0.3.2" - userspace_pm_rm_addr $ns2 20 userspace_pm_rm_sf $ns2 10.0.3.2 $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns2}" \ "" \ - "after rm_addr 20" - chk_rm_nr 1 1 + "after rm_sf 20" + chk_rm_nr 0 1 chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids @@ -3404,8 +3584,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 chk_mptcp_info subflows 0 subflows 0 @@ -3425,8 +3605,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 userspace_pm_add_sf $ns2 10.0.3.2 20 @@ -3449,8 +3629,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 userspace_pm_add_addr $ns1 10.0.2.1 10 @@ -3480,8 +3660,8 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - speed=slow \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3500,31 +3680,185 @@ endpoint_tests() mptcp_lib_kill_wait $tests_pid fi - if reset "delete and re-add" && + if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 1 1 - pm_nl_set_limits $ns2 1 1 + start_events + pm_nl_set_limits $ns1 0 3 + pm_nl_set_limits $ns2 0 3 + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - test_linkfail=4 speed=20 \ - run_tests $ns1 $ns2 10.0.1.1 & + { test_linkfail=4 speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 pm_nl_check_endpoint "creation" \ $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 - chk_subflow_nr "before delete" 2 + chk_subflow_nr "before delete id 2" 2 chk_mptcp_info subflows 1 subflows 1 pm_nl_del_endpoint $ns2 2 10.0.2.2 sleep 0.5 - chk_subflow_nr "after delete" 1 + chk_subflow_nr "after delete id 2" 1 chk_mptcp_info subflows 0 subflows 0 - pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow wait_mpj $ns2 - chk_subflow_nr "after re-add" 2 + chk_subflow_nr "after re-add id 2" 2 chk_mptcp_info subflows 1 subflows 1 + + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_attempt_fail $ns2 + chk_subflow_nr "after new reject" 2 + chk_mptcp_info subflows 1 subflows 1 + + ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT + pm_nl_del_endpoint $ns2 3 10.0.3.2 + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after no reject" 3 + chk_mptcp_info subflows 2 subflows 2 + + local i + for i in $(seq 3); do + pm_nl_del_endpoint $ns2 1 10.0.1.2 + sleep 0.5 + chk_subflow_nr "after delete id 0 ($i)" 2 + chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf + + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after re-add id 0 ($i)" 3 + chk_mptcp_info subflows 3 subflows 3 + done + + mptcp_lib_kill_wait $tests_pid + + kill_events_pids + chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 4 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 4 + + chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 0 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 5 # one has been closed before estab + + join_syn_tx=7 \ + chk_join_nr 6 6 6 + chk_rm_nr 4 4 + fi + + # remove and re-add + if reset_with_events "delete re-add signal" && + mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 3 + pm_nl_set_limits $ns2 3 3 + pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal + pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal + { test_linkfail=4 speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null + local tests_pid=$! + + wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns1 10.0.2.1 id 1 flags signal + chk_subflow_nr "before delete" 2 + chk_mptcp_info subflows 1 subflows 1 + + pm_nl_del_endpoint $ns1 1 10.0.2.1 + pm_nl_del_endpoint $ns1 2 224.0.0.1 + sleep 0.5 + chk_subflow_nr "after delete" 1 + chk_mptcp_info subflows 0 subflows 0 + + pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-add" 3 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_del_endpoint $ns1 42 10.0.1.1 + sleep 0.5 + chk_subflow_nr "after delete ID 0" 2 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-add ID 0" 3 + chk_mptcp_info subflows 3 subflows 3 + + pm_nl_del_endpoint $ns1 99 10.0.1.1 + sleep 0.5 + chk_subflow_nr "after re-delete ID 0" 2 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-re-add ID 0" 3 + chk_mptcp_info subflows 3 subflows 3 + mptcp_lib_kill_wait $tests_pid + + kill_events_pids + chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 3 + + chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 6 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 4 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 3 + + join_connect_err=1 \ + chk_join_nr 5 5 5 + chk_add_nr 6 6 + chk_rm_nr 4 3 invert + fi + + # flush and re-add + if reset_with_tcp_filter "flush re-add" ns2 10.0.3.2 REJECT OUTPUT && + mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 2 + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + { test_linkfail=4 speed=20 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null + local tests_pid=$! + + wait_attempt_fail $ns2 + chk_subflow_nr "before flush" 1 + chk_mptcp_info subflows 0 subflows 0 + + pm_nl_flush_endpoint $ns2 + pm_nl_flush_endpoint $ns1 + wait_rm_addr $ns2 0 + ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_mpj $ns2 + pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal + wait_mpj $ns2 mptcp_lib_kill_wait $tests_pid + + join_syn_tx=3 join_connect_err=1 \ + chk_join_nr 2 2 2 + chk_add_nr 2 2 + chk_rm_nr 1 0 invert fi } @@ -3627,9 +3961,11 @@ if [ ${#tests[@]} -eq 0 ]; then tests=("${all_tests_names[@]}") fi +mptcp_lib_subtests_last_ts_reset for subtests in "${tests[@]}"; do "${subtests}" done +append_prev_results if [ ${ret} -ne 0 ]; then echo @@ -3640,7 +3976,6 @@ if [ ${ret} -ne 0 ]; then echo fi -append_prev_results mptcp_lib_result_print_all_tap exit $ret diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 438280e68434..975d4d4c862a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -12,10 +12,14 @@ readonly KSFT_SKIP=4 readonly KSFT_TEST="${MPTCP_LIB_KSFT_TEST:-$(basename "${0}" .sh)}" # These variables are used in some selftests, read-only +declare -rx MPTCP_LIB_EVENT_CREATED=1 # MPTCP_EVENT_CREATED +declare -rx MPTCP_LIB_EVENT_ESTABLISHED=2 # MPTCP_EVENT_ESTABLISHED +declare -rx MPTCP_LIB_EVENT_CLOSED=3 # MPTCP_EVENT_CLOSED declare -rx MPTCP_LIB_EVENT_ANNOUNCED=6 # MPTCP_EVENT_ANNOUNCED declare -rx MPTCP_LIB_EVENT_REMOVED=7 # MPTCP_EVENT_REMOVED declare -rx MPTCP_LIB_EVENT_SUB_ESTABLISHED=10 # MPTCP_EVENT_SUB_ESTABLISHED declare -rx MPTCP_LIB_EVENT_SUB_CLOSED=11 # MPTCP_EVENT_SUB_CLOSED +declare -rx MPTCP_LIB_EVENT_SUB_PRIORITY=13 # MPTCP_EVENT_SUB_PRIORITY declare -rx MPTCP_LIB_EVENT_LISTENER_CREATED=15 # MPTCP_EVENT_LISTENER_CREATED declare -rx MPTCP_LIB_EVENT_LISTENER_CLOSED=16 # MPTCP_EVENT_LISTENER_CLOSED @@ -25,6 +29,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_SUBTEST_FLAKY=0 +MPTCP_LIB_SUBTESTS_LAST_TS_MS= MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -201,6 +206,11 @@ mptcp_lib_kversion_ge() { mptcp_lib_fail_if_expected_feature "kernel version ${1} lower than ${v}" } +mptcp_lib_subtests_last_ts_reset() { + MPTCP_LIB_SUBTESTS_LAST_TS_MS="$(date +%s%3N)" +} +mptcp_lib_subtests_last_ts_reset + __mptcp_lib_result_check_duplicated() { local subtest @@ -215,13 +225,22 @@ __mptcp_lib_result_check_duplicated() { __mptcp_lib_result_add() { local result="${1}" + local time="time=" + local ts_prev_ms shift local id=$((${#MPTCP_LIB_SUBTESTS[@]} + 1)) __mptcp_lib_result_check_duplicated "${*}" - MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*}") + # not to add two '#' + [[ "${*}" != *"#"* ]] && time="# ${time}" + + ts_prev_ms="${MPTCP_LIB_SUBTESTS_LAST_TS_MS}" + mptcp_lib_subtests_last_ts_reset + time+="$((MPTCP_LIB_SUBTESTS_LAST_TS_MS - ts_prev_ms))ms" + + MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*} ${time}") } # $1: test name diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 68899a303a1a..5e8d5b83e2d0 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -349,6 +349,7 @@ init make_file "$cin" "client" 1 make_file "$sin" "server" 1 trap cleanup EXIT +mptcp_lib_subtests_last_ts_reset run_tests $ns1 $ns2 10.0.1.1 run_tests $ns1 $ns2 dead:beef:1::1 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 2757378b1b13..2e6648a2b2c0 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -137,6 +137,8 @@ check() fi } +mptcp_lib_subtests_last_ts_reset + check "show_endpoints" "" "defaults addr list" default_limits="$(get_limits)" diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7ad5a59adff2..994a556f46c1 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -19,12 +19,6 @@ #include "linux/mptcp.h" -#ifndef MPTCP_PM_NAME -#define MPTCP_PM_NAME "mptcp_pm" -#endif -#ifndef MPTCP_PM_EVENTS -#define MPTCP_PM_EVENTS "mptcp_pm_events" -#endif #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 #endif @@ -116,7 +110,7 @@ static int capture_events(int fd, int event_group) if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &event_group, sizeof(event_group)) < 0) - error(1, errno, "could not join the " MPTCP_PM_EVENTS " mcast group"); + error(1, errno, "could not join the " MPTCP_PM_EV_GRP_NAME " mcast group"); do { FD_ZERO(&rfds); @@ -288,7 +282,7 @@ static int genl_parse_getfamily(struct nlmsghdr *nlh, int *pm_family, if (grp->rta_type == CTRL_ATTR_MCAST_GRP_ID) *events_mcast_grp = *(__u32 *)RTA_DATA(grp); else if (grp->rta_type == CTRL_ATTR_MCAST_GRP_NAME && - !strcmp(RTA_DATA(grp), MPTCP_PM_EVENTS)) + !strcmp(RTA_DATA(grp), MPTCP_PM_EV_GRP_NAME)) got_events_grp = 1; grp = RTA_NEXT(grp, grp_len); diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index f74e1c3c126d..8fa77c8e9b65 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -286,6 +286,7 @@ while getopts "bcdhi" option;do done setup +mptcp_lib_subtests_last_ts_reset run_test 10 10 0 0 "balanced bwidth" run_test 10 10 1 25 "balanced bwidth with unbalanced delay" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 9cb05978269d..3651f73451cf 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -150,6 +150,7 @@ mptcp_lib_events "${ns2}" "${client_evts}" client_evts_pid server_evts=$(mktemp) mptcp_lib_events "${ns1}" "${server_evts}" server_evts_pid sleep 0.5 +mptcp_lib_subtests_last_ts_reset print_title "Init" print_test "Created network namespaces ns1, ns2" diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c new file mode 100644 index 000000000000..64d6805381c5 --- /dev/null +++ b/tools/testing/selftests/net/ncdevmem.c @@ -0,0 +1,570 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <linux/uio.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <errno.h> +#define __iovec_defined +#include <fcntl.h> +#include <malloc.h> +#include <error.h> + +#include <arpa/inet.h> +#include <sys/socket.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> + +#include <linux/memfd.h> +#include <linux/dma-buf.h> +#include <linux/udmabuf.h> +#include <libmnl/libmnl.h> +#include <linux/types.h> +#include <linux/netlink.h> +#include <linux/genetlink.h> +#include <linux/netdev.h> +#include <time.h> +#include <net/if.h> + +#include "netdev-user.h" +#include <ynl.h> + +#define PAGE_SHIFT 12 +#define TEST_PREFIX "ncdevmem" +#define NUM_PAGES 16000 + +#ifndef MSG_SOCK_DEVMEM +#define MSG_SOCK_DEVMEM 0x2000000 +#endif + +/* + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. + * + * Usage: + * + * On server: + * ncdevmem -s <server IP> -c <client IP> -f eth1 -l -p 5201 -v 7 + * + * On client: + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ + * tr \\n \\0 | \ + * head -c 5G | \ + * nc <server IP> 5201 -p 5201 + * + * Note this is compatible with regular netcat. i.e. the sender or receiver can + * be replaced with regular netcat to test the RX or TX path in isolation. + */ + +static char *server_ip = "192.168.1.4"; +static char *client_ip = "192.168.1.2"; +static char *port = "5201"; +static size_t do_validation; +static int start_queue = 8; +static int num_queues = 8; +static char *ifname = "eth1"; +static unsigned int ifindex; +static unsigned int dmabuf_id; + +void print_bytes(void *ptr, size_t size) +{ + unsigned char *p = ptr; + int i; + + for (i = 0; i < size; i++) + printf("%02hhX ", p[i]); + printf("\n"); +} + +void print_nonzero_bytes(void *ptr, size_t size) +{ + unsigned char *p = ptr; + unsigned int i; + + for (i = 0; i < size; i++) + putchar(p[i]); + printf("\n"); +} + +void validate_buffer(void *line, size_t size) +{ + static unsigned char seed = 1; + unsigned char *ptr = line; + int errors = 0; + size_t i; + + for (i = 0; i < size; i++) { + if (ptr[i] != seed) { + fprintf(stderr, + "Failed validation: expected=%u, actual=%u, index=%lu\n", + seed, ptr[i], i); + errors++; + if (errors > 20) + error(1, 0, "validation failed."); + } + seed++; + if (seed == do_validation) + seed = 0; + } + + fprintf(stdout, "Validated buffer\n"); +} + +#define run_command(cmd, ...) \ + ({ \ + char command[256]; \ + memset(command, 0, sizeof(command)); \ + snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \ + printf("Running: %s\n", command); \ + system(command); \ + }) + +static int reset_flow_steering(void) +{ + int ret = 0; + + ret = run_command("sudo ethtool -K %s ntuple off", ifname); + if (ret) + return ret; + + return run_command("sudo ethtool -K %s ntuple on", ifname); +} + +static int configure_headersplit(bool on) +{ + return run_command("sudo ethtool -G %s tcp-data-split %s", ifname, + on ? "on" : "off"); +} + +static int configure_rss(void) +{ + return run_command("sudo ethtool -X %s equal %d", ifname, start_queue); +} + +static int configure_channels(unsigned int rx, unsigned int tx) +{ + return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx); +} + +static int configure_flow_steering(void) +{ + return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d", + ifname, client_ip, server_ip, port, port, start_queue); +} + +static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, + struct netdev_queue_id *queues, + unsigned int n_queue_index, struct ynl_sock **ys) +{ + struct netdev_bind_rx_req *req = NULL; + struct netdev_bind_rx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!*ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = netdev_bind_rx_req_alloc(); + netdev_bind_rx_req_set_ifindex(req, ifindex); + netdev_bind_rx_req_set_fd(req, dmabuf_fd); + __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); + + rsp = netdev_bind_rx(*ys, req); + if (!rsp) { + perror("netdev_bind_rx"); + goto err_close; + } + + if (!rsp->_present.id) { + perror("id not present"); + goto err_close; + } + + printf("got dmabuf id=%d\n", rsp->id); + dmabuf_id = rsp->id; + + netdev_bind_rx_req_free(req); + netdev_bind_rx_rsp_free(rsp); + + return 0; + +err_close: + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); + netdev_bind_rx_req_free(req); + ynl_sock_destroy(*ys); + return -1; +} + +static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) +{ + struct udmabuf_create create; + int ret; + + *devfd = open("/dev/udmabuf", O_RDWR); + if (*devfd < 0) { + error(70, 0, + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", + TEST_PREFIX); + } + + *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); + if (*memfd < 0) + error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX); + + /* Required for udmabuf */ + ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK); + if (ret < 0) + error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); + + ret = ftruncate(*memfd, dmabuf_size); + if (ret == -1) + error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); + + memset(&create, 0, sizeof(create)); + + create.memfd = *memfd; + create.offset = 0; + create.size = dmabuf_size; + *buf = ioctl(*devfd, UDMABUF_CREATE, &create); + if (*buf < 0) + error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX); +} + +int do_server(void) +{ + char ctrl_data[sizeof(int) * 20000]; + struct netdev_queue_id *queues; + size_t non_page_aligned_frags = 0; + struct sockaddr_in client_addr; + struct sockaddr_in server_sin; + size_t page_aligned_frags = 0; + int devfd, memfd, buf, ret; + size_t total_received = 0; + socklen_t client_addr_len; + bool is_devmem = false; + char *buf_mem = NULL; + struct ynl_sock *ys; + size_t dmabuf_size; + char iobuf[819200]; + char buffer[256]; + int socket_fd; + int client_fd; + size_t i = 0; + int opt = 1; + + dmabuf_size = getpagesize() * NUM_PAGES; + + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); + + if (reset_flow_steering()) + error(1, 0, "Failed to reset flow steering\n"); + + /* Configure RSS to divert all traffic from our devmem queues */ + if (configure_rss()) + error(1, 0, "Failed to configure rss\n"); + + /* Flow steer our devmem flows to start_queue */ + if (configure_flow_steering()) + error(1, 0, "Failed to configure flow steering\n"); + + sleep(1); + + queues = malloc(sizeof(*queues) * num_queues); + + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + error(1, 0, "Failed to bind\n"); + + buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED, + buf, 0); + if (buf_mem == MAP_FAILED) + error(1, 0, "mmap()"); + + server_sin.sin_family = AF_INET; + server_sin.sin_port = htons(atoi(port)); + + ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr); + if (socket < 0) + error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX); + + socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0); + if (socket < 0) + error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt, + sizeof(opt)); + if (ret) + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt, + sizeof(opt)); + if (ret) + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); + + printf("binding to address %s:%d\n", server_ip, + ntohs(server_sin.sin_port)); + + ret = bind(socket_fd, &server_sin, sizeof(server_sin)); + if (ret) + error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX); + + ret = listen(socket_fd, 1); + if (ret) + error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX); + + client_addr_len = sizeof(client_addr); + + inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer, + sizeof(buffer)); + printf("Waiting or connection on %s:%d\n", buffer, + ntohs(server_sin.sin_port)); + client_fd = accept(socket_fd, &client_addr, &client_addr_len); + + inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer, + sizeof(buffer)); + printf("Got connection from %s:%d\n", buffer, + ntohs(client_addr.sin_port)); + + while (1) { + struct iovec iov = { .iov_base = iobuf, + .iov_len = sizeof(iobuf) }; + struct dmabuf_cmsg *dmabuf_cmsg = NULL; + struct dma_buf_sync sync = { 0 }; + struct cmsghdr *cm = NULL; + struct msghdr msg = { 0 }; + struct dmabuf_token token; + ssize_t ret; + + is_devmem = false; + printf("\n\n"); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM); + printf("recvmsg ret=%ld\n", ret); + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + continue; + if (ret < 0) { + perror("recvmsg"); + continue; + } + if (ret == 0) { + printf("client exited\n"); + goto cleanup; + } + + i++; + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level != SOL_SOCKET || + (cm->cmsg_type != SCM_DEVMEM_DMABUF && + cm->cmsg_type != SCM_DEVMEM_LINEAR)) { + fprintf(stdout, "skipping non-devmem cmsg\n"); + continue; + } + + dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); + is_devmem = true; + + if (cm->cmsg_type == SCM_DEVMEM_LINEAR) { + /* TODO: process data copied from skb's linear + * buffer. + */ + fprintf(stdout, + "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", + dmabuf_cmsg->frag_size); + + continue; + } + + token.token_start = dmabuf_cmsg->frag_token; + token.token_count = 1; + + total_received += dmabuf_cmsg->frag_size; + printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n", + dmabuf_cmsg->frag_offset >> PAGE_SHIFT, + dmabuf_cmsg->frag_offset % getpagesize(), + dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size, + dmabuf_cmsg->frag_token, total_received, + dmabuf_cmsg->dmabuf_id); + + if (dmabuf_cmsg->dmabuf_id != dmabuf_id) + error(1, 0, + "received on wrong dmabuf_id: flow steering error\n"); + + if (dmabuf_cmsg->frag_size % getpagesize()) + non_page_aligned_frags++; + else + page_aligned_frags++; + + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START; + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); + + if (do_validation) + validate_buffer( + ((unsigned char *)buf_mem) + + dmabuf_cmsg->frag_offset, + dmabuf_cmsg->frag_size); + else + print_nonzero_bytes( + ((unsigned char *)buf_mem) + + dmabuf_cmsg->frag_offset, + dmabuf_cmsg->frag_size); + + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END; + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); + + ret = setsockopt(client_fd, SOL_SOCKET, + SO_DEVMEM_DONTNEED, &token, + sizeof(token)); + if (ret != 1) + error(1, 0, + "SO_DEVMEM_DONTNEED not enough tokens"); + } + if (!is_devmem) + error(1, 0, "flow steering error\n"); + + printf("total_received=%lu\n", total_received); + } + + fprintf(stdout, "%s: ok\n", TEST_PREFIX); + + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + page_aligned_frags, non_page_aligned_frags); + + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + page_aligned_frags, non_page_aligned_frags); + +cleanup: + + munmap(buf_mem, dmabuf_size); + close(client_fd); + close(socket_fd); + close(buf); + close(memfd); + close(devfd); + ynl_sock_destroy(ys); + + return 0; +} + +void run_devmem_tests(void) +{ + struct netdev_queue_id *queues; + int devfd, memfd, buf; + struct ynl_sock *ys; + size_t dmabuf_size; + size_t i = 0; + + dmabuf_size = getpagesize() * NUM_PAGES; + + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); + + /* Configure RSS to divert all traffic from our devmem queues */ + if (configure_rss()) + error(1, 0, "rss error\n"); + + queues = calloc(num_queues, sizeof(*queues)); + + if (configure_headersplit(1)) + error(1, 0, "Failed to configure header split\n"); + + if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + error(1, 0, "Binding empty queues array should have failed\n"); + + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + if (configure_headersplit(0)) + error(1, 0, "Failed to configure header split\n"); + + if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + error(1, 0, "Configure dmabuf with header split off should have failed\n"); + + if (configure_headersplit(1)) + error(1, 0, "Failed to configure header split\n"); + + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + error(1, 0, "Failed to bind\n"); + + /* Deactivating a bound queue should not be legal */ + if (!configure_channels(num_queues, num_queues - 1)) + error(1, 0, "Deactivating a bound queue should be illegal.\n"); + + /* Closing the netlink socket does an implicit unbind */ + ynl_sock_destroy(ys); +} + +int main(int argc, char *argv[]) +{ + int is_server = 0, opt; + + while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) { + switch (opt) { + case 'l': + is_server = 1; + break; + case 's': + server_ip = optarg; + break; + case 'c': + client_ip = optarg; + break; + case 'p': + port = optarg; + break; + case 'v': + do_validation = atoll(optarg); + break; + case 'q': + num_queues = atoi(optarg); + break; + case 't': + start_queue = atoi(optarg); + break; + case 'f': + ifname = optarg; + break; + case '?': + printf("unknown option: %c\n", optopt); + break; + } + } + + ifindex = if_nametoindex(ifname); + + for (; optind < argc; optind++) + printf("extra arguments: %s\n", argv[optind]); + + run_devmem_tests(); + + if (is_server) + return do_server(); + + return 0; +} diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index e3afcb424710..438f7b2acc5f 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -67,8 +67,12 @@ kci_net_setup() return $ksft_skip fi - # TODO what ipaddr to set ? DHCP ? - echo "SKIP: $netdev: set IP address" + if [ "$veth_created" ]; then + echo "XFAIL: $netdev: set IP address unsupported for veth*" + else + # TODO what ipaddr to set ? DHCP ? + echo "SKIP: $netdev: set IP address" + fi return $ksft_skip } @@ -86,7 +90,7 @@ kci_netdev_ethtool_test() ret=$? if [ $ret -ne 0 ];then if [ $ret -eq "$1" ];then - echo "SKIP: $netdev: ethtool $2 not supported" + echo "XFAIL: $netdev: ethtool $2 not supported" return $ksft_skip else echo "FAIL: $netdev: ethtool $2" @@ -124,11 +128,45 @@ kci_netdev_ethtool() return 1 fi echo "PASS: $netdev: ethtool list features" - #TODO for each non fixed features, try to turn them on/off + + while read -r FEATURE VALUE FIXED; do + [ "$FEATURE" != "Features" ] || continue # Skip "Features" + [ "$FIXED" != "[fixed]" ] || continue # Skip fixed features + feature="${FEATURE%:*}" + + ethtool --offload "$netdev" "$feature" off + if [ $? -eq 0 ]; then + echo "PASS: $netdev: Turned off feature: $feature" + else + echo "FAIL: $netdev: Failed to turn off feature:" \ + "$feature" + fi + + ethtool --offload "$netdev" "$feature" on + if [ $? -eq 0 ]; then + echo "PASS: $netdev: Turned on feature: $feature" + else + echo "FAIL: $netdev: Failed to turn on feature:" \ + "$feature" + fi + + #restore the feature to its initial state + ethtool --offload "$netdev" "$feature" "$VALUE" + if [ $? -eq 0 ]; then + echo "PASS: $netdev: Restore feature $feature" \ + "to initial state $VALUE" + else + echo "FAIL: $netdev: Failed to restore feature" \ + "$feature to initial state $VALUE" + fi + + done < "$TMP_ETHTOOL_FEATURES" + rm "$TMP_ETHTOOL_FEATURES" kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev" kci_netdev_ethtool_test 94 'stats' "ethtool -S $netdev" + return 0 } @@ -196,10 +234,24 @@ if [ ! -e "$TMP_LIST_NETDEV" ];then fi ip link show |grep '^[0-9]' | grep -oE '[[:space:]].*eth[0-9]*:|[[:space:]].*enp[0-9]s[0-9]:' | cut -d\ -f2 | cut -d: -f1> "$TMP_LIST_NETDEV" + +if [ ! -s "$TMP_LIST_NETDEV" ]; then + echo "No valid network device found, creating veth pair" + ip link add veth0 type veth peer name veth1 + echo "veth0" > "$TMP_LIST_NETDEV" + veth_created=1 +fi + while read netdev do kci_test_netdev "$netdev" done < "$TMP_LIST_NETDEV" +#clean up veth interface pair if it was created +if [ "$veth_created" ]; then + ip link delete veth0 + echo "Removed veth pair" +fi + rm "$TMP_LIST_NETDEV" exit 0 diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 47945b2b3f92..d13fb5ea3e89 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -7,6 +7,7 @@ MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += br_netfilter_queue.sh TEST_PROGS += conntrack_icmp_related.sh TEST_PROGS += conntrack_ipip_mtu.sh TEST_PROGS += conntrack_tcp_unreplied.sh diff --git a/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh new file mode 100755 index 000000000000..6a764d70ab06 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +source lib.sh + +checktool "nft --version" "run test without nft tool" + +cleanup() { + cleanup_all_ns +} + +setup_ns c1 c2 c3 sender + +trap cleanup EXIT + +nf_queue_wait() +{ + grep -q "^ *$1 " "/proc/self/net/netfilter/nfnetlink_queue" +} + +port_add() { + ns="$1" + dev="$2" + a="$3" + + ip link add name "$dev" type veth peer name "$dev" netns "$ns" + + ip -net "$ns" addr add 192.168.1."$a"/24 dev "$dev" + ip -net "$ns" link set "$dev" up + + ip link set "$dev" master br0 + ip link set "$dev" up +} + +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } + +ip link add br0 type bridge +ip addr add 192.168.1.254/24 dev br0 + +port_add "$c1" "c1" 1 +port_add "$c2" "c2" 2 +port_add "$c3" "c3" 3 +port_add "$sender" "sender" 253 + +ip link set br0 up + +modprobe -q br_netfilter + +sysctl net.bridge.bridge-nf-call-iptables=1 || exit 1 + +ip netns exec "$sender" ping -I sender -c1 192.168.1.1 || exit 1 +ip netns exec "$sender" ping -I sender -c1 192.168.1.2 || exit 2 +ip netns exec "$sender" ping -I sender -c1 192.168.1.3 || exit 3 + +nft -f /dev/stdin <<EOF +table ip filter { + chain forward { + type filter hook forward priority 0; policy accept; + ct state new counter + ip protocol icmp counter queue num 0 bypass + } +} +EOF +./nf_queue -t 5 > /dev/null & + +busywait 5000 nf_queue_wait + +for i in $(seq 1 5); do conntrack -F > /dev/null 2> /dev/null; sleep 0.1 ; done & +ip netns exec "$sender" ping -I sender -f -c 50 -b 192.168.1.255 + +read t < /proc/sys/kernel/tainted +if [ "$t" -eq 0 ];then + echo PASS: kernel not tainted +else + echo ERROR: kernel is tainted + exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 63ef80ef47a4..b2dd4db45215 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -87,3 +87,5 @@ CONFIG_XFRM_USER=m CONFIG_XFRM_STATISTICS=y CONFIG_NET_PKTGEN=m CONFIG_TUN=m +CONFIG_INET_DIAG=m +CONFIG_SCTP_DIAG=m diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index c61d23a8c88d..d66e3c4dfec6 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -8,7 +8,7 @@ source lib.sh ret=0 -timeout=2 +timeout=5 cleanup() { @@ -25,6 +25,9 @@ cleanup() } checktool "nft --version" "test without nft tool" +checktool "socat -h" "run test without socat" + +modprobe -q sctp trap cleanup EXIT @@ -36,7 +39,9 @@ TMPFILE2=$(mktemp) TMPFILE3=$(mktemp) TMPINPUT=$(mktemp) -dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" +COUNT=200 +[ "$KSFT_MACHINE_SLOW" = "yes" ] && COUNT=25 +dd conv=sparse status=none if=/dev/zero bs=1M count=$COUNT of="$TMPINPUT" if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: No virtual ethernet pair device support in kernel" @@ -250,45 +255,49 @@ listener_ready() test_tcp_forward() { - ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 2 & local nfqpid=$! timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 2 ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain" + kill "$nfqpid" } test_tcp_localhost() { - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! - ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 & local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null wait "$rpid" && echo "PASS: tcp via loopback" - wait 2>/dev/null + kill "$nfqpid" } test_tcp_localhost_connectclose() { - ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" & - ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 & + local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 + timeout 10 ip netns exec "$nsrouter" ./connect_close -p 23456 -t 3 + + kill "$nfqpid" wait && echo "PASS: tcp via loopback with connect/close" - wait 2>/dev/null } test_tcp_localhost_requeue() @@ -353,7 +362,7 @@ table inet filter { } } EOF - ip netns exec "$ns1" ./nf_queue -q 1 -t "$timeout" & + ip netns exec "$ns1" ./nf_queue -q 1 & local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1 @@ -363,6 +372,7 @@ EOF for n in output post; do for d in tvrf eth0; do if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then + kill "$nfqpid" echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 ip netns exec "$ns1" nft list ruleset ret=1 @@ -371,8 +381,96 @@ EOF done done - wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf" - wait 2>/dev/null + kill "$nfqpid" + echo "PASS: icmp+nfqueue via vrf" +} + +sctp_listener_ready() +{ + ss -S -N "$1" -ln -o "sport = :12345" | grep -q 12345 +} + +check_output_files() +{ + local f1="$1" + local f2="$2" + local err="$3" + + if ! cmp "$f1" "$f2" ; then + echo "FAIL: $err: input and output file differ" 1>&2 + echo -n " Input file" 1>&2 + ls -l "$f1" 1>&2 + echo -n "Output file" 1>&2 + ls -l "$f2" 1>&2 + ret=1 + fi +} + +test_sctp_forward() +{ + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +flush ruleset +table inet sctpq { + chain forward { + type filter hook forward priority 0; policy accept; + sctp dport 12345 queue num 10 + } +} +EOF + timeout 60 ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2" + + ip netns exec "$nsrouter" ./nf_queue -q 10 -G & + local nfqpid=$! + + ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + if ! ip netns exec "$nsrouter" nft delete table inet sctpq; then + echo "FAIL: Could not delete sctpq table" + exit 1 + fi + + wait "$rpid" && echo "PASS: sctp and nfqueue in forward chain" + kill "$nfqpid" + + check_output_files "$TMPINPUT" "$TMPFILE1" "sctp forward" +} + +test_sctp_output() +{ + ip netns exec "$ns1" nft -f /dev/stdin <<EOF +table inet sctpq { + chain output { + type filter hook output priority 0; policy accept; + sctp dport 12345 queue num 11 + } +} +EOF + # reduce test file size, software segmentation causes sk wmem increase. + dd conv=sparse status=none if=/dev/zero bs=1M count=$((COUNT/2)) of="$TMPINPUT" + + timeout 60 ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2" + + ip netns exec "$ns1" ./nf_queue -q 11 & + local nfqpid=$! + + ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + if ! ip netns exec "$ns1" nft delete table inet sctpq; then + echo "FAIL: Could not delete sctpq table" + exit 1 + fi + + # must wait before checking completeness of output file. + wait "$rpid" && echo "PASS: sctp and nfqueue in output chain with GSO" + kill "$nfqpid" + + check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output" } test_queue_removal() @@ -388,7 +486,7 @@ table ip filter { } } EOF - ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 -t "$timeout" & + ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 & local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 0 @@ -443,11 +541,16 @@ test_queue 10 # same. We queue to a second program as well. load_ruleset "filter2" 20 test_queue 20 +ip netns exec "$ns1" nft flush ruleset test_tcp_forward test_tcp_localhost test_tcp_localhost_connectclose test_tcp_localhost_requeue +test_sctp_forward +test_sctp_output + +# should be last, adds vrf device in ns1 and changes routes test_icmp_vrf test_queue_removal diff --git a/tools/testing/selftests/net/packetdrill/Makefile b/tools/testing/selftests/net/packetdrill/Makefile new file mode 100644 index 000000000000..31cfb666ba8b --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := ksft_runner.sh \ + defaults.sh \ + set_sysctls.py \ + ../../kselftest/ktap_helpers.sh + +TEST_PROGS := $(wildcard *.pkt) + +include ../../lib.mk diff --git a/tools/testing/selftests/net/packetdrill/config b/tools/testing/selftests/net/packetdrill/config new file mode 100644 index 000000000000..0237ed98f3c0 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/config @@ -0,0 +1,11 @@ +CONFIG_IPV6=y +CONFIG_HZ_1000=y +CONFIG_HZ=1000 +CONFIG_NET_NS=y +CONFIG_NET_SCH_FIFO=y +CONFIG_NET_SCH_FQ=y +CONFIG_PROC_SYSCTL=y +CONFIG_SYN_COOKIES=y +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_MD5SIG=y +CONFIG_TUN=y diff --git a/tools/testing/selftests/net/packetdrill/defaults.sh b/tools/testing/selftests/net/packetdrill/defaults.sh new file mode 100755 index 000000000000..1095a7b22f44 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/defaults.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Set standard production config values that relate to TCP behavior. + +# Flush old cached data (fastopen cookies). +ip tcp_metrics flush all > /dev/null 2>&1 + +# TCP min, default, and max receive and send buffer sizes. +sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))" +sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304" + +# TCP timestamps. +sysctl -q net.ipv4.tcp_timestamps=1 + +# TCP SYN(ACK) retry thresholds +sysctl -q net.ipv4.tcp_syn_retries=5 +sysctl -q net.ipv4.tcp_synack_retries=5 + +# TCP Forward RTO-Recovery, RFC 5682. +sysctl -q net.ipv4.tcp_frto=2 + +# TCP Selective Acknowledgements (SACK) +sysctl -q net.ipv4.tcp_sack=1 + +# TCP Duplicate Selective Acknowledgements (DSACK) +sysctl -q net.ipv4.tcp_dsack=1 + +# TCP FACK (Forward Acknowldgement) +sysctl -q net.ipv4.tcp_fack=0 + +# TCP reordering degree ("dupthresh" threshold for entering Fast Recovery). +sysctl -q net.ipv4.tcp_reordering=3 + +# TCP congestion control. +sysctl -q net.ipv4.tcp_congestion_control=cubic + +# TCP slow start after idle. +sysctl -q net.ipv4.tcp_slow_start_after_idle=0 + +# TCP RACK and TLP. +sysctl -q net.ipv4.tcp_early_retrans=4 net.ipv4.tcp_recovery=1 + +# TCP method for deciding when to defer sending to accumulate big TSO packets. +sysctl -q net.ipv4.tcp_tso_win_divisor=3 + +# TCP Explicit Congestion Notification (ECN) +sysctl -q net.ipv4.tcp_ecn=0 + +sysctl -q net.ipv4.tcp_pacing_ss_ratio=200 +sysctl -q net.ipv4.tcp_pacing_ca_ratio=120 +sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1 + +sysctl -q net.ipv4.tcp_fastopen=0x70403 +sysctl -q net.ipv4.tcp_fastopen_key=a1a1a1a1-b2b2b2b2-c3c3c3c3-d4d4d4d4 + +sysctl -q net.ipv4.tcp_syncookies=1 + +# Override the default qdisc on the tun device. +# Many tests fail with timing errors if the default +# is FQ and that paces their flows. +tc qdisc add dev tun0 root pfifo + diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh new file mode 100755 index 000000000000..7478c0c0c9aa --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source "$(dirname $(realpath $0))/../../kselftest/ktap_helpers.sh" + +readonly ipv4_args=('--ip_version=ipv4 ' + '--local_ip=192.168.0.1 ' + '--gateway_ip=192.168.0.1 ' + '--netmask_ip=255.255.0.0 ' + '--remote_ip=192.0.2.1 ' + '-D CMSG_LEVEL_IP=SOL_IP ' + '-D CMSG_TYPE_RECVERR=IP_RECVERR ') + +readonly ipv6_args=('--ip_version=ipv6 ' + '--mtu=1520 ' + '--local_ip=fd3d:0a0b:17d6::1 ' + '--gateway_ip=fd3d:0a0b:17d6:8888::1 ' + '--remote_ip=fd3d:fa7b:d17d::1 ' + '-D CMSG_LEVEL_IP=SOL_IPV6 ' + '-D CMSG_TYPE_RECVERR=IPV6_RECVERR ') + +if [ $# -ne 1 ]; then + ktap_exit_fail_msg "usage: $0 <script>" + exit "$KSFT_FAIL" +fi +script="$1" + +if [ -z "$(which packetdrill)" ]; then + ktap_skip_all "packetdrill not found in PATH" + exit "$KSFT_SKIP" +fi + +ktap_print_header +ktap_set_plan 2 + +unshare -n packetdrill ${ipv4_args[@]} $(basename $script) > /dev/null \ + && ktap_test_pass "ipv4" || ktap_test_fail "ipv4" +unshare -n packetdrill ${ipv6_args[@]} $(basename $script) > /dev/null \ + && ktap_test_pass "ipv6" || ktap_test_fail "ipv6" + +ktap_finished diff --git a/tools/testing/selftests/net/packetdrill/set_sysctls.py b/tools/testing/selftests/net/packetdrill/set_sysctls.py new file mode 100755 index 000000000000..5ddf456ae973 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/set_sysctls.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Sets sysctl values and writes a file that restores them. + +The arguments are of the form "<proc-file>=<val>" separated by spaces. +The program first reads the current value of the proc-file and creates +a shell script named "/tmp/sysctl_restore_${PACKETDRILL_PID}.sh" which +restores the values when executed. It then sets the new values. + +PACKETDRILL_PID is set by packetdrill to the pid of itself, so a .pkt +file could restore sysctls by running `/tmp/sysctl_restore_${PPID}.sh` +at the end. +""" + +import os +import subprocess +import sys + +filename = '/tmp/sysctl_restore_%s.sh' % os.environ['PACKETDRILL_PID'] + +# Open file for restoring sysctl values +restore_file = open(filename, 'w') +print('#!/bin/bash', file=restore_file) + +for a in sys.argv[1:]: + sysctl = a.split('=') + # sysctl[0] contains the proc-file name, sysctl[1] the new value + + # read current value and add restore command to file + cur_val = subprocess.check_output(['cat', sysctl[0]], universal_newlines=True) + print('echo "%s" > %s' % (cur_val.strip(), sysctl[0]), file=restore_file) + + # set new value + cmd = 'echo "%s" > %s' % (sysctl[1], sysctl[0]) + os.system(cmd) + +os.system('chmod u+x %s' % filename) diff --git a/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt new file mode 100644 index 000000000000..df49c67645ac --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP_INQ and TCP_CM_INQ on the client side. +`./defaults.sh +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Connect to the server and enable TCP_INQ. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 setsockopt(3, SOL_TCP, TCP_INQ, [1], 4) = 0 + + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +.01 < S. 0:0(0) ack 1 win 5792 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 7> + +0 > . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700> + +// Now we have 10K of data ready on the socket. + +0 < . 1:10001(10000) ack 1 win 514 + +0 > . 1:1(0) ack 10001 <nop,nop,TS val 200 ecr 700> + +// We read 1K and we should have 9K ready to read. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{..., 1000}], + msg_flags=0, + msg_control=[{cmsg_level=SOL_TCP, + cmsg_type=TCP_CM_INQ, + cmsg_data=9000}]}, 0) = 1000 +// We read 9K and we should have no further data ready to read. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{..., 9000}], + msg_flags=0, + msg_control=[{cmsg_level=SOL_TCP, + cmsg_type=TCP_CM_INQ, + cmsg_data=0}]}, 0) = 9000 + +// Server sends more data and closes the connections. + +0 < F. 10001:20001(10000) ack 1 win 514 + +0 > . 1:1(0) ack 20002 <nop,nop,TS val 200 ecr 700> + +// We read 10K and we should have one "fake" byte because the connection is +// closed. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{..., 10000}], + msg_flags=0, + msg_control=[{cmsg_level=SOL_TCP, + cmsg_type=TCP_CM_INQ, + cmsg_data=1}]}, 0) = 10000 +// Now, receive EOF. + +0 read(3, ..., 2000) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt new file mode 100644 index 000000000000..04a5e2590c62 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP_INQ and TCP_CM_INQ on the server side. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 514 + +// Accept the connection and enable TCP_INQ. + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_TCP, TCP_INQ, [1], 4) = 0 + +// Now we have 10K of data ready on the socket. + +0 < . 1:10001(10000) ack 1 win 514 + +0 > . 1:1(0) ack 10001 + +// We read 2K and we should have 8K ready to read. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 2000}], + msg_flags=0, + msg_control=[{cmsg_level=SOL_TCP, + cmsg_type=TCP_CM_INQ, + cmsg_data=8000}]}, 0) = 2000 +// We read 8K and we should have no further data ready to read. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 8000}], + msg_flags=0, + msg_control=[{cmsg_level=SOL_TCP, + cmsg_type=TCP_CM_INQ, + cmsg_data=0}]}, 0) = 8000 +// Client sends more data and closes the connections. + +0 < F. 10001:20001(10000) ack 1 win 514 + +0 > . 1:1(0) ack 20002 + +// We read 10K and we should have one "fake" byte because the connection is +// closed. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 10000}], + msg_flags=0, + msg_control=[{cmsg_level=SOL_TCP, + cmsg_type=TCP_CM_INQ, + cmsg_data=1}]}, 0) = 10000 +// Now, receive error. + +0 read(3, ..., 2000) = -1 ENOTCONN (Transport endpoint is not connected) diff --git a/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt new file mode 100644 index 000000000000..25dfef95d3f8 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test what happens when client does not provide MD5 on SYN, +// but then does on the ACK that completes the three-way handshake. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> +// Ooh, weird: client provides MD5 option on the ACK: + +.01 < . 1:1(0) ack 1 win 514 <md5 000102030405060708090a0b0c0d0e0f,nop,nop> + +.01 < . 1:1(0) ack 1 win 514 <md5 000102030405060708090a0b0c0d0e0f,nop,nop> + +// The TCP listener refcount should be 2, but on buggy kernels it can be 0: + +0 `grep " 0A " /proc/net/tcp /proc/net/tcp6 | grep ":1F90"` + +// Now here comes the legit ACK: + +.01 < . 1:1(0) ack 1 win 514 + +// Make sure the connection is OK: + +0 accept(3, ..., ...) = 4 + + +.01 write(4, ..., 1000) = 1000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt new file mode 100644 index 000000000000..795c476d222d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when not application-limited, so that +// the cwnd continues to grow. +// In this variant, the receiver ACKs every packet. + +// Set up config. To keep things simple, disable the +// mechanism that defers sending in order to send bigger TSO packets. +`./defaults.sh +sysctl -q net.ipv4.tcp_tso_win_divisor=100` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 30000) = 30000 + +0 > P. 1:10001(10000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + ++.105 < . 1:1(0) ack 1001 win 257 + +0 > P. 10001:12001(2000) ack 1 + + +0 < . 1:1(0) ack 2001 win 257 + +0 > P. 12001:14001(2000) ack 1 + ++.005 < . 1:1(0) ack 3001 win 257 + +0 > P. 14001:16001(2000) ack 1 + + +0 < . 1:1(0) ack 4001 win 257 + +0 > P. 16001:18001(2000) ack 1 + ++.005 < . 1:1(0) ack 5001 win 257 + +0 > P. 18001:20001(2000) ack 1 + + +0 < . 1:1(0) ack 6001 win 257 + +0 > P. 20001:22001(2000) ack 1 + ++.005 < . 1:1(0) ack 7001 win 257 + +0 > P. 22001:24001(2000) ack 1 + + +0 < . 1:1(0) ack 8001 win 257 + +0 > P. 24001:26001(2000) ack 1 + ++.005 < . 1:1(0) ack 9001 win 257 + +0 > P. 26001:28001(2000) ack 1 + + +0 < . 1:1(0) ack 10001 win 257 + +0 > P. 28001:30001(2000) ack 1 + + +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt new file mode 100644 index 000000000000..9212ae1fd0f2 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when an outstanding flight of packets is +// less than the current cwnd, and not big enough to bump up cwnd. +// +// In this variant, the receiver ACKs every other packet, +// approximating standard delayed ACKs. + +// Set up config. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +// Only send 5 packets. + +0 write(4, ..., 5000) = 5000 + +0 > P. 1:5001(5000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + + +0 < . 1:1(0) ack 2001 win 257 + +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }% + + +0 < . 1:1(0) ack 4001 win 257 + +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }% + + +0 < . 1:1(0) ack 5001 win 257 + +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt new file mode 100644 index 000000000000..416c901ddf51 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when an outstanding flight of packets is +// less than the current cwnd, but still big enough that in slow +// start we want to increase our cwnd a little. +// +// In this variant, the receiver ACKs every other packet, +// approximating standard delayed ACKs. + +// Set up config. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +// Only send 6 packets. + +0 write(4, ..., 6000) = 6000 + +0 > P. 1:6001(6000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + + +0 < . 1:1(0) ack 2001 win 257 + +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }% + + +0 < . 1:1(0) ack 4001 win 257 + +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }% + + +0 < . 1:1(0) ack 6001 win 257 + +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt new file mode 100644 index 000000000000..a894b7d4559c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when not application-limited, so that +// the cwnd continues to grow. +// In this variant, the receiver ACKs every other packet, +// approximating standard delayed ACKs. + +// Set up config. To keep things simple, disable the +// mechanism that defers sending in order to send bigger TSO packets. +`./defaults.sh +sysctl -q net.ipv4.tcp_tso_win_divisor=100` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 30000) = 30000 + +0 > P. 1:10001(10000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + ++.105 < . 1:1(0) ack 2001 win 257 + +0 > P. 10001:14001(4000) ack 1 + ++.005 < . 1:1(0) ack 4001 win 257 + +0 > P. 14001:18001(4000) ack 1 + ++.005 < . 1:1(0) ack 6001 win 257 + +0 > P. 18001:22001(4000) ack 1 + ++.005 < . 1:1(0) ack 8001 win 257 + +0 > P. 22001:26001(4000) ack 1 + ++.005 < . 1:1(0) ack 10001 win 257 + +0 > P. 26001:30001(4000) ack 1 + + +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt new file mode 100644 index 000000000000..065fae9e9abd --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when not application-limited, so that +// the cwnd continues to grow. +// In this variant, the receiver sends one ACK per 4 packets. + +// Set up config. To keep things simple, disable the +// mechanism that defers sending in order to send bigger TSO packets. +`./defaults.sh +sysctl -q net.ipv4.tcp_tso_win_divisor=100` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 30000) = 30000 + +0 > P. 1:10001(10000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + + +.11 < . 1:1(0) ack 4001 win 257 + +0 > P. 10001:18001(8000) ack 1 + + +.01 < . 1:1(0) ack 8001 win 257 + +0 > P. 18001:26001(8000) ack 1 + ++.005 < . 1:1(0) ack 10001 win 257 + +0 > P. 26001:30001(4000) ack 1 + + +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt new file mode 100644 index 000000000000..11b213be1138 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start after idle +// This test expects tso size to be at least initial cwnd * mss + +`./defaults.sh +./set_sysctls.py /proc/sys/net/ipv4/tcp_slow_start_after_idle=1 \ + /proc/sys/net/ipv4/tcp_min_tso_segs=10` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 511 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 26000) = 26000 + +0 > P. 1:5001(5000) ack 1 + +0 > P. 5001:10001(5000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + + +.1 < . 1:1(0) ack 10001 win 511 + +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% + +0 > P. 10001:20001(10000) ack 1 + +0 > P. 20001:26001(6000) ack 1 + + +.1 < . 1:1(0) ack 26001 win 511 + +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% + + +2 write(4, ..., 20000) = 20000 +// If slow start after idle works properly, we should send 5 MSS here (cwnd/2) + +0 > P. 26001:31001(5000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + +// Reset sysctls +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt new file mode 100644 index 000000000000..577ed8c8852c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start after window update +// This test expects tso size to be at least initial cwnd * mss + +`./defaults.sh +./set_sysctls.py /proc/sys/net/ipv4/tcp_slow_start_after_idle=1 \ + /proc/sys/net/ipv4/tcp_min_tso_segs=10` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 511 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 26000) = 26000 + +0 > P. 1:5001(5000) ack 1 + +0 > P. 5001:10001(5000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + + +.1 < . 1:1(0) ack 10001 win 511 + +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% + +0 > P. 10001:20001(10000) ack 1 + +0 > P. 20001:26001(6000) ack 1 + + +.1 < . 1:1(0) ack 26001 win 0 + +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% + + +0 write(4, ..., 20000) = 20000 +// 1st win0 probe ++.3~+.310 > . 26000:26000(0) ack 1 + +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% + +// 2nd win0 probe ++.6~+.620 > . 26000:26000(0) ack 1 + +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% + +// 3rd win0 probe ++1.2~+1.240 > . 26000:26000(0) ack 1 + +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% + + +.9 < . 1:1(0) ack 26001 win 511 + +0 > P. 26001:31001(5000) ack 1 + +// Reset sysctls +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt new file mode 100644 index 000000000000..869f32c35a2a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when application-limited: in this case, +// with IW10, if we don't fully use our cwnd but instead +// send just 9 packets, then cwnd should grow to twice that +// value, or 18 packets. + +// Set up config. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 9000) = 9000 + +0 > P. 1:9001(9000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + ++.105 < . 1:1(0) ack 2001 win 257 + +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 4001 win 257 + +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 6001 win 257 + +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 8001 win 257 + +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 9001 win 257 + +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt new file mode 100644 index 000000000000..0f77b7955db6 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when application-limited: in this case, +// with IW10, if we send exactly 10 packets then cwnd should grow to 20. + +// Set up config. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + ++.105 < . 1:1(0) ack 2001 win 257 + +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 4001 win 257 + +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 6001 win 257 + +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 8001 win 257 + +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }% + ++.005 < . 1:1(0) ack 10001 win 257 + +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt new file mode 100644 index 000000000000..7e9c83d617c2 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test of slow start when not application-limited, so that +// the cwnd continues to grow, even if TSQ triggers. +// In this variant, the receiver ACKs every other packet, +// approximating standard delayed ACKs. + +// Note we use FQ/pacing to check if TCP Small Queues is not hurting + +`./defaults.sh +tc qdisc replace dev tun0 root fq +sysctl -q net/ipv4/tcp_pacing_ss_ratio=200 +sysctl -e -q net.ipv4.tcp_min_tso_segs=2` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 500 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 + + +0 write(4, ..., 40000) = 40000 +// This might change if we cook the initial packet with 10 MSS. + +0 > P. 1:2921(2920) ack 1 + +0 > P. 2921:5841(2920) ack 1 + +0 > P. 5841:8761(2920) ack 1 + +0 > P. 8761:11681(2920) ack 1 + +0 > P. 11681:14601(2920) ack 1 + +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% + ++.105 < . 1:1(0) ack 2921 win 500 + +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }% + +// Note: after this commit : "net_sched: sch_fq: account for schedule/timers drifts" +// FQ notices that this packet missed the 'time to send next packet' computed +// when prior packet (11681:14601(2920)) was sent. +// So FQ will allow following packet to be sent a bit earlier (quantum/2) +// (FQ commit allows an application/cwnd limited flow to get at most quantum/2 extra credit) + +0 > P. 14601:17521(2920) ack 1 + ++.003 < . 1:1(0) ack 5841 win 500 + +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }% + ++.001 > P. 17521:20441(2920) ack 1 + ++.001 < . 1:1(0) ack 8761 win 500 + +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }% + +// remaining packets are delivered at a constant rate. ++.007 > P. 20441:23361(2920) ack 1 + ++.002 < . 1:1(0) ack 11681 win 500 + +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }% ++.001 < . 1:1(0) ack 14601 win 500 + ++.004 > P. 23361:26281(2920) ack 1 + ++.007 > P. 26281:29201(2920) ack 1 + + +0 %{ assert tcpi_snd_cwnd == 20, 'cwnd=%d' % tcpi_snd_cwnd }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt new file mode 100644 index 000000000000..a82c8899d36b --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +// basic zerocopy test: +// +// send a packet with MSG_ZEROCOPY and receive the notification ID +// repeat and verify IDs are consecutive + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 4001:8001(4000) ack 1 + +0 < . 1:1(0) ack 8001 win 257 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=1, + ee_data=1}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt new file mode 100644 index 000000000000..c01915e7f4a1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +// batch zerocopy test: +// +// send multiple packets, then read one range of all notifications. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_MARK, [666], 4) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 4001:8001(4000) ack 1 + +0 < . 1:1(0) ack 8001 win 257 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=1}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt new file mode 100644 index 000000000000..6509882932e9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +// Minimal client-side zerocopy test + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0...0 connect(4, ..., ...) = 0 + + +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8> + +0 < S. 0:0(0) ack 1 win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > . 1:1(0) ack 1 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt new file mode 100644 index 000000000000..2cd78755cb2a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +// send with MSG_ZEROCOPY on a non-established socket +// +// verify that a send in state TCP_CLOSE correctly aborts the zerocopy +// operation, specifically it does not increment the zerocopy counter. +// +// First send on a closed socket and wait for (absent) notification. +// Then connect and send and verify that notification nr. is zero. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = -1 EPIPE (Broken pipe) + + +0.1 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[]}, MSG_ERRQUEUE) = -1 EAGAIN (Resource temporarily unavailable) + + +0...0 connect(4, ..., ...) = 0 + + +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8> + +0 < S. 0:0(0) ack 1 win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > . 1:1(0) ack 1 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt new file mode 100644 index 000000000000..7671c20e01cf --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +// epoll zerocopy test: +// +// EPOLLERR is known to be not edge-triggered unlike EPOLLIN and EPOLLOUT but +// it is not level-triggered either. +// +// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR +// is correctly fired only once, when EPOLLET is set. send another packet with +// MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 epoll_create(1) = 5 + +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLOUT|EPOLLET, fd=4}) = 0 + +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 4001:8001(4000) ack 1 + +0 < . 1:1(0) ack 8001 win 257 + +// receive only one EPOLLERR for the two sends above. + +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1 + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 8001:12001(4000) ack 1 + +0 < . 1:1(0) ack 12001 win 257 + +// receive only one EPOLLERR for the third send above. + +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1 + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=2}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt new file mode 100644 index 000000000000..fadc480fdb7f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +// epoll zerocopy test: +// +// EPOLLERR is known to be not edge-triggered unlike EPOLLIN and EPOLLOUT but +// it is not level-triggered either. this tests verify that the same behavior is +// maintained when we have EPOLLEXCLUSIVE. +// +// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR +// is correctly fired only once, when EPOLLET is set. send another packet with +// MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 epoll_create(1) = 5 + +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, + {events=EPOLLOUT|EPOLLET|EPOLLEXCLUSIVE, fd=4}) = 0 + +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 4001:8001(4000) ack 1 + +0 < . 1:1(0) ack 8001 win 257 + +// receive only one EPOLLERR for the two sends above. + +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1 + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 8001:12001(4000) ack 1 + +0 < . 1:1(0) ack 12001 win 257 + +// receive only one EPOLLERR for the third send above. + +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1 + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=2}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt new file mode 100644 index 000000000000..5bfa0d1d2f4a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// epoll zerocopy test: +// +// This is a test to confirm that EPOLLERR is only fired once for an FD when +// EPOLLONESHOT is set. +// +// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR +// is correctly fired only once, when EPOLLONESHOT is set. send another packet +// with MSG_ZEROCOPY. confirm that EPOLLERR is not fired. Rearm the FD and +// confirm that EPOLLERR is correctly set. +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 epoll_create(1) = 5 + +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, + {events=EPOLLOUT|EPOLLET|EPOLLONESHOT, fd=4}) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 1:4001(4000) ack 1 + +0 < . 1:1(0) ack 4001 win 257 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 4001:8001(4000) ack 1 + +0 < . 1:1(0) ack 8001 win 257 + +// receive only one EPOLLERR for the two sends above. + +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1 + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + + +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000 + +0 > P. 8001:12001(4000) ack 1 + +0 < . 1:1(0) ack 12001 win 257 + +// receive no EPOLLERR for the third send above. + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + +// rearm the FD and verify the EPOLLERR is fired again. + +0 epoll_ctl(5, EPOLL_CTL_MOD, 4, {events=EPOLLOUT|EPOLLONESHOT, fd=4}) = 0 + +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1 + +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 + + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=2}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt new file mode 100644 index 000000000000..4a73bbf46961 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +// Fastopen client zerocopy test: +// +// send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the +// kernel returns the notification ID. +// +// Fastopen requires a stored cookie. Create two sockets. The first +// one will have no data in the initial send. On return 0 the +// zerocopy notification counter is not incremented. Verify this too. + +`./defaults.sh` + +// Send a FastOpen request, no cookie yet so no data in SYN + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 sendto(3, ..., 500, MSG_FASTOPEN|MSG_ZEROCOPY, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,sackOK,TS val 1000 ecr 0,nop,wscale 8,FO,nop,nop> + +.01 < S. 123:123(0) ack 1 win 14600 <mss 940,TS val 2000 ecr 1000,sackOK,nop,wscale 6, FO abcd1234,nop,nop> + +0 > . 1:1(0) ack 1 <nop,nop,TS val 1001 ecr 2000> + +// Read from error queue: no zerocopy notification + +1 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[]}, MSG_ERRQUEUE) = -1 EAGAIN (Resource temporarily unavailable) + + +.01 close(3) = 0 + +0 > F. 1:1(0) ack 1 <nop,nop,TS val 1002 ecr 2000> + +.01 < F. 1:1(0) ack 2 win 92 <nop,nop,TS val 2001 ecr 1002> + +0 > . 2:2(0) ack 2 <nop,nop,TS val 1003 ecr 2001> + +// Send another Fastopen request, now SYN will have data + +.07 `sysctl -q net.ipv4.tcp_timestamps=0` + +.1 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5 + +0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 setsockopt(5, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 sendto(5, ..., 500, MSG_FASTOPEN|MSG_ZEROCOPY, ..., ...) = 500 + +0 > S 0:500(500) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO abcd1234,nop,nop> + +.05 < S. 5678:5678(0) ack 501 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 6> + +0 > . 501:501(0) ack 1 + +// Read from error queue: now has first zerocopy notification + +0.5 recvmsg(5, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt new file mode 100644 index 000000000000..36086c5877ce --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +// Fastopen server zerocopy test: +// +// send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the +// kernel returns the notification ID. + +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x207` + +// Set up a TFO server listening socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +.1 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN, [2], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +// Client sends a SYN with data. + +.1 < S 0:1000(1000) win 32792 <mss 1460,sackOK,nop,nop> + +0 > S. 0:0(0) ack 1001 <mss 1460,nop,nop,sackOK> + +// Server accepts and replies with data. ++.005 accept(3, ..., ...) = 4 + +0 read(4, ..., 1024) = 1000 + +0 sendto(4, ..., 1000, MSG_ZEROCOPY, ..., ...) = 1000 + +0 > P. 1:1001(1000) ack 1001 + +.05 < . 1001:1001(0) ack 1001 win 32792 + +// Read from error queue: now has first zerocopy notification + +0.1 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 + +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt new file mode 100644 index 000000000000..672f817faca0 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +// tcp_MAX_SKB_FRAGS test +// +// Verify that sending an iovec of tcp_MAX_SKB_FRAGS + 1 elements will +// 1) fit in a single packet without zerocopy +// 2) spill over into a second packet with zerocopy, +// because each iovec element becomes a frag +// 3) the PSH bit is set on an skb when it runs out of fragments + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + + // Each pinned zerocopy page is fully accounted to skb->truesize. + // This test generates a worst case packet with each frag storing + // one byte, but increasing truesize with a page (64KB on PPC). + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [2000000], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + // send an iov of 18 elements: just becomes a linear skb + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(18)=[{..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}], + msg_flags=0}, 0) = 18 + + +0 > P. 1:19(18) ack 1 + +0 < . 1:1(0) ack 19 win 257 + + // send a zerocopy iov of 18 elements: + +1 sendmsg(4, {msg_name(...)=..., + msg_iov(18)=[{..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}], + msg_flags=0}, MSG_ZEROCOPY) = 18 + + // verify that it is split in one skb of 17 frags + 1 of 1 frag + // verify that both have the PSH bit set + +0 > P. 19:36(17) ack 1 + +0 < . 1:1(0) ack 36 win 257 + + +0 > P. 36:37(1) ack 1 + +0 < . 1:1(0) ack 37 win 257 + + +1 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 + + // send a zerocopy iov of 64 elements: + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(64)=[{..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}, + {..., 1}, {..., 1}, {..., 1}, {..., 1}], + msg_flags=0}, MSG_ZEROCOPY) = 64 + + // verify that it is split in skbs with 17 frags + +0 > P. 37:54(17) ack 1 + +0 < . 1:1(0) ack 54 win 257 + + +0 > P. 54:71(17) ack 1 + +0 < . 1:1(0) ack 71 win 257 + + +0 > P. 71:88(17) ack 1 + +0 < . 1:1(0) ack 88 win 257 + + +0 > P. 88:101(13) ack 1 + +0 < . 1:1(0) ack 101 win 257 + + +1 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=1, + ee_data=1}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt new file mode 100644 index 000000000000..a9a1ac0aea4f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +// small packet zerocopy test: +// +// verify that SO_EE_CODE_ZEROCOPY_COPIED is set on zerocopy +// packets of all sizes, including the smallest payload, 1B. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + // send 1B + +0 send(4, ..., 1, MSG_ZEROCOPY) = 1 + +0 > P. 1:2(1) ack 1 + +0 < . 1:1(0) ack 2 win 257 + + +1 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=0, + ee_data=0}} + ]}, MSG_ERRQUEUE) = 0 + + // send 1B again + +0 send(4, ..., 1, MSG_ZEROCOPY) = 1 + +0 > P. 2:3(1) ack 1 + +0 < . 1:1(0) ack 3 win 257 + + +1 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE, + msg_control=[ + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=0, + ee_origin=SO_EE_ORIGIN_ZEROCOPY, + ee_type=0, + ee_code=SO_EE_CODE_ZEROCOPY_COPIED, + ee_info=1, + ee_data=1}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 5175c0c83a23..569bce8b6383 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -681,13 +681,7 @@ setup_xfrm() { } setup_nettest_xfrm() { - if ! which nettest >/dev/null; then - PATH=$PWD:$PATH - if ! which nettest >/dev/null; then - echo "'nettest' command not found; skipping tests" - return 1 - fi - fi + check_gen_prog "nettest" [ ${1} -eq 6 ] && proto="-6" || proto="" port=${2} @@ -1447,7 +1441,7 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() { size=$(du -sb $tmpoutfile) size=${size%%/tmp/*} - [ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1 + [ $size -ne 1048576 ] && err "File size $size mismatches expected value in locally bridged vxlan test" && return 1 done rm -f "$tmpoutfile" diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 1a736f700be4..4f31e92ebd96 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -165,9 +165,9 @@ static void sock_fanout_set_ebpf(int fd) attr.insns = (unsigned long) prog; attr.insn_cnt = ARRAY_SIZE(prog); attr.license = (unsigned long) "GPL"; - attr.log_buf = (unsigned long) log_buf, - attr.log_size = sizeof(log_buf), - attr.log_level = 1, + attr.log_buf = (unsigned long) log_buf; + attr.log_size = sizeof(log_buf); + attr.log_level = 1; pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); if (pfd < 0) { diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile new file mode 100644 index 000000000000..da9714bc7aad --- /dev/null +++ b/tools/testing/selftests/net/rds/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +all: + @echo mk_build_dir="$(shell pwd)" > include.sh + +TEST_PROGS := run.sh \ + include.sh \ + test.py + +EXTRA_CLEAN := /tmp/rds_logs + +include ../../lib.mk diff --git a/tools/testing/selftests/net/rds/README.txt b/tools/testing/selftests/net/rds/README.txt new file mode 100644 index 000000000000..cbde2951ab13 --- /dev/null +++ b/tools/testing/selftests/net/rds/README.txt @@ -0,0 +1,41 @@ +RDS self-tests +============== + +These scripts provide a coverage test for RDS-TCP by creating two +network namespaces and running rds packets between them. A loopback +network is provisioned with optional probability of packet loss or +corruption. A workload of 50000 hashes, each 64 characters in size, +are passed over an RDS socket on this test network. A passing test means +the RDS-TCP stack was able to recover properly. The provided config.sh +can be used to compile the kernel with the necessary gcov options. The +kernel may optionally be configured to omit the coverage report as well. + +USAGE: + run.sh [-d logdir] [-l packet_loss] [-c packet_corruption] + [-u packet_duplcate] + +OPTIONS: + -d Log directory. Defaults to tools/testing/selftests/net/rds/rds_logs + + -l Simulates a percentage of packet loss + + -c Simulates a percentage of packet corruption + + -u Simulates a percentage of packet duplication. + +EXAMPLE: + + # Create a suitable gcov enabled .config + tools/testing/selftests/net/rds/config.sh -g + + # Alternatly create a gcov disabled .config + tools/testing/selftests/net/rds/config.sh + + # build the kernel + vng --build --config tools/testing/selftests/net/config + + # launch the tests in a VM + vng -v --rwdir ./ --run . --user root --cpus 4 -- \ + "export PYTHONPATH=tools/testing/selftests/net/; tools/testing/selftests/net/rds/run.sh" + +An HTML coverage report will be output in tools/testing/selftests/net/rds/rds_logs/coverage/. diff --git a/tools/testing/selftests/net/rds/config.sh b/tools/testing/selftests/net/rds/config.sh new file mode 100755 index 000000000000..791c8dbe1095 --- /dev/null +++ b/tools/testing/selftests/net/rds/config.sh @@ -0,0 +1,53 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u +set -x + +unset KBUILD_OUTPUT + +GENERATE_GCOV_REPORT=0 +while getopts "g" opt; do + case ${opt} in + g) + GENERATE_GCOV_REPORT=1 + ;; + :) + echo "USAGE: config.sh [-g]" + exit 1 + ;; + ?) + echo "Invalid option: -${OPTARG}." + exit 1 + ;; + esac +done + +CONF_FILE="tools/testing/selftests/net/config" + +# no modules +scripts/config --file "$CONF_FILE" --disable CONFIG_MODULES + +# enable RDS +scripts/config --file "$CONF_FILE" --enable CONFIG_RDS +scripts/config --file "$CONF_FILE" --enable CONFIG_RDS_TCP + +if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then + # instrument RDS and only RDS + scripts/config --file "$CONF_FILE" --enable CONFIG_GCOV_KERNEL + scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL + scripts/config --file "$CONF_FILE" --enable GCOV_PROFILE_RDS +else + scripts/config --file "$CONF_FILE" --disable CONFIG_GCOV_KERNEL + scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL + scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_RDS +fi + +# need network namespaces to run tests with veth network interfaces +scripts/config --file "$CONF_FILE" --enable CONFIG_NET_NS +scripts/config --file "$CONF_FILE" --enable CONFIG_VETH + +# simulate packet loss +scripts/config --file "$CONF_FILE" --enable CONFIG_NET_SCH_NETEM + diff --git a/tools/testing/selftests/net/rds/run.sh b/tools/testing/selftests/net/rds/run.sh new file mode 100755 index 000000000000..8aee244f582a --- /dev/null +++ b/tools/testing/selftests/net/rds/run.sh @@ -0,0 +1,224 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u + +unset KBUILD_OUTPUT + +current_dir="$(realpath "$(dirname "$0")")" +build_dir="$current_dir" + +build_include="$current_dir/include.sh" +if test -f "$build_include"; then + # this include will define "$mk_build_dir" as the location the test was + # built. We will need this if the tests are installed in a location + # other than the kernel source + + source "$build_include" + build_dir="$mk_build_dir" +fi + +# This test requires kernel source and the *.gcda data therein +# Locate the top level of the kernel source, and the net/rds +# subfolder with the appropriate *.gcno object files +ksrc_dir="$(realpath "$build_dir"/../../../../../)" +kconfig="$ksrc_dir/.config" +obj_dir="$ksrc_dir/net/rds" + +GCOV_CMD=gcov + +#check to see if the host has the required packages to generate a gcov report +check_gcov_env() +{ + if ! which "$GCOV_CMD" > /dev/null 2>&1; then + echo "Warning: Could not find gcov. " + GENERATE_GCOV_REPORT=0 + return + fi + + # the gcov version must match the gcc version + GCC_VER=$(gcc -dumpfullversion) + GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| awk 'BEGIN {FS="-"}{print $1}') + if [ "$GCOV_VER" != "$GCC_VER" ]; then + #attempt to find a matching gcov version + GCOV_CMD=gcov-$(gcc -dumpversion) + + if ! which "$GCOV_CMD" > /dev/null 2>&1; then + echo "Warning: Could not find an appropriate gcov installation. \ + gcov version must match gcc version" + GENERATE_GCOV_REPORT=0 + return + fi + + #recheck version number of found gcov executable + GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| \ + awk 'BEGIN {FS="-"}{print $1}') + if [ "$GCOV_VER" != "$GCC_VER" ]; then + echo "Warning: Could not find an appropriate gcov installation. \ + gcov version must match gcc version" + GENERATE_GCOV_REPORT=0 + else + echo "Warning: Mismatched gcc and gcov detected. Using $GCOV_CMD" + fi + fi +} + +# Check to see if the kconfig has the required configs to generate a coverage report +check_gcov_conf() +{ + if ! grep -x "CONFIG_GCOV_PROFILE_RDS=y" "$kconfig" > /dev/null 2>&1; then + echo "INFO: CONFIG_GCOV_PROFILE_RDS should be enabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + if ! grep -x "CONFIG_GCOV_KERNEL=y" "$kconfig" > /dev/null 2>&1; then + echo "INFO: CONFIG_GCOV_KERNEL should be enabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + if grep -x "CONFIG_GCOV_PROFILE_ALL=y" "$kconfig" > /dev/null 2>&1; then + echo "INFO: CONFIG_GCOV_PROFILE_ALL should be disabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + + if [ "$GENERATE_GCOV_REPORT" -eq 0 ]; then + echo "To enable gcov reports, please run "\ + "\"tools/testing/selftests/net/rds/config.sh -g\" and rebuild the kernel" + else + # if we have the required kernel configs, proceed to check the environment to + # ensure we have the required gcov packages + check_gcov_env + fi +} + +# Kselftest framework requirement - SKIP code is 4. +check_conf_enabled() { + if ! grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then + echo "selftests: [SKIP] This test requires $1 enabled" + echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel" + exit 4 + fi +} +check_conf_disabled() { + if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then + echo "selftests: [SKIP] This test requires $1 disabled" + echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel" + exit 4 + fi +} +check_conf() { + check_conf_enabled CONFIG_NET_SCH_NETEM + check_conf_enabled CONFIG_VETH + check_conf_enabled CONFIG_NET_NS + check_conf_enabled CONFIG_RDS_TCP + check_conf_enabled CONFIG_RDS + check_conf_disabled CONFIG_MODULES +} + +check_env() +{ + if ! test -d "$obj_dir"; then + echo "selftests: [SKIP] This test requires a kernel source tree" + exit 4 + fi + if ! test -e "$kconfig"; then + echo "selftests: [SKIP] This test requires a configured kernel source tree" + exit 4 + fi + if ! which strace > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without strace" + exit 4 + fi + if ! which tcpdump > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without tcpdump" + exit 4 + fi + + if ! which python3 > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without python3" + exit 4 + fi + + python_major=$(python3 -c "import sys; print(sys.version_info[0])") + python_minor=$(python3 -c "import sys; print(sys.version_info[1])") + if [[ python_major -lt 3 || ( python_major -eq 3 && python_minor -lt 9 ) ]] ; then + echo "selftests: [SKIP] Could not run test without at least python3.9" + python3 -V + exit 4 + fi +} + +LOG_DIR="$current_dir"/rds_logs +PLOSS=0 +PCORRUPT=0 +PDUP=0 +GENERATE_GCOV_REPORT=1 +while getopts "d:l:c:u:" opt; do + case ${opt} in + d) + LOG_DIR=${OPTARG} + ;; + l) + PLOSS=${OPTARG} + ;; + c) + PCORRUPT=${OPTARG} + ;; + u) + PDUP=${OPTARG} + ;; + :) + echo "USAGE: run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]" \ + "[-u packet_duplcate] [-g]" + exit 1 + ;; + ?) + echo "Invalid option: -${OPTARG}." + exit 1 + ;; + esac +done + + +check_env +check_conf +check_gcov_conf + + +rm -fr "$LOG_DIR" +TRACE_FILE="${LOG_DIR}/rds-strace.txt" +COVR_DIR="${LOG_DIR}/coverage/" +mkdir -p "$LOG_DIR" +mkdir -p "$COVR_DIR" + +set +e +echo running RDS tests... +echo Traces will be logged to "$TRACE_FILE" +rm -f "$TRACE_FILE" +strace -T -tt -o "$TRACE_FILE" python3 "$(dirname "$0")/test.py" --timeout 400 -d "$LOG_DIR" \ + -l "$PLOSS" -c "$PCORRUPT" -u "$PDUP" + +test_rc=$? +dmesg > "${LOG_DIR}/dmesg.out" + +if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then + echo saving coverage data... + (set +x; cd /sys/kernel/debug/gcov; find ./* -name '*.gcda' | \ + while read -r f + do + cat < "/sys/kernel/debug/gcov/$f" > "/$f" + done) + + echo running gcovr... + gcovr -s --html-details --gcov-executable "$GCOV_CMD" --gcov-ignore-parse-errors \ + -o "${COVR_DIR}/gcovr" "${ksrc_dir}/net/rds/" +else + echo "Coverage report will be skipped" +fi + +if [ "$test_rc" -eq 0 ]; then + echo "PASS: Test completed successfully" +else + echo "FAIL: Test failed" +fi + +exit "$test_rc" diff --git a/tools/testing/selftests/net/rds/test.py b/tools/testing/selftests/net/rds/test.py new file mode 100644 index 000000000000..e6bb109bcead --- /dev/null +++ b/tools/testing/selftests/net/rds/test.py @@ -0,0 +1,262 @@ +#! /usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import argparse +import ctypes +import errno +import hashlib +import os +import select +import signal +import socket +import subprocess +import sys +import atexit +from pwd import getpwuid +from os import stat +from lib.py import ip + + +libc = ctypes.cdll.LoadLibrary('libc.so.6') +setns = libc.setns + +net0 = 'net0' +net1 = 'net1' + +veth0 = 'veth0' +veth1 = 'veth1' + +# Helper function for creating a socket inside a network namespace. +# We need this because otherwise RDS will detect that the two TCP +# sockets are on the same interface and use the loop transport instead +# of the TCP transport. +def netns_socket(netns, *args): + u0, u1 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) + + child = os.fork() + if child == 0: + # change network namespace + with open(f'/var/run/netns/{netns}') as f: + try: + ret = setns(f.fileno(), 0) + except IOError as e: + print(e.errno) + print(e) + + # create socket in target namespace + s = socket.socket(*args) + + # send resulting socket to parent + socket.send_fds(u0, [], [s.fileno()]) + + sys.exit(0) + + # receive socket from child + _, s, _, _ = socket.recv_fds(u1, 0, 1) + os.waitpid(child, 0) + u0.close() + u1.close() + return socket.fromfd(s[0], *args) + +def signal_handler(sig, frame): + print('Test timed out') + sys.exit(1) + +#Parse out command line arguments. We take an optional +# timeout parameter and an optional log output folder +parser = argparse.ArgumentParser(description="init script args", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("-d", "--logdir", action="store", + help="directory to store logs", default="/tmp") +parser.add_argument('--timeout', help="timeout to terminate hung test", + type=int, default=0) +parser.add_argument('-l', '--loss', help="Simulate tcp packet loss", + type=int, default=0) +parser.add_argument('-c', '--corruption', help="Simulate tcp packet corruption", + type=int, default=0) +parser.add_argument('-u', '--duplicate', help="Simulate tcp packet duplication", + type=int, default=0) +args = parser.parse_args() +logdir=args.logdir +packet_loss=str(args.loss)+'%' +packet_corruption=str(args.corruption)+'%' +packet_duplicate=str(args.duplicate)+'%' + +ip(f"netns add {net0}") +ip(f"netns add {net1}") +ip(f"link add type veth") + +addrs = [ + # we technically don't need different port numbers, but this will + # help identify traffic in the network analyzer + ('10.0.0.1', 10000), + ('10.0.0.2', 20000), +] + +# move interfaces to separate namespaces so they can no longer be +# bound directly; this prevents rds from switching over from the tcp +# transport to the loop transport. +ip(f"link set {veth0} netns {net0} up") +ip(f"link set {veth1} netns {net1} up") + + + +# add addresses +ip(f"-n {net0} addr add {addrs[0][0]}/32 dev {veth0}") +ip(f"-n {net1} addr add {addrs[1][0]}/32 dev {veth1}") + +# add routes +ip(f"-n {net0} route add {addrs[1][0]}/32 dev {veth0}") +ip(f"-n {net1} route add {addrs[0][0]}/32 dev {veth1}") + +# sanity check that our two interfaces/addresses are correctly set up +# and communicating by doing a single ping +ip(f"netns exec {net0} ping -c 1 {addrs[1][0]}") + +# Start a packet capture on each network +for net in [net0, net1]: + tcpdump_pid = os.fork() + if tcpdump_pid == 0: + pcap = logdir+'/'+net+'.pcap' + subprocess.check_call(['touch', pcap]) + user = getpwuid(stat(pcap).st_uid).pw_name + ip(f"netns exec {net} /usr/sbin/tcpdump -Z {user} -i any -w {pcap}") + sys.exit(0) + +# simulate packet loss, duplication and corruption +for net, iface in [(net0, veth0), (net1, veth1)]: + ip(f"netns exec {net} /usr/sbin/tc qdisc add dev {iface} root netem \ + corrupt {packet_corruption} loss {packet_loss} duplicate \ + {packet_duplicate}") + +# add a timeout +if args.timeout > 0: + signal.alarm(args.timeout) + signal.signal(signal.SIGALRM, signal_handler) + +sockets = [ + netns_socket(net0, socket.AF_RDS, socket.SOCK_SEQPACKET), + netns_socket(net1, socket.AF_RDS, socket.SOCK_SEQPACKET), +] + +for s, addr in zip(sockets, addrs): + s.bind(addr) + s.setblocking(0) + +fileno_to_socket = { + s.fileno(): s for s in sockets +} + +addr_to_socket = { + addr: s for addr, s in zip(addrs, sockets) +} + +socket_to_addr = { + s: addr for addr, s in zip(addrs, sockets) +} + +send_hashes = {} +recv_hashes = {} + +ep = select.epoll() + +for s in sockets: + ep.register(s, select.EPOLLRDNORM) + +n = 50000 +nr_send = 0 +nr_recv = 0 + +while nr_send < n: + # Send as much as we can without blocking + print("sending...", nr_send, nr_recv) + while nr_send < n: + send_data = hashlib.sha256( + f'packet {nr_send}'.encode('utf-8')).hexdigest().encode('utf-8') + + # pseudo-random send/receive pattern + sender = sockets[nr_send % 2] + receiver = sockets[1 - (nr_send % 3) % 2] + + try: + sender.sendto(send_data, socket_to_addr[receiver]) + send_hashes.setdefault((sender.fileno(), receiver.fileno()), + hashlib.sha256()).update(f'<{send_data}>'.encode('utf-8')) + nr_send = nr_send + 1 + except BlockingIOError as e: + break + except OSError as e: + if e.errno in [errno.ENOBUFS, errno.ECONNRESET, errno.EPIPE]: + break + raise + + # Receive as much as we can without blocking + print("receiving...", nr_send, nr_recv) + while nr_recv < nr_send: + for fileno, eventmask in ep.poll(): + receiver = fileno_to_socket[fileno] + + if eventmask & select.EPOLLRDNORM: + while True: + try: + recv_data, address = receiver.recvfrom(1024) + sender = addr_to_socket[address] + recv_hashes.setdefault((sender.fileno(), + receiver.fileno()), hashlib.sha256()).update( + f'<{recv_data}>'.encode('utf-8')) + nr_recv = nr_recv + 1 + except BlockingIOError as e: + break + + # exercise net/rds/tcp.c:rds_tcp_sysctl_reset() + for net in [net0, net1]: + ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_rcvbuf=10000") + ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_sndbuf=10000") + +print("done", nr_send, nr_recv) + +# the Python socket module doesn't know these +RDS_INFO_FIRST = 10000 +RDS_INFO_LAST = 10017 + +nr_success = 0 +nr_error = 0 + +for s in sockets: + for optname in range(RDS_INFO_FIRST, RDS_INFO_LAST + 1): + # Sigh, the Python socket module doesn't allow us to pass + # buffer lengths greater than 1024 for some reason. RDS + # wants multiple pages. + try: + s.getsockopt(socket.SOL_RDS, optname, 1024) + nr_success = nr_success + 1 + except OSError as e: + nr_error = nr_error + 1 + if e.errno == errno.ENOSPC: + # ignore + pass + +print(f"getsockopt(): {nr_success}/{nr_error}") + +print("Stopping network packet captures") +subprocess.check_call(['killall', '-q', 'tcpdump']) + +# We're done sending and receiving stuff, now let's check if what +# we received is what we sent. +for (sender, receiver), send_hash in send_hashes.items(): + recv_hash = recv_hashes.get((sender, receiver)) + + if recv_hash is None: + print("FAIL: No data received") + sys.exit(1) + + if send_hash.hexdigest() != recv_hash.hexdigest(): + print("FAIL: Send/recv mismatch") + print("hash expected:", send_hash.hexdigest()) + print("hash received:", recv_hash.hexdigest()) + sys.exit(1) + + print(f"{sender}/{receiver}: ok") + +print("Success") +sys.exit(0) diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c index 9eb42570294d..16ac4df55fdb 100644 --- a/tools/testing/selftests/net/rxtimestamp.c +++ b/tools/testing/selftests/net/rxtimestamp.c @@ -57,6 +57,8 @@ static struct sof_flag sof_flags[] = { SOF_FLAG(SOF_TIMESTAMPING_SOFTWARE), SOF_FLAG(SOF_TIMESTAMPING_RX_SOFTWARE), SOF_FLAG(SOF_TIMESTAMPING_RX_HARDWARE), + SOF_FLAG(SOF_TIMESTAMPING_OPT_RX_FILTER), + SOF_FLAG(SOF_TIMESTAMPING_RAW_HARDWARE), }; static struct socket_type socket_types[] = { @@ -98,6 +100,22 @@ static struct test_case test_cases[] = { {} }, { + { .so_timestamping = SOF_TIMESTAMPING_RAW_HARDWARE + | SOF_TIMESTAMPING_OPT_RX_FILTER }, + {} + }, + { + { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE + | SOF_TIMESTAMPING_OPT_RX_FILTER }, + {} + }, + { + { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE + | SOF_TIMESTAMPING_RX_SOFTWARE + | SOF_TIMESTAMPING_OPT_RX_FILTER }, + { .swtstamp = true } + }, + { { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE }, { .swtstamp = true } diff --git a/tools/testing/selftests/net/sk_so_peek_off.c b/tools/testing/selftests/net/sk_so_peek_off.c new file mode 100644 index 000000000000..d87dd8d8d491 --- /dev/null +++ b/tools/testing/selftests/net/sk_so_peek_off.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include "../kselftest.h" + +static char *afstr(int af, int proto) +{ + if (proto == IPPROTO_TCP) + return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6"; + else + return af == AF_INET ? "UDP/IPv4" : "UDP/IPv6"; +} + +int sk_peek_offset_probe(sa_family_t af, int proto) +{ + int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM); + int optv = 0; + int ret = 0; + int s; + + s = socket(af, type, proto); + if (s < 0) { + ksft_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + ret = 1; + else + printf("%s does not support SO_PEEK_OFF\n", afstr(af, proto)); + close(s); + } + return ret; +} + +static void sk_peek_offset_set(int s, int offset) +{ + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + ksft_perror("Failed to set SO_PEEK_OFF value\n"); +} + +static int sk_peek_offset_get(int s) +{ + int offset; + socklen_t len = sizeof(offset); + + if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len)) + ksft_perror("Failed to get SO_PEEK_OFF value\n"); + return offset; +} + +static int sk_peek_offset_test(sa_family_t af, int proto) +{ + int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM); + union { + struct sockaddr sa; + struct sockaddr_in a4; + struct sockaddr_in6 a6; + } a; + int res = 0; + int s[2] = {0, 0}; + int recv_sock = 0; + int offset = 0; + ssize_t len; + char buf[2]; + + memset(&a, 0, sizeof(a)); + a.sa.sa_family = af; + + s[0] = recv_sock = socket(af, type, proto); + s[1] = socket(af, type, proto); + + if (s[0] < 0 || s[1] < 0) { + ksft_perror("Temporary socket creation failed\n"); + goto out; + } + if (bind(s[0], &a.sa, sizeof(a)) < 0) { + ksft_perror("Temporary socket bind() failed\n"); + goto out; + } + if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) { + ksft_perror("Temporary socket getsockname() failed\n"); + goto out; + } + if (proto == IPPROTO_TCP && listen(s[0], 0) < 0) { + ksft_perror("Temporary socket listen() failed\n"); + goto out; + } + if (connect(s[1], &a.sa, sizeof(a)) < 0) { + ksft_perror("Temporary socket connect() failed\n"); + goto out; + } + if (proto == IPPROTO_TCP) { + recv_sock = accept(s[0], NULL, NULL); + if (recv_sock <= 0) { + ksft_perror("Temporary socket accept() failed\n"); + goto out; + } + } + + /* Some basic tests of getting/setting offset */ + offset = sk_peek_offset_get(recv_sock); + if (offset != -1) { + ksft_perror("Initial value of socket offset not -1\n"); + goto out; + } + sk_peek_offset_set(recv_sock, 0); + offset = sk_peek_offset_get(recv_sock); + if (offset != 0) { + ksft_perror("Failed to set socket offset to 0\n"); + goto out; + } + + /* Transfer a message */ + if (send(s[1], (char *)("ab"), 2, 0) != 2) { + ksft_perror("Temporary probe socket send() failed\n"); + goto out; + } + /* Read first byte */ + len = recv(recv_sock, buf, 1, MSG_PEEK); + if (len != 1 || buf[0] != 'a') { + ksft_perror("Failed to read first byte of message\n"); + goto out; + } + offset = sk_peek_offset_get(recv_sock); + if (offset != 1) { + ksft_perror("Offset not forwarded correctly at first byte\n"); + goto out; + } + /* Try to read beyond last byte */ + len = recv(recv_sock, buf, 2, MSG_PEEK); + if (len != 1 || buf[0] != 'b') { + ksft_perror("Failed to read last byte of message\n"); + goto out; + } + offset = sk_peek_offset_get(recv_sock); + if (offset != 2) { + ksft_perror("Offset not forwarded correctly at last byte\n"); + goto out; + } + /* Flush message */ + len = recv(recv_sock, buf, 2, MSG_TRUNC); + if (len != 2) { + ksft_perror("Failed to flush message\n"); + goto out; + } + offset = sk_peek_offset_get(recv_sock); + if (offset != 0) { + ksft_perror("Offset not reverted correctly after flush\n"); + goto out; + } + + printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af, proto)); + res = 1; +out: + if (proto == IPPROTO_TCP && recv_sock >= 0) + close(recv_sock); + if (s[1] >= 0) + close(s[1]); + if (s[0] >= 0) + close(s[0]); + return res; +} + +static int do_test(int proto) +{ + int res4, res6; + + res4 = sk_peek_offset_probe(AF_INET, proto); + res6 = sk_peek_offset_probe(AF_INET6, proto); + + if (!res4 && !res6) + return KSFT_SKIP; + + if (res4) + res4 = sk_peek_offset_test(AF_INET, proto); + + if (res6) + res6 = sk_peek_offset_test(AF_INET6, proto); + + if (!res4 || !res6) + return KSFT_FAIL; + + return KSFT_PASS; +} + +int main(void) +{ + int restcp, resudp; + + restcp = do_test(IPPROTO_TCP); + resudp = do_test(IPPROTO_UDP); + if (restcp == KSFT_FAIL || resudp == KSFT_FAIL) + return KSFT_FAIL; + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile index bd88b90b902b..5b0205c70c39 100644 --- a/tools/testing/selftests/net/tcp_ao/Makefile +++ b/tools/testing/selftests/net/tcp_ao/Makefile @@ -31,7 +31,8 @@ CFLAGS += $(KHDR_INCLUDES) CFLAGS += -iquote ./lib/ -I ../../../../include/ # Library -LIBSRC := kconfig.c netlink.c proc.c repair.c setup.c sock.c utils.c +LIBSRC := ftrace.c ftrace-tcp.c kconfig.c netlink.c +LIBSRC += proc.c repair.c setup.c sock.c utils.c LIBOBJ := $(LIBSRC:%.c=$(LIBDIR)/%.o) EXTRA_CLEAN += $(LIBOBJ) $(LIB) diff --git a/tools/testing/selftests/net/tcp_ao/bench-lookups.c b/tools/testing/selftests/net/tcp_ao/bench-lookups.c index a1e6e007c291..6736484996a3 100644 --- a/tools/testing/selftests/net/tcp_ao/bench-lookups.c +++ b/tools/testing/selftests/net/tcp_ao/bench-lookups.c @@ -355,6 +355,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(30, server_fn, client_fn); + test_init(31, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/config b/tools/testing/selftests/net/tcp_ao/config index d3277a9de987..3605e38711cb 100644 --- a/tools/testing/selftests/net/tcp_ao/config +++ b/tools/testing/selftests/net/tcp_ao/config @@ -7,4 +7,5 @@ CONFIG_NET_L3_MASTER_DEV=y CONFIG_NET_VRF=y CONFIG_TCP_AO=y CONFIG_TCP_MD5SIG=y +CONFIG_TRACEPOINTS=y CONFIG_VETH=m diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c index 185a2f6e5ff3..d418162d335f 100644 --- a/tools/testing/selftests/net/tcp_ao/connect-deny.c +++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c @@ -71,10 +71,12 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, } } + synchronize_threads(); /* before counter checks */ if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt2)) test_error("test_get_tcp_ao_counters()"); close(lsk); + if (pwd) test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected); @@ -84,10 +86,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, after_cnt = netstat_get_one(cnt_name, NULL); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } @@ -180,6 +182,7 @@ static void try_connect(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + synchronize_threads(); /* before counter checks */ if (ret < 0) { if (fault(KEYREJECT) && ret == -EKEYREJECTED) { test_ok("%s: connect() was prevented", tst_name); @@ -212,30 +215,44 @@ out: static void *client_fn(void *arg) { - union tcp_addr wrong_addr, network_addr; + union tcp_addr wrong_addr, network_addr, addr_any = {}; unsigned int port = test_server_port; if (inet_pton(TEST_FAMILY, TEST_WRONG_IP, &wrong_addr) != 1) test_error("Can't convert ip address %s", TEST_WRONG_IP); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Non-AO server + AO client", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server + Non-AO client", port++, NULL, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Wrong password", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Wrong rcv id", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_sk_expect(TCP_AO_SYNACK_NO_KEY, this_ip_dest, addr_any, + port, 0, 100, 100); try_connect("Wrong snd id", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_WRONG_MACLEN, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Different maclen", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Server: Wrong addr", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); @@ -259,6 +276,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(21, server_fn, client_fn); + test_init(22, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c index 81653b47f303..f1d8d29e393f 100644 --- a/tools/testing/selftests/net/tcp_ao/connect.c +++ b/tools/testing/selftests/net/tcp_ao/connect.c @@ -67,14 +67,14 @@ static void *client_fn(void *arg) netstat_free(ns_after); if (nr_packets > (after_aogood - before_aogood)) { - test_fail("TCPAOGood counter mismatch: %zu > (%zu - %zu)", + test_fail("TCPAOGood counter mismatch: %zu > (%" PRIu64 " - %" PRIu64 ")", nr_packets, after_aogood, before_aogood); return NULL; } if (test_tcp_ao_counters_cmp("connect", &ao1, &ao2, TEST_CNT_GOOD)) return NULL; - test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %" PRIu64, + test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %zu", before_aogood, ao1.ao_info_pkt_good, ao1.key_cnts[0].pkt_good, after_aogood, ao2.ao_info_pkt_good, @@ -85,6 +85,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(1, server_fn, client_fn); + test_init(2, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/icmps-discard.c b/tools/testing/selftests/net/tcp_ao/icmps-discard.c index d69bcba3c929..a1614f0d8c44 100644 --- a/tools/testing/selftests/net/tcp_ao/icmps-discard.c +++ b/tools/testing/selftests/net/tcp_ao/icmps-discard.c @@ -444,6 +444,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(3, server_fn, client_fn); + test_init(4, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/key-management.c b/tools/testing/selftests/net/tcp_ao/key-management.c index 24e62120b792..d4385b52c10b 100644 --- a/tools/testing/selftests/net/tcp_ao/key-management.c +++ b/tools/testing/selftests/net/tcp_ao/key-management.c @@ -965,7 +965,7 @@ static void end_client(const char *tst_name, int sk, unsigned int nr_keys, synchronize_threads(); /* 5: counters */ } -static void try_unmatched_keys(int sk, int *rnext_index) +static void try_unmatched_keys(int sk, int *rnext_index, unsigned int port) { struct test_key *key; unsigned int i = 0; @@ -1013,6 +1013,9 @@ static void try_unmatched_keys(int sk, int *rnext_index) test_error("all keys on server match the client"); if (test_set_key(sk, -1, key->server_keyid)) test_error("Can't change the current key"); + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, + -1, key->server_keyid, -1); if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) test_fail("verify failed"); *rnext_index = i; @@ -1054,6 +1057,10 @@ static void check_current_back(const char *tst_name, unsigned int port, return; if (test_set_key(sk, collection.keys[rotate_to_index].client_keyid, -1)) test_error("Can't change the current key"); + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, + collection.keys[rotate_to_index].client_keyid, + collection.keys[current_index].client_keyid, -1); if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) test_fail("verify failed"); /* There is a race here: between setting the current_key with @@ -1085,6 +1092,11 @@ static void roll_over_keys(const char *tst_name, unsigned int port, for (i = rnext_index + 1; rotations > 0; i++, rotations--) { if (i >= collection.nr_keys) i = 0; + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, + this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, + i == 0 ? -1 : collection.keys[i - 1].server_keyid, + collection.keys[i].server_keyid, -1); if (test_set_key(sk, -1, collection.keys[i].server_keyid)) test_error("Can't change the Rnext key"); if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) { @@ -1124,7 +1136,7 @@ static void try_client_match(const char *tst_name, unsigned int port, rnext_index, msg_len, nr_packets); if (sk < 0) return; - try_unmatched_keys(sk, &rnext_index); + try_unmatched_keys(sk, &rnext_index, port); end_client(tst_name, sk, nr_keys, current_index, rnext_index, NULL); } @@ -1181,6 +1193,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(120, server_fn, client_fn); + test_init(121, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h index fbc7f6111815..db44e77428dd 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h +++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h @@ -37,17 +37,58 @@ extern void __test_xfail(const char *buf); extern void __test_error(const char *buf); extern void __test_skip(const char *buf); -__attribute__((__format__(__printf__, 2, 3))) -static inline void __test_print(void (*fn)(const char *), const char *fmt, ...) +static inline char *test_snprintf(const char *fmt, va_list vargs) { -#define TEST_MSG_BUFFER_SIZE 4096 - char buf[TEST_MSG_BUFFER_SIZE]; - va_list arg; - - va_start(arg, fmt); - vsnprintf(buf, sizeof(buf), fmt, arg); - va_end(arg); - fn(buf); + char *ret = NULL; + size_t size = 0; + va_list tmp; + int n = 0; + + va_copy(tmp, vargs); + n = vsnprintf(ret, size, fmt, tmp); + if (n < 0) + return NULL; + + size = n + 1; + ret = malloc(size); + if (!ret) + return NULL; + + n = vsnprintf(ret, size, fmt, vargs); + if (n < 0 || n > size - 1) { + free(ret); + return NULL; + } + return ret; +} + +static __printf(1, 2) inline char *test_sprintf(const char *fmt, ...) +{ + va_list vargs; + char *ret; + + va_start(vargs, fmt); + ret = test_snprintf(fmt, vargs); + va_end(vargs); + + return ret; +} + +static __printf(2, 3) inline void __test_print(void (*fn)(const char *), + const char *fmt, ...) +{ + va_list vargs; + char *msg; + + va_start(vargs, fmt); + msg = test_snprintf(fmt, vargs); + va_end(vargs); + + if (!msg) + return; + + fn(msg); + free(msg); } #define test_print(fmt, ...) \ @@ -103,6 +144,7 @@ enum test_needs_kconfig { KCONFIG_TCP_AO, /* required */ KCONFIG_TCP_MD5, /* optional, for TCP-MD5 features */ KCONFIG_NET_VRF, /* optional, for L3/VRF testing */ + KCONFIG_FTRACE, /* optional, for tracepoints checks */ __KCONFIG_LAST__ }; extern bool kernel_config_has(enum test_needs_kconfig k); @@ -142,6 +184,8 @@ static inline void test_init2(unsigned int ntests, __test_init(ntests, family, prefix, taddr1, taddr2, peer1, peer2); } extern void test_add_destructor(void (*d)(void)); +extern void test_init_ftrace(int nsfd1, int nsfd2); +extern int test_setup_tracing(void); /* To adjust optmem socket limit, approximately estimate a number, * that is bigger than sizeof(struct tcp_ao_key). @@ -216,12 +260,17 @@ static inline void test_init(unsigned int ntests, } extern void synchronize_threads(void); extern void switch_ns(int fd); +extern int switch_save_ns(int fd); +extern void switch_close_ns(int fd); extern __thread union tcp_addr this_ip_addr; extern __thread union tcp_addr this_ip_dest; extern int test_family; extern void randomize_buffer(void *buf, size_t buflen); +extern __printf(3, 4) int test_echo(const char *fname, bool append, + const char *fmt, ...); + extern int open_netns(void); extern int unshare_open_netns(void); extern const char veth_name[]; @@ -602,4 +651,115 @@ static inline int test_add_repaired_key(int sk, return test_verify_socket_key(sk, &tmp); } +#define DEFAULT_FTRACE_BUFFER_KB 10000 +#define DEFAULT_TRACER_LINES_ARR 200 +struct test_ftracer; +extern uint64_t ns_cookie1, ns_cookie2; + +enum ftracer_op { + FTRACER_LINE_DISCARD = 0, + FTRACER_LINE_PRESERVE, + FTRACER_EXIT, +}; + +extern struct test_ftracer *create_ftracer(const char *name, + enum ftracer_op (*process_line)(const char *line), + void (*destructor)(struct test_ftracer *tracer), + bool (*expecting_more)(void), + size_t lines_buf_sz, size_t buffer_size_kb); +extern int setup_trace_event(struct test_ftracer *tracer, + const char *event, const char *filter); +extern void destroy_ftracer(struct test_ftracer *tracer); +extern const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer); +extern const char **tracer_get_savedlines(struct test_ftracer *tracer); + +enum trace_events { + /* TCP_HASH_EVENT */ + TCP_HASH_BAD_HEADER = 0, + TCP_HASH_MD5_REQUIRED, + TCP_HASH_MD5_UNEXPECTED, + TCP_HASH_MD5_MISMATCH, + TCP_HASH_AO_REQUIRED, + /* TCP_AO_EVENT */ + TCP_AO_HANDSHAKE_FAILURE, + TCP_AO_WRONG_MACLEN, + TCP_AO_MISMATCH, + TCP_AO_KEY_NOT_FOUND, + TCP_AO_RNEXT_REQUEST, + /* TCP_AO_EVENT_SK */ + TCP_AO_SYNACK_NO_KEY, + /* TCP_AO_EVENT_SNE */ + TCP_AO_SND_SNE_UPDATE, + TCP_AO_RCV_SNE_UPDATE, + __MAX_TRACE_EVENTS +}; + +extern int __trace_event_expect(enum trace_events type, int family, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack, + int keyid, int rnext, int maclen, int sne); + +static inline void trace_hash_event_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, L3index, + fin, syn, rst, psh, ack, + -1, -1, -1, -1); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +static inline void trace_ao_event_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack, + int keyid, int rnext, int maclen) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, L3index, + fin, syn, rst, psh, ack, + keyid, rnext, maclen, -1); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +static inline void trace_ao_event_sk_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, + int keyid, int rnext) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, -1, + -1, -1, -1, -1, -1, + keyid, rnext, -1, -1); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +static inline void trace_ao_event_sne_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int sne) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, -1, + -1, -1, -1, -1, -1, + -1, -1, -1, sne); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +extern int setup_aolib_ftracer(void); + #endif /* _AOLIB_H_ */ diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c new file mode 100644 index 000000000000..24380c68fec6 --- /dev/null +++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <inttypes.h> +#include <pthread.h> +#include "aolib.h" + +static const char *trace_event_names[__MAX_TRACE_EVENTS] = { + /* TCP_HASH_EVENT */ + "tcp_hash_bad_header", + "tcp_hash_md5_required", + "tcp_hash_md5_unexpected", + "tcp_hash_md5_mismatch", + "tcp_hash_ao_required", + /* TCP_AO_EVENT */ + "tcp_ao_handshake_failure", + "tcp_ao_wrong_maclen", + "tcp_ao_mismatch", + "tcp_ao_key_not_found", + "tcp_ao_rnext_request", + /* TCP_AO_EVENT_SK */ + "tcp_ao_synack_no_key", + /* TCP_AO_EVENT_SNE */ + "tcp_ao_snd_sne_update", + "tcp_ao_rcv_sne_update" +}; + +struct expected_trace_point { + /* required */ + enum trace_events type; + int family; + union tcp_addr src; + union tcp_addr dst; + + /* optional */ + int src_port; + int dst_port; + int L3index; + + int fin; + int syn; + int rst; + int psh; + int ack; + + int keyid; + int rnext; + int maclen; + int sne; + + size_t matched; +}; + +static struct expected_trace_point *exp_tps; +static size_t exp_tps_nr; +static size_t exp_tps_size; +static pthread_mutex_t exp_tps_mutex = PTHREAD_MUTEX_INITIALIZER; + +int __trace_event_expect(enum trace_events type, int family, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack, + int keyid, int rnext, int maclen, int sne) +{ + struct expected_trace_point new_tp = { + .type = type, + .family = family, + .src = src, + .dst = dst, + .src_port = src_port, + .dst_port = dst_port, + .L3index = L3index, + .fin = fin, + .syn = syn, + .rst = rst, + .psh = psh, + .ack = ack, + .keyid = keyid, + .rnext = rnext, + .maclen = maclen, + .sne = sne, + .matched = 0, + }; + int ret = 0; + + if (!kernel_config_has(KCONFIG_FTRACE)) + return 0; + + pthread_mutex_lock(&exp_tps_mutex); + if (exp_tps_nr == exp_tps_size) { + struct expected_trace_point *tmp; + + if (exp_tps_size == 0) + exp_tps_size = 10; + else + exp_tps_size = exp_tps_size * 1.6; + + tmp = reallocarray(exp_tps, exp_tps_size, sizeof(exp_tps[0])); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + exp_tps = tmp; + } + exp_tps[exp_tps_nr] = new_tp; + exp_tps_nr++; +out: + pthread_mutex_unlock(&exp_tps_mutex); + return ret; +} + +static void free_expected_events(void) +{ + /* We're from the process destructor - not taking the mutex */ + exp_tps_size = 0; + exp_tps = NULL; + free(exp_tps); +} + +struct trace_point { + int family; + union tcp_addr src; + union tcp_addr dst; + unsigned int src_port; + unsigned int dst_port; + int L3index; + unsigned int fin:1, + syn:1, + rst:1, + psh:1, + ack:1; + + unsigned int keyid; + unsigned int rnext; + unsigned int maclen; + + unsigned int sne; +}; + +static bool lookup_expected_event(int event_type, struct trace_point *e) +{ + size_t i; + + pthread_mutex_lock(&exp_tps_mutex); + for (i = 0; i < exp_tps_nr; i++) { + struct expected_trace_point *p = &exp_tps[i]; + size_t sk_size; + + if (p->type != event_type) + continue; + if (p->family != e->family) + continue; + if (p->family == AF_INET) + sk_size = sizeof(p->src.a4); + else + sk_size = sizeof(p->src.a6); + if (memcmp(&p->src, &e->src, sk_size)) + continue; + if (memcmp(&p->dst, &e->dst, sk_size)) + continue; + if (p->src_port >= 0 && p->src_port != e->src_port) + continue; + if (p->dst_port >= 0 && p->dst_port != e->dst_port) + continue; + if (p->L3index >= 0 && p->L3index != e->L3index) + continue; + + if (p->fin >= 0 && p->fin != e->fin) + continue; + if (p->syn >= 0 && p->syn != e->syn) + continue; + if (p->rst >= 0 && p->rst != e->rst) + continue; + if (p->psh >= 0 && p->psh != e->psh) + continue; + if (p->ack >= 0 && p->ack != e->ack) + continue; + + if (p->keyid >= 0 && p->keyid != e->keyid) + continue; + if (p->rnext >= 0 && p->rnext != e->rnext) + continue; + if (p->maclen >= 0 && p->maclen != e->maclen) + continue; + if (p->sne >= 0 && p->sne != e->sne) + continue; + p->matched++; + pthread_mutex_unlock(&exp_tps_mutex); + return true; + } + pthread_mutex_unlock(&exp_tps_mutex); + return false; +} + +static int check_event_type(const char *line) +{ + size_t i; + + /* + * This should have been a set or hashmap, but it's a selftest, + * so... KISS. + */ + for (i = 0; i < __MAX_TRACE_EVENTS; i++) { + if (!strncmp(trace_event_names[i], line, strlen(trace_event_names[i]))) + return i; + } + return -1; +} + +static bool event_has_flags(enum trace_events event) +{ + switch (event) { + case TCP_HASH_BAD_HEADER: + case TCP_HASH_MD5_REQUIRED: + case TCP_HASH_MD5_UNEXPECTED: + case TCP_HASH_MD5_MISMATCH: + case TCP_HASH_AO_REQUIRED: + case TCP_AO_HANDSHAKE_FAILURE: + case TCP_AO_WRONG_MACLEN: + case TCP_AO_MISMATCH: + case TCP_AO_KEY_NOT_FOUND: + case TCP_AO_RNEXT_REQUEST: + return true; + default: + return false; + } +} + +static int tracer_ip_split(int family, char *src, char **addr, char **port) +{ + char *p; + + if (family == AF_INET) { + /* fomat is <addr>:port, i.e.: 10.0.254.1:7015 */ + *addr = src; + p = strchr(src, ':'); + if (!p) { + test_print("Couldn't parse trace event addr:port %s", src); + return -EINVAL; + } + *p++ = '\0'; + *port = p; + return 0; + } + if (family != AF_INET6) + return -EAFNOSUPPORT; + + /* format is [<addr>]:port, i.e.: [2001:db8:254::1]:7013 */ + *addr = strchr(src, '['); + p = strchr(src, ']'); + + if (!p || !*addr) { + test_print("Couldn't parse trace event [addr]:port %s", src); + return -EINVAL; + } + + *addr = *addr + 1; /* '[' */ + *p++ = '\0'; /* ']' */ + if (*p != ':') { + test_print("Couldn't parse trace event :port %s", p); + return -EINVAL; + } + *p++ = '\0'; /* ':' */ + *port = p; + return 0; +} + +static int tracer_scan_address(int family, char *src, + union tcp_addr *dst, unsigned int *port) +{ + char *addr, *port_str; + int ret; + + ret = tracer_ip_split(family, src, &addr, &port_str); + if (ret) + return ret; + + if (inet_pton(family, addr, dst) != 1) { + test_print("Couldn't parse trace event addr %s", addr); + return -EINVAL; + } + errno = 0; + *port = (unsigned int)strtoul(port_str, NULL, 10); + if (errno != 0) { + test_print("Couldn't parse trace event port %s", port_str); + return -errno; + } + return 0; +} + +static int tracer_scan_event(const char *line, enum trace_events event, + struct trace_point *out) +{ + char *src = NULL, *dst = NULL, *family = NULL; + char fin, syn, rst, psh, ack; + int nr_matched, ret = 0; + uint64_t netns_cookie; + + switch (event) { + case TCP_HASH_BAD_HEADER: + case TCP_HASH_MD5_REQUIRED: + case TCP_HASH_MD5_UNEXPECTED: + case TCP_HASH_MD5_MISMATCH: + case TCP_HASH_AO_REQUIRED: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c]", + &netns_cookie, &family, + &src, &dst, &out->L3index, + &fin, &syn, &rst, &psh, &ack); + if (nr_matched != 10) + test_print("Couldn't parse trace event, matched = %d/10", + nr_matched); + break; + } + case TCP_AO_HANDSHAKE_FAILURE: + case TCP_AO_WRONG_MACLEN: + case TCP_AO_MISMATCH: + case TCP_AO_KEY_NOT_FOUND: + case TCP_AO_RNEXT_REQUEST: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u", + &netns_cookie, &family, + &src, &dst, &out->L3index, + &fin, &syn, &rst, &psh, &ack, + &out->keyid, &out->rnext, &out->maclen); + if (nr_matched != 13) + test_print("Couldn't parse trace event, matched = %d/13", + nr_matched); + break; + } + case TCP_AO_SYNACK_NO_KEY: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms keyid=%u rnext=%u", + &netns_cookie, &family, + &src, &dst, &out->keyid, &out->rnext); + if (nr_matched != 6) + test_print("Couldn't parse trace event, matched = %d/6", + nr_matched); + break; + } + case TCP_AO_SND_SNE_UPDATE: + case TCP_AO_RCV_SNE_UPDATE: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms sne=%u", + &netns_cookie, &family, + &src, &dst, &out->sne); + if (nr_matched != 5) + test_print("Couldn't parse trace event, matched = %d/5", + nr_matched); + break; + } + default: + return -1; + } + + if (family) { + if (!strcmp(family, "AF_INET")) { + out->family = AF_INET; + } else if (!strcmp(family, "AF_INET6")) { + out->family = AF_INET6; + } else { + test_print("Couldn't parse trace event family %s", family); + ret = -EINVAL; + goto out_free; + } + } + + if (event_has_flags(event)) { + out->fin = (fin == 'F'); + out->syn = (syn == 'S'); + out->rst = (rst == 'R'); + out->psh = (psh == 'P'); + out->ack = (ack == '.'); + + if ((fin != 'F' && fin != ' ') || + (syn != 'S' && syn != ' ') || + (rst != 'R' && rst != ' ') || + (psh != 'P' && psh != ' ') || + (ack != '.' && ack != ' ')) { + test_print("Couldn't parse trace event flags %c%c%c%c%c", + fin, syn, rst, psh, ack); + ret = -EINVAL; + goto out_free; + } + } + + if (src && tracer_scan_address(out->family, src, &out->src, &out->src_port)) { + ret = -EINVAL; + goto out_free; + } + + if (dst && tracer_scan_address(out->family, dst, &out->dst, &out->dst_port)) { + ret = -EINVAL; + goto out_free; + } + + if (netns_cookie != ns_cookie1 && netns_cookie != ns_cookie2) { + test_print("Net namespace filter for trace event didn't work: %" PRIu64 " != %" PRIu64 " OR %" PRIu64, + netns_cookie, ns_cookie1, ns_cookie2); + ret = -EINVAL; + } + +out_free: + free(src); + free(dst); + free(family); + return ret; +} + +static enum ftracer_op aolib_tracer_process_event(const char *line) +{ + int event_type = check_event_type(line); + struct trace_point tmp = {}; + + if (event_type < 0) + return FTRACER_LINE_PRESERVE; + + if (tracer_scan_event(line, event_type, &tmp)) + return FTRACER_LINE_PRESERVE; + + return lookup_expected_event(event_type, &tmp) ? + FTRACER_LINE_DISCARD : FTRACER_LINE_PRESERVE; +} + +static void dump_trace_event(struct expected_trace_point *e) +{ + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; + + if (!inet_ntop(e->family, &e->src, src, INET6_ADDRSTRLEN)) + test_error("inet_ntop()"); + if (!inet_ntop(e->family, &e->dst, dst, INET6_ADDRSTRLEN)) + test_error("inet_ntop()"); + test_print("trace event filter %s [%s:%d => %s:%d, L3index %d, flags: %s%s%s%s%s, keyid: %d, rnext: %d, maclen: %d, sne: %d] = %zu", + trace_event_names[e->type], + src, e->src_port, dst, e->dst_port, e->L3index, + (e->fin > 0) ? "F" : (e->fin == 0) ? "!F" : "", + (e->syn > 0) ? "S" : (e->syn == 0) ? "!S" : "", + (e->rst > 0) ? "R" : (e->rst == 0) ? "!R" : "", + (e->psh > 0) ? "P" : (e->psh == 0) ? "!P" : "", + (e->ack > 0) ? "." : (e->ack == 0) ? "!." : "", + e->keyid, e->rnext, e->maclen, e->sne, e->matched); +} + +static void print_match_stats(bool unexpected_events) +{ + size_t matches_per_type[__MAX_TRACE_EVENTS] = {}; + bool expected_but_none = false; + size_t i, total_matched = 0; + char *stat_line = NULL; + + for (i = 0; i < exp_tps_nr; i++) { + struct expected_trace_point *e = &exp_tps[i]; + + total_matched += e->matched; + matches_per_type[e->type] += e->matched; + if (!e->matched) + expected_but_none = true; + } + for (i = 0; i < __MAX_TRACE_EVENTS; i++) { + if (!matches_per_type[i]) + continue; + stat_line = test_sprintf("%s%s[%zu] ", stat_line ?: "", + trace_event_names[i], + matches_per_type[i]); + if (!stat_line) + test_error("test_sprintf()"); + } + + if (unexpected_events || expected_but_none) { + for (i = 0; i < exp_tps_nr; i++) + dump_trace_event(&exp_tps[i]); + } + + if (unexpected_events) + return; + + if (expected_but_none) + test_fail("Some trace events were expected, but didn't occur"); + else if (total_matched) + test_ok("Trace events matched expectations: %zu %s", + total_matched, stat_line); + else + test_ok("No unexpected trace events during the test run"); +} + +#define dump_events(fmt, ...) \ + __test_print(__test_msg, fmt, ##__VA_ARGS__) +static void check_free_events(struct test_ftracer *tracer) +{ + const char **lines; + size_t nr; + + if (!kernel_config_has(KCONFIG_FTRACE)) { + test_skip("kernel config doesn't have ftrace - no checks"); + return; + } + + nr = tracer_get_savedlines_nr(tracer); + lines = tracer_get_savedlines(tracer); + print_match_stats(!!nr); + if (!nr) + return; + + errno = 0; + test_xfail("Trace events [%zu] were not expected:", nr); + while (nr) + dump_events("\t%s", lines[--nr]); +} + +static int setup_tcp_trace_events(struct test_ftracer *tracer) +{ + char *filter; + size_t i; + int ret; + + filter = test_sprintf("net_cookie == %zu || net_cookie == %zu", + ns_cookie1, ns_cookie2); + if (!filter) + return -ENOMEM; + + for (i = 0; i < __MAX_TRACE_EVENTS; i++) { + char *event_name = test_sprintf("tcp/%s", trace_event_names[i]); + + if (!event_name) { + ret = -ENOMEM; + break; + } + ret = setup_trace_event(tracer, event_name, filter); + free(event_name); + if (ret) + break; + } + + free(filter); + return ret; +} + +static void aolib_tracer_destroy(struct test_ftracer *tracer) +{ + check_free_events(tracer); + free_expected_events(); +} + +static bool aolib_tracer_expecting_more(void) +{ + size_t i; + + for (i = 0; i < exp_tps_nr; i++) + if (!exp_tps[i].matched) + return true; + return false; +} + +int setup_aolib_ftracer(void) +{ + struct test_ftracer *f; + + f = create_ftracer("aolib", aolib_tracer_process_event, + aolib_tracer_destroy, aolib_tracer_expecting_more, + DEFAULT_FTRACE_BUFFER_KB, DEFAULT_TRACER_LINES_ARR); + if (!f) + return -1; + + return setup_tcp_trace_events(f); +} diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c new file mode 100644 index 000000000000..e4d0b173bc94 --- /dev/null +++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <inttypes.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/time.h> +#include <unistd.h> +#include "../../../../../include/linux/kernel.h" +#include "aolib.h" + +static char ftrace_path[] = "ksft-ftrace-XXXXXX"; +static bool ftrace_mounted; +uint64_t ns_cookie1, ns_cookie2; + +struct test_ftracer { + pthread_t tracer_thread; + int error; + char *instance_path; + FILE *trace_pipe; + + enum ftracer_op (*process_line)(const char *line); + void (*destructor)(struct test_ftracer *tracer); + bool (*expecting_more)(void); + + char **saved_lines; + size_t saved_lines_size; + size_t next_line_ind; + + pthread_cond_t met_all_expected; + pthread_mutex_t met_all_expected_lock; + + struct test_ftracer *next; +}; + +static struct test_ftracer *ftracers; +static pthread_mutex_t ftracers_lock = PTHREAD_MUTEX_INITIALIZER; + +static int mount_ftrace(void) +{ + if (!mkdtemp(ftrace_path)) + test_error("Can't create temp dir"); + + if (mount("tracefs", ftrace_path, "tracefs", 0, "rw")) + return -errno; + + ftrace_mounted = true; + + return 0; +} + +static void unmount_ftrace(void) +{ + if (ftrace_mounted && umount(ftrace_path)) + test_print("Failed on cleanup: can't unmount tracefs: %m"); + + if (rmdir(ftrace_path)) + test_error("Failed on cleanup: can't remove ftrace dir %s", + ftrace_path); +} + +struct opts_list_t { + char *opt_name; + struct opts_list_t *next; +}; + +static int disable_trace_options(const char *ftrace_path) +{ + struct opts_list_t *opts_list = NULL; + char *fopts, *line = NULL; + size_t buf_len = 0; + ssize_t line_len; + int ret = 0; + FILE *opts; + + fopts = test_sprintf("%s/%s", ftrace_path, "trace_options"); + if (!fopts) + return -ENOMEM; + + opts = fopen(fopts, "r+"); + if (!opts) { + ret = -errno; + goto out_free; + } + + while ((line_len = getline(&line, &buf_len, opts)) != -1) { + struct opts_list_t *tmp; + + if (!strncmp(line, "no", 2)) + continue; + + tmp = malloc(sizeof(*tmp)); + if (!tmp) { + ret = -ENOMEM; + goto out_free_opts_list; + } + tmp->next = opts_list; + tmp->opt_name = test_sprintf("no%s", line); + if (!tmp->opt_name) { + ret = -ENOMEM; + free(tmp); + goto out_free_opts_list; + } + opts_list = tmp; + } + + while (opts_list) { + struct opts_list_t *tmp = opts_list; + + fseek(opts, 0, SEEK_SET); + fwrite(tmp->opt_name, 1, strlen(tmp->opt_name), opts); + + opts_list = opts_list->next; + free(tmp->opt_name); + free(tmp); + } + +out_free_opts_list: + while (opts_list) { + struct opts_list_t *tmp = opts_list; + + opts_list = opts_list->next; + free(tmp->opt_name); + free(tmp); + } + free(line); + fclose(opts); +out_free: + free(fopts); + return ret; +} + +static int setup_buffer_size(const char *ftrace_path, size_t sz) +{ + char *fbuf_size = test_sprintf("%s/buffer_size_kb", ftrace_path); + int ret; + + if (!fbuf_size) + return -1; + + ret = test_echo(fbuf_size, 0, "%zu", sz); + free(fbuf_size); + return ret; +} + +static int setup_ftrace_instance(struct test_ftracer *tracer, const char *name) +{ + char *tmp; + + tmp = test_sprintf("%s/instances/ksft-%s-XXXXXX", ftrace_path, name); + if (!tmp) + return -ENOMEM; + + tracer->instance_path = mkdtemp(tmp); + if (!tracer->instance_path) { + free(tmp); + return -errno; + } + + return 0; +} + +static void remove_ftrace_instance(struct test_ftracer *tracer) +{ + if (rmdir(tracer->instance_path)) + test_print("Failed on cleanup: can't remove ftrace instance %s", + tracer->instance_path); + free(tracer->instance_path); +} + +static void tracer_cleanup(void *arg) +{ + struct test_ftracer *tracer = arg; + + fclose(tracer->trace_pipe); +} + +static void tracer_set_error(struct test_ftracer *tracer, int error) +{ + if (!tracer->error) + tracer->error = error; +} + +const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer) +{ + return tracer->next_line_ind; +} + +const char **tracer_get_savedlines(struct test_ftracer *tracer) +{ + return (const char **)tracer->saved_lines; +} + +static void *tracer_thread_func(void *arg) +{ + struct test_ftracer *tracer = arg; + + pthread_cleanup_push(tracer_cleanup, arg); + + while (tracer->next_line_ind < tracer->saved_lines_size) { + char **lp = &tracer->saved_lines[tracer->next_line_ind]; + enum ftracer_op op; + size_t buf_len = 0; + ssize_t line_len; + + line_len = getline(lp, &buf_len, tracer->trace_pipe); + if (line_len == -1) + break; + + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + op = tracer->process_line(*lp); + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + if (tracer->expecting_more) { + pthread_mutex_lock(&tracer->met_all_expected_lock); + if (!tracer->expecting_more()) + pthread_cond_signal(&tracer->met_all_expected); + pthread_mutex_unlock(&tracer->met_all_expected_lock); + } + + if (op == FTRACER_LINE_DISCARD) + continue; + if (op == FTRACER_EXIT) + break; + if (op != FTRACER_LINE_PRESERVE) + test_error("unexpected tracer command %d", op); + + tracer->next_line_ind++; + buf_len = 0; + } + test_print("too many lines in ftracer buffer %zu, exiting tracer", + tracer->next_line_ind); + + pthread_cleanup_pop(1); + return NULL; +} + +static int setup_trace_thread(struct test_ftracer *tracer) +{ + int ret = 0; + char *path; + + path = test_sprintf("%s/trace_pipe", tracer->instance_path); + if (!path) + return -ENOMEM; + + tracer->trace_pipe = fopen(path, "r"); + if (!tracer->trace_pipe) { + ret = -errno; + goto out_free; + } + + if (pthread_create(&tracer->tracer_thread, NULL, + tracer_thread_func, (void *)tracer)) { + ret = -errno; + fclose(tracer->trace_pipe); + } + +out_free: + free(path); + return ret; +} + +static void stop_trace_thread(struct test_ftracer *tracer) +{ + void *res; + + if (pthread_cancel(tracer->tracer_thread)) { + test_print("Can't stop tracer pthread: %m"); + tracer_set_error(tracer, -errno); + } + if (pthread_join(tracer->tracer_thread, &res)) { + test_print("Can't join tracer pthread: %m"); + tracer_set_error(tracer, -errno); + } + if (res != PTHREAD_CANCELED) { + test_print("Tracer thread wasn't canceled"); + tracer_set_error(tracer, -errno); + } + if (tracer->error) + test_fail("tracer errored by %s", strerror(tracer->error)); +} + +static void final_wait_for_events(struct test_ftracer *tracer, + unsigned timeout_sec) +{ + struct timespec timeout; + struct timeval now; + int ret = 0; + + if (!tracer->expecting_more) + return; + + pthread_mutex_lock(&tracer->met_all_expected_lock); + gettimeofday(&now, NULL); + timeout.tv_sec = now.tv_sec + timeout_sec; + timeout.tv_nsec = now.tv_usec * 1000; + + while (tracer->expecting_more() && ret != ETIMEDOUT) + ret = pthread_cond_timedwait(&tracer->met_all_expected, + &tracer->met_all_expected_lock, &timeout); + pthread_mutex_unlock(&tracer->met_all_expected_lock); +} + +int setup_trace_event(struct test_ftracer *tracer, + const char *event, const char *filter) +{ + char *enable_path, *filter_path, *instance = tracer->instance_path; + int ret; + + enable_path = test_sprintf("%s/events/%s/enable", instance, event); + if (!enable_path) + return -ENOMEM; + + filter_path = test_sprintf("%s/events/%s/filter", instance, event); + if (!filter_path) { + ret = -ENOMEM; + goto out_free; + } + + ret = test_echo(filter_path, 0, "%s", filter); + if (!ret) + ret = test_echo(enable_path, 0, "1"); + +out_free: + free(filter_path); + free(enable_path); + return ret; +} + +struct test_ftracer *create_ftracer(const char *name, + enum ftracer_op (*process_line)(const char *line), + void (*destructor)(struct test_ftracer *tracer), + bool (*expecting_more)(void), + size_t lines_buf_sz, size_t buffer_size_kb) +{ + struct test_ftracer *tracer; + int err; + + /* XXX: separate __create_ftracer() helper and do here + * if (!kernel_config_has(KCONFIG_FTRACE)) + * return NULL; + */ + + tracer = malloc(sizeof(*tracer)); + if (!tracer) { + test_print("malloc()"); + return NULL; + } + + memset(tracer, 0, sizeof(*tracer)); + + err = setup_ftrace_instance(tracer, name); + if (err) { + test_print("setup_ftrace_instance(): %d", err); + goto err_free; + } + + err = disable_trace_options(tracer->instance_path); + if (err) { + test_print("disable_trace_options(): %d", err); + goto err_remove; + } + + err = setup_buffer_size(tracer->instance_path, buffer_size_kb); + if (err) { + test_print("disable_trace_options(): %d", err); + goto err_remove; + } + + tracer->saved_lines = calloc(lines_buf_sz, sizeof(tracer->saved_lines[0])); + if (!tracer->saved_lines) { + test_print("calloc()"); + goto err_remove; + } + tracer->saved_lines_size = lines_buf_sz; + + tracer->process_line = process_line; + tracer->destructor = destructor; + tracer->expecting_more = expecting_more; + + err = pthread_cond_init(&tracer->met_all_expected, NULL); + if (err) { + test_print("pthread_cond_init(): %d", err); + goto err_free_lines; + } + + err = pthread_mutex_init(&tracer->met_all_expected_lock, NULL); + if (err) { + test_print("pthread_mutex_init(): %d", err); + goto err_cond_destroy; + } + + err = setup_trace_thread(tracer); + if (err) { + test_print("setup_trace_thread(): %d", err); + goto err_mutex_destroy; + } + + pthread_mutex_lock(&ftracers_lock); + tracer->next = ftracers; + ftracers = tracer; + pthread_mutex_unlock(&ftracers_lock); + + return tracer; + +err_mutex_destroy: + pthread_mutex_destroy(&tracer->met_all_expected_lock); +err_cond_destroy: + pthread_cond_destroy(&tracer->met_all_expected); +err_free_lines: + free(tracer->saved_lines); +err_remove: + remove_ftrace_instance(tracer); +err_free: + free(tracer); + return NULL; +} + +static void __destroy_ftracer(struct test_ftracer *tracer) +{ + size_t i; + + final_wait_for_events(tracer, TEST_TIMEOUT_SEC); + stop_trace_thread(tracer); + remove_ftrace_instance(tracer); + if (tracer->destructor) + tracer->destructor(tracer); + for (i = 0; i < tracer->saved_lines_size; i++) + free(tracer->saved_lines[i]); + pthread_cond_destroy(&tracer->met_all_expected); + pthread_mutex_destroy(&tracer->met_all_expected_lock); + free(tracer); +} + +void destroy_ftracer(struct test_ftracer *tracer) +{ + pthread_mutex_lock(&ftracers_lock); + if (tracer == ftracers) { + ftracers = tracer->next; + } else { + struct test_ftracer *f = ftracers; + + while (f->next != tracer) { + if (!f->next) + test_error("tracers list corruption or double free %p", tracer); + f = f->next; + } + f->next = tracer->next; + } + tracer->next = NULL; + pthread_mutex_unlock(&ftracers_lock); + __destroy_ftracer(tracer); +} + +static void destroy_all_ftracers(void) +{ + struct test_ftracer *f; + + pthread_mutex_lock(&ftracers_lock); + f = ftracers; + ftracers = NULL; + pthread_mutex_unlock(&ftracers_lock); + + while (f) { + struct test_ftracer *n = f->next; + + f->next = NULL; + __destroy_ftracer(f); + f = n; + } +} + +static void test_unset_tracing(void) +{ + destroy_all_ftracers(); + unmount_ftrace(); +} + +int test_setup_tracing(void) +{ + /* + * Just a basic protection - this should be called only once from + * lib/kconfig. Not thread safe, which is fine as it's early, before + * threads are created. + */ + static int already_set; + int err; + + if (already_set) + return -1; + + /* Needs net-namespace cookies for filters */ + if (ns_cookie1 == ns_cookie2) { + test_print("net-namespace cookies: %" PRIu64 " == %" PRIu64 ", can't set up tracing", + ns_cookie1, ns_cookie2); + return -1; + } + + already_set = 1; + + test_add_destructor(test_unset_tracing); + + err = mount_ftrace(); + if (err) { + test_print("failed to mount_ftrace(): %d", err); + return err; + } + + return setup_aolib_ftracer(); +} + +static int get_ns_cookie(int nsfd, uint64_t *out) +{ + int old_ns = switch_save_ns(nsfd); + socklen_t size = sizeof(*out); + int sk; + + sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + test_print("socket(): %m"); + return -errno; + } + + if (getsockopt(sk, SOL_SOCKET, SO_NETNS_COOKIE, out, &size)) { + test_print("getsockopt(SO_NETNS_COOKIE): %m"); + close(sk); + return -errno; + } + + close(sk); + switch_close_ns(old_ns); + return 0; +} + +void test_init_ftrace(int nsfd1, int nsfd2) +{ + get_ns_cookie(nsfd1, &ns_cookie1); + get_ns_cookie(nsfd2, &ns_cookie2); + /* Populate kernel config state */ + kernel_config_has(KCONFIG_FTRACE); +} diff --git a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c index f279ffc3843b..9f1c175846f8 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c +++ b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c @@ -6,7 +6,7 @@ #include "aolib.h" struct kconfig_t { - int _errno; /* the returned error if not supported */ + int _error; /* negative errno if not supported */ int (*check_kconfig)(int *error); }; @@ -62,7 +62,7 @@ static int has_tcp_ao(int *err) memcpy(&tmp.addr, &addr, sizeof(addr)); *err = 0; if (setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tmp, sizeof(tmp)) < 0) { - *err = errno; + *err = -errno; if (errno != ENOPROTOOPT) ret = -errno; } @@ -87,7 +87,7 @@ static int has_tcp_md5(int *err) */ *err = 0; if (test_set_md5(sk, addr_any, 0, -1, DEFAULT_TEST_PASSWORD)) { - *err = errno; + *err = -errno; if (errno != ENOPROTOOPT && errno == ENOMEM) { test_print("setsockopt(TCP_MD5SIG_EXT): %m"); ret = -errno; @@ -116,13 +116,21 @@ static int has_vrfs(int *err) return ret; } +static int has_ftrace(int *err) +{ + *err = test_setup_tracing(); + return 0; +} + +#define KCONFIG_UNKNOWN 1 static pthread_mutex_t kconfig_lock = PTHREAD_MUTEX_INITIALIZER; static struct kconfig_t kconfig[__KCONFIG_LAST__] = { - { -1, has_net_ns }, - { -1, has_veth }, - { -1, has_tcp_ao }, - { -1, has_tcp_md5 }, - { -1, has_vrfs }, + { KCONFIG_UNKNOWN, has_net_ns }, + { KCONFIG_UNKNOWN, has_veth }, + { KCONFIG_UNKNOWN, has_tcp_ao }, + { KCONFIG_UNKNOWN, has_tcp_md5 }, + { KCONFIG_UNKNOWN, has_vrfs }, + { KCONFIG_UNKNOWN, has_ftrace }, }; const char *tests_skip_reason[__KCONFIG_LAST__] = { @@ -131,6 +139,7 @@ const char *tests_skip_reason[__KCONFIG_LAST__] = { "Tests require TCP-AO support (CONFIG_TCP_AO)", "setsockopt(TCP_MD5SIG_EXT) is not supported (CONFIG_TCP_MD5)", "VRFs are not supported (CONFIG_NET_VRF)", + "Ftrace points are not supported (CONFIG_TRACEPOINTS)", }; bool kernel_config_has(enum test_needs_kconfig k) @@ -138,11 +147,11 @@ bool kernel_config_has(enum test_needs_kconfig k) bool ret; pthread_mutex_lock(&kconfig_lock); - if (kconfig[k]._errno == -1) { - if (kconfig[k].check_kconfig(&kconfig[k]._errno)) + if (kconfig[k]._error == KCONFIG_UNKNOWN) { + if (kconfig[k].check_kconfig(&kconfig[k]._error)) test_error("Failed to initialize kconfig %u", k); } - ret = kconfig[k]._errno == 0; + ret = kconfig[k]._error == 0; pthread_mutex_unlock(&kconfig_lock); return ret; } diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index e408b9243b2c..a27cc03c9fbd 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -111,7 +111,7 @@ static void sig_int(int signo) int open_netns(void) { - const char *netns_path = "/proc/self/ns/net"; + const char *netns_path = "/proc/thread-self/ns/net"; int fd; fd = open(netns_path, O_RDONLY); @@ -142,6 +142,13 @@ int switch_save_ns(int new_ns) return ret; } +void switch_close_ns(int fd) +{ + if (setns(fd, CLONE_NEWNET)) + test_error("setns()"); + close(fd); +} + static int nsfd_outside = -1; static int nsfd_parent = -1; static int nsfd_child = -1; @@ -243,9 +250,9 @@ void __test_init(unsigned int ntests, int family, unsigned int prefix, test_print("rand seed %u", (unsigned int)seed); srand(seed); - ksft_print_header(); init_namespaces(); + test_init_ftrace(nsfd_parent, nsfd_child); if (add_veth(veth_name, nsfd_parent, nsfd_child)) test_error("Failed to add veth"); @@ -296,7 +303,7 @@ static bool is_optmem_namespaced(void) int old_ns = switch_save_ns(nsfd_child); optmem_ns = !access(optmem_file, F_OK); - switch_ns(old_ns); + switch_close_ns(old_ns); } return !!optmem_ns; } @@ -317,7 +324,7 @@ size_t test_get_optmem(void) test_error("can't read from %s", optmem_file); fclose(foptmem); if (!is_optmem_namespaced()) - switch_ns(old_ns); + switch_close_ns(old_ns); return ret; } @@ -339,7 +346,7 @@ static void __test_set_optmem(size_t new, size_t *old) test_error("can't write %zu to %s", new, optmem_file); fclose(foptmem); if (!is_optmem_namespaced()) - switch_ns(old_ns); + switch_close_ns(old_ns); } static void test_revert_optmem(void) diff --git a/tools/testing/selftests/net/tcp_ao/lib/sock.c b/tools/testing/selftests/net/tcp_ao/lib/sock.c index 15aeb0963058..0ffda966c677 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/sock.c +++ b/tools/testing/selftests/net/tcp_ao/lib/sock.c @@ -379,7 +379,6 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out) key_dump[0].nkeys = nr_keys; key_dump[0].get_all = 1; - key_dump[0].get_all = 1; err = getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, key_dump, &key_dump_sz); if (err) { diff --git a/tools/testing/selftests/net/tcp_ao/lib/utils.c b/tools/testing/selftests/net/tcp_ao/lib/utils.c index 372daca525f5..bdf5522c9213 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/utils.c +++ b/tools/testing/selftests/net/tcp_ao/lib/utils.c @@ -21,6 +21,32 @@ void randomize_buffer(void *buf, size_t buflen) } } +__printf(3, 4) int test_echo(const char *fname, bool append, + const char *fmt, ...) +{ + size_t len, written; + va_list vargs; + char *msg; + FILE *f; + + f = fopen(fname, append ? "a" : "w"); + if (!f) + return -errno; + + va_start(vargs, fmt); + msg = test_snprintf(fmt, vargs); + va_end(vargs); + if (!msg) { + fclose(f); + return -1; + } + len = strlen(msg); + written = fwrite(msg, 1, len, f); + fclose(f); + free(msg); + return written == len ? 0 : -1; +} + const struct sockaddr_in6 addr_any6 = { .sin6_family = AF_INET6, }; diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c index 8fdc808df325..ecc6f1e3a414 100644 --- a/tools/testing/selftests/net/tcp_ao/restore.c +++ b/tools/testing/selftests/net/tcp_ao/restore.c @@ -64,6 +64,7 @@ static void try_server_run(const char *tst_name, unsigned int port, else test_ok("%s: server alive", tst_name); } + synchronize_threads(); /* 3: counters checks */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_cnt = netstat_get_one(cnt_name, NULL); @@ -71,10 +72,10 @@ static void try_server_run(const char *tst_name, unsigned int port, test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } @@ -82,7 +83,7 @@ static void try_server_run(const char *tst_name, unsigned int port, * Before close() as that will send FIN and move the peer in TCP_CLOSE * and that will prevent reading AO counters from the peer's socket. */ - synchronize_threads(); /* 3: verified => closed */ + synchronize_threads(); /* 4: verified => closed */ out: close(sk); } @@ -176,6 +177,7 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, else test_ok("%s: post-migrate connection is alive", tst_name); } + synchronize_threads(); /* 3: counters checks */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_cnt = netstat_get_one(cnt_name, NULL); @@ -183,13 +185,13 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } - synchronize_threads(); /* 3: verified => closed */ + synchronize_threads(); /* 4: verified => closed */ close(sk); } @@ -206,22 +208,36 @@ static void *client_fn(void *arg) test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.snt_isn += 1; + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); test_sk_restore("TCP-AO with wrong send ISN", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.rcv_isn += 1; + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); test_sk_restore("TCP-AO with wrong receive ISN", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.snd_sne += 1; + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); + /* not expecting server => client mismatches as only snd sne is broken */ test_sk_restore("TCP-AO with wrong send SEQ ext number", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.rcv_sne += 1; + /* not expecting client => server mismatches as only rcv sne is broken */ + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); test_sk_restore("TCP-AO with wrong receive SEQ ext number", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_NS_GOOD | TEST_CNT_BAD); @@ -231,6 +247,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(20, server_fn, client_fn); + test_init(21, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index a2fe88d35ac0..6364facaa63e 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -455,6 +455,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(14, server_fn, client_fn); + test_init(15, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c index a5698b0a3718..3ecd2b58de6a 100644 --- a/tools/testing/selftests/net/tcp_ao/self-connect.c +++ b/tools/testing/selftests/net/tcp_ao/self-connect.c @@ -87,7 +87,7 @@ static void tcp_self_connect(const char *tst, unsigned int port, netstat_free(ns_after); if (after_aogood <= before_aogood) { - test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu", + test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64, tst, after_aogood, before_aogood); close(sk); return; @@ -148,7 +148,7 @@ static void tcp_self_connect(const char *tst, unsigned int port, netstat_free(ns_after); close(sk); if (after_aogood <= before_aogood) { - test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu", + test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64, tst, after_aogood, before_aogood); return; } @@ -163,17 +163,26 @@ static void *client_fn(void *arg) setup_lo_intf("lo"); tcp_self_connect("self-connect(same keyids)", port++, false, false); + + /* expecting rnext to change based on the first segment RNext != Current */ + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr, + port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1); tcp_self_connect("self-connect(different keyids)", port++, true, false); tcp_self_connect("self-connect(restore)", port, false, true); - port += 2; + port += 2; /* restore test restores over different port */ + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr, + port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1); + /* intentionally on restore they are added to the socket in different order */ + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr, + port + 1, port + 1, 0, -1, -1, -1, -1, -1, 5, 7, -1); tcp_self_connect("self-connect(restore, different keyids)", port, true, true); - port += 2; + port += 2; /* restore test restores over different port */ return NULL; } int main(int argc, char *argv[]) { - test_init(4, client_fn, NULL); + test_init(5, client_fn, NULL); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c index ad4e77d6823e..8901a6785dc8 100644 --- a/tools/testing/selftests/net/tcp_ao/seq-ext.c +++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c @@ -116,7 +116,15 @@ static void *server_fn(void *arg) sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, client_new_port, &ao1); - synchronize_threads(); /* 5: verify counters during SEQ-number rollover */ + trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_addr, + this_ip_dest, test_server_port + 1, client_new_port, 1); + trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_dest, + this_ip_addr, client_new_port, test_server_port + 1, 1); + trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_addr, + this_ip_dest, test_server_port + 1, client_new_port, 1); + trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_dest, + this_ip_addr, client_new_port, test_server_port + 1, 1); + synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */ bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC); if (bytes != quota) { if (bytes > 0) @@ -127,6 +135,7 @@ static void *server_fn(void *arg) test_ok("server alive"); } + synchronize_threads(); /* 6: verify counters after SEQ-number rollover */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_good = netstat_get_one("TCPAOGood", NULL); @@ -134,15 +143,15 @@ static void *server_fn(void *arg) test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD); if (after_good <= before_good) { - test_fail("TCPAOGood counter did not increase: %zu <= %zu", + test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64, after_good, before_good); } else { - test_ok("TCPAOGood counter increased %zu => %zu", + test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64, before_good, after_good); } after_bad = netstat_get_one("TCPAOBad", NULL); if (after_bad) - test_fail("TCPAOBad counter is non-zero: %zu", after_bad); + test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad); else test_ok("TCPAOBad counter didn't increase"); test_enable_repair(sk); @@ -206,12 +215,13 @@ static void *client_fn(void *arg) sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, test_server_port + 1, &ao1); - synchronize_threads(); /* 5: verify counters during SEQ-number rollover */ + synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */ if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) test_fail("post-migrate verify failed"); else test_ok("post-migrate connection alive"); + synchronize_threads(); /* 5: verify counters after SEQ-number rollover */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_good = netstat_get_one("TCPAOGood", NULL); @@ -219,15 +229,15 @@ static void *client_fn(void *arg) test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD); if (after_good <= before_good) { - test_fail("TCPAOGood counter did not increase: %zu <= %zu", + test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64, after_good, before_good); } else { - test_ok("TCPAOGood counter increased %zu => %zu", + test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64, before_good, after_good); } after_bad = netstat_get_one("TCPAOBad", NULL); if (after_bad) - test_fail("TCPAOBad counter is non-zero: %zu", after_bad); + test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad); else test_ok("TCPAOBad counter didn't increase"); @@ -240,6 +250,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(7, server_fn, client_fn); + test_init(8, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 517930f9721b..084db4ecdff6 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -30,8 +30,8 @@ static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, #define __cmp_ao(member) \ do { \ if (info->member != tmp.member) { \ - test_fail("%s: getsockopt(): " __stringify(member) " %zu != %zu", \ - tst, (size_t)info->member, (size_t)tmp.member); \ + test_fail("%s: getsockopt(): " __stringify(member) " %" PRIu64 " != %" PRIu64, \ + tst, (uint64_t)info->member, (uint64_t)tmp.member); \ return; \ } \ } while(0) @@ -830,6 +830,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(120, client_fn, NULL); + test_init(121, client_fn, NULL); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c index 6b59a652159f..f779e5892bc1 100644 --- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c +++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c @@ -70,6 +70,7 @@ static void try_accept(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; err = test_wait_fd(lsk, timeout, 0); + synchronize_threads(); /* connect()/accept() timeouts */ if (err == -ETIMEDOUT) { if (!fault(TIMEOUT)) test_fail("timed out for accept()"); @@ -100,10 +101,10 @@ static void try_accept(const char *tst_name, unsigned int port, after_cnt = netstat_get_one(cnt_name, NULL); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } if (ao_addr) @@ -283,6 +284,7 @@ static void try_connect(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + synchronize_threads(); /* connect()/accept() timeouts */ if (ret < 0) { if (fault(KEYREJECT) && ret == -EKEYREJECTED) test_ok("%s: connect() was prevented", tst_name); @@ -451,6 +453,7 @@ static void try_to_add(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + synchronize_threads(); /* connect()/accept() timeouts */ if (ret <= 0) { test_error("%s: connect() returned %d", tst_name, ret); goto out; @@ -671,24 +674,38 @@ static void *client_fn(void *arg) try_connect("AO server (INADDR_ANY): AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server (INADDR_ANY): MD5 client", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server (INADDR_ANY): unsigned client", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr); try_connect("AO server (AO_REQUIRED): AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server (AO_REQUIRED): unsigned client", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &client2); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("MD5 server (INADDR_ANY): AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); try_connect("MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0, NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("MD5 server (INADDR_ANY): no sign client", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("no sign server: AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("no sign server: MD5 client", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); try_connect("no sign server: no sign client", port++, NULL, 0, @@ -696,25 +713,37 @@ static void *client_fn(void *arg) try_connect("AO+MD5 server: AO client (matching)", port++, NULL, 0, &addr_any, 0, 100, 100, 0, 0, 1, &client2); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("AO+MD5 server: AO client (misconfig, matching MD5)", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, client3, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("AO+MD5 server: AO client (misconfig, non-matching)", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client3); try_connect("AO+MD5 server: MD5 client (matching)", port++, &addr_any, 0, NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client2, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: MD5 client (misconfig, matching AO)", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client2); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client3, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: MD5 client (misconfig, non-matching)", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client3); try_connect("AO+MD5 server: no sign client (unmatched)", port++, NULL, 0, NULL, 0, 100, 100, 0, 0, 1, &client3); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: no sign client (misconfig, matching AO)", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client2); + trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: no sign client (misconfig, matching MD5)", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); @@ -736,6 +765,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(72, server_fn, client_fn); + test_init(73, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index ec60a16c9307..d626f22f9550 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -356,8 +356,12 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) } } - if (batch > 1) + if (batch > 1) { fprintf(stderr, "batched %d timestamps\n", batch); + } else if (!batch) { + fprintf(stderr, "Failed to report timestamps\n"); + test_failed = true; + } } static int recv_errmsg(int fd) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 11a1ebda564f..d5ffd8c9172e 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,8 +7,6 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.bpf.o" - # set global exit status, but never reset nonzero one. check_err() { @@ -38,7 +36,7 @@ cfg_veth() { ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24 ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp + ip netns exec "${PEER_NS}" ethtool -K veth1 gro on } run_one() { @@ -46,17 +44,19 @@ run_one() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \ - echo "ok" || \ - echo "failed" & + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} & + local PID1=$! wait_local_port_listen ${PEER_NS} 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - wait $(jobs -p) + check_err $? + wait ${PID1} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -73,6 +73,7 @@ run_one_nat() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 if [[ ${tx_args} = *-4* ]]; then ipt_cmd=iptables @@ -93,16 +94,17 @@ run_one_nat() { # ... so that GRO will match the UDP_GRO enabled socket, but packets # will land on the 'plain' one ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 & - pid=$! - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \ - echo "ok" || \ - echo "failed"& + local PID1=$! + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} & + local PID2=$! wait_local_port_listen "${PEER_NS}" 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - kill -INT $pid - wait $(jobs -p) + check_err $? + kill -INT ${PID1} + wait ${PID2} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -111,20 +113,26 @@ run_one_2sock() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 cfg_veth ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 & - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \ - echo "ok" || \ - echo "failed" & + local PID1=$! + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} & + local PID2=$! wait_local_port_listen "${PEER_NS}" 12345 udp ./udpgso_bench_tx ${tx_args} -p 12345 + check_err $? wait_local_port_listen "${PEER_NS}" 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - wait $(jobs -p) + check_err $? + wait ${PID1} + check_err $? + wait ${PID2} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -196,11 +204,6 @@ run_all() { return $ret } -if [ ! -f ${BPF_FILE} ]; then - echo "Missing ${BPF_FILE}. Run 'make' first" - exit -1 -fi - if [[ $# -eq 0 ]]; then run_all elif [[ $1 == "__subprocess" ]]; then diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 3e74cfa1a2bf..3f2fca02fec5 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -67,6 +67,7 @@ struct testcase { int gso_len; /* mss after applying gso */ int r_num_mss; /* recv(): number of calls of full mss */ int r_len_last; /* recv(): size of last non-mss dgram, if any */ + bool v6_ext_hdr; /* send() dgrams with IPv6 extension headers */ }; const struct in6_addr addr6 = { @@ -77,6 +78,8 @@ const struct in_addr addr4 = { __constant_htonl(0x0a000001), /* 10.0.0.1 */ }; +static const char ipv6_hopopts_pad1[8] = { 0 }; + struct testcase testcases_v4[] = { { /* no GSO: send a single byte */ @@ -256,6 +259,13 @@ struct testcase testcases_v6[] = { .r_num_mss = 2, }, { + /* send 2 1B segments with extension headers */ + .tlen = 2, + .gso_len = 1, + .r_num_mss = 2, + .v6_ext_hdr = true, + }, + { /* send 2B + 2B + 1B segments */ .tlen = 5, .gso_len = 2, @@ -396,11 +406,18 @@ static void run_one(struct testcase *test, int fdt, int fdr, int i, ret, val, mss; bool sent; - fprintf(stderr, "ipv%d tx:%d gso:%d %s\n", + fprintf(stderr, "ipv%d tx:%d gso:%d %s%s\n", addr->sa_family == AF_INET ? 4 : 6, test->tlen, test->gso_len, + test->v6_ext_hdr ? "ext-hdr " : "", test->tfail ? "(fail)" : ""); + if (test->v6_ext_hdr) { + if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, + ipv6_hopopts_pad1, sizeof(ipv6_hopopts_pad1))) + error(1, errno, "setsockopt ipv6 hopopts"); + } + val = test->gso_len; if (cfg_do_setsockopt) { if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val))) @@ -412,6 +429,12 @@ static void run_one(struct testcase *test, int fdt, int fdr, error(1, 0, "send succeeded while expecting failure"); if (!sent && !test->tfail) error(1, 0, "send failed while expecting success"); + + if (test->v6_ext_hdr) { + if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, NULL, 0)) + error(1, errno, "setsockopt ipv6 hopopts clear"); + } + if (!sent) return; diff --git a/tools/testing/selftests/net/unicast_extensions.sh b/tools/testing/selftests/net/unicast_extensions.sh index f52aa5f7da52..3e751234ccfe 100755 --- a/tools/testing/selftests/net/unicast_extensions.sh +++ b/tools/testing/selftests/net/unicast_extensions.sh @@ -30,14 +30,7 @@ source lib.sh -# nettest can be run from PATH or from same directory as this selftest -if ! which nettest >/dev/null; then - PATH=$PWD:$PATH - if ! which nettest >/dev/null; then - echo "'nettest' command not found; skipping tests" - exit $ksft_skip - fi -fi +check_gen_prog "nettest" result=0 diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh index 152171fb1fc8..e9c2f71da207 100755 --- a/tools/testing/selftests/net/vrf_route_leaking.sh +++ b/tools/testing/selftests/net/vrf_route_leaking.sh @@ -59,7 +59,6 @@ # while it is forwarded between different vrfs. source lib.sh -PATH=$PWD:$PWD/tools/testing/selftests/net:$PATH VERBOSE=0 PAUSE_ON_FAIL=no DEFAULT_TTYPE=sym @@ -636,6 +635,8 @@ EOF # Some systems don't have a ping6 binary anymore command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping) +check_gen_prog "nettest" + TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_local ipv4_tcp_local ipv4_udp_local ipv4_ping_ttl_asym ipv4_traceroute_asym" TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_local ipv6_tcp_local ipv6_udp_local diff --git a/tools/testing/selftests/net/xfrm_policy_add_speed.sh b/tools/testing/selftests/net/xfrm_policy_add_speed.sh new file mode 100755 index 000000000000..2fab29d3cb91 --- /dev/null +++ b/tools/testing/selftests/net/xfrm_policy_add_speed.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +source lib.sh + +timeout=4m +ret=0 +tmp=$(mktemp) +cleanup() { + cleanup_all_ns + rm -f "$tmp" +} + +trap cleanup EXIT + +maxpolicies=100000 +[ "$KSFT_MACHINE_SLOW" = "yes" ] && maxpolicies=10000 + +do_dummies4() { + local dir="$1" + local max="$2" + + local policies + local pfx + pfx=30 + policies=0 + + ip netns exec "$ns" ip xfrm policy flush + + for i in $(seq 1 100);do + local s + local d + for j in $(seq 1 255);do + s=$((i+0)) + d=$((i+100)) + + for a in $(seq 1 8 255); do + policies=$((policies+1)) + [ "$policies" -gt "$max" ] && return + echo xfrm policy add src 10.$s.$j.0/30 dst 10.$d.$j.$a/$pfx dir $dir action block + done + for a in $(seq 1 8 255); do + policies=$((policies+1)) + [ "$policies" -gt "$max" ] && return + echo xfrm policy add src 10.$s.$j.$a/30 dst 10.$d.$j.0/$pfx dir $dir action block + done + done + done +} + +setup_ns ns + +do_bench() +{ + local max="$1" + + start=$(date +%s%3N) + do_dummies4 "out" "$max" > "$tmp" + if ! timeout "$timeout" ip netns exec "$ns" ip -batch "$tmp";then + echo "WARNING: policy insertion cancelled after $timeout" + ret=1 + fi + stop=$(date +%s%3N) + + result=$((stop-start)) + + policies=$(wc -l < "$tmp") + printf "Inserted %-06s policies in $result ms\n" $policies + + have=$(ip netns exec "$ns" ip xfrm policy show | grep "action block" | wc -l) + if [ "$have" -ne "$policies" ]; then + echo "WARNING: mismatch, have $have policies, expected $policies" + ret=1 + fi +} + +p=100 +while [ $p -le "$maxpolicies" ]; do + do_bench "$p" + p="${p}0" +done + +exit $ret diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 3fbabab46958..8de98ea7af80 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,19 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for nolibc tests -include ../../../scripts/Makefile.include -include ../../../scripts/utilities.mak -# We need this for the "cc-option" macro. -include ../../../build/Build.include +# we're in ".../tools/testing/selftests/nolibc" +ifeq ($(srctree),) +srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) +endif + +include $(srctree)/tools/scripts/utilities.mak +# We need this for the "__cc-option" macro. +include $(srctree)/scripts/Makefile.compiler ifneq ($(O),) ifneq ($(call is-absolute,$(O)),y) $(error Only absolute O= parameters are supported) endif -endif - -# we're in ".../tools/testing/selftests/nolibc" -ifeq ($(srctree),) -srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) +objtree := $(O) +else +objtree ?= $(srctree) endif ifeq ($(ARCH),) @@ -21,7 +23,7 @@ include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) endif -objtree ?= $(srctree) +cc-option = $(call __cc-option, $(CC),$(CLANG_CROSS_FLAGS),$(1),$(2)) # XARCH extends the kernel's ARCH with a few variants of the same # architecture that only differ by the configuration, the toolchain @@ -155,9 +157,22 @@ CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wex $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := +LIBGCC := -lgcc + +ifneq ($(LLVM),) +# Not needed for clang +LIBGCC := +endif + +# Modify CFLAGS based on LLVM= +include $(srctree)/tools/scripts/Makefile.include + +# GCC uses "s390", clang "systemz" +CLANG_CROSS_FLAGS := $(subst --target=s390-linux,--target=systemz-linux,$(CLANG_CROSS_FLAGS)) + REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \ END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \ - if (f) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ + if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ printf("\nSee all results in %s\n", ARGV[1]); }' help: @@ -204,11 +219,11 @@ sysroot/$(ARCH)/include: ifneq ($(NOLIBC_SYSROOT),0) nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c -lgcc + -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC) else nolibc-test: nolibc-test.c nolibc-test-linkage.c $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c -lgcc + -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC) endif libc-test: nolibc-test.c nolibc-test-linkage.c diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 093d0512f4c5..6fba7025c5e3 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -542,7 +542,7 @@ int expect_strzr(const char *expr, int llen) { int ret = 0; - llen += printf(" = <%s> ", expr); + llen += printf(" = <%s> ", expr ? expr : "(null)"); if (expr) { ret = 1; result(llen, FAIL); @@ -561,7 +561,7 @@ int expect_strnz(const char *expr, int llen) { int ret = 0; - llen += printf(" = <%s> ", expr); + llen += printf(" = <%s> ", expr ? expr : "(null)"); if (!expr) { ret = 1; result(llen, FAIL); @@ -686,9 +686,10 @@ static void constructor1(void) } __attribute__((constructor)) -static void constructor2(void) +static void constructor2(int argc, char **argv, char **envp) { - constructor_test_value *= 2; + if (argc && argv && envp) + constructor_test_value *= 2; } int run_startup(int min, int max) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 0446e6326a40..e7ecda4ae796 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -15,10 +15,11 @@ download_location="${cache_dir}/crosstools/" build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 test_mode=system -CFLAGS_EXTRA="-Werror" +werror=1 +llvm= archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch" -TEMP=$(getopt -o 'j:d:c:b:a:m:peh' -n "$0" -- "$@") +TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") eval set -- "$TEMP" unset TEMP @@ -42,6 +43,7 @@ Options: -b [DIR] Build location (default: ${build_location}) -m [MODE] Test mode user/system (default: ${test_mode}) -e Disable -Werror + -l Build with LLVM/clang EOF } @@ -69,7 +71,10 @@ while true; do test_mode="$2" shift 2; continue ;; '-e') - CFLAGS_EXTRA="" + werror=0 + shift; continue ;; + '-l') + llvm=1 shift; continue ;; '-h') print_usage @@ -140,7 +145,10 @@ test_arch() { ct_abi=$(crosstool_abi "$1") cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-") build_dir="${build_location}/${arch}" - MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}") + if [ "$werror" -ne 0 ]; then + CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror" + fi + MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") mkdir -p "$build_dir" if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 47746b0c6acd..7c2a4349170a 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -16,11 +16,56 @@ #include <unistd.h> #include <sys/socket.h> #include <sys/stat.h> +#include <linux/ioctl.h> #include "pidfd.h" #include "../clone3/clone3_selftests.h" #include "../kselftest_harness.h" +#ifndef PIDFS_IOCTL_MAGIC +#define PIDFS_IOCTL_MAGIC 0xFF +#endif + +#ifndef PIDFD_GET_CGROUP_NAMESPACE +#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) +#endif + +#ifndef PIDFD_GET_IPC_NAMESPACE +#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) +#endif + +#ifndef PIDFD_GET_MNT_NAMESPACE +#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) +#endif + +#ifndef PIDFD_GET_NET_NAMESPACE +#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) +#endif + +#ifndef PIDFD_GET_PID_NAMESPACE +#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) +#endif + +#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) +#endif + +#ifndef PIDFD_GET_TIME_NAMESPACE +#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) +#endif + +#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) +#endif + +#ifndef PIDFD_GET_USER_NAMESPACE +#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) +#endif + +#ifndef PIDFD_GET_UTS_NAMESPACE +#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) +#endif + enum { PIDFD_NS_USER, PIDFD_NS_MNT, @@ -31,22 +76,25 @@ enum { PIDFD_NS_CGROUP, PIDFD_NS_PIDCLD, PIDFD_NS_TIME, + PIDFD_NS_TIMECLD, PIDFD_NS_MAX }; const struct ns_info { const char *name; int flag; + unsigned int pidfd_ioctl; } ns_info[] = { - [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, }, - [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, }, - [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, }, - [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, }, - [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, }, - [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, - [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, - [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, - [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, }, + [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, PIDFD_GET_USER_NAMESPACE, }, + [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, PIDFD_GET_MNT_NAMESPACE, }, + [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, PIDFD_GET_PID_NAMESPACE, }, + [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, PIDFD_GET_UTS_NAMESPACE, }, + [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, PIDFD_GET_IPC_NAMESPACE, }, + [PIDFD_NS_NET] = { "net", CLONE_NEWNET, PIDFD_GET_NET_NAMESPACE, }, + [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, PIDFD_GET_CGROUP_NAMESPACE, }, + [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, PIDFD_GET_TIME_NAMESPACE, }, + [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE, }, + [PIDFD_NS_TIMECLD] = { "time_for_children", 0, PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE, }, }; FIXTURE(current_nsset) @@ -54,6 +102,7 @@ FIXTURE(current_nsset) pid_t pid; int pidfd; int nsfds[PIDFD_NS_MAX]; + int child_pidfd_derived_nsfds[PIDFD_NS_MAX]; pid_t child_pid_exited; int child_pidfd_exited; @@ -61,10 +110,12 @@ FIXTURE(current_nsset) pid_t child_pid1; int child_pidfd1; int child_nsfds1[PIDFD_NS_MAX]; + int child_pidfd_derived_nsfds1[PIDFD_NS_MAX]; pid_t child_pid2; int child_pidfd2; int child_nsfds2[PIDFD_NS_MAX]; + int child_pidfd_derived_nsfds2[PIDFD_NS_MAX]; }; static int sys_waitid(int which, pid_t pid, int options) @@ -128,9 +179,12 @@ FIXTURE_SETUP(current_nsset) char c; for (i = 0; i < PIDFD_NS_MAX; i++) { - self->nsfds[i] = -EBADF; - self->child_nsfds1[i] = -EBADF; - self->child_nsfds2[i] = -EBADF; + self->nsfds[i] = -EBADF; + self->child_nsfds1[i] = -EBADF; + self->child_nsfds2[i] = -EBADF; + self->child_pidfd_derived_nsfds[i] = -EBADF; + self->child_pidfd_derived_nsfds1[i] = -EBADF; + self->child_pidfd_derived_nsfds2[i] = -EBADF; } proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC); @@ -139,6 +193,11 @@ FIXTURE_SETUP(current_nsset) } self->pid = getpid(); + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GT(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + for (i = 0; i < PIDFD_NS_MAX; i++) { const struct ns_info *info = &ns_info[i]; self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); @@ -148,20 +207,27 @@ FIXTURE_SETUP(current_nsset) info->name, self->pid); } } - } - self->pidfd = sys_pidfd_open(self->pid, 0); - EXPECT_GT(self->pidfd, 0) { - TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + self->child_pidfd_derived_nsfds[i] = ioctl(self->pidfd, info->pidfd_ioctl, 0); + if (self->child_pidfd_derived_nsfds[i] < 0) { + EXPECT_EQ(errno, EOPNOTSUPP) { + TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d", + info->name, self->pid); + } + } } /* Create task that exits right away. */ - self->child_pid_exited = create_child(&self->child_pidfd_exited, - CLONE_NEWUSER | CLONE_NEWNET); + self->child_pid_exited = create_child(&self->child_pidfd_exited, 0); EXPECT_GE(self->child_pid_exited, 0); - if (self->child_pid_exited == 0) + if (self->child_pid_exited == 0) { + if (self->nsfds[PIDFD_NS_USER] >= 0 && unshare(CLONE_NEWUSER) < 0) + _exit(EXIT_FAILURE); + if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) + _exit(EXIT_FAILURE); _exit(EXIT_SUCCESS); + } ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); @@ -174,18 +240,43 @@ FIXTURE_SETUP(current_nsset) EXPECT_EQ(ret, 0); /* Create tasks that will be stopped. */ - self->child_pid1 = create_child(&self->child_pidfd1, - CLONE_NEWUSER | CLONE_NEWNS | - CLONE_NEWCGROUP | CLONE_NEWIPC | - CLONE_NEWUTS | CLONE_NEWPID | - CLONE_NEWNET); + if (self->nsfds[PIDFD_NS_USER] >= 0 && self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER | CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_USER] >= 0) + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER); + else + self->child_pid1 = create_child(&self->child_pidfd1, 0); EXPECT_GE(self->child_pid1, 0); if (self->child_pid1 == 0) { close(ipc_sockets[0]); - if (!switch_timens()) + if (self->nsfds[PIDFD_NS_MNT] >= 0 && unshare(CLONE_NEWNS) < 0) { + TH_LOG("%m - Failed to unshare mount namespace for process %d", self->pid); _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_CGROUP] >= 0 && unshare(CLONE_NEWCGROUP) < 0) { + TH_LOG("%m - Failed to unshare cgroup namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_IPC] >= 0 && unshare(CLONE_NEWIPC) < 0) { + TH_LOG("%m - Failed to unshare ipc namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_UTS] >= 0 && unshare(CLONE_NEWUTS) < 0) { + TH_LOG("%m - Failed to unshare uts namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) { + TH_LOG("%m - Failed to unshare net namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_TIME] >= 0 && !switch_timens()) { + TH_LOG("%m - Failed to unshare time namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } if (write_nointr(ipc_sockets[1], "1", 1) < 0) _exit(EXIT_FAILURE); @@ -203,18 +294,43 @@ FIXTURE_SETUP(current_nsset) ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); EXPECT_EQ(ret, 0); - self->child_pid2 = create_child(&self->child_pidfd2, - CLONE_NEWUSER | CLONE_NEWNS | - CLONE_NEWCGROUP | CLONE_NEWIPC | - CLONE_NEWUTS | CLONE_NEWPID | - CLONE_NEWNET); + if (self->nsfds[PIDFD_NS_USER] >= 0 && self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_USER] >= 0) + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER); + else + self->child_pid2 = create_child(&self->child_pidfd2, 0); EXPECT_GE(self->child_pid2, 0); if (self->child_pid2 == 0) { close(ipc_sockets[0]); - if (!switch_timens()) + if (self->nsfds[PIDFD_NS_MNT] >= 0 && unshare(CLONE_NEWNS) < 0) { + TH_LOG("%m - Failed to unshare mount namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_CGROUP] >= 0 && unshare(CLONE_NEWCGROUP) < 0) { + TH_LOG("%m - Failed to unshare cgroup namespace for process %d", self->pid); _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_IPC] >= 0 && unshare(CLONE_NEWIPC) < 0) { + TH_LOG("%m - Failed to unshare ipc namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_UTS] >= 0 && unshare(CLONE_NEWUTS) < 0) { + TH_LOG("%m - Failed to unshare uts namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) { + TH_LOG("%m - Failed to unshare net namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_TIME] >= 0 && !switch_timens()) { + TH_LOG("%m - Failed to unshare time namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } if (write_nointr(ipc_sockets[1], "1", 1) < 0) _exit(EXIT_FAILURE); @@ -267,6 +383,22 @@ FIXTURE_SETUP(current_nsset) info->name, self->child_pid1); } } + + self->child_pidfd_derived_nsfds1[i] = ioctl(self->child_pidfd1, info->pidfd_ioctl, 0); + if (self->child_pidfd_derived_nsfds1[i] < 0) { + EXPECT_EQ(errno, EOPNOTSUPP) { + TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d", + info->name, self->child_pid1); + } + } + + self->child_pidfd_derived_nsfds2[i] = ioctl(self->child_pidfd2, info->pidfd_ioctl, 0); + if (self->child_pidfd_derived_nsfds2[i] < 0) { + EXPECT_EQ(errno, EOPNOTSUPP) { + TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d", + info->name, self->child_pid2); + } + } } close(proc_fd); @@ -288,6 +420,12 @@ FIXTURE_TEARDOWN(current_nsset) close(self->child_nsfds1[i]); if (self->child_nsfds2[i] >= 0) close(self->child_nsfds2[i]); + if (self->child_pidfd_derived_nsfds[i] >= 0) + close(self->child_pidfd_derived_nsfds[i]); + if (self->child_pidfd_derived_nsfds1[i] >= 0) + close(self->child_pidfd_derived_nsfds1[i]); + if (self->child_pidfd_derived_nsfds2[i] >= 0) + close(self->child_pidfd_derived_nsfds2[i]); } if (self->child_pidfd1 >= 0) @@ -446,6 +584,42 @@ TEST_F(current_nsset, nsfd_incremental_setns) } } +TEST_F(current_nsset, pidfd_derived_nsfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_pidfd_derived_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_pidfd_derived_nsfds1[i], info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_pidfd_derived_nsfds1[i]); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->child_pidfd_derived_nsfds[i]; + else + nsfd = self->child_pidfd_derived_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_pidfd_derived_nsfds1[i]); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, self->child_pidfd_derived_nsfds1[i]); + } +} + TEST_F(current_nsset, pidfd_one_shot_setns) { unsigned flags = 0; @@ -542,6 +716,28 @@ TEST_F(current_nsset, no_foul_play) info->name, self->child_pid2, self->child_nsfds2[i]); } + + /* + * Can't setns to a user namespace outside of our hierarchy since we + * don't have caps in there and didn't create it. That means that under + * no circumstances should we be able to setns to any of the other + * ones since they aren't owned by our user namespace. + */ + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_pidfd_derived_nsfds2[i] < 0 || !info->flag) + continue; + + ASSERT_NE(setns(self->child_pidfd_derived_nsfds2[i], info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_pidfd_derived_nsfds2[i]); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_pidfd_derived_nsfds2[i]); + } } TEST(setns_einval) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index b33cd8753689..ad79784e552d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -68,6 +68,8 @@ config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG" config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +config_override_param "$config_dir/CFcommon.$(uname -m)" KcList \ + "`cat $config_dir/CFcommon.$(uname -m) 2> /dev/null`" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 990d24696fd3..0447c4a00cc4 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -19,10 +19,10 @@ PATH=${RCUTORTURE}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2)) -HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) -if test "$HALF_ALLOTED_CPUS" -lt 1 +SCALE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) +if test "$SCALE_ALLOTED_CPUS" -lt 1 then - HALF_ALLOTED_CPUS=1 + SCALE_ALLOTED_CPUS=1 fi VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16)) if test "$VERBOSE_BATCH_CPUS" -lt 2 @@ -90,6 +90,7 @@ usage () { echo " --do-scftorture / --do-no-scftorture / --no-scftorture" echo " --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep" echo " --duration [ <minutes> | <hours>h | <days>d ]" + echo " --guest-cpu-limit N" echo " --kcsan-kmake-arg kernel-make-arguments" exit 1 } @@ -203,6 +204,21 @@ do duration_base=$(($ts*mult)) shift ;; + --guest-cpu-limit|--guest-cpu-lim) + checkarg --guest-cpu-limit "(number)" "$#" "$2" '^[0-9]*$' '^--' + if (("$2" <= "$TORTURE_ALLOTED_CPUS" / 2)) + then + SCALE_ALLOTED_CPUS="$2" + VERBOSE_BATCH_CPUS="$((SCALE_ALLOTED_CPUS/8))" + if (("$VERBOSE_BATCH_CPUS" < 2)) + then + VERBOSE_BATCH_CPUS=0 + fi + else + echo "Ignoring value of $2 for --guest-cpu-limit which is greater than (("$TORTURE_ALLOTED_CPUS" / 2))." + fi + shift + ;; --kcsan-kmake-arg|--kcsan-kmake-args) checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -425,9 +441,9 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((3+HALF_ALLOTED_CPUS/16)) - torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make + scfmem=$((3+SCALE_ALLOTED_CPUS/16)) + torture_bootargs="scftorture.nthreads=$SCALE_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi if test "$do_rt" = "yes" @@ -471,8 +487,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$SCALE_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -520,8 +536,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$SCALE_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -559,7 +575,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index 0e92d85313aa..217597e84905 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -1,7 +1,5 @@ CONFIG_RCU_TORTURE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le new file mode 100644 index 000000000000..133da04247ee --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le @@ -0,0 +1 @@ +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot index 979edbf4c820..55ce305b2a3d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot @@ -2,3 +2,4 @@ nohz_full=2-9 rcutorture.stall_cpu=14 rcutorture.stall_cpu_holdoff=90 rcutorture.fwd_progress=0 +rcutree.nohz_full_patience_delay=1000 diff --git a/tools/testing/selftests/rcutorture/configs/refscale/TINY b/tools/testing/selftests/rcutorture/configs/refscale/TINY new file mode 100644 index 000000000000..759343980b80 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/TINY @@ -0,0 +1,20 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n +#CHECK#CONFIG_PREEMPT_RCU=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 742782438ca3..94cfdba5308d 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -290,12 +290,12 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param static bool arch_supports_noncont_cat(const struct resctrl_test *test) { - unsigned int eax, ebx, ecx, edx; - /* AMD always supports non-contiguous CBM. */ if (get_vendor() == ARCH_AMD) return true; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax, ebx, ecx, edx; /* Intel support for non-contiguous CBM needs to be discovered. */ if (!strcmp(test->resource, "L3")) __cpuid_count(0x10, 1, eax, ebx, ecx, edx); @@ -305,6 +305,9 @@ static bool arch_supports_noncont_cat(const struct resctrl_test *test) return false; return ((ecx >> 3) & 1); +#endif /* end arch */ + + return false; } static int noncont_cat_run_test(const struct resctrl_test *test, diff --git a/tools/testing/selftests/riscv/mm/mmap_bottomup.c b/tools/testing/selftests/riscv/mm/mmap_bottomup.c index 7f7d3eb8b9c9..f9ccae50349b 100644 --- a/tools/testing/selftests/riscv/mm/mmap_bottomup.c +++ b/tools/testing/selftests/riscv/mm/mmap_bottomup.c @@ -7,8 +7,6 @@ TEST(infinite_rlimit) { EXPECT_EQ(BOTTOM_UP, memory_layout()); - - TEST_MMAPS; } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/mmap_default.c b/tools/testing/selftests/riscv/mm/mmap_default.c index 2ba3ec990006..3f53b6ecc326 100644 --- a/tools/testing/selftests/riscv/mm/mmap_default.c +++ b/tools/testing/selftests/riscv/mm/mmap_default.c @@ -7,8 +7,6 @@ TEST(default_rlimit) { EXPECT_EQ(TOP_DOWN, memory_layout()); - - TEST_MMAPS; } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h index 3b29ca3bb3d4..75918d15919f 100644 --- a/tools/testing/selftests/riscv/mm/mmap_test.h +++ b/tools/testing/selftests/riscv/mm/mmap_test.h @@ -10,76 +10,9 @@ #define TOP_DOWN 0 #define BOTTOM_UP 1 -#if __riscv_xlen == 64 -uint64_t random_addresses[] = { - 0x19764f0d73b3a9f0, 0x016049584cecef59, 0x3580bdd3562f4acd, - 0x1164219f20b17da0, 0x07d97fcb40ff2373, 0x76ec528921272ee7, - 0x4dd48c38a3de3f70, 0x2e11415055f6997d, 0x14b43334ac476c02, - 0x375a60795aff19f6, 0x47f3051725b8ee1a, 0x4e697cf240494a9f, - 0x456b59b5c2f9e9d1, 0x101724379d63cb96, 0x7fe9ad31619528c1, - 0x2f417247c495c2ea, 0x329a5a5b82943a5e, 0x06d7a9d6adcd3827, - 0x327b0b9ee37f62d5, 0x17c7b1851dfd9b76, 0x006ebb6456ec2cd9, - 0x00836cd14146a134, 0x00e5c4dcde7126db, 0x004c29feadf75753, - 0x00d8b20149ed930c, 0x00d71574c269387a, 0x0006ebe4a82acb7a, - 0x0016135df51f471b, 0x00758bdb55455160, 0x00d0bdd949b13b32, - 0x00ecea01e7c5f54b, 0x00e37b071b9948b1, 0x0011fdd00ff57ab3, - 0x00e407294b52f5ea, 0x00567748c200ed20, 0x000d073084651046, - 0x00ac896f4365463c, 0x00eb0d49a0b26216, 0x0066a2564a982a31, - 0x002e0d20237784ae, 0x0000554ff8a77a76, 0x00006ce07a54c012, - 0x000009570516d799, 0x00000954ca15b84d, 0x0000684f0d453379, - 0x00002ae5816302b5, 0x0000042403fb54bf, 0x00004bad7392bf30, - 0x00003e73bfa4b5e3, 0x00005442c29978e0, 0x00002803f11286b6, - 0x000073875d745fc6, 0x00007cede9cb8240, 0x000027df84cc6a4f, - 0x00006d7e0e74242a, 0x00004afd0b836e02, 0x000047d0e837cd82, - 0x00003b42405efeda, 0x00001531bafa4c95, 0x00007172cae34ac4, -}; -#else -uint32_t random_addresses[] = { - 0x8dc302e0, 0x929ab1e0, 0xb47683ba, 0xea519c73, 0xa19f1c90, 0xc49ba213, - 0x8f57c625, 0xadfe5137, 0x874d4d95, 0xaa20f09d, 0xcf21ebfc, 0xda7737f1, - 0xcedf392a, 0x83026c14, 0xccedca52, 0xc6ccf826, 0xe0cd9415, 0x997472ca, - 0xa21a44c1, 0xe82196f5, 0xa23fd66b, 0xc28d5590, 0xd009cdce, 0xcf0be646, - 0x8fc8c7ff, 0xe2a85984, 0xa3d3236b, 0x89a0619d, 0xc03db924, 0xb5d4cc1b, - 0xb96ee04c, 0xd191da48, 0xb432a000, 0xaa2bebbc, 0xa2fcb289, 0xb0cca89b, - 0xb0c18d6a, 0x88f58deb, 0xa4d42d1c, 0xe4d74e86, 0x99902b09, 0x8f786d31, - 0xbec5e381, 0x9a727e65, 0xa9a65040, 0xa880d789, 0x8f1b335e, 0xfc821c1e, - 0x97e34be4, 0xbbef84ed, 0xf447d197, 0xfd7ceee2, 0xe632348d, 0xee4590f4, - 0x958992a5, 0xd57e05d6, 0xfd240970, 0xc5b0dcff, 0xd96da2c2, 0xa7ae041d, -}; -#endif - -// Only works on 64 bit -#if __riscv_xlen == 64 #define PROT (PROT_READ | PROT_WRITE) #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) -/* mmap must return a value that doesn't use more bits than the hint address. */ -static inline unsigned long get_max_value(unsigned long input) -{ - unsigned long max_bit = (1UL << (((sizeof(unsigned long) * 8) - 1 - - __builtin_clzl(input)))); - - return max_bit + (max_bit - 1); -} - -#define TEST_MMAPS \ - ({ \ - void *mmap_addr; \ - for (int i = 0; i < ARRAY_SIZE(random_addresses); i++) { \ - mmap_addr = mmap((void *)random_addresses[i], \ - 5 * sizeof(int), PROT, FLAGS, 0, 0); \ - EXPECT_NE(MAP_FAILED, mmap_addr); \ - EXPECT_GE((void *)get_max_value(random_addresses[i]), \ - mmap_addr); \ - mmap_addr = mmap((void *)random_addresses[i], \ - 5 * sizeof(int), PROT, FLAGS, 0, 0); \ - EXPECT_NE(MAP_FAILED, mmap_addr); \ - EXPECT_GE((void *)get_max_value(random_addresses[i]), \ - mmap_addr); \ - } \ - }) -#endif /* __riscv_xlen == 64 */ - static inline int memory_layout(void) { void *value1 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0); diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c index 63ce02d1d5cc..9647b14b47c5 100644 --- a/tools/testing/selftests/rtc/rtctest.c +++ b/tools/testing/selftests/rtc/rtctest.c @@ -410,13 +410,6 @@ TEST_F_TIMEOUT(rtc, alarm_wkalm_set_minute, 65) { ASSERT_EQ(new, secs); } -static void __attribute__((constructor)) -__constructor_order_last(void) -{ - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; -} - int main(int argc, char **argv) { switch (argc) { diff --git a/tools/testing/selftests/rust/config b/tools/testing/selftests/rust/config index b4002acd40bc..5f942b5c8c17 100644 --- a/tools/testing/selftests/rust/config +++ b/tools/testing/selftests/rust/config @@ -1,5 +1,6 @@ +# CONFIG_GCC_PLUGINS is not set CONFIG_RUST=y CONFIG_SAMPLES=y CONFIG_SAMPLES_RUST=y CONFIG_SAMPLE_RUST_MINIMAL=m -CONFIG_SAMPLE_RUST_PRINT=m
\ No newline at end of file +CONFIG_SAMPLE_RUST_PRINT=m diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e3f97f90d8db..8c3a73461475 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -60,7 +60,9 @@ #define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) #endif +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#endif #ifndef PR_SET_PTRACER # define PR_SET_PTRACER 0x59616d61 diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py index ee349187636f..4f255cec0c22 100755 --- a/tools/testing/selftests/tc-testing/tdc.py +++ b/tools/testing/selftests/tc-testing/tdc.py @@ -143,7 +143,6 @@ class PluginMgr: except Exception as ee: print('exception {} in call to pre_case for {} plugin'. format(ee, pgn_inst.__class__)) - print('test_ordinal is {}'.format(test_ordinal)) print('testid is {}'.format(caseinfo['id'])) raise diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index 4421cd562c24..18e794a46c23 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -30,9 +30,6 @@ #include <time.h> #include "../kselftest.h" -#define NSEC_PER_SEC 1000000000LL - - int change_skew_test(int ppm) { struct timex tx; diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 07c81c0093c0..16bd49492efa 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -6,10 +6,13 @@ * * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com> */ - +#define _GNU_SOURCE #include <sys/time.h> +#include <sys/types.h> #include <stdio.h> #include <signal.h> +#include <stdint.h> +#include <string.h> #include <unistd.h> #include <time.h> #include <pthread.h> @@ -18,6 +21,21 @@ #define DELAY 2 #define USECS_PER_SEC 1000000 +#define NSECS_PER_SEC 1000000000 + +static void __fatal_error(const char *test, const char *name, const char *what) +{ + char buf[64]; + + strerror_r(errno, buf, sizeof(buf)); + + if (name && strlen(name)) + ksft_exit_fail_msg("%s %s %s %s\n", test, name, what, buf); + else + ksft_exit_fail_msg("%s %s %s\n", test, what, buf); +} + +#define fatal_error(name, what) __fatal_error(__func__, name, what) static volatile int done; @@ -74,24 +92,13 @@ static int check_diff(struct timeval start, struct timeval end) return 0; } -static int check_itimer(int which) +static void check_itimer(int which, const char *name) { - const char *name; - int err; struct timeval start, end; struct itimerval val = { .it_value.tv_sec = DELAY, }; - if (which == ITIMER_VIRTUAL) - name = "ITIMER_VIRTUAL"; - else if (which == ITIMER_PROF) - name = "ITIMER_PROF"; - else if (which == ITIMER_REAL) - name = "ITIMER_REAL"; - else - return -1; - done = 0; if (which == ITIMER_VIRTUAL) @@ -101,17 +108,11 @@ static int check_itimer(int which) else if (which == ITIMER_REAL) signal(SIGALRM, sig_handler); - err = gettimeofday(&start, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (gettimeofday(&start, NULL) < 0) + fatal_error(name, "gettimeofday()"); - err = setitimer(which, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + if (setitimer(which, &val, NULL) < 0) + fatal_error(name, "setitimer()"); if (which == ITIMER_VIRTUAL) user_loop(); @@ -120,68 +121,41 @@ static int check_itimer(int which) else if (which == ITIMER_REAL) idle_loop(); - err = gettimeofday(&end, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (gettimeofday(&end, NULL) < 0) + fatal_error(name, "gettimeofday()"); ksft_test_result(check_diff(start, end) == 0, "%s\n", name); - - return 0; } -static int check_timer_create(int which) +static void check_timer_create(int which, const char *name) { - const char *type; - int err; - timer_t id; struct timeval start, end; struct itimerspec val = { .it_value.tv_sec = DELAY, }; - - if (which == CLOCK_THREAD_CPUTIME_ID) { - type = "thread"; - } else if (which == CLOCK_PROCESS_CPUTIME_ID) { - type = "process"; - } else { - ksft_print_msg("Unknown timer_create() type %d\n", which); - return -1; - } + timer_t id; done = 0; - err = timer_create(which, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - signal(SIGALRM, sig_handler); - err = gettimeofday(&start, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (timer_create(which, NULL, &id) < 0) + fatal_error(name, "timer_create()"); - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + if (signal(SIGALRM, sig_handler) == SIG_ERR) + fatal_error(name, "signal()"); + + if (gettimeofday(&start, NULL) < 0) + fatal_error(name, "gettimeofday()"); + + if (timer_settime(id, 0, &val, NULL) < 0) + fatal_error(name, "timer_settime()"); user_loop(); - err = gettimeofday(&end, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (gettimeofday(&end, NULL) < 0) + fatal_error(name, "gettimeofday()"); ksft_test_result(check_diff(start, end) == 0, - "timer_create() per %s\n", type); - - return 0; + "timer_create() per %s\n", name); } static pthread_t ctd_thread; @@ -209,15 +183,14 @@ static void *ctd_thread_func(void *arg) ctd_count = 100; if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) - return "Can't create timer\n"; + fatal_error(NULL, "timer_create()"); if (timer_settime(id, 0, &val, NULL)) - return "Can't set timer\n"; - + fatal_error(NULL, "timer_settime()"); while (ctd_count > 0 && !ctd_failed) ; if (timer_delete(id)) - return "Can't delete timer\n"; + fatal_error(NULL, "timer_delete()"); return NULL; } @@ -225,19 +198,16 @@ static void *ctd_thread_func(void *arg) /* * Test that only the running thread receives the timer signal. */ -static int check_timer_distribution(void) +static void check_timer_distribution(void) { - const char *errmsg; + if (signal(SIGALRM, ctd_sighandler) == SIG_ERR) + fatal_error(NULL, "signal()"); - signal(SIGALRM, ctd_sighandler); - - errmsg = "Can't create thread\n"; if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) - goto err; + fatal_error(NULL, "pthread_create()"); - errmsg = "Can't join thread\n"; - if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) - goto err; + if (pthread_join(ctd_thread, NULL)) + fatal_error(NULL, "pthread_join()"); if (!ctd_failed) ksft_test_result_pass("check signal distribution\n"); @@ -245,31 +215,399 @@ static int check_timer_distribution(void) ksft_test_result_fail("check signal distribution\n"); else ksft_test_result_skip("check signal distribution (old kernel)\n"); - return 0; -err: - ksft_print_msg("%s", errmsg); - return -1; +} + +struct tmrsig { + int signals; + int overruns; +}; + +static void siginfo_handler(int sig, siginfo_t *si, void *uc) +{ + struct tmrsig *tsig = si ? si->si_ptr : NULL; + + if (tsig) { + tsig->signals++; + tsig->overruns += si->si_overrun; + } +} + +static void *ignore_thread(void *arg) +{ + unsigned int *tid = arg; + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + *tid = gettid(); + sleep(100); + + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + return NULL; +} + +static void check_sig_ign(int thread) +{ + struct tmrsig tsig = { }; + struct itimerspec its; + unsigned int tid = 0; + struct sigaction sa; + struct sigevent sev; + pthread_t pthread; + timer_t timerid; + sigset_t set; + + if (thread) { + if (pthread_create(&pthread, NULL, ignore_thread, &tid)) + fatal_error(NULL, "pthread_create()"); + sleep(1); + } + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (thread) { + sev.sigev_notify = SIGEV_THREAD_ID; + sev._sigev_un._tid = tid; + } + + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + fatal_error(NULL, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + timer_settime(timerid, 0, &its, NULL); + + sleep(1); + + /* Set the signal to be ignored */ + if (signal(SIGUSR1, SIG_IGN) == SIG_ERR) + fatal_error(NULL, "signal(SIG_IGN)"); + + sleep(1); + + if (thread) { + /* Stop the thread first. No signal should be delivered to it */ + if (pthread_cancel(pthread)) + fatal_error(NULL, "pthread_cancel()"); + if (pthread_join(pthread, NULL)) + fatal_error(NULL, "pthread_join()"); + } + + /* Restore the handler */ + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + sleep(1); + + /* Unblock it, which should deliver the signal in the !thread case*/ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + + if (timer_delete(timerid)) + fatal_error(NULL, "timer_delete()"); + + if (!thread) { + ksft_test_result(tsig.signals == 1 && tsig.overruns == 29, + "check_sig_ign SIGEV_SIGNAL\n"); + } else { + ksft_test_result(tsig.signals == 0 && tsig.overruns == 0, + "check_sig_ign SIGEV_THREAD_ID\n"); + } +} + +static void check_rearm(void) +{ + struct tmrsig tsig = { }; + struct itimerspec its; + struct sigaction sa; + struct sigevent sev; + timer_t timerid; + sigset_t set; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + fatal_error(NULL, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(NULL, "timer_settime()"); + + sleep(1); + + /* Reprogram the timer to single shot */ + its.it_value.tv_sec = 10; + its.it_value.tv_nsec = 0; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 0; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(NULL, "timer_settime()"); + + /* Unblock it, which should not deliver a signal */ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + + if (timer_delete(timerid)) + fatal_error(NULL, "timer_delete()"); + + ksft_test_result(!tsig.signals, "check_rearm\n"); +} + +static void check_delete(void) +{ + struct tmrsig tsig = { }; + struct itimerspec its; + struct sigaction sa; + struct sigevent sev; + timer_t timerid; + sigset_t set; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + fatal_error(NULL, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(NULL, "timer_settime()"); + + sleep(1); + + if (timer_delete(timerid)) + fatal_error(NULL, "timer_delete()"); + + /* Unblock it, which should not deliver a signal */ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + + ksft_test_result(!tsig.signals, "check_delete\n"); +} + +static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) +{ + int64_t diff; + + diff = NSECS_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec); + diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); + return diff; +} + +static void check_sigev_none(int which, const char *name) +{ + struct timespec start, now; + struct itimerspec its; + struct sigevent sev; + timer_t timerid; + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_NONE; + + if (timer_create(which, &sev, &timerid)) + fatal_error(name, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + timer_settime(timerid, 0, &its, NULL); + + if (clock_gettime(which, &start)) + fatal_error(name, "clock_gettime()"); + + do { + if (clock_gettime(which, &now)) + fatal_error(name, "clock_gettime()"); + } while (calcdiff_ns(now, start) < NSECS_PER_SEC); + + if (timer_gettime(timerid, &its)) + fatal_error(name, "timer_gettime()"); + + if (timer_delete(timerid)) + fatal_error(name, "timer_delete()"); + + ksft_test_result(its.it_value.tv_sec || its.it_value.tv_nsec, + "check_sigev_none %s\n", name); +} + +static void check_gettime(int which, const char *name) +{ + struct itimerspec its, prev; + struct timespec start, now; + struct sigevent sev; + timer_t timerid; + int wraps = 0; + sigset_t set; + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(name, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + + if (timer_create(which, &sev, &timerid)) + fatal_error(name, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(name, "timer_settime()"); + + if (timer_gettime(timerid, &prev)) + fatal_error(name, "timer_gettime()"); + + if (clock_gettime(which, &start)) + fatal_error(name, "clock_gettime()"); + + do { + if (clock_gettime(which, &now)) + fatal_error(name, "clock_gettime()"); + if (timer_gettime(timerid, &its)) + fatal_error(name, "timer_gettime()"); + if (its.it_value.tv_nsec > prev.it_value.tv_nsec) + wraps++; + prev = its; + + } while (calcdiff_ns(now, start) < NSECS_PER_SEC); + + if (timer_delete(timerid)) + fatal_error(name, "timer_delete()"); + + ksft_test_result(wraps > 1, "check_gettime %s\n", name); +} + +static void check_overrun(int which, const char *name) +{ + struct timespec start, now; + struct tmrsig tsig = { }; + struct itimerspec its; + struct sigaction sa; + struct sigevent sev; + timer_t timerid; + sigset_t set; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(name, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(name, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (timer_create(which, &sev, &timerid)) + fatal_error(name, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(name, "timer_settime()"); + + if (clock_gettime(which, &start)) + fatal_error(name, "clock_gettime()"); + + do { + if (clock_gettime(which, &now)) + fatal_error(name, "clock_gettime()"); + } while (calcdiff_ns(now, start) < NSECS_PER_SEC); + + /* Unblock it, which should deliver a signal */ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(name, "sigprocmask(SIG_UNBLOCK)"); + + if (timer_delete(timerid)) + fatal_error(name, "timer_delete()"); + + ksft_test_result(tsig.signals == 1 && tsig.overruns == 9, + "check_overrun %s\n", name); } int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(6); + ksft_set_plan(18); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); - if (check_itimer(ITIMER_VIRTUAL) < 0) - ksft_exit_fail(); - - if (check_itimer(ITIMER_PROF) < 0) - ksft_exit_fail(); - - if (check_itimer(ITIMER_REAL) < 0) - ksft_exit_fail(); - - if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) - ksft_exit_fail(); + check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL"); + check_itimer(ITIMER_PROF, "ITIMER_PROF"); + check_itimer(ITIMER_REAL, "ITIMER_REAL"); + check_timer_create(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); /* * It's unfortunately hard to reliably test a timer expiration @@ -280,11 +618,21 @@ int main(int argc, char **argv) * to ensure true parallelism. So test only one thread until we * find a better solution. */ - if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) - ksft_exit_fail(); - - if (check_timer_distribution() < 0) - ksft_exit_fail(); + check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_timer_distribution(); + + check_sig_ign(0); + check_sig_ign(1); + check_rearm(); + check_delete(); + check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); ksft_finished(); } diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index c8e6bffe4e0a..83450145fe65 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -36,8 +36,6 @@ #include <sys/wait.h> #include "../kselftest.h" -#define NSEC_PER_SEC 1000000000LL - int main(int argc, char **argv) { struct timex tx; diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index 76b38e41d9c7..d5564bbf0e50 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -38,10 +38,10 @@ struct timespec global_list[LISTSIZE]; int listcount = 0; -void checklist(struct timespec *list, int size) +void checklist(const struct timespec *list, int size) { int i, j; - struct timespec *a, *b; + const struct timespec *a, *b; /* scan the list */ for (i = 0; i < size-1; i++) { diff --git a/tools/testing/selftests/tpm2/test_async.sh b/tools/testing/selftests/tpm2/test_async.sh index 43bf5bd772fd..cf5a9c826097 100755 --- a/tools/testing/selftests/tpm2/test_async.sh +++ b/tools/testing/selftests/tpm2/test_async.sh @@ -7,4 +7,4 @@ ksft_skip=4 [ -e /dev/tpm0 ] || exit $ksft_skip [ -e /dev/tpmrm0 ] || exit $ksft_skip -python3 -m unittest -v tpm2_tests.AsyncTest +python3 -m unittest -v tpm2_tests.AsyncTest 2>&1 diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh index 58af963e5b55..20fa70f970a9 100755 --- a/tools/testing/selftests/tpm2/test_smoke.sh +++ b/tools/testing/selftests/tpm2/test_smoke.sh @@ -6,4 +6,4 @@ ksft_skip=4 [ -e /dev/tpm0 ] || exit $ksft_skip -python3 -m unittest -v tpm2_tests.SmokeTest +python3 -m unittest -v tpm2_tests.SmokeTest 2>&1 diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh index 04c47b13fe8a..93894cbc89a8 100755 --- a/tools/testing/selftests/tpm2/test_space.sh +++ b/tools/testing/selftests/tpm2/test_space.sh @@ -6,4 +6,4 @@ ksft_skip=4 [ -e /dev/tpmrm0 ] || exit $ksft_skip -python3 -m unittest -v tpm2_tests.SpaceTest +python3 -m unittest -v tpm2_tests.SpaceTest 2>&1 diff --git a/tools/testing/selftests/turbostat/added_perf_counters.py b/tools/testing/selftests/turbostat/added_perf_counters.py new file mode 100755 index 000000000000..9ab4aaf45fb8 --- /dev/null +++ b/tools/testing/selftests/turbostat/added_perf_counters.py @@ -0,0 +1,178 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +class PerfCounterInfo: + def __init__(self, subsys, event): + self.subsys = subsys + self.event = event + + def get_perf_event_name(self): + return f'{self.subsys}/{self.event}/' + + def get_turbostat_perf_id(self, counter_scope, counter_type, column_name): + return f'perf/{self.subsys}/{self.event},{counter_scope},{counter_type},{column_name}' + +PERF_COUNTERS_CANDIDATES = [ + PerfCounterInfo('msr', 'mperf'), + PerfCounterInfo('msr', 'aperf'), + PerfCounterInfo('msr', 'tsc'), + PerfCounterInfo('cstate_core', 'c1-residency'), + PerfCounterInfo('cstate_core', 'c6-residency'), + PerfCounterInfo('cstate_core', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c2-residency'), + PerfCounterInfo('cstate_pkg', 'c3-residency'), + PerfCounterInfo('cstate_pkg', 'c6-residency'), + PerfCounterInfo('cstate_pkg', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c8-residency'), + PerfCounterInfo('cstate_pkg', 'c9-residency'), + PerfCounterInfo('cstate_pkg', 'c10-residency'), +] +present_perf_counters = [] + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + if b'<not supported>' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + return True + + for counter in PERF_COUNTERS_CANDIDATES: + if has_perf_counter_access(counter.get_perf_event_name()): + present_perf_counters.append(counter) + + if len(present_perf_counters) == 0: + print('SKIP: Could not read any perf counter.') + return False + + if len(present_perf_counters) != len(PERF_COUNTERS_CANDIDATES): + print(f'WARN: Could not access all of the counters - some will be left untested') + + return True + +if not check_perf_access(): + exit(0) + +turbostat_counter_source_opts = [''] + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = [b'usec', b'Time_Of_Day_Seconds', b'APIC', b'X2APIC'] + +expected_columns = [b'CPU'] +counters_argv = [] +for counter in present_perf_counters: + if counter.subsys == 'cstate_core': + counter_scope = 'core' + elif counter.subsys == 'cstate_pkg': + counter_scope = 'package' + else: + counter_scope = 'cpu' + + counter_type = 'delta' + column_name = counter.event + + cparams = counter.get_turbostat_perf_id( + counter_scope = counter_scope, + counter_type = counter_type, + column_name = column_name + ) + expected_columns.append(column_name.encode()) + counters_argv.extend(['--add', cparams]) + +expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + expected_columns + +def gen_user_friendly_cmdline(argv_): + argv = argv_[:] + ret = '' + + while len(argv) != 0: + arg = argv.pop(0) + arg_next = '' + + if arg in ('-i', '--show', '--add'): + arg_next = argv.pop(0) if len(argv) > 0 else '' + + ret += f'{arg} {arg_next} \\\n\t' + + # Remove the last separator and return + return ret[:-4] + +# +# Run turbostat for some time and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] +turbostat_argv = [turbostat, '-i', '0.50', '--show', 'CPU'] + counters_argv + +def check_columns_or_fail(expected_columns: list, actual_columns: list): + if len(actual_columns) != len(expected_columns): + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + + failed = False + for expected_column in expected_columns: + if expected_column not in actual_columns: + print(f'turbostat column check failed: missing column {expected_column.decode()}') + failed = True + + if failed: + exit(1) + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns, actual_columns) +print('OK') + +# +# Same, but with --debug +# +# We explicitly specify '--show CPU' to make sure turbostat +# don't show a bunch of default counters instead. +# +turbostat_argv.append('--debug') + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat (in debug mode) with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns_debug, actual_columns) +print('OK') diff --git a/tools/testing/selftests/turbostat/smi_aperf_mperf.py b/tools/testing/selftests/turbostat/smi_aperf_mperf.py new file mode 100755 index 000000000000..6289cc47d5f0 --- /dev/null +++ b/tools/testing/selftests/turbostat/smi_aperf_mperf.py @@ -0,0 +1,157 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +# CDLL calls dlopen underneath. +# Calling it with None (null), we get handle to the our own image (python interpreter). +# We hope to find sched_getcpu() inside ;] +# This is a bit ugly, but helps shipping working software, so.. +try: + import ctypes + + this_image = ctypes.CDLL(None) + BASE_CPU = this_image.sched_getcpu() +except: + BASE_CPU = 0 # If we fail, set to 0 and pray it's not offline. + +MSR_IA32_MPERF = 0x000000e7 +MSR_IA32_APERF = 0x000000e8 + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + if b'<not supported>' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + return True + + if not has_perf_counter_access('msr/mperf/'): + return False + if not has_perf_counter_access('msr/aperf/'): + return False + if not has_perf_counter_access('msr/smi/'): + return False + + return True + +def check_msr_access(): + try: + file_msr = open(f'/dev/cpu/{BASE_CPU}/msr', 'rb') + except: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_MPERF)) != 8: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_APERF)) != 8: + return False + + return True + +has_perf_access = check_perf_access() +has_msr_access = check_msr_access() + +turbostat_counter_source_opts = [''] + +if has_msr_access: + turbostat_counter_source_opts.append('--no-perf') +else: + print('SKIP: doesn\'t have MSR access, skipping run with --no-perf') + +if has_perf_access: + turbostat_counter_source_opts.append('--no-msr') +else: + print('SKIP: doesn\'t have perf access, skipping run with --no-msr') + +if not has_msr_access and not has_perf_access: + print('SKIP: No MSR nor perf access detected. Skipping the tests entirely') + exit(0) + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = b'usec\tTime_Of_Day_Seconds\tAPIC\tX2APIC' + +SMI_APERF_MPERF_DEPENDENT_BICS = [ + 'SMI', + 'Avg_MHz', + 'Busy%', + 'Bzy_MHz', +] +if has_perf_access: + SMI_APERF_MPERF_DEPENDENT_BICS.append('IPC') + +for bic in SMI_APERF_MPERF_DEPENDENT_BICS: + for counter_source_opt in turbostat_counter_source_opts: + + # Ugly special case, but it is what it is.. + if counter_source_opt == '--no-perf' and bic == 'IPC': + continue + + expected_columns = bic.encode() + expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + f'\t{bic}'.encode() + + # + # Run turbostat for some time and send SIGINT + # + timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] + turbostat_argv = [turbostat, '-i', '0.50', '--show', bic] + + if counter_source_opt: + turbostat_argv.append(counter_source_opt) + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + print('OK') + + # + # Same, but with --debug + # + turbostat_argv.append('--debug') + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) + print('OK') diff --git a/tools/testing/selftests/user/Makefile b/tools/testing/selftests/user/Makefile deleted file mode 100644 index 640a40f9b72b..000000000000 --- a/tools/testing/selftests/user/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# Makefile for user memory selftests - -# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" -all: - -TEST_PROGS := test_user_copy.sh - -include ../lib.mk diff --git a/tools/testing/selftests/user/config b/tools/testing/selftests/user/config deleted file mode 100644 index 784ed8416324..000000000000 --- a/tools/testing/selftests/user/config +++ /dev/null @@ -1 +0,0 @@ -CONFIG_TEST_USER_COPY=m diff --git a/tools/testing/selftests/user/test_user_copy.sh b/tools/testing/selftests/user/test_user_copy.sh deleted file mode 100755 index f9b31a57439b..000000000000 --- a/tools/testing/selftests/user/test_user_copy.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Runs copy_to/from_user infrastructure using test_user_copy kernel module - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -if ! /sbin/modprobe -q -n test_user_copy; then - echo "user: module test_user_copy is not found [SKIP]" - exit $ksft_skip -fi -if /sbin/modprobe -q test_user_copy; then - /sbin/modprobe -q -r test_user_copy - echo "user_copy: ok" -else - echo "user_copy: [FAIL]" - exit 1 -fi diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index a8dc51af5a9c..30d5c8f0e5c7 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -6,3 +6,5 @@ vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu vdso_standalone_test_x86 +vdso_test_getrandom +vdso_test_chacha diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 98d8ba2afa00..3de8e7e052ae 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu @@ -10,6 +11,12 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += vdso_standalone_test_x86 endif TEST_GEN_PROGS += vdso_test_correctness +ifeq ($(uname_M),x86_64) +TEST_GEN_PROGS += vdso_test_getrandom +ifneq ($(SODIUM),) +TEST_GEN_PROGS += vdso_test_chacha +endif +endif CFLAGS := -std=gnu99 @@ -28,3 +35,14 @@ $(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind- $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl + +$(OUTPUT)/vdso_test_getrandom: parse_vdso.c +$(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ + -isystem $(top_srcdir)/include/uapi + +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S +$(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ + -isystem $(top_srcdir)/arch/$(ARCH)/include \ + -isystem $(top_srcdir)/include \ + -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \ + -Wa,--noexecstack $(SODIUM) diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c new file mode 100644 index 000000000000..e38f44e5f803 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <sodium/crypto_stream_chacha20.h> +#include <sys/random.h> +#include <string.h> +#include <stdint.h> +#include "../kselftest.h" + +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks); + +int main(int argc, char *argv[]) +{ + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 }; + static const uint8_t nonce[8] = { 0 }; + uint32_t counter[2]; + uint8_t key[32]; + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS]; + + ksft_print_header(); + ksft_set_plan(1); + + for (unsigned int trial = 0; trial < TRIALS; ++trial) { + if (getrandom(key, sizeof(key), 0) != sizeof(key)) { + printf("getrandom() failed!\n"); + return KSFT_SKIP; + } + crypto_stream_chacha20(output1, sizeof(output1), nonce, key); + for (unsigned int split = 0; split < BLOCKS; ++split) { + memset(output2, 'X', sizeof(output2)); + memset(counter, 0, sizeof(counter)); + if (split) + __arch_chacha20_blocks_nostack(output2, key, counter, split); + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split); + if (memcmp(output1, output2, sizeof(output1))) + return KSFT_FAIL; + } + } + ksft_test_result_pass("chacha: PASS\n"); + return KSFT_PASS; +} diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c new file mode 100644 index 000000000000..05122425a873 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <signal.h> +#include <sys/auxv.h> +#include <sys/mman.h> +#include <sys/random.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <linux/random.h> + +#include "../kselftest.h" +#include "parse_vdso.h" + +#ifndef timespecsub +#define timespecsub(tsp, usp, vsp) \ + do { \ + (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ + (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ + if ((vsp)->tv_nsec < 0) { \ + (vsp)->tv_sec--; \ + (vsp)->tv_nsec += 1000000000L; \ + } \ + } while (0) +#endif + +static struct { + pthread_mutex_t lock; + void **states; + size_t len, cap; +} grnd_allocator = { + .lock = PTHREAD_MUTEX_INITIALIZER +}; + +static struct { + ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t); + pthread_key_t key; + pthread_once_t initialized; + struct vgetrandom_opaque_params params; +} grnd_ctx = { + .initialized = PTHREAD_ONCE_INIT +}; + +static void *vgetrandom_get_state(void) +{ + void *state = NULL; + + pthread_mutex_lock(&grnd_allocator.lock); + if (!grnd_allocator.len) { + size_t page_size = getpagesize(); + size_t new_cap; + size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */ + void *new_block, *new_states; + + alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1)); + num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size); + new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0); + if (new_block == MAP_FAILED) + goto out; + + new_cap = grnd_allocator.cap + num; + new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states)); + if (!new_states) + goto unmap; + grnd_allocator.cap = new_cap; + grnd_allocator.states = new_states; + + for (size_t i = 0; i < num; ++i) { + if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size) + new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1))); + grnd_allocator.states[i] = new_block; + new_block += grnd_ctx.params.size_of_opaque_state; + } + grnd_allocator.len = num; + goto success; + + unmap: + munmap(new_block, alloc_size); + goto out; + } +success: + state = grnd_allocator.states[--grnd_allocator.len]; + +out: + pthread_mutex_unlock(&grnd_allocator.lock); + return state; +} + +static void vgetrandom_put_state(void *state) +{ + if (!state) + return; + pthread_mutex_lock(&grnd_allocator.lock); + grnd_allocator.states[grnd_allocator.len++] = state; + pthread_mutex_unlock(&grnd_allocator.lock); +} + +static void vgetrandom_init(void) +{ + if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0) + return; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + exit(KSFT_SKIP); + } + vdso_init_from_sysinfo_ehdr(sysinfo_ehdr); + grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom"); + if (!grnd_ctx.fn) { + printf("__vdso_getrandom is missing!\n"); + exit(KSFT_FAIL); + } + if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) { + printf("failed to fetch vgetrandom params!\n"); + exit(KSFT_FAIL); + } +} + +static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags) +{ + void *state; + + pthread_once(&grnd_ctx.initialized, vgetrandom_init); + state = pthread_getspecific(grnd_ctx.key); + if (!state) { + state = vgetrandom_get_state(); + if (pthread_setspecific(grnd_ctx.key, state) != 0) { + vgetrandom_put_state(state); + state = NULL; + } + if (!state) { + printf("vgetrandom_get_state failed!\n"); + exit(KSFT_FAIL); + } + } + return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state); +} + +enum { TRIALS = 25000000, THREADS = 256 }; + +static void *test_vdso_getrandom(void *) +{ + for (size_t i = 0; i < TRIALS; ++i) { + unsigned int val; + ssize_t ret = vgetrandom(&val, sizeof(val), 0); + assert(ret == sizeof(val)); + } + return NULL; +} + +static void *test_libc_getrandom(void *) +{ + for (size_t i = 0; i < TRIALS; ++i) { + unsigned int val; + ssize_t ret = getrandom(&val, sizeof(val), 0); + assert(ret == sizeof(val)); + } + return NULL; +} + +static void *test_syscall_getrandom(void *) +{ + for (size_t i = 0; i < TRIALS; ++i) { + unsigned int val; + ssize_t ret = syscall(__NR_getrandom, &val, sizeof(val), 0); + assert(ret == sizeof(val)); + } + return NULL; +} + +static void bench_single(void) +{ + struct timespec start, end, diff; + + clock_gettime(CLOCK_MONOTONIC, &start); + test_vdso_getrandom(NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" vdso: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + test_libc_getrandom(NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" libc: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + test_syscall_getrandom(NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf("syscall: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); +} + +static void bench_multi(void) +{ + struct timespec start, end, diff; + pthread_t threads[THREADS]; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (size_t i = 0; i < THREADS; ++i) + assert(pthread_create(&threads[i], NULL, test_vdso_getrandom, NULL) == 0); + for (size_t i = 0; i < THREADS; ++i) + pthread_join(threads[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" vdso: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (size_t i = 0; i < THREADS; ++i) + assert(pthread_create(&threads[i], NULL, test_libc_getrandom, NULL) == 0); + for (size_t i = 0; i < THREADS; ++i) + pthread_join(threads[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" libc: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (size_t i = 0; i < THREADS; ++i) + assert(pthread_create(&threads[i], NULL, test_syscall_getrandom, NULL) == 0); + for (size_t i = 0; i < THREADS; ++i) + pthread_join(threads[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" syscall: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); +} + +static void fill(void) +{ + uint8_t weird_size[323929]; + for (;;) + vgetrandom(weird_size, sizeof(weird_size), 0); +} + +static void kselftest(void) +{ + uint8_t weird_size[1263]; + + ksft_print_header(); + ksft_set_plan(1); + + for (size_t i = 0; i < 1000; ++i) { + ssize_t ret = vgetrandom(weird_size, sizeof(weird_size), 0); + if (ret != sizeof(weird_size)) + exit(KSFT_FAIL); + } + + ksft_test_result_pass("getrandom: PASS\n"); + exit(KSFT_PASS); +} + +static void usage(const char *argv0) +{ + fprintf(stderr, "Usage: %s [bench-single|bench-multi|fill]\n", argv0); +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) { + kselftest(); + return 0; + } + + if (argc != 2) { + usage(argv[0]); + return 1; + } + if (!strcmp(argv[1], "bench-single")) + bench_single(); + else if (!strcmp(argv[1], "bench-multi")) + bench_multi(); + else if (!strcmp(argv[1], "fill")) + fill(); + else { + usage(argv[0]); + return 1; + } + return 0; +} diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5c8757a25998..d51249f14e2f 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -77,7 +77,7 @@ all_32: $(BINARIES_32) all_64: $(BINARIES_64) -EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) +EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) srso $(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $< $(EXTRA_FILES) -lrt -ldl -lm diff --git a/tools/testing/selftests/x86/srso.c b/tools/testing/selftests/x86/srso.c new file mode 100644 index 000000000000..394ec8bdeb00 --- /dev/null +++ b/tools/testing/selftests/x86/srso.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/perf_event.h> +#include <cpuid.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include <unistd.h> + +int main(void) +{ + struct perf_event_attr ret_attr, mret_attr; + long long count_rets, count_rets_mispred; + int rrets_fd, mrrets_fd; + unsigned int cpuid1_eax, b, c, d; + + __cpuid(1, cpuid1_eax, b, c, d); + + if (cpuid1_eax < 0x00800f00 || + cpuid1_eax > 0x00afffff) { + fprintf(stderr, "This needs to run on a Zen[1-4] machine (CPUID(1).EAX: 0x%x). Exiting...\n", cpuid1_eax); + exit(EXIT_FAILURE); + } + + memset(&ret_attr, 0, sizeof(struct perf_event_attr)); + memset(&mret_attr, 0, sizeof(struct perf_event_attr)); + + ret_attr.type = mret_attr.type = PERF_TYPE_RAW; + ret_attr.size = mret_attr.size = sizeof(struct perf_event_attr); + ret_attr.config = 0xc8; + mret_attr.config = 0xc9; + ret_attr.disabled = mret_attr.disabled = 1; + ret_attr.exclude_user = mret_attr.exclude_user = 1; + ret_attr.exclude_hv = mret_attr.exclude_hv = 1; + + rrets_fd = syscall(SYS_perf_event_open, &ret_attr, 0, -1, -1, 0); + if (rrets_fd == -1) { + perror("opening retired RETs fd"); + exit(EXIT_FAILURE); + } + + mrrets_fd = syscall(SYS_perf_event_open, &mret_attr, 0, -1, -1, 0); + if (mrrets_fd == -1) { + perror("opening retired mispredicted RETs fd"); + exit(EXIT_FAILURE); + } + + ioctl(rrets_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(mrrets_fd, PERF_EVENT_IOC_RESET, 0); + + ioctl(rrets_fd, PERF_EVENT_IOC_ENABLE, 0); + ioctl(mrrets_fd, PERF_EVENT_IOC_ENABLE, 0); + + printf("Sleeping for 10 seconds\n"); + sleep(10); + + ioctl(rrets_fd, PERF_EVENT_IOC_DISABLE, 0); + ioctl(mrrets_fd, PERF_EVENT_IOC_DISABLE, 0); + + read(rrets_fd, &count_rets, sizeof(long long)); + read(mrrets_fd, &count_rets_mispred, sizeof(long long)); + + printf("RETs: (%lld retired <-> %lld mispredicted)\n", + count_rets, count_rets_mispred); + printf("SRSO Safe-RET mitigation works correctly if both counts are almost equal.\n"); + + return 0; +} diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 554b290fefdc..a3d448a075e3 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po } /* Connect to <cid, port> and return the file descriptor. */ -static int vsock_connect(unsigned int cid, unsigned int port, int type) +int vsock_connect(unsigned int cid, unsigned int port, int type) { union { struct sockaddr sa; @@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type) /* Listen on <cid, port> and return the first incoming connection. The remote * address is stored to clientaddrp. clientaddrp may be NULL. */ -static int vsock_accept(unsigned int cid, unsigned int port, - struct sockaddr_vm *clientaddrp, int type) +int vsock_accept(unsigned int cid, unsigned int port, + struct sockaddr_vm *clientaddrp, int type) { union { struct sockaddr sa; diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index e95e62485959..fff22d4a14c0 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -39,6 +39,9 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); unsigned int parse_port(const char *str); +int vsock_connect(unsigned int cid, unsigned int port, int type); +int vsock_accept(unsigned int cid, unsigned int port, + struct sockaddr_vm *clientaddrp, int type); int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index f851f8961247..8d38dbf8f41f 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -20,6 +20,8 @@ #include <sys/mman.h> #include <poll.h> #include <signal.h> +#include <sys/ioctl.h> +#include <linux/sockios.h> #include "vsock_test_zerocopy.h" #include "timeout.h" @@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define MSG_BUF_IOCTL_LEN 64 +static void test_unsent_bytes_server(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int client_fd; + + client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type); + if (client_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf)); + control_writeln("RECEIVED"); + + close(client_fd); +} + +static void test_unsent_bytes_client(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int ret, fd, sock_bytes_unsent; + + fd = vsock_connect(opts->peer_cid, opts->peer_port, type); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < sizeof(buf); i++) + buf[i] = rand() & 0xFF; + + send_buf(fd, buf, sizeof(buf), 0, sizeof(buf)); + control_expectln("RECEIVED"); + + ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n"); + } else { + perror("ioctl"); + exit(EXIT_FAILURE); + } + } else if (ret == 0 && sock_bytes_unsent != 0) { + fprintf(stderr, + "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n", + sock_bytes_unsent); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_STREAM); +} + +static void test_stream_unsent_bytes_server(const struct test_opts *opts) +{ + test_unsent_bytes_server(opts, SOCK_STREAM); +} + +static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_SEQPACKET); +} + +static void test_seqpacket_unsent_bytes_server(const struct test_opts *opts) +{ + test_unsent_bytes_server(opts, SOCK_SEQPACKET); +} + #define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) /* This define is the same as in 'include/linux/virtio_vsock.h': * it is used to decide when to send credit update message during @@ -1523,6 +1598,16 @@ static struct test_case test_cases[] = { .run_client = test_stream_rcvlowat_def_cred_upd_client, .run_server = test_stream_cred_upd_on_low_rx_bytes, }, + { + .name = "SOCK_STREAM ioctl(SIOCOUTQ) 0 unsent bytes", + .run_client = test_stream_unsent_bytes_client, + .run_server = test_stream_unsent_bytes_server, + }, + { + .name = "SOCK_SEQPACKET ioctl(SIOCOUTQ) 0 unsent bytes", + .run_client = test_seqpacket_unsent_bytes_client, + .run_server = test_seqpacket_unsent_bytes_server, + }, {}, }; diff --git a/tools/tracing/latency/Makefile.config b/tools/tracing/latency/Makefile.config index b25e531a1f95..0fe6b50f029b 100644 --- a/tools/tracing/latency/Makefile.config +++ b/tools/tracing/latency/Makefile.config @@ -3,8 +3,9 @@ STOP_ERROR := define lib_setup - $(eval EXTLIBS += -l$(1)) $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) + $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) + $(eval EXTLIBS += $(shell sh -c "$(PKG_CONFIG) --libs-only-l lib$(1)")) endef $(call feature_check,libtraceevent) diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 0b7ecfb30d19..5f8c286712d4 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -7,7 +7,8 @@ LIBTRACEFS_MIN_VERSION = 1.6 define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) - $(eval EXTLIBS += -l$(1)) + $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) + $(eval EXTLIBS += $(shell sh -c "$(PKG_CONFIG) --libs-only-l lib$(1)")) endef $(call feature_check,libtraceevent) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index f594a44df840..2f756628613d 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -651,8 +651,10 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) return NULL; tool->data = osnoise_alloc_top(nr_cpus); - if (!tool->data) - goto out_err; + if (!tool->data) { + osnoise_destroy_tool(tool); + return NULL; + } tool->params = params; @@ -660,11 +662,6 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) osnoise_top_handler, NULL); return tool; - -out_err: - osnoise_free_top(tool->data); - osnoise_destroy_tool(tool); - return NULL; } static int stop_tracing; diff --git a/tools/verification/rv/Makefile.config b/tools/verification/rv/Makefile.config index 6d4ba77847b6..066302230eb2 100644 --- a/tools/verification/rv/Makefile.config +++ b/tools/verification/rv/Makefile.config @@ -7,7 +7,8 @@ LIBTRACEFS_MIN_VERSION = 1.3 define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) - $(eval EXTLIBS += -l$(1)) + $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) + $(eval EXTLIBS += $(shell sh -c "$(PKG_CONFIG) --libs-only-l lib$(1)")) endef $(call feature_check,libtraceevent) |