aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CREDITS13
-rw-r--r--Documentation/dev-tools/kunit/usage.rst19
-rw-r--r--MAINTAINERS14
-rw-r--r--arch/Kconfig1
-rw-r--r--arch/x86/include/asm/kmsan.h17
-rw-r--r--drivers/crypto/caam/caamalg_qi2.c7
-rw-r--r--drivers/crypto/caam/caamhash.c7
-rw-r--r--drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c1
-rw-r--r--drivers/regulator/max5970-regulator.c2
-rw-r--r--drivers/regulator/pwm-regulator.c43
-rw-r--r--drivers/regulator/ti-abb-regulator.c22
-rw-r--r--drivers/scsi/initio.c3
-rw-r--r--drivers/scsi/isci/request.c2
-rw-r--r--drivers/scsi/scsi_error.c8
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/scsi_priv.h2
-rw-r--r--drivers/scsi/storvsc_drv.c12
-rw-r--r--drivers/scsi/virtio_scsi.c2
-rw-r--r--drivers/soc/apple/mailbox.c6
-rw-r--r--drivers/spi/spi-sh-msiof.c16
-rw-r--r--fs/erofs/compress.h5
-rw-r--r--fs/erofs/decompressor.c5
-rw-r--r--fs/erofs/decompressor_deflate.c19
-rw-r--r--fs/erofs/decompressor_lzma.c17
-rw-r--r--fs/erofs/fscache.c2
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/zdata.c98
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/jfs/jfs_dmap.c8
-rw-r--r--fs/tracefs/event_inode.c38
-rw-r--r--fs/tracefs/internal.h1
-rw-r--r--include/linux/lsm_hook_defs.h4
-rw-r--r--include/linux/mman.h1
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--lib/kunit/device.c4
-rw-r--r--lib/kunit/executor.c4
-rw-r--r--lib/kunit/kunit-test.c2
-rw-r--r--lib/kunit/test.c14
-rw-r--r--lib/stackdepot.c373
-rw-r--r--mm/huge_memory.c18
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/userfaultfd.c15
-rw-r--r--security/security.c45
-rw-r--r--tools/testing/selftests/livepatch/functions.sh37
-rwxr-xr-xtools/testing/selftests/mm/charge_reserved_hugetlb.sh2
-rw-r--r--tools/testing/selftests/mm/ksm_tests.c2
-rw-r--r--tools/testing/selftests/mm/map_hugetlb.c7
-rw-r--r--tools/testing/selftests/mm/mremap_test.c27
-rwxr-xr-xtools/testing/selftests/mm/va_high_addr_switch.sh6
-rwxr-xr-xtools/testing/selftests/mm/write_hugetlb_memory.sh2
-rw-r--r--tools/testing/selftests/rseq/basic_percpu_ops_test.c14
-rw-r--r--tools/testing/selftests/rseq/param_test.c22
-rw-r--r--tools/testing/selftests/seccomp/seccomp_benchmark.c104
61 files changed, 774 insertions, 386 deletions
diff --git a/CREDITS b/CREDITS
index 5797e8f7e92b..df8d6946739f 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2161,6 +2161,19 @@ N: Mike Kravetz
D: Maintenance and development of the hugetlb subsystem
+N: Seth Jennings
+D: Creation and maintenance of zswap
+
+N: Dan Streetman
+D: Maintenance and development of zswap
+D: Creation and maintenance of the zpool API
+
+N: Vitaly Wool
+D: Maintenance and development of zswap
+
N: Andreas S. Krebs
D: CYPRESS CY82C693 chipset IDE, Digital's PC-Alpha 164SX boards
diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index a9efab50eed8..22955d56b379 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -671,8 +671,23 @@ Testing Static Functions
------------------------
If we do not want to expose functions or variables for testing, one option is to
-conditionally ``#include`` the test file at the end of your .c file. For
-example:
+conditionally export the used symbol. For example:
+
+.. code-block:: c
+
+ /* In my_file.c */
+
+ VISIBLE_IF_KUNIT int do_interesting_thing();
+ EXPORT_SYMBOL_IF_KUNIT(do_interesting_thing);
+
+ /* In my_file.h */
+
+ #if IS_ENABLED(CONFIG_KUNIT)
+ int do_interesting_thing(void);
+ #endif
+
+Alternatively, you could conditionally ``#include`` the test file at the end of
+your .c file. For example:
.. code-block:: c
diff --git a/MAINTAINERS b/MAINTAINERS
index 8999497011a2..61117c3afa80 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10283,7 +10283,7 @@ F: drivers/scsi/ibmvscsi/ibmvscsi*
F: include/scsi/viosrp.h
IBM Power Virtual SCSI Device Target Driver
-M: Michael Cyr <[email protected]>
+M: Tyrel Datwyler <[email protected]>
S: Supported
@@ -11725,6 +11725,7 @@ F: fs/smb/server/
KERNEL UNIT TESTING FRAMEWORK (KUnit)
M: Brendan Higgins <[email protected]>
M: David Gow <[email protected]>
+R: Rae Moar <[email protected]>
S: Maintained
@@ -12903,6 +12904,8 @@ M: Alejandro Colomar <[email protected]>
S: Maintained
W: http://www.kernel.org/doc/man-pages
+T: git git://git.kernel.org/pub/scm/docs/man-pages/man-pages.git
+T: git git://www.alejandro-colomar.es/src/alx/linux/man-pages/man-pages.git
MANAGEMENT COMPONENT TRANSPORT PROTOCOL (MCTP)
M: Jeremy Kerr <[email protected]>
@@ -24341,13 +24344,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git
F: Documentation/filesystems/zonefs.rst
F: fs/zonefs/
-ZPOOL COMPRESSED PAGE STORAGE API
-M: Dan Streetman <[email protected]>
-S: Maintained
-F: include/linux/zpool.h
-F: mm/zpool.c
-
ZR36067 VIDEO FOR LINUX DRIVER
M: Corentin Labbe <[email protected]>
@@ -24399,7 +24395,9 @@ M: Nhat Pham <[email protected]>
S: Maintained
F: Documentation/admin-guide/mm/zswap.rst
+F: include/linux/zpool.h
F: include/linux/zswap.h
+F: mm/zpool.c
F: mm/zswap.c
THE REST
diff --git a/arch/Kconfig b/arch/Kconfig
index c91917b50873..a5af0edd3eb8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -673,6 +673,7 @@ config SHADOW_CALL_STACK
bool "Shadow Call Stack"
depends on ARCH_SUPPORTS_SHADOW_CALL_STACK
depends on DYNAMIC_FTRACE_WITH_ARGS || DYNAMIC_FTRACE_WITH_REGS || !FUNCTION_GRAPH_TRACER
+ depends on MMU
help
This option enables the compiler's Shadow Call Stack, which
uses a shadow stack to protect function return addresses from
diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h
index 8fa6ac0e2d76..d91b37f5b4bb 100644
--- a/arch/x86/include/asm/kmsan.h
+++ b/arch/x86/include/asm/kmsan.h
@@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr)
{
unsigned long x = (unsigned long)addr;
unsigned long y = x - __START_KERNEL_map;
+ bool ret;
/* use the carry flag to determine if x was < __START_KERNEL_map */
if (unlikely(x > y)) {
@@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr)
return false;
}
- return pfn_valid(x >> PAGE_SHIFT);
+ /*
+ * pfn_valid() relies on RCU, and may call into the scheduler on exiting
+ * the critical section. However, this would result in recursion with
+ * KMSAN. Therefore, disable preemption here, and re-enable preemption
+ * below while suppressing reschedules to avoid recursion.
+ *
+ * Note, this sacrifices occasionally breaking scheduling guarantees.
+ * Although, a kernel compiled with KMSAN has already given up on any
+ * performance guarantees due to being heavily instrumented.
+ */
+ preempt_disable();
+ ret = pfn_valid(x >> PAGE_SHIFT);
+ preempt_enable_no_resched();
+
+ return ret;
}
#endif /* !MODULE */
diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c
index a148ff1f0872..a4f6884416a0 100644
--- a/drivers/crypto/caam/caamalg_qi2.c
+++ b/drivers/crypto/caam/caamalg_qi2.c
@@ -4545,6 +4545,7 @@ struct caam_hash_alg {
struct list_head entry;
struct device *dev;
int alg_type;
+ bool is_hmac;
struct ahash_alg ahash_alg;
};
@@ -4571,7 +4572,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
ctx->dev = caam_hash->dev;
- if (alg->setkey) {
+ if (caam_hash->is_hmac) {
ctx->adata.key_dma = dma_map_single_attrs(ctx->dev, ctx->key,
ARRAY_SIZE(ctx->key),
DMA_TO_DEVICE,
@@ -4611,7 +4612,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
* For keyed hash algorithms shared descriptors
* will be created later in setkey() callback
*/
- return alg->setkey ? 0 : ahash_set_sh_desc(ahash);
+ return caam_hash->is_hmac ? 0 : ahash_set_sh_desc(ahash);
}
static void caam_hash_cra_exit(struct crypto_tfm *tfm)
@@ -4646,12 +4647,14 @@ static struct caam_hash_alg *caam_hash_alloc(struct device *dev,
template->hmac_name);
snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
template->hmac_driver_name);
+ t_alg->is_hmac = true;
} else {
snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s",
template->name);
snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
template->driver_name);
t_alg->ahash_alg.setkey = NULL;
+ t_alg->is_hmac = false;
}
alg->cra_module = THIS_MODULE;
alg->cra_init = caam_hash_cra_init;
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 290c8500c247..fdd724228c2f 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -1753,6 +1753,7 @@ static struct caam_hash_template driver_hash[] = {
struct caam_hash_alg {
struct list_head entry;
int alg_type;
+ bool is_hmac;
struct ahash_engine_alg ahash_alg;
};
@@ -1804,7 +1805,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
} else {
if (priv->era >= 6) {
ctx->dir = DMA_BIDIRECTIONAL;
- ctx->key_dir = alg->setkey ? DMA_TO_DEVICE : DMA_NONE;
+ ctx->key_dir = caam_hash->is_hmac ? DMA_TO_DEVICE : DMA_NONE;
} else {
ctx->dir = DMA_TO_DEVICE;
ctx->key_dir = DMA_NONE;
@@ -1862,7 +1863,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
* For keyed hash algorithms shared descriptors
* will be created later in setkey() callback
*/
- return alg->setkey ? 0 : ahash_set_sh_desc(ahash);
+ return caam_hash->is_hmac ? 0 : ahash_set_sh_desc(ahash);
}
static void caam_hash_cra_exit(struct crypto_tfm *tfm)
@@ -1915,12 +1916,14 @@ caam_hash_alloc(struct caam_hash_template *template,
template->hmac_name);
snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
template->hmac_driver_name);
+ t_alg->is_hmac = true;
} else {
snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s",
template->name);
snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
template->driver_name);
halg->setkey = NULL;
+ t_alg->is_hmac = false;
}
alg->cra_module = THIS_MODULE;
alg->cra_init = caam_hash_cra_init;
diff --git a/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c b/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c
index 479062aa5e6b..94a0ebb03d8c 100644
--- a/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c
+++ b/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c
@@ -463,6 +463,7 @@ void adf_init_hw_data_4xxx(struct adf_hw_device_data *hw_data, u32 dev_id)
hw_data->fw_name = ADF_402XX_FW;
hw_data->fw_mmp_name = ADF_402XX_MMP;
hw_data->uof_get_name = uof_get_name_402xx;
+ hw_data->get_ena_thd_mask = get_ena_thd_mask;
break;
case ADF_401XX_PCI_DEVICE_ID:
hw_data->fw_name = ADF_4XXX_FW;
diff --git a/drivers/regulator/max5970-regulator.c b/drivers/regulator/max5970-regulator.c
index bc88a40a88d4..830a1c4cd705 100644
--- a/drivers/regulator/max5970-regulator.c
+++ b/drivers/regulator/max5970-regulator.c
@@ -392,7 +392,7 @@ static int max597x_regmap_read_clear(struct regmap *map, unsigned int reg,
return ret;
if (*val)
- return regmap_write(map, reg, *val);
+ return regmap_write(map, reg, 0);
return 0;
}
diff --git a/drivers/regulator/pwm-regulator.c b/drivers/regulator/pwm-regulator.c
index 698c420e0869..60cfcd741c2a 100644
--- a/drivers/regulator/pwm-regulator.c
+++ b/drivers/regulator/pwm-regulator.c
@@ -157,7 +157,17 @@ static int pwm_regulator_get_voltage(struct regulator_dev *rdev)
pwm_get_state(drvdata->pwm, &pstate);
+ if (!pstate.enabled) {
+ if (pstate.polarity == PWM_POLARITY_INVERSED)
+ pstate.duty_cycle = pstate.period;
+ else
+ pstate.duty_cycle = 0;
+ }
+
voltage = pwm_get_relative_duty_cycle(&pstate, duty_unit);
+ if (voltage < min(max_uV_duty, min_uV_duty) ||
+ voltage > max(max_uV_duty, min_uV_duty))
+ return -ENOTRECOVERABLE;
/*
* The dutycycle for min_uV might be greater than the one for max_uV.
@@ -313,6 +323,32 @@ static int pwm_regulator_init_continuous(struct platform_device *pdev,
return 0;
}
+static int pwm_regulator_init_boot_on(struct platform_device *pdev,
+ struct pwm_regulator_data *drvdata,
+ const struct regulator_init_data *init_data)
+{
+ struct pwm_state pstate;
+
+ if (!init_data->constraints.boot_on || drvdata->enb_gpio)
+ return 0;
+
+ pwm_get_state(drvdata->pwm, &pstate);
+ if (pstate.enabled)
+ return 0;
+
+ /*
+ * Update the duty cycle so the output does not change
+ * when the regulator core enables the regulator (and
+ * thus the PWM channel).
+ */
+ if (pstate.polarity == PWM_POLARITY_INVERSED)
+ pstate.duty_cycle = pstate.period;
+ else
+ pstate.duty_cycle = 0;
+
+ return pwm_apply_might_sleep(drvdata->pwm, &pstate);
+}
+
static int pwm_regulator_probe(struct platform_device *pdev)
{
const struct regulator_init_data *init_data;
@@ -372,6 +408,13 @@ static int pwm_regulator_probe(struct platform_device *pdev)
if (ret)
return ret;
+ ret = pwm_regulator_init_boot_on(pdev, drvdata, init_data);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to apply boot_on settings: %d\n",
+ ret);
+ return ret;
+ }
+
regulator = devm_regulator_register(&pdev->dev,
&drvdata->desc, &config);
if (IS_ERR(regulator)) {
diff --git a/drivers/regulator/ti-abb-regulator.c b/drivers/regulator/ti-abb-regulator.c
index f48214e2c3b4..04133510e5af 100644
--- a/drivers/regulator/ti-abb-regulator.c
+++ b/drivers/regulator/ti-abb-regulator.c
@@ -726,9 +726,25 @@ static int ti_abb_probe(struct platform_device *pdev)
return PTR_ERR(abb->setup_reg);
}
- abb->int_base = devm_platform_ioremap_resource_byname(pdev, "int-address");
- if (IS_ERR(abb->int_base))
- return PTR_ERR(abb->int_base);
+ pname = "int-address";
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, pname);
+ if (!res) {
+ dev_err(dev, "Missing '%s' IO resource\n", pname);
+ return -ENODEV;
+ }
+ /*
+ * The MPU interrupt status register (PRM_IRQSTATUS_MPU) is
+ * shared between regulator-abb-{ivahd,dspeve,gpu} driver
+ * instances. Therefore use devm_ioremap() rather than
+ * devm_platform_ioremap_resource_byname() to avoid busy
+ * resource region conflicts.
+ */
+ abb->int_base = devm_ioremap(dev, res->start,
+ resource_size(res));
+ if (!abb->int_base) {
+ dev_err(dev, "Unable to map '%s'\n", pname);
+ return -ENOMEM;
+ }
/* Map Optional resources */
pname = "efuse-address";
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index 2a50fda3a628..625fd547ee60 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -371,7 +371,6 @@ static u16 initio_se2_rd(unsigned long base, u8 addr)
*/
static void initio_se2_wr(unsigned long base, u8 addr, u16 val)
{
- u8 rb;
u8 instr;
int i;
@@ -400,7 +399,7 @@ static void initio_se2_wr(unsigned long base, u8 addr, u16 val)
udelay(30);
outb(SE2CS, base + TUL_NVRAM); /* -CLK */
udelay(30);
- if ((rb = inb(base + TUL_NVRAM)) & SE2DI)
+ if (inb(base + TUL_NVRAM) & SE2DI)
break; /* write complete */
}
outb(0, base + TUL_NVRAM); /* -CS */
diff --git a/drivers/scsi/isci/request.c b/drivers/scsi/isci/request.c
index 71f711cb0628..355a0bc0828e 100644
--- a/drivers/scsi/isci/request.c
+++ b/drivers/scsi/isci/request.c
@@ -3387,7 +3387,7 @@ static enum sci_status isci_io_request_build(struct isci_host *ihost,
return SCI_FAILURE;
}
- return SCI_SUCCESS;
+ return status;
}
static struct isci_request *isci_request_from_tag(struct isci_host *ihost, u16 tag)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 79da4b1c1df0..4f455884fdc4 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -61,11 +61,11 @@ static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *,
struct scsi_cmnd *);
-void scsi_eh_wakeup(struct Scsi_Host *shost)
+void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy)
{
lockdep_assert_held(shost->host_lock);
- if (scsi_host_busy(shost) == shost->host_failed) {
+ if (busy == shost->host_failed) {
trace_scsi_eh_wakeup(shost);
wake_up_process(shost->ehandler);
SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost,
@@ -88,7 +88,7 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
shost->host_eh_scheduled++;
- scsi_eh_wakeup(shost);
+ scsi_eh_wakeup(shost, scsi_host_busy(shost));
}
spin_unlock_irqrestore(shost->host_lock, flags);
@@ -286,7 +286,7 @@ static void scsi_eh_inc_host_failed(struct rcu_head *head)
spin_lock_irqsave(shost->host_lock, flags);
shost->host_failed++;
- scsi_eh_wakeup(shost);
+ scsi_eh_wakeup(shost, scsi_host_busy(shost));
spin_unlock_irqrestore(shost->host_lock, flags);
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index cf3864f72093..1fb80eae9a63 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -280,7 +280,7 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
if (unlikely(scsi_host_in_recovery(shost))) {
spin_lock_irqsave(shost->host_lock, flags);
if (shost->host_failed || shost->host_eh_scheduled)
- scsi_eh_wakeup(shost);
+ scsi_eh_wakeup(shost, scsi_host_busy(shost));
spin_unlock_irqrestore(shost->host_lock, flags);
}
rcu_read_unlock();
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 3f0dfb97db6b..1fbfe1b52c9f 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -92,7 +92,7 @@ extern void scmd_eh_abort_handler(struct work_struct *work);
extern enum blk_eh_timer_return scsi_timeout(struct request *req);
extern int scsi_error_handler(void *host);
extern enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *cmd);
-extern void scsi_eh_wakeup(struct Scsi_Host *shost);
+extern void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy);
extern void scsi_eh_scmd_add(struct scsi_cmnd *);
void scsi_eh_ready_devs(struct Scsi_Host *shost,
struct list_head *work_q,
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index a95936b18f69..7ceb982040a5 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -330,6 +330,7 @@ enum storvsc_request_type {
*/
static int storvsc_ringbuffer_size = (128 * 1024);
+static int aligned_ringbuffer_size;
static u32 max_outstanding_req_per_channel;
static int storvsc_change_queue_depth(struct scsi_device *sdev, int queue_depth);
@@ -687,8 +688,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc)
new_sc->next_request_id_callback = storvsc_next_request_id;
ret = vmbus_open(new_sc,
- storvsc_ringbuffer_size,
- storvsc_ringbuffer_size,
+ aligned_ringbuffer_size,
+ aligned_ringbuffer_size,
(void *)&props,
sizeof(struct vmstorage_channel_properties),
storvsc_on_channel_callback, new_sc);
@@ -1973,7 +1974,7 @@ static int storvsc_probe(struct hv_device *device,
dma_set_min_align_mask(&device->device, HV_HYP_PAGE_SIZE - 1);
stor_device->port_number = host->host_no;
- ret = storvsc_connect_to_vsp(device, storvsc_ringbuffer_size, is_fc);
+ ret = storvsc_connect_to_vsp(device, aligned_ringbuffer_size, is_fc);
if (ret)
goto err_out1;
@@ -2164,7 +2165,7 @@ static int storvsc_resume(struct hv_device *hv_dev)
{
int ret;
- ret = storvsc_connect_to_vsp(hv_dev, storvsc_ringbuffer_size,
+ ret = storvsc_connect_to_vsp(hv_dev, aligned_ringbuffer_size,
hv_dev_is_fc(hv_dev));
return ret;
}
@@ -2198,8 +2199,9 @@ static int __init storvsc_drv_init(void)
* the ring buffer indices) by the max request size (which is
* vmbus_channel_packet_multipage_buffer + struct vstor_packet + u64)
*/
+ aligned_ringbuffer_size = VMBUS_RING_SIZE(storvsc_ringbuffer_size);
max_outstanding_req_per_channel =
- ((storvsc_ringbuffer_size - PAGE_SIZE) /
+ ((aligned_ringbuffer_size - PAGE_SIZE) /
ALIGN(MAX_MULTIPAGE_BUFFER_PACKET +
sizeof(struct vstor_packet) + sizeof(u64),
sizeof(u64)));
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 4cf20be668a6..617eb892f4ad 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -188,8 +188,6 @@ static void virtscsi_vq_done(struct virtio_scsi *vscsi,
while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
fn(vscsi, buf);
- if (unlikely(virtqueue_is_broken(vq)))
- break;
} while (!virtqueue_enable_cb(vq));
spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
}
diff --git a/drivers/soc/apple/mailbox.c b/drivers/soc/apple/mailbox.c
index 780199bf351e..49a0955e82d6 100644
--- a/drivers/soc/apple/mailbox.c
+++ b/drivers/soc/apple/mailbox.c
@@ -296,14 +296,14 @@ struct apple_mbox *apple_mbox_get(struct device *dev, int index)
of_node_put(args.np);
if (!pdev)
- return ERR_PTR(EPROBE_DEFER);
+ return ERR_PTR(-EPROBE_DEFER);
mbox = platform_get_drvdata(pdev);
if (!mbox)
- return ERR_PTR(EPROBE_DEFER);
+ return ERR_PTR(-EPROBE_DEFER);
if (!device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_CONSUMER))
- return ERR_PTR(ENODEV);
+ return ERR_PTR(-ENODEV);
return mbox;
}
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index cfc3b1ddbd22..6f12e4fb2e2e 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -136,14 +136,14 @@ struct sh_msiof_spi_priv {
/* SIFCTR */
#define SIFCTR_TFWM_MASK GENMASK(31, 29) /* Transmit FIFO Watermark */
-#define SIFCTR_TFWM_64 (0 << 29) /* Transfer Request when 64 empty stages */
-#define SIFCTR_TFWM_32 (1 << 29) /* Transfer Request when 32 empty stages */
-#define SIFCTR_TFWM_24 (2 << 29) /* Transfer Request when 24 empty stages */
-#define SIFCTR_TFWM_16 (3 << 29) /* Transfer Request when 16 empty stages */
-#define SIFCTR_TFWM_12 (4 << 29) /* Transfer Request when 12 empty stages */
-#define SIFCTR_TFWM_8 (5 << 29) /* Transfer Request when 8 empty stages */
-#define SIFCTR_TFWM_4 (6 << 29) /* Transfer Request when 4 empty stages */
-#define SIFCTR_TFWM_1 (7 << 29) /* Transfer Request when 1 empty stage */
+#define SIFCTR_TFWM_64 (0UL << 29) /* Transfer Request when 64 empty stages */
+#define SIFCTR_TFWM_32 (1UL << 29) /* Transfer Request when 32 empty stages */
+#define SIFCTR_TFWM_24 (2UL << 29) /* Transfer Request when 24 empty stages */
+#define SIFCTR_TFWM_16 (3UL << 29) /* Transfer Request when 16 empty stages */
+#define SIFCTR_TFWM_12 (4UL << 29) /* Transfer Request when 12 empty stages */
+#define SIFCTR_TFWM_8 (5UL << 29) /* Transfer Request when 8 empty stages */
+#define SIFCTR_TFWM_4 (6UL << 29) /* Transfer Request when 4 empty stages */
+#define SIFCTR_TFWM_1 (7UL << 29) /* Transfer Request when 1 empty stage */
#define SIFCTR_TFUA_MASK GENMASK(26, 20) /* Transmit FIFO Usable Area */
#define SIFCTR_TFUA_SHIFT 20
#define SIFCTR_TFUA(i) ((i) << SIFCTR_TFUA_SHIFT)
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 279933e007d2..7cc5841577b2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,13 +11,12 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
-
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
- /* indicate the algorithm will be used for decompression */
- unsigned int alg;
+ unsigned int alg; /* the algorithm for decompression */
bool inplace_io, partial_decoding, fillgaps;
+ gfp_t gfp; /* allocation flags for extra temporary buffers */
};
struct z_erofs_decompressor {
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 072ef6a66823..d4cee95af14c 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ victim = erofs_allocpage(pagepool, rq->gfp);
+ if (!victim)
+ return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 4a64a9c91dd3..b98872058abe 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb,
}
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -158,8 +158,12 @@ again:
strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
outsz -= strm->z.avail_out;
if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ kout = NULL;
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -211,8 +215,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -230,7 +237,7 @@ again:
break;
}
}
-
+failed:
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
if (kout)
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 2dd14f99c1dc..6ca357d83cfa 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -148,7 +148,7 @@ again:
}
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -215,8 +215,11 @@ again:
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -258,8 +261,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -277,6 +283,7 @@ again:
break;
}
}
+failed:
if (no < nrpages_out && strm->buf.out)
kunmap(rq->out[no]);
if (ni < nrpages_in)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index bc12030393b2..5ff90026fd43 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -459,7 +459,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
inode->i_blkbits = EROFS_SB(sb)->blkszbits;
inode->i_private = ctx;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3d616dea55dc..36e638e8b53a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
} else {
const unsigned int gotten = sb->s_blocksize - *ofs;
- copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
err = -ENOMEM;
goto err_out;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 5dea308764b4..e146d09151af 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
repeat:
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_NOFS);
+ NULL, grp, GFP_KERNEL);
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 692c0c39be63..ff0aa72b0db3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -82,6 +82,9 @@ struct z_erofs_pcluster {
/* L: indicate several pageofs_outs or not */
bool multibases;
+ /* L: whether extra buffer allocations are best-effort */
+ bool besteffort;
+
/* A: compressed bvecs (can be cached or inplaced pages) */
struct z_erofs_bvec compressed_bvecs[];
};
@@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
if (nrpages > pcs->maxpages)
continue;
- pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
pcl->pclustersize = size;
@@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (i_blocksize(fe->inode) != PAGE_SIZE)
- return;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
-
if (page) {
t = (void *)((unsigned long)page | 1);
newpage = NULL;
@@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->obj.lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = t;
+ spin_unlock(&pcl->obj.lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ while (!z_erofs_cache_release_folio(folio, 0))
cond_resched();
}
@@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb)
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->obj.lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->obj.lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct page *page, bool ra)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
@@ -1014,6 +1013,7 @@ repeat:
err = z_erofs_pcluster_begin(fe);
if (err)
goto out;
+ fe->pcl->besteffort |= !ra;
}
/*
@@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
.fillgaps = pcl->multibases,
+ .gfp = pcl->besteffort ?
+ GFP_KERNEL | __GFP_NOFAIL :
+ GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
/* must handle all compressed pages before actual file pages */
@@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
pcl->length = 0;
pcl->partial = true;
pcl->multibases = false;
+ pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
@@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
- struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *page, *oldpage;
+ struct page *page;
int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- oldpage = READ_ONCE(zbv->page);
- if (!oldpage)
+ spin_lock(&pcl->obj.lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ page = zbv.page;
+ justfound = (unsigned long)page & 1UL;
+ page = (struct page *)((unsigned long)page & ~1UL);
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ if (!page)
goto out_allocpage;
- justfound = (unsigned long)oldpage & 1UL;
- page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page;
-
DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
* Handle preallocated cached pages. We tried to allocate such pages
@@ -1448,7 +1455,6 @@ repeat:
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0);
- WRITE_ONCE(zbv->page, page);
tocache = true;
goto out_tocache;
}
@@ -1459,9 +1465,9 @@ repeat:
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
- if (zbv->offset < 0)
- bvec->bv_offset = round_up(-zbv->offset, bs);
- bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return;
}
@@ -1471,7 +1477,6 @@ repeat:
/* the cached page is still in managed cache */
if (page->mapping == mc) {
- WRITE_ONCE(zbv->page, page);
/*
* The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them.
@@ -1503,11 +1508,15 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1694,7 @@ submit_bio_retry:
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
goto submit_bio_retry;
@@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page))
unlock_page(page);
else
- (void)z_erofs_do_read_page(f, page);
+ (void)z_erofs_do_read_page(f, page, !!rac);
put_page(page);
}
@@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
@@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ea5b8e57d904..671664fed307 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -340,7 +340,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
folio_unlock(folio);
- if (!folio_test_has_hwpoisoned(folio))
+ if (!folio_test_hwpoison(folio))
want = nr;
else {
/*
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 8eec84c651bf..cb3cda1390ad 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
* leafno - the number of the leaf to be updated.
* newval - the new value for the leaf.
*
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error
+ * RETURN VALUES: none
*/
static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
@@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
* get the buddy size (number of words covered) of
* the new value.
*/
-
- if ((newval - tp->dmt_budmin) > BUDMIN)
- return -EIO;
-
budsz = BUDSIZE(newval, tp->dmt_budmin);
/* try to join.
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 6b211522a13e..1c3dd0ad4660 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -281,44 +281,6 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
inode->i_gid = attr->gid;
}
-static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
-{
- struct eventfs_inode *ei_child;
-
- /* at most we have events/system/event */
- if (WARN_ON_ONCE(level > 3))
- return;
-
- ei->attr.gid = gid;
-
- if (ei->entry_attrs) {
- for (int i = 0; i < ei->nr_entries; i++) {
- ei->entry_attrs[i].gid = gid;
- }
- }
-
- /*
- * Only eventfs_inode with dentries are updated, make sure
- * all eventfs_inodes are updated. If one of the children
- * do not have a dentry, this function must traverse it.
- */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- if (!ei_child->dentry)
- update_gid(ei_child, gid, level + 1);
- }
-}
-
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
-{
- struct eventfs_inode *ei = dentry->d_fsdata;
- int idx;
-
- idx = srcu_read_lock(&eventfs_srcu);
- update_gid(ei, gid, 0);
- srcu_read_unlock(&eventfs_srcu, idx);
-}
-
/**
* create_file - create a file in the tracefs filesystem
* @name: the name of the file to create.
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 45397df9bb65..91c2bf0b91d9 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -82,7 +82,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
struct dentry *eventfs_failed_creating(struct dentry *dentry);
struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 185924c56378..76458b6d53da 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -315,9 +315,9 @@ LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname)
LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname)
LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how)
LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb)
-LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock,
+LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock,
sockptr_t optval, sockptr_t optlen, unsigned int len)
-LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock,
+LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock,
struct sk_buff *skb, u32 *secid)
LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority)
LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 40d94411d492..dc7048824be8 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -156,6 +156,7 @@ calc_vm_flag_bits(unsigned long flags)
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
+ _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
arch_calc_vm_flag_bits(flags);
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4ed33b127821..a497f189d988 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2013,9 +2013,9 @@ static inline int pfn_valid(unsigned long pfn)
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
ms = __pfn_to_section(pfn);
- rcu_read_lock();
+ rcu_read_lock_sched();
if (!valid_section(ms)) {
- rcu_read_unlock();
+ rcu_read_unlock_sched();
return 0;
}
/*
@@ -2023,7 +2023,7 @@ static inline int pfn_valid(unsigned long pfn)
* the entire section-sized span.
*/
ret = early_section(ms) || pfn_section_valid(ms, pfn);
- rcu_read_unlock();
+ rcu_read_unlock_sched();
return ret;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 485bb0389b48..929e98c62965 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -537,7 +537,7 @@ retry:
}
}
- ret = __replace_page(vma, vaddr, old_page, new_page);
+ ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 46439e3bcec4..b33c3861fbbb 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1470,8 +1470,10 @@ register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- if (tracing_alloc_snapshot_instance(file->tr) != 0)
- return 0;
+ int ret = tracing_alloc_snapshot_instance(file->tr);
+
+ if (ret < 0)
+ return ret;
return register_trigger(glob, data, file);
}
diff --git a/lib/kunit/device.c b/lib/kunit/device.c
index f5371287b375..074c6dd2e36a 100644
--- a/lib/kunit/device.c
+++ b/lib/kunit/device.c
@@ -45,8 +45,8 @@ int kunit_bus_init(void)
int error;
kunit_bus_device = root_device_register("kunit");
- if (!kunit_bus_device)
- return -ENOMEM;
+ if (IS_ERR(kunit_bus_device))
+ return PTR_ERR(kunit_bus_device);
error = bus_register(&kunit_bus_type);
if (error)
diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c
index 717b9599036b..689fff2b2b10 100644
--- a/lib/kunit/executor.c
+++ b/lib/kunit/executor.c
@@ -146,6 +146,10 @@ void kunit_free_suite_set(struct kunit_suite_set suite_set)
kfree(suite_set.start);
}
+/*
+ * Filter and reallocate test suites. Must return the filtered test suites set
+ * allocated at a valid virtual address or NULL in case of error.
+ */
struct kunit_suite_set
kunit_filter_suites(const struct kunit_suite_set *suite_set,
const char *filter_glob,
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index c4259d910356..f7980ef236a3 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -720,7 +720,7 @@ static void kunit_device_cleanup_test(struct kunit *test)
long action_was_run = 0;
test_device = kunit_device_register(test, "my_device");
- KUNIT_ASSERT_NOT_NULL(test, test_device);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_device);
/* Add an action to verify cleanup. */
devm_add_action(test_device, test_dev_action, &action_was_run);
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index f95d2093a0aa..31a5a992e646 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -17,6 +17,7 @@
#include <linux/panic.h>
#include <linux/sched/debug.h>
#include <linux/sched.h>
+#include <linux/mm.h>
#include "debugfs.h"
#include "device-impl.h"
@@ -801,12 +802,19 @@ static void kunit_module_exit(struct module *mod)
};
const char *action = kunit_action();
+ /*
+ * Check if the start address is a valid virtual address to detect
+ * if the module load sequence has failed and the suite set has not
+ * been initialized and filtered.
+ */
+ if (!suite_set.start || !virt_addr_valid(suite_set.start))
+ return;
+
if (!action)
__kunit_test_suites_exit(mod->kunit_suites,
mod->num_kunit_suites);
- if (suite_set.start)
- kunit_free_suite_set(suite_set);
+ kunit_free_suite_set(suite_set);
}
static int kunit_module_notify(struct notifier_block *nb, unsigned long val,
@@ -816,12 +824,12 @@ static int kunit_module_notify(struct notifier_block *nb, unsigned long val,
switch (val) {
case MODULE_STATE_LIVE:
+ kunit_module_init(mod);
break;
case MODULE_STATE_GOING:
kunit_module_exit(mod);
break;
case MODULE_STATE_COMING:
- kunit_module_init(mod);
break;
case MODULE_STATE_UNFORMED:
break;
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index a0be5d05c7f0..5caa1f566553 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -14,6 +14,7 @@
#define pr_fmt(fmt) "stackdepot: " fmt
+#include <linux/debugfs.h>
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <linux/kernel.h>
@@ -21,8 +22,9 @@
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
-#include <linux/percpu.h>
#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -67,12 +69,28 @@ union handle_parts {
};
struct stack_record {
- struct list_head list; /* Links in hash table or freelist */
+ struct list_head hash_list; /* Links in the hash table */
u32 hash; /* Hash in hash table */
u32 size; /* Number of stored frames */
- union handle_parts handle;
+ union handle_parts handle; /* Constant after initialization */
refcount_t count;
- unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
+ union {
+ unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
+ struct {
+ /*
+ * An important invariant of the implementation is to
+ * only place a stack record onto the freelist iff its
+ * refcount is zero. Because stack records with a zero
+ * refcount are never considered as valid, it is safe to
+ * union @entries and freelist management state below.
+ * Conversely, as soon as an entry is off the freelist
+ * and its refcount becomes non-zero, the below must not
+ * be accessed until being placed back on the freelist.
+ */
+ struct list_head free_list; /* Links in the freelist */
+ unsigned long rcu_state; /* RCU cookie */
+ };
+ };
};
#define DEPOT_STACK_RECORD_SIZE \
@@ -112,8 +130,25 @@ static LIST_HEAD(free_stacks);
* yet allocated or if the limit on the number of pools is reached.
*/
static bool new_pool_required = true;
-/* Lock that protects the variables above. */
-static DEFINE_RWLOCK(pool_rwlock);
+/* The lock must be held when performing pool or freelist modifications. */
+static DEFINE_RAW_SPINLOCK(pool_lock);
+
+/* Statistics counters for debugfs. */
+enum depot_counter_id {
+ DEPOT_COUNTER_ALLOCS,
+ DEPOT_COUNTER_FREES,
+ DEPOT_COUNTER_INUSE,
+ DEPOT_COUNTER_FREELIST_SIZE,
+ DEPOT_COUNTER_COUNT,
+};
+static long counters[DEPOT_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [DEPOT_COUNTER_ALLOCS] = "allocations",
+ [DEPOT_COUNTER_FREES] = "frees",
+ [DEPOT_COUNTER_INUSE] = "in_use",
+ [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size",
+};
+static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
static int __init disable_stack_depot(char *str)
{
@@ -258,14 +293,15 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(stack_depot_init);
-/* Initializes a stack depol pool. */
+/*
+ * Initializes new stack depot @pool, release all its entries to the freelist,
+ * and update the list of pools.
+ */
static void depot_init_pool(void *pool)
{
int offset;
- lockdep_assert_held_write(&pool_rwlock);
-
- WARN_ON(!list_empty(&free_stacks));
+ lockdep_assert_held(&pool_lock);
/* Initialize handles and link stack records into the freelist. */
for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
@@ -276,18 +312,36 @@ static void depot_init_pool(void *pool)
stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
stack->handle.extra = 0;
- list_add(&stack->list, &free_stacks);
+ /*
+ * Stack traces of size 0 are never saved, and we can simply use
+ * the size field as an indicator if this is a new unused stack
+ * record in the freelist.
+ */
+ stack->size = 0;
+
+ INIT_LIST_HEAD(&stack->hash_list);
+ /*
+ * Add to the freelist front to prioritize never-used entries:
+ * required in case there are entries in the freelist, but their
+ * RCU cookie still belongs to the current RCU grace period
+ * (there can still be concurrent readers).
+ */
+ list_add(&stack->free_list, &free_stacks);
+ counters[DEPOT_COUNTER_FREELIST_SIZE]++;
}
/* Save reference to the pool to be used by depot_fetch_stack(). */
stack_pools[pools_num] = pool;
- pools_num++;
+
+ /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
+ WRITE_ONCE(pools_num, pools_num + 1);
+ ASSERT_EXCLUSIVE_WRITER(pools_num);
}
/* Keeps the preallocated memory to be used for a new stack depot pool. */
static void depot_keep_new_pool(void **prealloc)
{
- lockdep_assert_held_write(&pool_rwlock);
+ lockdep_assert_held(&pool_lock);
/*
* If a new pool is already saved or the maximum number of
@@ -310,17 +364,16 @@ static void depot_keep_new_pool(void **prealloc)
* number of pools is reached. In either case, take note that
* keeping another pool is not required.
*/
- new_pool_required = false;
+ WRITE_ONCE(new_pool_required, false);
}
-/* Updates references to the current and the next stack depot pools. */
-static bool depot_update_pools(void **prealloc)
+/*
+ * Try to initialize a new stack depot pool from either a previous or the
+ * current pre-allocation, and release all its entries to the freelist.
+ */
+static bool depot_try_init_pool(void **prealloc)
{
- lockdep_assert_held_write(&pool_rwlock);
-
- /* Check if we still have objects in the freelist. */
- if (!list_empty(&free_stacks))
- goto out_keep_prealloc;
+ lockdep_assert_held(&pool_lock);
/* Check if we have a new pool saved and use it. */
if (new_pool) {
@@ -329,10 +382,9 @@ static bool depot_update_pools(void **prealloc)
/* Take note that we might need a new new_pool. */
if (pools_num < DEPOT_MAX_POOLS)
- new_pool_required = true;
+ WRITE_ONCE(new_pool_required, true);
- /* Try keeping the preallocated memory for new_pool. */
- goto out_keep_prealloc;
+ return true;
}
/* Bail out if we reached the pool limit. */
@@ -349,12 +401,32 @@ static bool depot_update_pools(void **prealloc)
}
return false;
+}
+
+/* Try to find next free usable entry. */
+static struct stack_record *depot_pop_free(void)
+{
+ struct stack_record *stack;
+
+ lockdep_assert_held(&pool_lock);
+
+ if (list_empty(&free_stacks))
+ return NULL;
+
+ /*
+ * We maintain the invariant that the elements in front are least
+ * recently used, and are therefore more likely to be associated with an
+ * RCU grace period in the past. Consequently it is sufficient to only
+ * check the first entry.
+ */
+ stack = list_first_entry(&free_stacks, struct stack_record, free_list);
+ if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
+ return NULL;
+
+ list_del(&stack->free_list);
+ counters[DEPOT_COUNTER_FREELIST_SIZE]--;
-out_keep_prealloc:
- /* Keep the preallocated memory for a new pool if required. */
- if (*prealloc)
- depot_keep_new_pool(prealloc);
- return true;
+ return stack;
}
/* Allocates a new stack in a stack depot pool. */
@@ -363,19 +435,22 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
{
struct stack_record *stack;
- lockdep_assert_held_write(&pool_rwlock);
+ lockdep_assert_held(&pool_lock);
- /* Update current and new pools if required and possible. */
- if (!depot_update_pools(prealloc))
+ /* This should already be checked by public API entry points. */
+ if (WARN_ON_ONCE(!size))
return NULL;
/* Check if we have a stack record to save the stack trace. */
- if (list_empty(&free_stacks))
- return NULL;
-
- /* Get and unlink the first entry from the freelist. */
- stack = list_first_entry(&free_stacks, struct stack_record, list);
- list_del(&stack->list);
+ stack = depot_pop_free();
+ if (!stack) {
+ /* No usable entries on the freelist - try to refill the freelist. */
+ if (!depot_try_init_pool(prealloc))
+ return NULL;
+ stack = depot_pop_free();
+ if (WARN_ON(!stack))
+ return NULL;
+ }
/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
@@ -394,38 +469,80 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
*/
kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
+ counters[DEPOT_COUNTER_ALLOCS]++;
+ counters[DEPOT_COUNTER_INUSE]++;
return stack;
}
static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
{
+ const int pools_num_cached = READ_ONCE(pools_num);
union handle_parts parts = { .handle = handle };
void *pool;
size_t offset = parts.offset << DEPOT_STACK_ALIGN;
struct stack_record *stack;
- lockdep_assert_held(&pool_rwlock);
+ lockdep_assert_not_held(&pool_lock);
- if (parts.pool_index > pools_num) {
+ if (parts.pool_index > pools_num_cached) {
WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
- parts.pool_index, pools_num, handle);
+ parts.pool_index, pools_num_cached, handle);
return NULL;
}
pool = stack_pools[parts.pool_index];
- if (!pool)
+ if (WARN_ON(!pool))
return NULL;
stack = pool + offset;
+ if (WARN_ON(!refcount_read(&stack->count)))
+ return NULL;
+
return stack;
}
/* Links stack into the freelist. */
static void depot_free_stack(struct stack_record *stack)
{
- lockdep_assert_held_write(&pool_rwlock);
+ unsigned long flags;
+
+ lockdep_assert_not_held(&pool_lock);
- list_add(&stack->list, &free_stacks);
+ raw_spin_lock_irqsave(&pool_lock, flags);
+ printk_deferred_enter();
+
+ /*
+ * Remove the entry from the hash list. Concurrent list traversal may
+ * still observe the entry, but since the refcount is zero, this entry
+ * will no longer be considered as valid.
+ */
+ list_del_rcu(&stack->hash_list);
+
+ /*
+ * Due to being used from constrained contexts such as the allocators,
+ * NMI, or even RCU itself, stack depot cannot rely on primitives that
+ * would sleep (such as synchronize_rcu()) or recursively call into
+ * stack depot again (such as call_rcu()).
+ *
+ * Instead, get an RCU cookie, so that we can ensure this entry isn't
+ * moved onto another list until the next grace period, and concurrent
+ * RCU list traversal remains safe.
+ */
+ stack->rcu_state = get_state_synchronize_rcu();
+
+ /*
+ * Add the entry to the freelist tail, so that older entries are
+ * considered first - their RCU cookie is more likely to no longer be
+ * associated with the current grace period.
+ */
+ list_add_tail(&stack->free_list, &free_stacks);
+
+ counters[DEPOT_COUNTER_FREELIST_SIZE]++;
+ counters[DEPOT_COUNTER_FREES]++;
+ counters[DEPOT_COUNTER_INUSE]--;
+
+ printk_deferred_exit();
+ raw_spin_unlock_irqrestore(&pool_lock, flags);
}
/* Calculates the hash for a stack. */
@@ -453,22 +570,52 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
/* Finds a stack in a bucket of the hash table. */
static inline struct stack_record *find_stack(struct list_head *bucket,
- unsigned long *entries, int size,
- u32 hash)
+ unsigned long *entries, int size,
+ u32 hash, depot_flags_t flags)
{
- struct list_head *pos;
- struct stack_record *found;
+ struct stack_record *stack, *ret = NULL;
+
+ /*
+ * Stack depot may be used from instrumentation that instruments RCU or
+ * tracing itself; use variant that does not call into RCU and cannot be
+ * traced.
+ *
+ * Note: Such use cases must take care when using refcounting to evict
+ * unused entries, because the stack record free-then-reuse code paths
+ * do call into RCU.
+ */
+ rcu_read_lock_sched_notrace();
- lockdep_assert_held(&pool_rwlock);
+ list_for_each_entry_rcu(stack, bucket, hash_list) {
+ if (stack->hash != hash || stack->size != size)
+ continue;
+
+ /*
+ * This may race with depot_free_stack() accessing the freelist
+ * management state unioned with @entries. The refcount is zero
+ * in that case and the below refcount_inc_not_zero() will fail.
+ */
+ if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
+ continue;
+
+ /*
+ * Try to increment refcount. If this succeeds, the stack record
+ * is valid and has not yet been freed.
+ *
+ * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
+ * to then call stack_depot_put() later, and we can assume that
+ * a stack record is never placed back on the freelist.
+ */
+ if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
+ continue;
- list_for_each(pos, bucket) {
- found = list_entry(pos, struct stack_record, list);
- if (found->hash == hash &&
- found->size == size &&
- !stackdepot_memcmp(entries, found->entries, size))
- return found;
+ ret = stack;
+ break;
}
- return NULL;
+
+ rcu_read_unlock_sched_notrace();
+
+ return ret;
}
depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
@@ -482,7 +629,6 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
struct page *page = NULL;
void *prealloc = NULL;
bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
- bool need_alloc = false;
unsigned long flags;
u32 hash;
@@ -505,31 +651,16 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
hash = hash_stack(entries, nr_entries);
bucket = &stack_table[hash & stack_hash_mask];
- read_lock_irqsave(&pool_rwlock, flags);
- printk_deferred_enter();
-
- /* Fast path: look the stack trace up without full locking. */
- found = find_stack(bucket, entries, nr_entries, hash);
- if (found) {
- if (depot_flags & STACK_DEPOT_FLAG_GET)
- refcount_inc(&found->count);
- printk_deferred_exit();
- read_unlock_irqrestore(&pool_rwlock, flags);
+ /* Fast path: look the stack trace up without locking. */
+ found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
+ if (found)
goto exit;
- }
-
- /* Take note if another stack pool needs to be allocated. */
- if (new_pool_required)
- need_alloc = true;
-
- printk_deferred_exit();
- read_unlock_irqrestore(&pool_rwlock, flags);
/*
* Allocate memory for a new pool if required now:
* we won't be able to do that under the lock.
*/
- if (unlikely(can_alloc && need_alloc)) {
+ if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
/*
* Zero out zone modifiers, as we don't have specific zone
* requirements. Keep the flags related to allocation in atomic
@@ -543,31 +674,36 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
prealloc = page_address(page);
}
- write_lock_irqsave(&pool_rwlock, flags);
+ raw_spin_lock_irqsave(&pool_lock, flags);
printk_deferred_enter();
- found = find_stack(bucket, entries, nr_entries, hash);
+ /* Try to find again, to avoid concurrently inserting duplicates. */
+ found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
if (!found) {
struct stack_record *new =
depot_alloc_stack(entries, nr_entries, hash, &prealloc);
if (new) {
- list_add(&new->list, bucket);
+ /*
+ * This releases the stack record into the bucket and
+ * makes it visible to readers in find_stack().
+ */
+ list_add_rcu(&new->hash_list, bucket);
found = new;
}
- } else {
- if (depot_flags & STACK_DEPOT_FLAG_GET)
- refcount_inc(&found->count);
+ }
+
+ if (prealloc) {
/*
- * Stack depot already contains this stack trace, but let's
- * keep the preallocated memory for future.
+ * Either stack depot already contains this stack trace, or
+ * depot_alloc_stack() did not consume the preallocated memory.
+ * Try to keep the preallocated memory for future.
*/
- if (prealloc)
- depot_keep_new_pool(&prealloc);
+ depot_keep_new_pool(&prealloc);
}
printk_deferred_exit();
- write_unlock_irqrestore(&pool_rwlock, flags);
+ raw_spin_unlock_irqrestore(&pool_lock, flags);
exit:
if (prealloc) {
/* Stack depot didn't use this memory, free it. */
@@ -592,7 +728,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries)
{
struct stack_record *stack;
- unsigned long flags;
*entries = NULL;
/*
@@ -604,13 +739,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
if (!handle || stack_depot_disabled)
return 0;
- read_lock_irqsave(&pool_rwlock, flags);
- printk_deferred_enter();
-
stack = depot_fetch_stack(handle);
-
- printk_deferred_exit();
- read_unlock_irqrestore(&pool_rwlock, flags);
+ /*
+ * Should never be NULL, otherwise this is a use-after-put (or just a
+ * corrupt handle).
+ */
+ if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
+ return 0;
*entries = stack->entries;
return stack->size;
@@ -620,29 +755,20 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch);
void stack_depot_put(depot_stack_handle_t handle)
{
struct stack_record *stack;
- unsigned long flags;
if (!handle || stack_depot_disabled)
return;
- write_lock_irqsave(&pool_rwlock, flags);
- printk_deferred_enter();
-
stack = depot_fetch_stack(handle);
- if (WARN_ON(!stack))
- goto out;
-
- if (refcount_dec_and_test(&stack->count)) {
- /* Unlink stack from the hash table. */
- list_del(&stack->list);
+ /*
+ * Should always be able to find the stack record, otherwise this is an
+ * unbalanced put attempt (or corrupt handle).
+ */
+ if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
+ return;
- /* Free stack. */
+ if (refcount_dec_and_test(&stack->count))
depot_free_stack(stack);
- }
-
-out:
- printk_deferred_exit();
- write_unlock_irqrestore(&pool_rwlock, flags);
}
EXPORT_SYMBOL_GPL(stack_depot_put);
@@ -690,3 +816,30 @@ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
return parts.extra;
}
EXPORT_SYMBOL(stack_depot_get_extra_bits);
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+ /*
+ * data race ok: These are just statistics counters, and approximate
+ * statistics are ok for debugging.
+ */
+ seq_printf(seq, "pools: %d\n", data_race(pools_num));
+ for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
+ seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+static int depot_debugfs_init(void)
+{
+ struct dentry *dir;
+
+ if (stack_depot_disabled)
+ return 0;
+
+ dir = debugfs_create_dir("stackdepot", NULL);
+ debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
+ return 0;
+}
+late_initcall(depot_debugfs_init);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94ef5c02b459..94c958f7ebb5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,6 +37,7 @@
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
+#include <linux/compat.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -809,7 +810,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
{
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad, ret;
+ unsigned long len_pad, ret, off_sub;
+
+ if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
+ return 0;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -835,7 +839,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (ret == addr)
return addr;
- ret += (off - ret) & (size - 1);
+ off_sub = (off - ret) & (size - 1);
+
+ if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
+ !off_sub)
+ return ret + size;
+
+ ret += off_sub;
return ret;
}
@@ -2437,7 +2447,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(old_pmd);
folio = page_folio(page);
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
folio_set_referenced(folio);
folio_remove_rmap_pmd(folio, page, vma);
@@ -3563,7 +3573,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
}
if (pmd_dirty(pmdval))
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (pmd_write(pmdval))
entry = make_writable_migration_entry(page_to_pfn(page));
else if (anon_exclusive)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e4c8735e7c85..46d8d02114cf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2623,8 +2623,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
}
/*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Reclaims memory over the high limit. Called directly from
+ * try_charge() (context permitting), as well as from the userland
+ * return path where reclaim is always able to block.
*/
void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
@@ -2644,6 +2645,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
retry_reclaim:
/*
+ * Bail if the task is already exiting. Unlike memory.max,
+ * memory.high enforcement isn't as strict, and there is no
+ * OOM killer involved, which means the excess could already
+ * be much bigger (and still growing) than it could for
+ * memory.max; the dying task could get stuck in fruitless
+ * reclaim for a long time, which isn't desirable.
+ */
+ if (task_is_dying())
+ goto out;
+
+ /*
* The allocating task should reclaim at least the batch size, but for
* subsequent retries we only want to do what's necessary to prevent oom
* or breaching resource isolation.
@@ -2693,6 +2705,9 @@ retry_reclaim:
}
/*
+ * Reclaim didn't manage to push usage below the limit, slow
+ * this allocating task down.
+ *
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
@@ -2887,11 +2902,17 @@ done_restock:
}
} while ((memcg = parent_mem_cgroup(memcg)));
+ /*
+ * Reclaim is set up above to be called from the userland
+ * return path. But also attempt synchronous reclaim to avoid
+ * excessive overrun while the task is still inside the
+ * kernel. If this is successful, the return path will see it
+ * when it rechecks the overage and simply bail out.
+ */
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
- gfpflags_allow_blocking(gfp_mask)) {
+ gfpflags_allow_blocking(gfp_mask))
mem_cgroup_handle_over_high(gfp_mask);
- }
return 0;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f9b61f4a668..636280d04008 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -982,7 +982,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
int count = page_count(p) - 1;
if (extra_pins)
- count -= 1;
+ count -= folio_nr_pages(page_folio(p));
if (count > 0) {
pr_err("%#lx: %s still referenced by %d users\n",
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..89bcae0b224d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1464,7 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
delay_rmap = 0;
if (!folio_test_anon(folio)) {
if (pte_dirty(ptent)) {
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (tlb_delay_rmap(tlb)) {
delay_rmap = 1;
force_flush = 1;
diff --git a/mm/mmap.c b/mm/mmap.c
index b78e83d351d2..d89770eaab6b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1825,15 +1825,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap() will clear pgoff, so match alignment.
*/
- pgoff = 0;
get_area = shmem_get_unmapped_area;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
/* Ensures that larger anonymous mappings are THP aligned. */
get_area = thp_get_unmapped_area;
}
+ /* Always treat pgoff as zero for anonymous memory. */
+ if (!file)
+ pgoff = 0;
+
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..02147b61712b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1638,7 +1638,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
*/
dtc->wb_thresh = __wb_calc_thresh(dtc);
dtc->wb_bg_thresh = dtc->thresh ?
- div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
+ div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
diff --git a/mm/readahead.c b/mm/readahead.c
index 23620c57c122..2648ec4f0494 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -469,7 +469,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (!folio)
return -ENOMEM;
- mark = round_up(mark, 1UL << order);
+ mark = round_down(mark, 1UL << order);
if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
@@ -575,7 +575,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- expected = round_up(ra->start + ra->size - ra->async_size,
+ expected = round_down(ra->start + ra->size - ra->async_size,
1UL << order);
if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 20e3b0d9cf7e..75fcf1f783bc 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -472,6 +473,15 @@ retry:
goto out;
}
mmap_read_lock(dst_mm);
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ if (mmap_changing && atomic_read(mmap_changing)) {
+ err = -EAGAIN;
+ break;
+ }
dst_vma = NULL;
goto retry;
@@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -622,8 +633,8 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
+ len, mmap_changing, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
diff --git a/security/security.c b/security/security.c
index 0144a98d3712..3aaad75c9ce8 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4255,7 +4255,19 @@ EXPORT_SYMBOL(security_inode_setsecctx);
*/
int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
- return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen);
+ struct security_hook_list *hp;
+ int rc;
+
+ /*
+ * Only one module will provide a security context.
+ */
+ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecctx, list) {
+ rc = hp->hook.inode_getsecctx(inode, ctx, ctxlen);
+ if (rc != LSM_RET_DEFAULT(inode_getsecctx))
+ return rc;
+ }
+
+ return LSM_RET_DEFAULT(inode_getsecctx);
}
EXPORT_SYMBOL(security_inode_getsecctx);
@@ -4612,8 +4624,20 @@ EXPORT_SYMBOL(security_sock_rcv_skb);
int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
sockptr_t optlen, unsigned int len)
{
- return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock,
- optval, optlen, len);
+ struct security_hook_list *hp;
+ int rc;
+
+ /*
+ * Only one module will provide a security context.
+ */
+ hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream,
+ list) {
+ rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen,
+ len);
+ if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream))
+ return rc;
+ }
+ return LSM_RET_DEFAULT(socket_getpeersec_stream);
}
/**
@@ -4633,8 +4657,19 @@ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
int security_socket_getpeersec_dgram(struct socket *sock,
struct sk_buff *skb, u32 *secid)
{
- return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock,
- skb, secid);
+ struct security_hook_list *hp;
+ int rc;
+
+ /*
+ * Only one module will provide a security context.
+ */
+ hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram,
+ list) {
+ rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid);
+ if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram))
+ return rc;
+ }
+ return LSM_RET_DEFAULT(socket_getpeersec_dgram);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);
diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index c8416c54b463..b1fd7362c2fe 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -42,17 +42,6 @@ function die() {
exit 1
}
-# save existing dmesg so we can detect new content
-function save_dmesg() {
- SAVED_DMESG=$(mktemp --tmpdir -t klp-dmesg-XXXXXX)
- dmesg > "$SAVED_DMESG"
-}
-
-# cleanup temporary dmesg file from save_dmesg()
-function cleanup_dmesg_file() {
- rm -f "$SAVED_DMESG"
-}
-
function push_config() {
DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \
awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}')
@@ -99,7 +88,6 @@ function set_ftrace_enabled() {
function cleanup() {
pop_config
- cleanup_dmesg_file
}
# setup_config - save the current config and set a script exit trap that
@@ -280,7 +268,15 @@ function set_pre_patch_ret {
function start_test {
local test="$1"
- save_dmesg
+ # Dump something unique into the dmesg log, then stash the entry
+ # in LAST_DMESG. The check_result() function will use it to
+ # find new kernel messages since the test started.
+ local last_dmesg_msg="livepatch kselftest timestamp: $(date --rfc-3339=ns)"
+ log "$last_dmesg_msg"
+ loop_until 'dmesg | grep -q "$last_dmesg_msg"' ||
+ die "buffer busy? can't find canary dmesg message: $last_dmesg_msg"
+ LAST_DMESG=$(dmesg | grep "$last_dmesg_msg")
+
echo -n "TEST: $test ... "
log "===== TEST: $test ====="
}
@@ -291,23 +287,24 @@ function check_result {
local expect="$*"
local result
- # Note: when comparing dmesg output, the kernel log timestamps
- # help differentiate repeated testing runs. Remove them with a
- # post-comparison sed filter.
-
- result=$(dmesg | comm --nocheck-order -13 "$SAVED_DMESG" - | \
+ # Test results include any new dmesg entry since LAST_DMESG, then:
+ # - include lines matching keywords
+ # - exclude lines matching keywords
+ # - filter out dmesg timestamp prefixes
+ result=$(dmesg | awk -v last_dmesg="$LAST_DMESG" 'p; $0 == last_dmesg { p=1 }' | \
grep -e 'livepatch:' -e 'test_klp' | \
grep -v '\(tainting\|taints\) kernel' | \
sed 's/^\[[ 0-9.]*\] //')
if [[ "$expect" == "$result" ]] ; then
echo "ok"
+ elif [[ "$result" == "" ]] ; then
+ echo -e "not ok\n\nbuffer overrun? can't find canary dmesg entry: $LAST_DMESG\n"
+ die "livepatch kselftest(s) failed"
else
echo -e "not ok\n\n$(diff -upr --label expected --label result <(echo "$expect") <(echo "$result"))\n"
die "livepatch kselftest(s) failed"
fi
-
- cleanup_dmesg_file
}
# check_sysfs_rights(modname, rel_path, expected_rights) - check sysfs
diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index 0899019a7fcb..e14bdd4455f2 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Kselftest framework requirement - SKIP code is 4.
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index 380b691d3eb9..b748c48908d9 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -566,7 +566,7 @@ static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot,
if (map_ptr_orig == MAP_FAILED)
err(2, "initial mmap");
- if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
+ if (madvise(map_ptr, len, MADV_HUGEPAGE))
err(2, "MADV_HUGEPAGE");
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
index 193281560b61..86e8f2048a40 100644
--- a/tools/testing/selftests/mm/map_hugetlb.c
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -15,6 +15,7 @@
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
+#include "vm_util.h"
#define LENGTH (256UL*1024*1024)
#define PROTECTION (PROT_READ | PROT_WRITE)
@@ -58,10 +59,16 @@ int main(int argc, char **argv)
{
void *addr;
int ret;
+ size_t hugepage_size;
size_t length = LENGTH;
int flags = FLAGS;
int shift = 0;
+ hugepage_size = default_huge_page_size();
+ /* munmap with fail if the length is not page aligned */
+ if (hugepage_size > length)
+ length = hugepage_size;
+
if (argc > 1)
length = atol(argv[1]) << 20;
if (argc > 2) {
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 1d4c1589c305..2f8b991f78cb 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -360,7 +360,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
char pattern_seed)
{
void *addr, *src_addr, *dest_addr, *dest_preamble_addr;
- unsigned long long i;
+ int d;
+ unsigned long long t;
struct timespec t_start = {0, 0}, t_end = {0, 0};
long long start_ns, end_ns, align_mask, ret, offset;
unsigned long long threshold;
@@ -378,8 +379,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
/* Set byte pattern for source block. */
srand(pattern_seed);
- for (i = 0; i < threshold; i++)
- memset((char *) src_addr + i, (char) rand(), 1);
+ for (t = 0; t < threshold; t++)
+ memset((char *) src_addr + t, (char) rand(), 1);
/* Mask to zero out lower bits of address for alignment */
align_mask = ~(c.dest_alignment - 1);
@@ -420,8 +421,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
/* Set byte pattern for the dest preamble block. */
srand(pattern_seed);
- for (i = 0; i < c.dest_preamble_size; i++)
- memset((char *) dest_preamble_addr + i, (char) rand(), 1);
+ for (d = 0; d < c.dest_preamble_size; d++)
+ memset((char *) dest_preamble_addr + d, (char) rand(), 1);
}
clock_gettime(CLOCK_MONOTONIC, &t_start);
@@ -437,14 +438,14 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
/* Verify byte pattern after remapping */
srand(pattern_seed);
- for (i = 0; i < threshold; i++) {
+ for (t = 0; t < threshold; t++) {
char c = (char) rand();
- if (((char *) dest_addr)[i] != c) {
+ if (((char *) dest_addr)[t] != c) {
ksft_print_msg("Data after remap doesn't match at offset %llu\n",
- i);
+ t);
ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
- ((char *) dest_addr)[i] & 0xff);
+ ((char *) dest_addr)[t] & 0xff);
ret = -1;
goto clean_up_dest;
}
@@ -453,14 +454,14 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
/* Verify the dest preamble byte pattern after remapping */
if (c.dest_preamble_size) {
srand(pattern_seed);
- for (i = 0; i < c.dest_preamble_size; i++) {
+ for (d = 0; d < c.dest_preamble_size; d++) {
char c = (char) rand();
- if (((char *) dest_preamble_addr)[i] != c) {
+ if (((char *) dest_preamble_addr)[d] != c) {
ksft_print_msg("Preamble data after remap doesn't match at offset %d\n",
- i);
+ d);
ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
- ((char *) dest_preamble_addr)[i] & 0xff);
+ ((char *) dest_preamble_addr)[d] & 0xff);
ret = -1;
goto clean_up_dest;
}
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index 45cae7cab27e..a0a75f302904 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -29,9 +29,15 @@ check_supported_x86_64()
# See man 1 gzip under '-f'.
local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
+ local cpu_supports_pl5=$(awk '/^flags/ {if (/la57/) {print 0;}
+ else {print 1}; exit}' /proc/cpuinfo 2>/dev/null)
+
if [[ "${pg_table_levels}" -lt 5 ]]; then
echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
exit $ksft_skip
+ elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then
+ echo "$0: CPU does not have the necessary la57 flag to support page table level 5"
+ exit $ksft_skip
fi
}
diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh
index 70a02301f4c2..3d2d2eb9d6ff 100755
--- a/tools/testing/selftests/mm/write_hugetlb_memory.sh
+++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
set -e
diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
index 887542961968..2348d2c20d0a 100644
--- a/tools/testing/selftests/rseq/basic_percpu_ops_test.c
+++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
@@ -24,6 +24,11 @@ bool rseq_validate_cpu_id(void)
{
return rseq_mm_cid_available();
}
+static
+bool rseq_use_cpu_index(void)
+{
+ return false; /* Use mm_cid */
+}
#else
# define RSEQ_PERCPU RSEQ_PERCPU_CPU_ID
static
@@ -36,6 +41,11 @@ bool rseq_validate_cpu_id(void)
{
return rseq_current_cpu_raw() >= 0;
}
+static
+bool rseq_use_cpu_index(void)
+{
+ return true; /* Use cpu_id as index. */
+}
#endif
struct percpu_lock_entry {
@@ -274,7 +284,7 @@ void test_percpu_list(void)
/* Generate list entries for every usable cpu. */
sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
for (i = 0; i < CPU_SETSIZE; i++) {
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
for (j = 1; j <= 100; j++) {
struct percpu_list_node *node;
@@ -299,7 +309,7 @@ void test_percpu_list(void)
for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_list_node *node;
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
while ((node = __percpu_list_pop(&list, i))) {
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index 20403d58345c..2f37961240ca 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -288,6 +288,11 @@ bool rseq_validate_cpu_id(void)
{
return rseq_mm_cid_available();
}
+static
+bool rseq_use_cpu_index(void)
+{
+ return false; /* Use mm_cid */
+}
# ifdef TEST_MEMBARRIER
/*
* Membarrier does not currently support targeting a mm_cid, so
@@ -312,6 +317,11 @@ bool rseq_validate_cpu_id(void)
{
return rseq_current_cpu_raw() >= 0;
}
+static
+bool rseq_use_cpu_index(void)
+{
+ return true; /* Use cpu_id as index. */
+}
# ifdef TEST_MEMBARRIER
static
int rseq_membarrier_expedited(int cpu)
@@ -715,7 +725,7 @@ void test_percpu_list(void)
/* Generate list entries for every usable cpu. */
sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
for (i = 0; i < CPU_SETSIZE; i++) {
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
for (j = 1; j <= 100; j++) {
struct percpu_list_node *node;
@@ -752,7 +762,7 @@ void test_percpu_list(void)
for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_list_node *node;
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
while ((node = __percpu_list_pop(&list, i))) {
@@ -902,7 +912,7 @@ void test_percpu_buffer(void)
/* Generate list entries for every usable cpu. */
sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
for (i = 0; i < CPU_SETSIZE; i++) {
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
/* Worse-case is every item in same CPU. */
buffer.c[i].array =
@@ -952,7 +962,7 @@ void test_percpu_buffer(void)
for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_buffer_node *node;
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
while ((node = __percpu_buffer_pop(&buffer, i))) {
@@ -1113,7 +1123,7 @@ void test_percpu_memcpy_buffer(void)
/* Generate list entries for every usable cpu. */
sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
for (i = 0; i < CPU_SETSIZE; i++) {
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
/* Worse-case is every item in same CPU. */
buffer.c[i].array =
@@ -1160,7 +1170,7 @@ void test_percpu_memcpy_buffer(void)
for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_memcpy_buffer_node item;
- if (!CPU_ISSET(i, &allowed_cpus))
+ if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
continue;
while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c
index 5b5c9d558dee..97b86980b768 100644
--- a/tools/testing/selftests/seccomp/seccomp_benchmark.c
+++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c
@@ -38,10 +38,10 @@ unsigned long long timing(clockid_t clk_id, unsigned long long samples)
i *= 1000000000ULL;
i += finish.tv_nsec - start.tv_nsec;
- printf("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n",
- finish.tv_sec, finish.tv_nsec,
- start.tv_sec, start.tv_nsec,
- i, (double)i / 1000000000.0);
+ ksft_print_msg("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n",
+ finish.tv_sec, finish.tv_nsec,
+ start.tv_sec, start.tv_nsec,
+ i, (double)i / 1000000000.0);
return i;
}
@@ -53,7 +53,7 @@ unsigned long long calibrate(void)
pid_t pid, ret;
int seconds = 15;
- printf("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds);
+ ksft_print_msg("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds);
samples = 0;
pid = getpid();
@@ -98,24 +98,36 @@ bool le(int i_one, int i_two)
}
long compare(const char *name_one, const char *name_eval, const char *name_two,
- unsigned long long one, bool (*eval)(int, int), unsigned long long two)
+ unsigned long long one, bool (*eval)(int, int), unsigned long long two,
+ bool skip)
{
bool good;
- printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
- (long long)one, name_eval, (long long)two);
+ if (skip) {
+ ksft_test_result_skip("%s %s %s\n", name_one, name_eval,
+ name_two);
+ return 0;
+ }
+
+ ksft_print_msg("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
+ (long long)one, name_eval, (long long)two);
if (one > INT_MAX) {
- printf("Miscalculation! Measurement went negative: %lld\n", (long long)one);
- return 1;
+ ksft_print_msg("Miscalculation! Measurement went negative: %lld\n", (long long)one);
+ good = false;
+ goto out;
}
if (two > INT_MAX) {
- printf("Miscalculation! Measurement went negative: %lld\n", (long long)two);
- return 1;
+ ksft_print_msg("Miscalculation! Measurement went negative: %lld\n", (long long)two);
+ good = false;
+ goto out;
}
good = eval(one, two);
printf("%s\n", good ? "✔️" : "❌");
+out:
+ ksft_test_result(good, "%s %s %s\n", name_one, name_eval, name_two);
+
return good ? 0 : 1;
}
@@ -142,15 +154,22 @@ int main(int argc, char *argv[])
unsigned long long samples, calc;
unsigned long long native, filter1, filter2, bitmap1, bitmap2;
unsigned long long entry, per_filter1, per_filter2;
+ bool skip = false;
setbuf(stdout, NULL);
- printf("Running on:\n");
+ ksft_print_header();
+ ksft_set_plan(7);
+
+ ksft_print_msg("Running on:\n");
+ ksft_print_msg("");
system("uname -a");
- printf("Current BPF sysctl settings:\n");
+ ksft_print_msg("Current BPF sysctl settings:\n");
/* Avoid using "sysctl" which may not be installed. */
+ ksft_print_msg("");
system("grep -H . /proc/sys/net/core/bpf_jit_enable");
+ ksft_print_msg("");
system("grep -H . /proc/sys/net/core/bpf_jit_harden");
if (argc > 1)
@@ -158,11 +177,11 @@ int main(int argc, char *argv[])
else
samples = calibrate();
- printf("Benchmarking %llu syscalls...\n", samples);
+ ksft_print_msg("Benchmarking %llu syscalls...\n", samples);
/* Native call */
native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid native: %llu ns\n", native);
+ ksft_print_msg("getpid native: %llu ns\n", native);
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
assert(ret == 0);
@@ -172,35 +191,37 @@ int main(int argc, char *argv[])
assert(ret == 0);
bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
+ ksft_print_msg("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
/* Second filter resulting in a bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
+ ksft_print_msg("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
/* Third filter, can no longer be converted to bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
assert(ret == 0);
filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
+ ksft_print_msg("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
/* Fourth filter, can not be converted to bitmap because of filter 3 */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
+ ksft_print_msg("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
/* Estimations */
#define ESTIMATE(fmt, var, what) do { \
var = (what); \
- printf("Estimated " fmt ": %llu ns\n", var); \
- if (var > INT_MAX) \
- goto more_samples; \
+ ksft_print_msg("Estimated " fmt ": %llu ns\n", var); \
+ if (var > INT_MAX) { \
+ skip = true; \
+ ret |= 1; \
+ } \
} while (0)
ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc,
@@ -218,31 +239,34 @@ int main(int argc, char *argv[])
ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2,
(filter2 - native - entry) / 4);
- printf("Expectations:\n");
- ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1);
- bits = compare("native", "≤", "1 filter", native, le, filter1);
+ ksft_print_msg("Expectations:\n");
+ ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1,
+ skip);
+ bits = compare("native", "≤", "1 filter", native, le, filter1,
+ skip);
if (bits)
- goto more_samples;
+ skip = true;
ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)",
- per_filter1, approx, per_filter2);
+ per_filter1, approx, per_filter2, skip);
bits = compare("1 bitmapped", "≈", "2 bitmapped",
- bitmap1 - native, approx, bitmap2 - native);
+ bitmap1 - native, approx, bitmap2 - native, skip);
if (bits) {
- printf("Skipping constant action bitmap expectations: they appear unsupported.\n");
- goto out;
+ ksft_print_msg("Skipping constant action bitmap expectations: they appear unsupported.\n");
+ skip = true;
}
- ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native);
- ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native);
+ ret |= compare("entry", "≈", "1 bitmapped", entry, approx,
+ bitmap1 - native, skip);
+ ret |= compare("entry", "≈", "2 bitmapped", entry, approx,
+ bitmap2 - native, skip);
ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total",
- entry + (per_filter1 * 4) + native, approx, filter2);
- if (ret == 0)
- goto out;
+ entry + (per_filter1 * 4) + native, approx, filter2,
+ skip);
-more_samples:
- printf("Saw unexpected benchmark result. Try running again with more samples?\n");
-out:
- return 0;
+ if (ret)
+ ksft_print_msg("Saw unexpected benchmark result. Try running again with more samples?\n");
+
+ ksft_finished();
}