diff options
61 files changed, 774 insertions, 386 deletions
@@ -2161,6 +2161,19 @@ N: Mike Kravetz D: Maintenance and development of the hugetlb subsystem +N: Seth Jennings +D: Creation and maintenance of zswap + +N: Dan Streetman +D: Maintenance and development of zswap +D: Creation and maintenance of the zpool API + +N: Vitaly Wool +D: Maintenance and development of zswap + N: Andreas S. Krebs D: CYPRESS CY82C693 chipset IDE, Digital's PC-Alpha 164SX boards diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index a9efab50eed8..22955d56b379 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -671,8 +671,23 @@ Testing Static Functions ------------------------ If we do not want to expose functions or variables for testing, one option is to -conditionally ``#include`` the test file at the end of your .c file. For -example: +conditionally export the used symbol. For example: + +.. code-block:: c + + /* In my_file.c */ + + VISIBLE_IF_KUNIT int do_interesting_thing(); + EXPORT_SYMBOL_IF_KUNIT(do_interesting_thing); + + /* In my_file.h */ + + #if IS_ENABLED(CONFIG_KUNIT) + int do_interesting_thing(void); + #endif + +Alternatively, you could conditionally ``#include`` the test file at the end of +your .c file. For example: .. code-block:: c diff --git a/MAINTAINERS b/MAINTAINERS index 8999497011a2..61117c3afa80 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10283,7 +10283,7 @@ F: drivers/scsi/ibmvscsi/ibmvscsi* F: include/scsi/viosrp.h IBM Power Virtual SCSI Device Target Driver -M: Michael Cyr <[email protected]> +M: Tyrel Datwyler <[email protected]> S: Supported @@ -11725,6 +11725,7 @@ F: fs/smb/server/ KERNEL UNIT TESTING FRAMEWORK (KUnit) M: Brendan Higgins <[email protected]> M: David Gow <[email protected]> +R: Rae Moar <[email protected]> S: Maintained @@ -12903,6 +12904,8 @@ M: Alejandro Colomar <[email protected]> S: Maintained W: http://www.kernel.org/doc/man-pages +T: git git://git.kernel.org/pub/scm/docs/man-pages/man-pages.git +T: git git://www.alejandro-colomar.es/src/alx/linux/man-pages/man-pages.git MANAGEMENT COMPONENT TRANSPORT PROTOCOL (MCTP) M: Jeremy Kerr <[email protected]> @@ -24341,13 +24344,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git F: Documentation/filesystems/zonefs.rst F: fs/zonefs/ -ZPOOL COMPRESSED PAGE STORAGE API -M: Dan Streetman <[email protected]> -S: Maintained -F: include/linux/zpool.h -F: mm/zpool.c - ZR36067 VIDEO FOR LINUX DRIVER M: Corentin Labbe <[email protected]> @@ -24399,7 +24395,9 @@ M: Nhat Pham <[email protected]> S: Maintained F: Documentation/admin-guide/mm/zswap.rst +F: include/linux/zpool.h F: include/linux/zswap.h +F: mm/zpool.c F: mm/zswap.c THE REST diff --git a/arch/Kconfig b/arch/Kconfig index c91917b50873..a5af0edd3eb8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -673,6 +673,7 @@ config SHADOW_CALL_STACK bool "Shadow Call Stack" depends on ARCH_SUPPORTS_SHADOW_CALL_STACK depends on DYNAMIC_FTRACE_WITH_ARGS || DYNAMIC_FTRACE_WITH_REGS || !FUNCTION_GRAPH_TRACER + depends on MMU help This option enables the compiler's Shadow Call Stack, which uses a shadow stack to protect function return addresses from diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index 8fa6ac0e2d76..d91b37f5b4bb 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr) { unsigned long x = (unsigned long)addr; unsigned long y = x - __START_KERNEL_map; + bool ret; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { @@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr) return false; } - return pfn_valid(x >> PAGE_SHIFT); + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = pfn_valid(x >> PAGE_SHIFT); + preempt_enable_no_resched(); + + return ret; } #endif /* !MODULE */ diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index a148ff1f0872..a4f6884416a0 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -4545,6 +4545,7 @@ struct caam_hash_alg { struct list_head entry; struct device *dev; int alg_type; + bool is_hmac; struct ahash_alg ahash_alg; }; @@ -4571,7 +4572,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) ctx->dev = caam_hash->dev; - if (alg->setkey) { + if (caam_hash->is_hmac) { ctx->adata.key_dma = dma_map_single_attrs(ctx->dev, ctx->key, ARRAY_SIZE(ctx->key), DMA_TO_DEVICE, @@ -4611,7 +4612,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) * For keyed hash algorithms shared descriptors * will be created later in setkey() callback */ - return alg->setkey ? 0 : ahash_set_sh_desc(ahash); + return caam_hash->is_hmac ? 0 : ahash_set_sh_desc(ahash); } static void caam_hash_cra_exit(struct crypto_tfm *tfm) @@ -4646,12 +4647,14 @@ static struct caam_hash_alg *caam_hash_alloc(struct device *dev, template->hmac_name); snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", template->hmac_driver_name); + t_alg->is_hmac = true; } else { snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s", template->name); snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", template->driver_name); t_alg->ahash_alg.setkey = NULL; + t_alg->is_hmac = false; } alg->cra_module = THIS_MODULE; alg->cra_init = caam_hash_cra_init; diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 290c8500c247..fdd724228c2f 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -1753,6 +1753,7 @@ static struct caam_hash_template driver_hash[] = { struct caam_hash_alg { struct list_head entry; int alg_type; + bool is_hmac; struct ahash_engine_alg ahash_alg; }; @@ -1804,7 +1805,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) } else { if (priv->era >= 6) { ctx->dir = DMA_BIDIRECTIONAL; - ctx->key_dir = alg->setkey ? DMA_TO_DEVICE : DMA_NONE; + ctx->key_dir = caam_hash->is_hmac ? DMA_TO_DEVICE : DMA_NONE; } else { ctx->dir = DMA_TO_DEVICE; ctx->key_dir = DMA_NONE; @@ -1862,7 +1863,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) * For keyed hash algorithms shared descriptors * will be created later in setkey() callback */ - return alg->setkey ? 0 : ahash_set_sh_desc(ahash); + return caam_hash->is_hmac ? 0 : ahash_set_sh_desc(ahash); } static void caam_hash_cra_exit(struct crypto_tfm *tfm) @@ -1915,12 +1916,14 @@ caam_hash_alloc(struct caam_hash_template *template, template->hmac_name); snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", template->hmac_driver_name); + t_alg->is_hmac = true; } else { snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s", template->name); snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", template->driver_name); halg->setkey = NULL; + t_alg->is_hmac = false; } alg->cra_module = THIS_MODULE; alg->cra_init = caam_hash_cra_init; diff --git a/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c b/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c index 479062aa5e6b..94a0ebb03d8c 100644 --- a/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c +++ b/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c @@ -463,6 +463,7 @@ void adf_init_hw_data_4xxx(struct adf_hw_device_data *hw_data, u32 dev_id) hw_data->fw_name = ADF_402XX_FW; hw_data->fw_mmp_name = ADF_402XX_MMP; hw_data->uof_get_name = uof_get_name_402xx; + hw_data->get_ena_thd_mask = get_ena_thd_mask; break; case ADF_401XX_PCI_DEVICE_ID: hw_data->fw_name = ADF_4XXX_FW; diff --git a/drivers/regulator/max5970-regulator.c b/drivers/regulator/max5970-regulator.c index bc88a40a88d4..830a1c4cd705 100644 --- a/drivers/regulator/max5970-regulator.c +++ b/drivers/regulator/max5970-regulator.c @@ -392,7 +392,7 @@ static int max597x_regmap_read_clear(struct regmap *map, unsigned int reg, return ret; if (*val) - return regmap_write(map, reg, *val); + return regmap_write(map, reg, 0); return 0; } diff --git a/drivers/regulator/pwm-regulator.c b/drivers/regulator/pwm-regulator.c index 698c420e0869..60cfcd741c2a 100644 --- a/drivers/regulator/pwm-regulator.c +++ b/drivers/regulator/pwm-regulator.c @@ -157,7 +157,17 @@ static int pwm_regulator_get_voltage(struct regulator_dev *rdev) pwm_get_state(drvdata->pwm, &pstate); + if (!pstate.enabled) { + if (pstate.polarity == PWM_POLARITY_INVERSED) + pstate.duty_cycle = pstate.period; + else + pstate.duty_cycle = 0; + } + voltage = pwm_get_relative_duty_cycle(&pstate, duty_unit); + if (voltage < min(max_uV_duty, min_uV_duty) || + voltage > max(max_uV_duty, min_uV_duty)) + return -ENOTRECOVERABLE; /* * The dutycycle for min_uV might be greater than the one for max_uV. @@ -313,6 +323,32 @@ static int pwm_regulator_init_continuous(struct platform_device *pdev, return 0; } +static int pwm_regulator_init_boot_on(struct platform_device *pdev, + struct pwm_regulator_data *drvdata, + const struct regulator_init_data *init_data) +{ + struct pwm_state pstate; + + if (!init_data->constraints.boot_on || drvdata->enb_gpio) + return 0; + + pwm_get_state(drvdata->pwm, &pstate); + if (pstate.enabled) + return 0; + + /* + * Update the duty cycle so the output does not change + * when the regulator core enables the regulator (and + * thus the PWM channel). + */ + if (pstate.polarity == PWM_POLARITY_INVERSED) + pstate.duty_cycle = pstate.period; + else + pstate.duty_cycle = 0; + + return pwm_apply_might_sleep(drvdata->pwm, &pstate); +} + static int pwm_regulator_probe(struct platform_device *pdev) { const struct regulator_init_data *init_data; @@ -372,6 +408,13 @@ static int pwm_regulator_probe(struct platform_device *pdev) if (ret) return ret; + ret = pwm_regulator_init_boot_on(pdev, drvdata, init_data); + if (ret) { + dev_err(&pdev->dev, "Failed to apply boot_on settings: %d\n", + ret); + return ret; + } + regulator = devm_regulator_register(&pdev->dev, &drvdata->desc, &config); if (IS_ERR(regulator)) { diff --git a/drivers/regulator/ti-abb-regulator.c b/drivers/regulator/ti-abb-regulator.c index f48214e2c3b4..04133510e5af 100644 --- a/drivers/regulator/ti-abb-regulator.c +++ b/drivers/regulator/ti-abb-regulator.c @@ -726,9 +726,25 @@ static int ti_abb_probe(struct platform_device *pdev) return PTR_ERR(abb->setup_reg); } - abb->int_base = devm_platform_ioremap_resource_byname(pdev, "int-address"); - if (IS_ERR(abb->int_base)) - return PTR_ERR(abb->int_base); + pname = "int-address"; + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, pname); + if (!res) { + dev_err(dev, "Missing '%s' IO resource\n", pname); + return -ENODEV; + } + /* + * The MPU interrupt status register (PRM_IRQSTATUS_MPU) is + * shared between regulator-abb-{ivahd,dspeve,gpu} driver + * instances. Therefore use devm_ioremap() rather than + * devm_platform_ioremap_resource_byname() to avoid busy + * resource region conflicts. + */ + abb->int_base = devm_ioremap(dev, res->start, + resource_size(res)); + if (!abb->int_base) { + dev_err(dev, "Unable to map '%s'\n", pname); + return -ENOMEM; + } /* Map Optional resources */ pname = "efuse-address"; diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index 2a50fda3a628..625fd547ee60 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -371,7 +371,6 @@ static u16 initio_se2_rd(unsigned long base, u8 addr) */ static void initio_se2_wr(unsigned long base, u8 addr, u16 val) { - u8 rb; u8 instr; int i; @@ -400,7 +399,7 @@ static void initio_se2_wr(unsigned long base, u8 addr, u16 val) udelay(30); outb(SE2CS, base + TUL_NVRAM); /* -CLK */ udelay(30); - if ((rb = inb(base + TUL_NVRAM)) & SE2DI) + if (inb(base + TUL_NVRAM) & SE2DI) break; /* write complete */ } outb(0, base + TUL_NVRAM); /* -CS */ diff --git a/drivers/scsi/isci/request.c b/drivers/scsi/isci/request.c index 71f711cb0628..355a0bc0828e 100644 --- a/drivers/scsi/isci/request.c +++ b/drivers/scsi/isci/request.c @@ -3387,7 +3387,7 @@ static enum sci_status isci_io_request_build(struct isci_host *ihost, return SCI_FAILURE; } - return SCI_SUCCESS; + return status; } static struct isci_request *isci_request_from_tag(struct isci_host *ihost, u16 tag) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 79da4b1c1df0..4f455884fdc4 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -61,11 +61,11 @@ static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *, struct scsi_cmnd *); -void scsi_eh_wakeup(struct Scsi_Host *shost) +void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy) { lockdep_assert_held(shost->host_lock); - if (scsi_host_busy(shost) == shost->host_failed) { + if (busy == shost->host_failed) { trace_scsi_eh_wakeup(shost); wake_up_process(shost->ehandler); SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost, @@ -88,7 +88,7 @@ void scsi_schedule_eh(struct Scsi_Host *shost) if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 || scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) { shost->host_eh_scheduled++; - scsi_eh_wakeup(shost); + scsi_eh_wakeup(shost, scsi_host_busy(shost)); } spin_unlock_irqrestore(shost->host_lock, flags); @@ -286,7 +286,7 @@ static void scsi_eh_inc_host_failed(struct rcu_head *head) spin_lock_irqsave(shost->host_lock, flags); shost->host_failed++; - scsi_eh_wakeup(shost); + scsi_eh_wakeup(shost, scsi_host_busy(shost)); spin_unlock_irqrestore(shost->host_lock, flags); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index cf3864f72093..1fb80eae9a63 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -280,7 +280,7 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) if (unlikely(scsi_host_in_recovery(shost))) { spin_lock_irqsave(shost->host_lock, flags); if (shost->host_failed || shost->host_eh_scheduled) - scsi_eh_wakeup(shost); + scsi_eh_wakeup(shost, scsi_host_busy(shost)); spin_unlock_irqrestore(shost->host_lock, flags); } rcu_read_unlock(); diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h index 3f0dfb97db6b..1fbfe1b52c9f 100644 --- a/drivers/scsi/scsi_priv.h +++ b/drivers/scsi/scsi_priv.h @@ -92,7 +92,7 @@ extern void scmd_eh_abort_handler(struct work_struct *work); extern enum blk_eh_timer_return scsi_timeout(struct request *req); extern int scsi_error_handler(void *host); extern enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *cmd); -extern void scsi_eh_wakeup(struct Scsi_Host *shost); +extern void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy); extern void scsi_eh_scmd_add(struct scsi_cmnd *); void scsi_eh_ready_devs(struct Scsi_Host *shost, struct list_head *work_q, diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index a95936b18f69..7ceb982040a5 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -330,6 +330,7 @@ enum storvsc_request_type { */ static int storvsc_ringbuffer_size = (128 * 1024); +static int aligned_ringbuffer_size; static u32 max_outstanding_req_per_channel; static int storvsc_change_queue_depth(struct scsi_device *sdev, int queue_depth); @@ -687,8 +688,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) new_sc->next_request_id_callback = storvsc_next_request_id; ret = vmbus_open(new_sc, - storvsc_ringbuffer_size, - storvsc_ringbuffer_size, + aligned_ringbuffer_size, + aligned_ringbuffer_size, (void *)&props, sizeof(struct vmstorage_channel_properties), storvsc_on_channel_callback, new_sc); @@ -1973,7 +1974,7 @@ static int storvsc_probe(struct hv_device *device, dma_set_min_align_mask(&device->device, HV_HYP_PAGE_SIZE - 1); stor_device->port_number = host->host_no; - ret = storvsc_connect_to_vsp(device, storvsc_ringbuffer_size, is_fc); + ret = storvsc_connect_to_vsp(device, aligned_ringbuffer_size, is_fc); if (ret) goto err_out1; @@ -2164,7 +2165,7 @@ static int storvsc_resume(struct hv_device *hv_dev) { int ret; - ret = storvsc_connect_to_vsp(hv_dev, storvsc_ringbuffer_size, + ret = storvsc_connect_to_vsp(hv_dev, aligned_ringbuffer_size, hv_dev_is_fc(hv_dev)); return ret; } @@ -2198,8 +2199,9 @@ static int __init storvsc_drv_init(void) * the ring buffer indices) by the max request size (which is * vmbus_channel_packet_multipage_buffer + struct vstor_packet + u64) */ + aligned_ringbuffer_size = VMBUS_RING_SIZE(storvsc_ringbuffer_size); max_outstanding_req_per_channel = - ((storvsc_ringbuffer_size - PAGE_SIZE) / + ((aligned_ringbuffer_size - PAGE_SIZE) / ALIGN(MAX_MULTIPAGE_BUFFER_PACKET + sizeof(struct vstor_packet) + sizeof(u64), sizeof(u64))); diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 4cf20be668a6..617eb892f4ad 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -188,8 +188,6 @@ static void virtscsi_vq_done(struct virtio_scsi *vscsi, while ((buf = virtqueue_get_buf(vq, &len)) != NULL) fn(vscsi, buf); - if (unlikely(virtqueue_is_broken(vq))) - break; } while (!virtqueue_enable_cb(vq)); spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } diff --git a/drivers/soc/apple/mailbox.c b/drivers/soc/apple/mailbox.c index 780199bf351e..49a0955e82d6 100644 --- a/drivers/soc/apple/mailbox.c +++ b/drivers/soc/apple/mailbox.c @@ -296,14 +296,14 @@ struct apple_mbox *apple_mbox_get(struct device *dev, int index) of_node_put(args.np); if (!pdev) - return ERR_PTR(EPROBE_DEFER); + return ERR_PTR(-EPROBE_DEFER); mbox = platform_get_drvdata(pdev); if (!mbox) - return ERR_PTR(EPROBE_DEFER); + return ERR_PTR(-EPROBE_DEFER); if (!device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_CONSUMER)) - return ERR_PTR(ENODEV); + return ERR_PTR(-ENODEV); return mbox; } diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index cfc3b1ddbd22..6f12e4fb2e2e 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -136,14 +136,14 @@ struct sh_msiof_spi_priv { /* SIFCTR */ #define SIFCTR_TFWM_MASK GENMASK(31, 29) /* Transmit FIFO Watermark */ -#define SIFCTR_TFWM_64 (0 << 29) /* Transfer Request when 64 empty stages */ -#define SIFCTR_TFWM_32 (1 << 29) /* Transfer Request when 32 empty stages */ -#define SIFCTR_TFWM_24 (2 << 29) /* Transfer Request when 24 empty stages */ -#define SIFCTR_TFWM_16 (3 << 29) /* Transfer Request when 16 empty stages */ -#define SIFCTR_TFWM_12 (4 << 29) /* Transfer Request when 12 empty stages */ -#define SIFCTR_TFWM_8 (5 << 29) /* Transfer Request when 8 empty stages */ -#define SIFCTR_TFWM_4 (6 << 29) /* Transfer Request when 4 empty stages */ -#define SIFCTR_TFWM_1 (7 << 29) /* Transfer Request when 1 empty stage */ +#define SIFCTR_TFWM_64 (0UL << 29) /* Transfer Request when 64 empty stages */ +#define SIFCTR_TFWM_32 (1UL << 29) /* Transfer Request when 32 empty stages */ +#define SIFCTR_TFWM_24 (2UL << 29) /* Transfer Request when 24 empty stages */ +#define SIFCTR_TFWM_16 (3UL << 29) /* Transfer Request when 16 empty stages */ +#define SIFCTR_TFWM_12 (4UL << 29) /* Transfer Request when 12 empty stages */ +#define SIFCTR_TFWM_8 (5UL << 29) /* Transfer Request when 8 empty stages */ +#define SIFCTR_TFWM_4 (6UL << 29) /* Transfer Request when 4 empty stages */ +#define SIFCTR_TFWM_1 (7UL << 29) /* Transfer Request when 1 empty stage */ #define SIFCTR_TFUA_MASK GENMASK(26, 20) /* Transmit FIFO Usable Area */ #define SIFCTR_TFUA_SHIFT 20 #define SIFCTR_TFUA(i) ((i) << SIFCTR_TFUA_SHIFT) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 279933e007d2..7cc5841577b2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,13 +11,12 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 072ef6a66823..d4cee95af14c 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = erofs_allocpage(pagepool, rq->gfp); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 4a64a9c91dd3..b98872058abe 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb, } int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -158,8 +158,12 @@ again: strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); outsz -= strm->z.avail_out; if (!rq->out[no]) { - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + kout = NULL; + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -211,8 +215,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -230,7 +237,7 @@ again: break; } } - +failed: if (zlib_inflateEnd(&strm->z) != Z_OK && !err) err = -EIO; if (kout) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 2dd14f99c1dc..6ca357d83cfa 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -148,7 +148,7 @@ again: } int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -215,8 +215,11 @@ again: PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; if (!rq->out[no] && rq->fillgaps) { /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -258,8 +261,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -277,6 +283,7 @@ again: break; } } +failed: if (no < nrpages_out && strm->buf.out) kunmap(rq->out[no]); if (ni < nrpages_in) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index bc12030393b2..5ff90026fd43 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -459,7 +459,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3d616dea55dc..36e638e8b53a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, } else { const unsigned int gotten = sb->s_blocksize - *ofs; - copied = kmalloc(vi->inode_isize, GFP_NOFS); + copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { err = -ENOMEM; goto err_out; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 5dea308764b4..e146d09151af 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); + NULL, grp, GFP_KERNEL); if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 692c0c39be63..ff0aa72b0db3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -82,6 +82,9 @@ struct z_erofs_pcluster { /* L: indicate several pageofs_outs or not */ bool multibases; + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; @@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_KERNEL); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); pcl->pclustersize = size; @@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (i_blocksize(fe->inode) != PAGE_SIZE) - return; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pclusterpages; ++i) { struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - /* the compressed page was loaded before */ + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; page = find_get_page(mc, pcl->obj.index + i); - if (page) { t = (void *)((unsigned long)page | 1); newpage = NULL; @@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); t = (void *)((unsigned long)newpage | 1); } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + spin_lock(&pcl->obj.lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = t; + spin_unlock(&pcl->obj.lockref.lock); continue; + } + spin_unlock(&pcl->obj.lockref.lock); if (page) put_page(page); @@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio, DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) - while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } @@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb) set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; return 0; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) -{ - struct z_erofs_pcluster *const pcl = fe->pcl; - - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; -} - /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) + spin_lock(&pcl->obj.lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->obj.lockref.lock); return 0; + } + spin_unlock(&pcl->obj.lockref.lock); + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) @@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct page *page, bool ra) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1014,6 +1013,7 @@ repeat: err = z_erofs_pcluster_begin(fe); if (err) goto out; + fe->pcl->besteffort |= !ra; } /* @@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, + .gfp = pcl->besteffort ? + GFP_KERNEL | __GFP_NOFAIL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ @@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->length = 0; pcl->partial = true; pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, { gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *page, *oldpage; + struct page *page; int justfound, bs = i_blocksize(f->inode); /* Except for inplace pages, the entire page can be used for I/Os */ bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - oldpage = READ_ONCE(zbv->page); - if (!oldpage) + spin_lock(&pcl->obj.lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + page = zbv.page; + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); + if (!page) goto out_allocpage; - justfound = (unsigned long)oldpage & 1UL; - page = (struct page *)((unsigned long)oldpage & ~1UL); bvec->bv_page = page; - DBG_BUGON(z_erofs_is_shortlived_page(page)); /* * Handle preallocated cached pages. We tried to allocate such pages @@ -1448,7 +1455,6 @@ repeat: */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { set_page_private(page, 0); - WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } @@ -1459,9 +1465,9 @@ repeat: * therefore it is impossible for `mapping` to be NULL. */ if (mapping && mapping != mc) { - if (zbv->offset < 0) - bvec->bv_offset = round_up(-zbv->offset, bs); - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; return; } @@ -1471,7 +1477,6 @@ repeat: /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(zbv->page, page); /* * The cached page is still available but without a valid * `->private` pcluster hint. Let's reconnect them. @@ -1503,11 +1508,15 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + spin_lock(&pcl->obj.lockref.lock); + if (pcl->compressed_bvecs[nr].page) { erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); bvec->bv_page = page; out_tocache: if (!tocache || bs != PAGE_SIZE || @@ -1685,6 +1694,7 @@ submit_bio_retry: if (cur + bvec.bv_len > end) bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) goto submit_bio_retry; @@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) unlock_page(page); else - (void)z_erofs_do_read_page(f, page); + (void)z_erofs_do_read_page(f, page, !!rac); put_page(page); } @@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); @@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac) folio = head; head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ea5b8e57d904..671664fed307 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -340,7 +340,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { folio_unlock(folio); - if (!folio_test_has_hwpoisoned(folio)) + if (!folio_test_hwpoison(folio)) want = nr; else { /* diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 8eec84c651bf..cb3cda1390ad 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * RETURN VALUES: none */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ - - if ((newval - tp->dmt_budmin) > BUDMIN) - return -EIO; - budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 6b211522a13e..1c3dd0ad4660 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -281,44 +281,6 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, inode->i_gid = attr->gid; } -static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level) -{ - struct eventfs_inode *ei_child; - - /* at most we have events/system/event */ - if (WARN_ON_ONCE(level > 3)) - return; - - ei->attr.gid = gid; - - if (ei->entry_attrs) { - for (int i = 0; i < ei->nr_entries; i++) { - ei->entry_attrs[i].gid = gid; - } - } - - /* - * Only eventfs_inode with dentries are updated, make sure - * all eventfs_inodes are updated. If one of the children - * do not have a dentry, this function must traverse it. - */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - if (!ei_child->dentry) - update_gid(ei_child, gid, level + 1); - } -} - -void eventfs_update_gid(struct dentry *dentry, kgid_t gid) -{ - struct eventfs_inode *ei = dentry->d_fsdata; - int idx; - - idx = srcu_read_lock(&eventfs_srcu); - update_gid(ei, gid, 0); - srcu_read_unlock(&eventfs_srcu, idx); -} - /** * create_file - create a file in the tracefs filesystem * @name: the name of the file to create. diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 45397df9bb65..91c2bf0b91d9 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -82,7 +82,6 @@ struct inode *tracefs_get_inode(struct super_block *sb); struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); struct dentry *eventfs_failed_creating(struct dentry *dentry); struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_update_gid(struct dentry *dentry, kgid_t gid); void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 185924c56378..76458b6d53da 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -315,9 +315,9 @@ LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how) LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb) -LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock, +LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) -LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock, +LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock, struct sk_buff *skb, u32 *secid) LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk) diff --git a/include/linux/mman.h b/include/linux/mman.h index 40d94411d492..dc7048824be8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,6 +156,7 @@ calc_vm_flag_bits(unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | + _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | arch_calc_vm_flag_bits(flags); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4ed33b127821..a497f189d988 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2013,9 +2013,9 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); - rcu_read_lock(); + rcu_read_lock_sched(); if (!valid_section(ms)) { - rcu_read_unlock(); + rcu_read_unlock_sched(); return 0; } /* @@ -2023,7 +2023,7 @@ static inline int pfn_valid(unsigned long pfn) * the entire section-sized span. */ ret = early_section(ms) || pfn_section_valid(ms, pfn); - rcu_read_unlock(); + rcu_read_unlock_sched(); return ret; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 485bb0389b48..929e98c62965 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -537,7 +537,7 @@ retry: } } - ret = __replace_page(vma, vaddr, old_page, new_page); + ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); if (new_page) put_page(new_page); put_old: diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 46439e3bcec4..b33c3861fbbb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1470,8 +1470,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { - if (tracing_alloc_snapshot_instance(file->tr) != 0) - return 0; + int ret = tracing_alloc_snapshot_instance(file->tr); + + if (ret < 0) + return ret; return register_trigger(glob, data, file); } diff --git a/lib/kunit/device.c b/lib/kunit/device.c index f5371287b375..074c6dd2e36a 100644 --- a/lib/kunit/device.c +++ b/lib/kunit/device.c @@ -45,8 +45,8 @@ int kunit_bus_init(void) int error; kunit_bus_device = root_device_register("kunit"); - if (!kunit_bus_device) - return -ENOMEM; + if (IS_ERR(kunit_bus_device)) + return PTR_ERR(kunit_bus_device); error = bus_register(&kunit_bus_type); if (error) diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 717b9599036b..689fff2b2b10 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -146,6 +146,10 @@ void kunit_free_suite_set(struct kunit_suite_set suite_set) kfree(suite_set.start); } +/* + * Filter and reallocate test suites. Must return the filtered test suites set + * allocated at a valid virtual address or NULL in case of error. + */ struct kunit_suite_set kunit_filter_suites(const struct kunit_suite_set *suite_set, const char *filter_glob, diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index c4259d910356..f7980ef236a3 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -720,7 +720,7 @@ static void kunit_device_cleanup_test(struct kunit *test) long action_was_run = 0; test_device = kunit_device_register(test, "my_device"); - KUNIT_ASSERT_NOT_NULL(test, test_device); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_device); /* Add an action to verify cleanup. */ devm_add_action(test_device, test_dev_action, &action_was_run); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index f95d2093a0aa..31a5a992e646 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -17,6 +17,7 @@ #include <linux/panic.h> #include <linux/sched/debug.h> #include <linux/sched.h> +#include <linux/mm.h> #include "debugfs.h" #include "device-impl.h" @@ -801,12 +802,19 @@ static void kunit_module_exit(struct module *mod) }; const char *action = kunit_action(); + /* + * Check if the start address is a valid virtual address to detect + * if the module load sequence has failed and the suite set has not + * been initialized and filtered. + */ + if (!suite_set.start || !virt_addr_valid(suite_set.start)) + return; + if (!action) __kunit_test_suites_exit(mod->kunit_suites, mod->num_kunit_suites); - if (suite_set.start) - kunit_free_suite_set(suite_set); + kunit_free_suite_set(suite_set); } static int kunit_module_notify(struct notifier_block *nb, unsigned long val, @@ -816,12 +824,12 @@ static int kunit_module_notify(struct notifier_block *nb, unsigned long val, switch (val) { case MODULE_STATE_LIVE: + kunit_module_init(mod); break; case MODULE_STATE_GOING: kunit_module_exit(mod); break; case MODULE_STATE_COMING: - kunit_module_init(mod); break; case MODULE_STATE_UNFORMED: break; diff --git a/lib/stackdepot.c b/lib/stackdepot.c index a0be5d05c7f0..5caa1f566553 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) "stackdepot: " fmt +#include <linux/debugfs.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <linux/kernel.h> @@ -21,8 +22,9 @@ #include <linux/list.h> #include <linux/mm.h> #include <linux/mutex.h> -#include <linux/percpu.h> #include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -67,12 +69,28 @@ union handle_parts { }; struct stack_record { - struct list_head list; /* Links in hash table or freelist */ + struct list_head hash_list; /* Links in the hash table */ u32 hash; /* Hash in hash table */ u32 size; /* Number of stored frames */ - union handle_parts handle; + union handle_parts handle; /* Constant after initialization */ refcount_t count; - unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ + union { + unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ + struct { + /* + * An important invariant of the implementation is to + * only place a stack record onto the freelist iff its + * refcount is zero. Because stack records with a zero + * refcount are never considered as valid, it is safe to + * union @entries and freelist management state below. + * Conversely, as soon as an entry is off the freelist + * and its refcount becomes non-zero, the below must not + * be accessed until being placed back on the freelist. + */ + struct list_head free_list; /* Links in the freelist */ + unsigned long rcu_state; /* RCU cookie */ + }; + }; }; #define DEPOT_STACK_RECORD_SIZE \ @@ -112,8 +130,25 @@ static LIST_HEAD(free_stacks); * yet allocated or if the limit on the number of pools is reached. */ static bool new_pool_required = true; -/* Lock that protects the variables above. */ -static DEFINE_RWLOCK(pool_rwlock); +/* The lock must be held when performing pool or freelist modifications. */ +static DEFINE_RAW_SPINLOCK(pool_lock); + +/* Statistics counters for debugfs. */ +enum depot_counter_id { + DEPOT_COUNTER_ALLOCS, + DEPOT_COUNTER_FREES, + DEPOT_COUNTER_INUSE, + DEPOT_COUNTER_FREELIST_SIZE, + DEPOT_COUNTER_COUNT, +}; +static long counters[DEPOT_COUNTER_COUNT]; +static const char *const counter_names[] = { + [DEPOT_COUNTER_ALLOCS] = "allocations", + [DEPOT_COUNTER_FREES] = "frees", + [DEPOT_COUNTER_INUSE] = "in_use", + [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size", +}; +static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT); static int __init disable_stack_depot(char *str) { @@ -258,14 +293,15 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -/* Initializes a stack depol pool. */ +/* + * Initializes new stack depot @pool, release all its entries to the freelist, + * and update the list of pools. + */ static void depot_init_pool(void *pool) { int offset; - lockdep_assert_held_write(&pool_rwlock); - - WARN_ON(!list_empty(&free_stacks)); + lockdep_assert_held(&pool_lock); /* Initialize handles and link stack records into the freelist. */ for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; @@ -276,18 +312,36 @@ static void depot_init_pool(void *pool) stack->handle.offset = offset >> DEPOT_STACK_ALIGN; stack->handle.extra = 0; - list_add(&stack->list, &free_stacks); + /* + * Stack traces of size 0 are never saved, and we can simply use + * the size field as an indicator if this is a new unused stack + * record in the freelist. + */ + stack->size = 0; + + INIT_LIST_HEAD(&stack->hash_list); + /* + * Add to the freelist front to prioritize never-used entries: + * required in case there are entries in the freelist, but their + * RCU cookie still belongs to the current RCU grace period + * (there can still be concurrent readers). + */ + list_add(&stack->free_list, &free_stacks); + counters[DEPOT_COUNTER_FREELIST_SIZE]++; } /* Save reference to the pool to be used by depot_fetch_stack(). */ stack_pools[pools_num] = pool; - pools_num++; + + /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */ + WRITE_ONCE(pools_num, pools_num + 1); + ASSERT_EXCLUSIVE_WRITER(pools_num); } /* Keeps the preallocated memory to be used for a new stack depot pool. */ static void depot_keep_new_pool(void **prealloc) { - lockdep_assert_held_write(&pool_rwlock); + lockdep_assert_held(&pool_lock); /* * If a new pool is already saved or the maximum number of @@ -310,17 +364,16 @@ static void depot_keep_new_pool(void **prealloc) * number of pools is reached. In either case, take note that * keeping another pool is not required. */ - new_pool_required = false; + WRITE_ONCE(new_pool_required, false); } -/* Updates references to the current and the next stack depot pools. */ -static bool depot_update_pools(void **prealloc) +/* + * Try to initialize a new stack depot pool from either a previous or the + * current pre-allocation, and release all its entries to the freelist. + */ +static bool depot_try_init_pool(void **prealloc) { - lockdep_assert_held_write(&pool_rwlock); - - /* Check if we still have objects in the freelist. */ - if (!list_empty(&free_stacks)) - goto out_keep_prealloc; + lockdep_assert_held(&pool_lock); /* Check if we have a new pool saved and use it. */ if (new_pool) { @@ -329,10 +382,9 @@ static bool depot_update_pools(void **prealloc) /* Take note that we might need a new new_pool. */ if (pools_num < DEPOT_MAX_POOLS) - new_pool_required = true; + WRITE_ONCE(new_pool_required, true); - /* Try keeping the preallocated memory for new_pool. */ - goto out_keep_prealloc; + return true; } /* Bail out if we reached the pool limit. */ @@ -349,12 +401,32 @@ static bool depot_update_pools(void **prealloc) } return false; +} + +/* Try to find next free usable entry. */ +static struct stack_record *depot_pop_free(void) +{ + struct stack_record *stack; + + lockdep_assert_held(&pool_lock); + + if (list_empty(&free_stacks)) + return NULL; + + /* + * We maintain the invariant that the elements in front are least + * recently used, and are therefore more likely to be associated with an + * RCU grace period in the past. Consequently it is sufficient to only + * check the first entry. + */ + stack = list_first_entry(&free_stacks, struct stack_record, free_list); + if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state)) + return NULL; + + list_del(&stack->free_list); + counters[DEPOT_COUNTER_FREELIST_SIZE]--; -out_keep_prealloc: - /* Keep the preallocated memory for a new pool if required. */ - if (*prealloc) - depot_keep_new_pool(prealloc); - return true; + return stack; } /* Allocates a new stack in a stack depot pool. */ @@ -363,19 +435,22 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; - lockdep_assert_held_write(&pool_rwlock); + lockdep_assert_held(&pool_lock); - /* Update current and new pools if required and possible. */ - if (!depot_update_pools(prealloc)) + /* This should already be checked by public API entry points. */ + if (WARN_ON_ONCE(!size)) return NULL; /* Check if we have a stack record to save the stack trace. */ - if (list_empty(&free_stacks)) - return NULL; - - /* Get and unlink the first entry from the freelist. */ - stack = list_first_entry(&free_stacks, struct stack_record, list); - list_del(&stack->list); + stack = depot_pop_free(); + if (!stack) { + /* No usable entries on the freelist - try to refill the freelist. */ + if (!depot_try_init_pool(prealloc)) + return NULL; + stack = depot_pop_free(); + if (WARN_ON(!stack)) + return NULL; + } /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ if (size > CONFIG_STACKDEPOT_MAX_FRAMES) @@ -394,38 +469,80 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) */ kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); + counters[DEPOT_COUNTER_ALLOCS]++; + counters[DEPOT_COUNTER_INUSE]++; return stack; } static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) { + const int pools_num_cached = READ_ONCE(pools_num); union handle_parts parts = { .handle = handle }; void *pool; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; - lockdep_assert_held(&pool_rwlock); + lockdep_assert_not_held(&pool_lock); - if (parts.pool_index > pools_num) { + if (parts.pool_index > pools_num_cached) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pools_num, handle); + parts.pool_index, pools_num_cached, handle); return NULL; } pool = stack_pools[parts.pool_index]; - if (!pool) + if (WARN_ON(!pool)) return NULL; stack = pool + offset; + if (WARN_ON(!refcount_read(&stack->count))) + return NULL; + return stack; } /* Links stack into the freelist. */ static void depot_free_stack(struct stack_record *stack) { - lockdep_assert_held_write(&pool_rwlock); + unsigned long flags; + + lockdep_assert_not_held(&pool_lock); - list_add(&stack->list, &free_stacks); + raw_spin_lock_irqsave(&pool_lock, flags); + printk_deferred_enter(); + + /* + * Remove the entry from the hash list. Concurrent list traversal may + * still observe the entry, but since the refcount is zero, this entry + * will no longer be considered as valid. + */ + list_del_rcu(&stack->hash_list); + + /* + * Due to being used from constrained contexts such as the allocators, + * NMI, or even RCU itself, stack depot cannot rely on primitives that + * would sleep (such as synchronize_rcu()) or recursively call into + * stack depot again (such as call_rcu()). + * + * Instead, get an RCU cookie, so that we can ensure this entry isn't + * moved onto another list until the next grace period, and concurrent + * RCU list traversal remains safe. + */ + stack->rcu_state = get_state_synchronize_rcu(); + + /* + * Add the entry to the freelist tail, so that older entries are + * considered first - their RCU cookie is more likely to no longer be + * associated with the current grace period. + */ + list_add_tail(&stack->free_list, &free_stacks); + + counters[DEPOT_COUNTER_FREELIST_SIZE]++; + counters[DEPOT_COUNTER_FREES]++; + counters[DEPOT_COUNTER_INUSE]--; + + printk_deferred_exit(); + raw_spin_unlock_irqrestore(&pool_lock, flags); } /* Calculates the hash for a stack. */ @@ -453,22 +570,52 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, /* Finds a stack in a bucket of the hash table. */ static inline struct stack_record *find_stack(struct list_head *bucket, - unsigned long *entries, int size, - u32 hash) + unsigned long *entries, int size, + u32 hash, depot_flags_t flags) { - struct list_head *pos; - struct stack_record *found; + struct stack_record *stack, *ret = NULL; + + /* + * Stack depot may be used from instrumentation that instruments RCU or + * tracing itself; use variant that does not call into RCU and cannot be + * traced. + * + * Note: Such use cases must take care when using refcounting to evict + * unused entries, because the stack record free-then-reuse code paths + * do call into RCU. + */ + rcu_read_lock_sched_notrace(); - lockdep_assert_held(&pool_rwlock); + list_for_each_entry_rcu(stack, bucket, hash_list) { + if (stack->hash != hash || stack->size != size) + continue; + + /* + * This may race with depot_free_stack() accessing the freelist + * management state unioned with @entries. The refcount is zero + * in that case and the below refcount_inc_not_zero() will fail. + */ + if (data_race(stackdepot_memcmp(entries, stack->entries, size))) + continue; + + /* + * Try to increment refcount. If this succeeds, the stack record + * is valid and has not yet been freed. + * + * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior + * to then call stack_depot_put() later, and we can assume that + * a stack record is never placed back on the freelist. + */ + if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count)) + continue; - list_for_each(pos, bucket) { - found = list_entry(pos, struct stack_record, list); - if (found->hash == hash && - found->size == size && - !stackdepot_memcmp(entries, found->entries, size)) - return found; + ret = stack; + break; } - return NULL; + + rcu_read_unlock_sched_notrace(); + + return ret; } depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, @@ -482,7 +629,6 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, struct page *page = NULL; void *prealloc = NULL; bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; - bool need_alloc = false; unsigned long flags; u32 hash; @@ -505,31 +651,16 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, hash = hash_stack(entries, nr_entries); bucket = &stack_table[hash & stack_hash_mask]; - read_lock_irqsave(&pool_rwlock, flags); - printk_deferred_enter(); - - /* Fast path: look the stack trace up without full locking. */ - found = find_stack(bucket, entries, nr_entries, hash); - if (found) { - if (depot_flags & STACK_DEPOT_FLAG_GET) - refcount_inc(&found->count); - printk_deferred_exit(); - read_unlock_irqrestore(&pool_rwlock, flags); + /* Fast path: look the stack trace up without locking. */ + found = find_stack(bucket, entries, nr_entries, hash, depot_flags); + if (found) goto exit; - } - - /* Take note if another stack pool needs to be allocated. */ - if (new_pool_required) - need_alloc = true; - - printk_deferred_exit(); - read_unlock_irqrestore(&pool_rwlock, flags); /* * Allocate memory for a new pool if required now: * we won't be able to do that under the lock. */ - if (unlikely(can_alloc && need_alloc)) { + if (unlikely(can_alloc && READ_ONCE(new_pool_required))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -543,31 +674,36 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, prealloc = page_address(page); } - write_lock_irqsave(&pool_rwlock, flags); + raw_spin_lock_irqsave(&pool_lock, flags); printk_deferred_enter(); - found = find_stack(bucket, entries, nr_entries, hash); + /* Try to find again, to avoid concurrently inserting duplicates. */ + found = find_stack(bucket, entries, nr_entries, hash, depot_flags); if (!found) { struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { - list_add(&new->list, bucket); + /* + * This releases the stack record into the bucket and + * makes it visible to readers in find_stack(). + */ + list_add_rcu(&new->hash_list, bucket); found = new; } - } else { - if (depot_flags & STACK_DEPOT_FLAG_GET) - refcount_inc(&found->count); + } + + if (prealloc) { /* - * Stack depot already contains this stack trace, but let's - * keep the preallocated memory for future. + * Either stack depot already contains this stack trace, or + * depot_alloc_stack() did not consume the preallocated memory. + * Try to keep the preallocated memory for future. */ - if (prealloc) - depot_keep_new_pool(&prealloc); + depot_keep_new_pool(&prealloc); } printk_deferred_exit(); - write_unlock_irqrestore(&pool_rwlock, flags); + raw_spin_unlock_irqrestore(&pool_lock, flags); exit: if (prealloc) { /* Stack depot didn't use this memory, free it. */ @@ -592,7 +728,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { struct stack_record *stack; - unsigned long flags; *entries = NULL; /* @@ -604,13 +739,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle || stack_depot_disabled) return 0; - read_lock_irqsave(&pool_rwlock, flags); - printk_deferred_enter(); - stack = depot_fetch_stack(handle); - - printk_deferred_exit(); - read_unlock_irqrestore(&pool_rwlock, flags); + /* + * Should never be NULL, otherwise this is a use-after-put (or just a + * corrupt handle). + */ + if (WARN(!stack, "corrupt handle or use after stack_depot_put()")) + return 0; *entries = stack->entries; return stack->size; @@ -620,29 +755,20 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); void stack_depot_put(depot_stack_handle_t handle) { struct stack_record *stack; - unsigned long flags; if (!handle || stack_depot_disabled) return; - write_lock_irqsave(&pool_rwlock, flags); - printk_deferred_enter(); - stack = depot_fetch_stack(handle); - if (WARN_ON(!stack)) - goto out; - - if (refcount_dec_and_test(&stack->count)) { - /* Unlink stack from the hash table. */ - list_del(&stack->list); + /* + * Should always be able to find the stack record, otherwise this is an + * unbalanced put attempt (or corrupt handle). + */ + if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()")) + return; - /* Free stack. */ + if (refcount_dec_and_test(&stack->count)) depot_free_stack(stack); - } - -out: - printk_deferred_exit(); - write_unlock_irqrestore(&pool_rwlock, flags); } EXPORT_SYMBOL_GPL(stack_depot_put); @@ -690,3 +816,30 @@ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) return parts.extra; } EXPORT_SYMBOL(stack_depot_get_extra_bits); + +static int stats_show(struct seq_file *seq, void *v) +{ + /* + * data race ok: These are just statistics counters, and approximate + * statistics are ok for debugging. + */ + seq_printf(seq, "pools: %d\n", data_race(pools_num)); + for (int i = 0; i < DEPOT_COUNTER_COUNT; i++) + seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i])); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(stats); + +static int depot_debugfs_init(void) +{ + struct dentry *dir; + + if (stack_depot_disabled) + return 0; + + dir = debugfs_create_dir("stackdepot", NULL); + debugfs_create_file("stats", 0444, dir, NULL, &stats_fops); + return 0; +} +late_initcall(depot_debugfs_init); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 94ef5c02b459..94c958f7ebb5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,6 +37,7 @@ #include <linux/page_owner.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> +#include <linux/compat.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -809,7 +810,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, { loff_t off_end = off + len; loff_t off_align = round_up(off, size); - unsigned long len_pad, ret; + unsigned long len_pad, ret, off_sub; + + if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) + return 0; if (off_end <= off_align || (off_end - off_align) < size) return 0; @@ -835,7 +839,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (ret == addr) return addr; - ret += (off - ret) & (size - 1); + off_sub = (off - ret) & (size - 1); + + if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && + !off_sub) + return ret + size; + + ret += off_sub; return ret; } @@ -2437,7 +2447,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(old_pmd); folio = page_folio(page); if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) - folio_set_dirty(folio); + folio_mark_dirty(folio); if (!folio_test_referenced(folio) && pmd_young(old_pmd)) folio_set_referenced(folio); folio_remove_rmap_pmd(folio, page, vma); @@ -3563,7 +3573,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, } if (pmd_dirty(pmdval)) - folio_set_dirty(folio); + folio_mark_dirty(folio); if (pmd_write(pmdval)) entry = make_writable_migration_entry(page_to_pfn(page)); else if (anon_exclusive) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e4c8735e7c85..46d8d02114cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2623,8 +2623,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, } /* - * Scheduled by try_charge() to be executed from the userland return path - * and reclaims memory over the high limit. + * Reclaims memory over the high limit. Called directly from + * try_charge() (context permitting), as well as from the userland + * return path where reclaim is always able to block. */ void mem_cgroup_handle_over_high(gfp_t gfp_mask) { @@ -2644,6 +2645,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) retry_reclaim: /* + * Bail if the task is already exiting. Unlike memory.max, + * memory.high enforcement isn't as strict, and there is no + * OOM killer involved, which means the excess could already + * be much bigger (and still growing) than it could for + * memory.max; the dying task could get stuck in fruitless + * reclaim for a long time, which isn't desirable. + */ + if (task_is_dying()) + goto out; + + /* * The allocating task should reclaim at least the batch size, but for * subsequent retries we only want to do what's necessary to prevent oom * or breaching resource isolation. @@ -2693,6 +2705,9 @@ retry_reclaim: } /* + * Reclaim didn't manage to push usage below the limit, slow + * this allocating task down. + * * If we exit early, we're guaranteed to die (since * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. @@ -2887,11 +2902,17 @@ done_restock: } } while ((memcg = parent_mem_cgroup(memcg))); + /* + * Reclaim is set up above to be called from the userland + * return path. But also attempt synchronous reclaim to avoid + * excessive overrun while the task is still inside the + * kernel. If this is successful, the return path will see it + * when it rechecks the overage and simply bail out. + */ if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) { + gfpflags_allow_blocking(gfp_mask)) mem_cgroup_handle_over_high(gfp_mask); - } return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4f9b61f4a668..636280d04008 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -982,7 +982,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p, int count = page_count(p) - 1; if (extra_pins) - count -= 1; + count -= folio_nr_pages(page_folio(p)); if (count > 0) { pr_err("%#lx: %s still referenced by %d users\n", diff --git a/mm/memory.c b/mm/memory.c index 7e1f4849463a..89bcae0b224d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1464,7 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, delay_rmap = 0; if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { - folio_set_dirty(folio); + folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = 1; force_flush = 1; diff --git a/mm/mmap.c b/mm/mmap.c index b78e83d351d2..d89770eaab6b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1825,15 +1825,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. - * do_mmap() will clear pgoff, so match alignment. */ - pgoff = 0; get_area = shmem_get_unmapped_area; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { /* Ensures that larger anonymous mappings are THP aligned. */ get_area = thp_get_unmapped_area; } + /* Always treat pgoff as zero for anonymous memory. */ + if (!file) + pgoff = 0; + addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cd4e4ae77c40..02147b61712b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1638,7 +1638,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? - div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need diff --git a/mm/readahead.c b/mm/readahead.c index 23620c57c122..2648ec4f0494 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -469,7 +469,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (!folio) return -ENOMEM; - mark = round_up(mark, 1UL << order); + mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); @@ -575,7 +575,7 @@ static void ondemand_readahead(struct readahead_control *ractl, * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - expected = round_up(ra->start + ra->size - ra->async_size, + expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected || index == (ra->start + ra->size)) { ra->start += ra->size; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 20e3b0d9cf7e..75fcf1f783bc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( unsigned long dst_start, unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -472,6 +473,15 @@ retry: goto out; } mmap_read_lock(dst_mm); + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + if (mmap_changing && atomic_read(mmap_changing)) { + err = -EAGAIN; + break; + } dst_vma = NULL; goto retry; @@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ @@ -622,8 +633,8 @@ retry: * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(dst_vma, dst_start, - src_start, len, flags); + return mfill_atomic_hugetlb(dst_vma, dst_start, src_start, + len, mmap_changing, flags); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; diff --git a/security/security.c b/security/security.c index 0144a98d3712..3aaad75c9ce8 100644 --- a/security/security.c +++ b/security/security.c @@ -4255,7 +4255,19 @@ EXPORT_SYMBOL(security_inode_setsecctx); */ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { - return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen); + struct security_hook_list *hp; + int rc; + + /* + * Only one module will provide a security context. + */ + hlist_for_each_entry(hp, &security_hook_heads.inode_getsecctx, list) { + rc = hp->hook.inode_getsecctx(inode, ctx, ctxlen); + if (rc != LSM_RET_DEFAULT(inode_getsecctx)) + return rc; + } + + return LSM_RET_DEFAULT(inode_getsecctx); } EXPORT_SYMBOL(security_inode_getsecctx); @@ -4612,8 +4624,20 @@ EXPORT_SYMBOL(security_sock_rcv_skb); int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { - return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock, - optval, optlen, len); + struct security_hook_list *hp; + int rc; + + /* + * Only one module will provide a security context. + */ + hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream, + list) { + rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen, + len); + if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream)) + return rc; + } + return LSM_RET_DEFAULT(socket_getpeersec_stream); } /** @@ -4633,8 +4657,19 @@ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { - return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock, - skb, secid); + struct security_hook_list *hp; + int rc; + + /* + * Only one module will provide a security context. + */ + hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram, + list) { + rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid); + if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram)) + return rc; + } + return LSM_RET_DEFAULT(socket_getpeersec_dgram); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index c8416c54b463..b1fd7362c2fe 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -42,17 +42,6 @@ function die() { exit 1 } -# save existing dmesg so we can detect new content -function save_dmesg() { - SAVED_DMESG=$(mktemp --tmpdir -t klp-dmesg-XXXXXX) - dmesg > "$SAVED_DMESG" -} - -# cleanup temporary dmesg file from save_dmesg() -function cleanup_dmesg_file() { - rm -f "$SAVED_DMESG" -} - function push_config() { DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \ awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}') @@ -99,7 +88,6 @@ function set_ftrace_enabled() { function cleanup() { pop_config - cleanup_dmesg_file } # setup_config - save the current config and set a script exit trap that @@ -280,7 +268,15 @@ function set_pre_patch_ret { function start_test { local test="$1" - save_dmesg + # Dump something unique into the dmesg log, then stash the entry + # in LAST_DMESG. The check_result() function will use it to + # find new kernel messages since the test started. + local last_dmesg_msg="livepatch kselftest timestamp: $(date --rfc-3339=ns)" + log "$last_dmesg_msg" + loop_until 'dmesg | grep -q "$last_dmesg_msg"' || + die "buffer busy? can't find canary dmesg message: $last_dmesg_msg" + LAST_DMESG=$(dmesg | grep "$last_dmesg_msg") + echo -n "TEST: $test ... " log "===== TEST: $test =====" } @@ -291,23 +287,24 @@ function check_result { local expect="$*" local result - # Note: when comparing dmesg output, the kernel log timestamps - # help differentiate repeated testing runs. Remove them with a - # post-comparison sed filter. - - result=$(dmesg | comm --nocheck-order -13 "$SAVED_DMESG" - | \ + # Test results include any new dmesg entry since LAST_DMESG, then: + # - include lines matching keywords + # - exclude lines matching keywords + # - filter out dmesg timestamp prefixes + result=$(dmesg | awk -v last_dmesg="$LAST_DMESG" 'p; $0 == last_dmesg { p=1 }' | \ grep -e 'livepatch:' -e 'test_klp' | \ grep -v '\(tainting\|taints\) kernel' | \ sed 's/^\[[ 0-9.]*\] //') if [[ "$expect" == "$result" ]] ; then echo "ok" + elif [[ "$result" == "" ]] ; then + echo -e "not ok\n\nbuffer overrun? can't find canary dmesg entry: $LAST_DMESG\n" + die "livepatch kselftest(s) failed" else echo -e "not ok\n\n$(diff -upr --label expected --label result <(echo "$expect") <(echo "$result"))\n" die "livepatch kselftest(s) failed" fi - - cleanup_dmesg_file } # check_sysfs_rights(modname, rel_path, expected_rights) - check sysfs diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index 0899019a7fcb..e14bdd4455f2 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Kselftest framework requirement - SKIP code is 4. diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index 380b691d3eb9..b748c48908d9 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -566,7 +566,7 @@ static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, if (map_ptr_orig == MAP_FAILED) err(2, "initial mmap"); - if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) + if (madvise(map_ptr, len, MADV_HUGEPAGE)) err(2, "MADV_HUGEPAGE"); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c index 193281560b61..86e8f2048a40 100644 --- a/tools/testing/selftests/mm/map_hugetlb.c +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -15,6 +15,7 @@ #include <unistd.h> #include <sys/mman.h> #include <fcntl.h> +#include "vm_util.h" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) @@ -58,10 +59,16 @@ int main(int argc, char **argv) { void *addr; int ret; + size_t hugepage_size; size_t length = LENGTH; int flags = FLAGS; int shift = 0; + hugepage_size = default_huge_page_size(); + /* munmap with fail if the length is not page aligned */ + if (hugepage_size > length) + length = hugepage_size; + if (argc > 1) length = atol(argv[1]) << 20; if (argc > 2) { diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 1d4c1589c305..2f8b991f78cb 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -360,7 +360,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, char pattern_seed) { void *addr, *src_addr, *dest_addr, *dest_preamble_addr; - unsigned long long i; + int d; + unsigned long long t; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; @@ -378,8 +379,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Set byte pattern for source block. */ srand(pattern_seed); - for (i = 0; i < threshold; i++) - memset((char *) src_addr + i, (char) rand(), 1); + for (t = 0; t < threshold; t++) + memset((char *) src_addr + t, (char) rand(), 1); /* Mask to zero out lower bits of address for alignment */ align_mask = ~(c.dest_alignment - 1); @@ -420,8 +421,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Set byte pattern for the dest preamble block. */ srand(pattern_seed); - for (i = 0; i < c.dest_preamble_size; i++) - memset((char *) dest_preamble_addr + i, (char) rand(), 1); + for (d = 0; d < c.dest_preamble_size; d++) + memset((char *) dest_preamble_addr + d, (char) rand(), 1); } clock_gettime(CLOCK_MONOTONIC, &t_start); @@ -437,14 +438,14 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Verify byte pattern after remapping */ srand(pattern_seed); - for (i = 0; i < threshold; i++) { + for (t = 0; t < threshold; t++) { char c = (char) rand(); - if (((char *) dest_addr)[i] != c) { + if (((char *) dest_addr)[t] != c) { ksft_print_msg("Data after remap doesn't match at offset %llu\n", - i); + t); ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_addr)[i] & 0xff); + ((char *) dest_addr)[t] & 0xff); ret = -1; goto clean_up_dest; } @@ -453,14 +454,14 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Verify the dest preamble byte pattern after remapping */ if (c.dest_preamble_size) { srand(pattern_seed); - for (i = 0; i < c.dest_preamble_size; i++) { + for (d = 0; d < c.dest_preamble_size; d++) { char c = (char) rand(); - if (((char *) dest_preamble_addr)[i] != c) { + if (((char *) dest_preamble_addr)[d] != c) { ksft_print_msg("Preamble data after remap doesn't match at offset %d\n", - i); + d); ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_preamble_addr)[i] & 0xff); + ((char *) dest_preamble_addr)[d] & 0xff); ret = -1; goto clean_up_dest; } diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index 45cae7cab27e..a0a75f302904 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -29,9 +29,15 @@ check_supported_x86_64() # See man 1 gzip under '-f'. local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + local cpu_supports_pl5=$(awk '/^flags/ {if (/la57/) {print 0;} + else {print 1}; exit}' /proc/cpuinfo 2>/dev/null) + if [[ "${pg_table_levels}" -lt 5 ]]; then echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" exit $ksft_skip + elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then + echo "$0: CPU does not have the necessary la57 flag to support page table level 5" + exit $ksft_skip fi } diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh index 70a02301f4c2..3d2d2eb9d6ff 100755 --- a/tools/testing/selftests/mm/write_hugetlb_memory.sh +++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 set -e diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c index 887542961968..2348d2c20d0a 100644 --- a/tools/testing/selftests/rseq/basic_percpu_ops_test.c +++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c @@ -24,6 +24,11 @@ bool rseq_validate_cpu_id(void) { return rseq_mm_cid_available(); } +static +bool rseq_use_cpu_index(void) +{ + return false; /* Use mm_cid */ +} #else # define RSEQ_PERCPU RSEQ_PERCPU_CPU_ID static @@ -36,6 +41,11 @@ bool rseq_validate_cpu_id(void) { return rseq_current_cpu_raw() >= 0; } +static +bool rseq_use_cpu_index(void) +{ + return true; /* Use cpu_id as index. */ +} #endif struct percpu_lock_entry { @@ -274,7 +284,7 @@ void test_percpu_list(void) /* Generate list entries for every usable cpu. */ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus); for (i = 0; i < CPU_SETSIZE; i++) { - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; for (j = 1; j <= 100; j++) { struct percpu_list_node *node; @@ -299,7 +309,7 @@ void test_percpu_list(void) for (i = 0; i < CPU_SETSIZE; i++) { struct percpu_list_node *node; - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; while ((node = __percpu_list_pop(&list, i))) { diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 20403d58345c..2f37961240ca 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -288,6 +288,11 @@ bool rseq_validate_cpu_id(void) { return rseq_mm_cid_available(); } +static +bool rseq_use_cpu_index(void) +{ + return false; /* Use mm_cid */ +} # ifdef TEST_MEMBARRIER /* * Membarrier does not currently support targeting a mm_cid, so @@ -312,6 +317,11 @@ bool rseq_validate_cpu_id(void) { return rseq_current_cpu_raw() >= 0; } +static +bool rseq_use_cpu_index(void) +{ + return true; /* Use cpu_id as index. */ +} # ifdef TEST_MEMBARRIER static int rseq_membarrier_expedited(int cpu) @@ -715,7 +725,7 @@ void test_percpu_list(void) /* Generate list entries for every usable cpu. */ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus); for (i = 0; i < CPU_SETSIZE; i++) { - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; for (j = 1; j <= 100; j++) { struct percpu_list_node *node; @@ -752,7 +762,7 @@ void test_percpu_list(void) for (i = 0; i < CPU_SETSIZE; i++) { struct percpu_list_node *node; - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; while ((node = __percpu_list_pop(&list, i))) { @@ -902,7 +912,7 @@ void test_percpu_buffer(void) /* Generate list entries for every usable cpu. */ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus); for (i = 0; i < CPU_SETSIZE; i++) { - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; /* Worse-case is every item in same CPU. */ buffer.c[i].array = @@ -952,7 +962,7 @@ void test_percpu_buffer(void) for (i = 0; i < CPU_SETSIZE; i++) { struct percpu_buffer_node *node; - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; while ((node = __percpu_buffer_pop(&buffer, i))) { @@ -1113,7 +1123,7 @@ void test_percpu_memcpy_buffer(void) /* Generate list entries for every usable cpu. */ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus); for (i = 0; i < CPU_SETSIZE; i++) { - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; /* Worse-case is every item in same CPU. */ buffer.c[i].array = @@ -1160,7 +1170,7 @@ void test_percpu_memcpy_buffer(void) for (i = 0; i < CPU_SETSIZE; i++) { struct percpu_memcpy_buffer_node item; - if (!CPU_ISSET(i, &allowed_cpus)) + if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus)) continue; while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) { diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 5b5c9d558dee..97b86980b768 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -38,10 +38,10 @@ unsigned long long timing(clockid_t clk_id, unsigned long long samples) i *= 1000000000ULL; i += finish.tv_nsec - start.tv_nsec; - printf("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n", - finish.tv_sec, finish.tv_nsec, - start.tv_sec, start.tv_nsec, - i, (double)i / 1000000000.0); + ksft_print_msg("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n", + finish.tv_sec, finish.tv_nsec, + start.tv_sec, start.tv_nsec, + i, (double)i / 1000000000.0); return i; } @@ -53,7 +53,7 @@ unsigned long long calibrate(void) pid_t pid, ret; int seconds = 15; - printf("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds); + ksft_print_msg("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds); samples = 0; pid = getpid(); @@ -98,24 +98,36 @@ bool le(int i_one, int i_two) } long compare(const char *name_one, const char *name_eval, const char *name_two, - unsigned long long one, bool (*eval)(int, int), unsigned long long two) + unsigned long long one, bool (*eval)(int, int), unsigned long long two, + bool skip) { bool good; - printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, - (long long)one, name_eval, (long long)two); + if (skip) { + ksft_test_result_skip("%s %s %s\n", name_one, name_eval, + name_two); + return 0; + } + + ksft_print_msg("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, + (long long)one, name_eval, (long long)two); if (one > INT_MAX) { - printf("Miscalculation! Measurement went negative: %lld\n", (long long)one); - return 1; + ksft_print_msg("Miscalculation! Measurement went negative: %lld\n", (long long)one); + good = false; + goto out; } if (two > INT_MAX) { - printf("Miscalculation! Measurement went negative: %lld\n", (long long)two); - return 1; + ksft_print_msg("Miscalculation! Measurement went negative: %lld\n", (long long)two); + good = false; + goto out; } good = eval(one, two); printf("%s\n", good ? "✔️" : "❌"); +out: + ksft_test_result(good, "%s %s %s\n", name_one, name_eval, name_two); + return good ? 0 : 1; } @@ -142,15 +154,22 @@ int main(int argc, char *argv[]) unsigned long long samples, calc; unsigned long long native, filter1, filter2, bitmap1, bitmap2; unsigned long long entry, per_filter1, per_filter2; + bool skip = false; setbuf(stdout, NULL); - printf("Running on:\n"); + ksft_print_header(); + ksft_set_plan(7); + + ksft_print_msg("Running on:\n"); + ksft_print_msg(""); system("uname -a"); - printf("Current BPF sysctl settings:\n"); + ksft_print_msg("Current BPF sysctl settings:\n"); /* Avoid using "sysctl" which may not be installed. */ + ksft_print_msg(""); system("grep -H . /proc/sys/net/core/bpf_jit_enable"); + ksft_print_msg(""); system("grep -H . /proc/sys/net/core/bpf_jit_harden"); if (argc > 1) @@ -158,11 +177,11 @@ int main(int argc, char *argv[]) else samples = calibrate(); - printf("Benchmarking %llu syscalls...\n", samples); + ksft_print_msg("Benchmarking %llu syscalls...\n", samples); /* Native call */ native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid native: %llu ns\n", native); + ksft_print_msg("getpid native: %llu ns\n", native); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); assert(ret == 0); @@ -172,35 +191,37 @@ int main(int argc, char *argv[]) assert(ret == 0); bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); + ksft_print_msg("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); /* Second filter resulting in a bitmap */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); assert(ret == 0); bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); + ksft_print_msg("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); /* Third filter, can no longer be converted to bitmap */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); assert(ret == 0); filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); + ksft_print_msg("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); /* Fourth filter, can not be converted to bitmap because of filter 3 */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); assert(ret == 0); filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); + ksft_print_msg("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); /* Estimations */ #define ESTIMATE(fmt, var, what) do { \ var = (what); \ - printf("Estimated " fmt ": %llu ns\n", var); \ - if (var > INT_MAX) \ - goto more_samples; \ + ksft_print_msg("Estimated " fmt ": %llu ns\n", var); \ + if (var > INT_MAX) { \ + skip = true; \ + ret |= 1; \ + } \ } while (0) ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc, @@ -218,31 +239,34 @@ int main(int argc, char *argv[]) ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2, (filter2 - native - entry) / 4); - printf("Expectations:\n"); - ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1); - bits = compare("native", "≤", "1 filter", native, le, filter1); + ksft_print_msg("Expectations:\n"); + ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1, + skip); + bits = compare("native", "≤", "1 filter", native, le, filter1, + skip); if (bits) - goto more_samples; + skip = true; ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)", - per_filter1, approx, per_filter2); + per_filter1, approx, per_filter2, skip); bits = compare("1 bitmapped", "≈", "2 bitmapped", - bitmap1 - native, approx, bitmap2 - native); + bitmap1 - native, approx, bitmap2 - native, skip); if (bits) { - printf("Skipping constant action bitmap expectations: they appear unsupported.\n"); - goto out; + ksft_print_msg("Skipping constant action bitmap expectations: they appear unsupported.\n"); + skip = true; } - ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native); - ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native); + ret |= compare("entry", "≈", "1 bitmapped", entry, approx, + bitmap1 - native, skip); + ret |= compare("entry", "≈", "2 bitmapped", entry, approx, + bitmap2 - native, skip); ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total", - entry + (per_filter1 * 4) + native, approx, filter2); - if (ret == 0) - goto out; + entry + (per_filter1 * 4) + native, approx, filter2, + skip); -more_samples: - printf("Saw unexpected benchmark result. Try running again with more samples?\n"); -out: - return 0; + if (ret) + ksft_print_msg("Saw unexpected benchmark result. Try running again with more samples?\n"); + + ksft_finished(); } |