diff options
98 files changed, 2386 insertions, 2103 deletions
@@ -97,6 +97,11 @@ Baolin Wang <[email protected]> <[email protected]> Baolin Wang <[email protected]> <[email protected]> Baolin Wang <[email protected]> <[email protected]> Baolin Wang <[email protected]> <[email protected]> +Barry Song <[email protected]> <[email protected]> +Barry Song <[email protected]> <[email protected]> +Barry Song <[email protected]> <[email protected]> +Barry Song <[email protected]> <[email protected]> +Barry Song <[email protected]> <[email protected]> Bart Van Assche <[email protected]> <[email protected]> Bart Van Assche <[email protected]> <[email protected]> Bartosz Golaszewski <[email protected]> <[email protected]> @@ -304,6 +309,7 @@ Johan Hovold <[email protected]> <[email protected]> Johan Hovold <[email protected]> <[email protected]> John Crispin <[email protected]> <[email protected]> John Fastabend <[email protected]> <[email protected]> +John Garry <[email protected]> <[email protected]> John Keeping <[email protected]> <[email protected]> John Moon <[email protected]> <[email protected]> John Paul Adrian Glaubitz <[email protected]> diff --git a/MAINTAINERS b/MAINTAINERS index b81b2be60b77..eeea867ea23b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12051,6 +12051,7 @@ M: Mimi Zohar <[email protected]> S: Supported +W: https://kernsec.org/wiki/index.php/Linux_Kernel_Integrity F: security/integrity/platform_certs KFENCE @@ -22449,7 +22450,7 @@ M: Jarkko Sakkinen <[email protected]> R: Jason Gunthorpe <[email protected]> S: Maintained -W: https://kernsec.org/wiki/index.php/Linux_Kernel_Integrity +W: https://gitlab.com/jarkkojs/linux-tpmdd-test Q: https://patchwork.kernel.org/project/linux-integrity/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git F: Documentation/devicetree/bindings/tpm/ diff --git a/block/blk-settings.c b/block/blk-settings.c index d2731843f2fc..9d6033e01f2e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -188,7 +188,10 @@ static int blk_validate_limits(struct queue_limits *lim) * bvec and lower layer bio splitting is supposed to handle the two * correctly. */ - if (!lim->virt_boundary_mask) { + if (lim->virt_boundary_mask) { + if (!lim->max_segment_size) + lim->max_segment_size = UINT_MAX; + } else { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index d09c7d728365..9dad67ea2597 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -1193,6 +1193,8 @@ static int edge_detector_update(struct line *line, struct gpio_v2_line_config *lc, unsigned int line_idx, u64 edflags) { + u64 eflags; + int ret; u64 active_edflags = READ_ONCE(line->edflags); unsigned int debounce_period_us = gpio_v2_line_config_debounce_period(lc, line_idx); @@ -1204,6 +1206,18 @@ static int edge_detector_update(struct line *line, /* sw debounced and still will be...*/ if (debounce_period_us && READ_ONCE(line->sw_debounced)) { line_set_debounce_period(line, debounce_period_us); + /* + * ensure event fifo is initialised if edge detection + * is now enabled. + */ + eflags = edflags & GPIO_V2_LINE_EDGE_FLAGS; + if (eflags && !kfifo_initialized(&line->req->events)) { + ret = kfifo_alloc(&line->req->events, + line->req->event_buffer_size, + GFP_KERNEL); + if (ret) + return ret; + } return 0; } @@ -2351,7 +2365,7 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, dflags = READ_ONCE(desc->flags); - scoped_guard(srcu, &desc->srcu) { + scoped_guard(srcu, &desc->gdev->desc_srcu) { label = gpiod_get_label(desc); if (label && test_bit(FLAG_REQUESTED, &dflags)) strscpy(info->consumer, label, @@ -2799,11 +2813,11 @@ static int gpio_chrdev_release(struct inode *inode, struct file *file) struct gpio_chardev_data *cdev = file->private_data; struct gpio_device *gdev = cdev->gdev; - bitmap_free(cdev->watched_lines); blocking_notifier_chain_unregister(&gdev->device_notifier, &cdev->device_unregistered_nb); blocking_notifier_chain_unregister(&gdev->line_state_notifier, &cdev->lineinfo_changed_nb); + bitmap_free(cdev->watched_lines); gpio_device_put(gdev); kfree(cdev); diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 94903fc1c145..fa50db0c3605 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -101,6 +101,7 @@ static bool gpiolib_initialized; const char *gpiod_get_label(struct gpio_desc *desc) { + struct gpio_desc_label *label; unsigned long flags; flags = READ_ONCE(desc->flags); @@ -108,23 +109,36 @@ const char *gpiod_get_label(struct gpio_desc *desc) !test_bit(FLAG_REQUESTED, &flags)) return "interrupt"; - return test_bit(FLAG_REQUESTED, &flags) ? - srcu_dereference(desc->label, &desc->srcu) : NULL; + if (!test_bit(FLAG_REQUESTED, &flags)) + return NULL; + + label = srcu_dereference_check(desc->label, &desc->gdev->desc_srcu, + srcu_read_lock_held(&desc->gdev->desc_srcu)); + + return label->str; +} + +static void desc_free_label(struct rcu_head *rh) +{ + kfree(container_of(rh, struct gpio_desc_label, rh)); } static int desc_set_label(struct gpio_desc *desc, const char *label) { - const char *new = NULL, *old; + struct gpio_desc_label *new = NULL, *old; if (label) { - new = kstrdup_const(label, GFP_KERNEL); + new = kzalloc(struct_size(new, str, strlen(label) + 1), + GFP_KERNEL); if (!new) return -ENOMEM; + + strcpy(new->str, label); } old = rcu_replace_pointer(desc->label, new, 1); - synchronize_srcu(&desc->srcu); - kfree_const(old); + if (old) + call_srcu(&desc->gdev->desc_srcu, &old->rh, desc_free_label); return 0; } @@ -695,10 +709,10 @@ EXPORT_SYMBOL_GPL(gpiochip_line_is_valid); static void gpiodev_release(struct device *dev) { struct gpio_device *gdev = to_gpio_device(dev); - unsigned int i; - for (i = 0; i < gdev->ngpio; i++) - cleanup_srcu_struct(&gdev->descs[i].srcu); + /* Call pending kfree()s for descriptor labels. */ + synchronize_srcu(&gdev->desc_srcu); + cleanup_srcu_struct(&gdev->desc_srcu); ida_free(&gpio_ida, gdev->id); kfree_const(gdev->label); @@ -975,6 +989,10 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, if (ret) goto err_remove_from_list; + ret = init_srcu_struct(&gdev->desc_srcu); + if (ret) + goto err_cleanup_gdev_srcu; + #ifdef CONFIG_PINCTRL INIT_LIST_HEAD(&gdev->pin_ranges); #endif @@ -982,23 +1000,19 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, if (gc->names) { ret = gpiochip_set_desc_names(gc); if (ret) - goto err_cleanup_gdev_srcu; + goto err_cleanup_desc_srcu; } ret = gpiochip_set_names(gc); if (ret) - goto err_cleanup_gdev_srcu; + goto err_cleanup_desc_srcu; ret = gpiochip_init_valid_mask(gc); if (ret) - goto err_cleanup_gdev_srcu; + goto err_cleanup_desc_srcu; for (desc_index = 0; desc_index < gc->ngpio; desc_index++) { struct gpio_desc *desc = &gdev->descs[desc_index]; - ret = init_srcu_struct(&desc->srcu); - if (ret) - goto err_cleanup_desc_srcu; - if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->get_direction(gc, desc_index)); @@ -1010,7 +1024,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, ret = of_gpiochip_add(gc); if (ret) - goto err_cleanup_desc_srcu; + goto err_free_valid_mask; ret = gpiochip_add_pin_ranges(gc); if (ret) @@ -1057,10 +1071,10 @@ err_free_hogs: gpiochip_remove_pin_ranges(gc); err_remove_of_chip: of_gpiochip_remove(gc); -err_cleanup_desc_srcu: - while (desc_index--) - cleanup_srcu_struct(&gdev->descs[desc_index].srcu); +err_free_valid_mask: gpiochip_free_valid_mask(gc); +err_cleanup_desc_srcu: + cleanup_srcu_struct(&gdev->desc_srcu); err_cleanup_gdev_srcu: cleanup_srcu_struct(&gdev->srcu); err_remove_from_list: @@ -2390,7 +2404,7 @@ char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset) if (!test_bit(FLAG_REQUESTED, &desc->flags)) return NULL; - guard(srcu)(&desc->srcu); + guard(srcu)(&desc->gdev->desc_srcu); label = kstrdup(gpiod_get_label(desc), GFP_KERNEL); if (!label) @@ -4781,7 +4795,7 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev) } for_each_gpio_desc(gc, desc) { - guard(srcu)(&desc->srcu); + guard(srcu)(&desc->gdev->desc_srcu); if (test_bit(FLAG_REQUESTED, &desc->flags)) { gpiod_get_direction(desc); is_out = test_bit(FLAG_IS_OUT, &desc->flags); diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index f67d5991ab1c..8e0e211ebf08 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -31,6 +31,7 @@ * @chip: pointer to the corresponding gpiochip, holding static * data for this device * @descs: array of ngpio descriptors. + * @desc_srcu: ensures consistent state of GPIO descriptors exposed to users * @ngpio: the number of GPIO lines on this GPIO device, equal to the size * of the @descs array. * @can_sleep: indicate whether the GPIO chip driver's callbacks can sleep @@ -61,6 +62,7 @@ struct gpio_device { struct module *owner; struct gpio_chip __rcu *chip; struct gpio_desc *descs; + struct srcu_struct desc_srcu; int base; u16 ngpio; bool can_sleep; @@ -137,6 +139,11 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory); void gpiod_line_state_notify(struct gpio_desc *desc, unsigned long action); +struct gpio_desc_label { + struct rcu_head rh; + char str[]; +}; + /** * struct gpio_desc - Opaque descriptor for a GPIO * @@ -145,7 +152,6 @@ void gpiod_line_state_notify(struct gpio_desc *desc, unsigned long action); * @label: Name of the consumer * @name: Line name * @hog: Pointer to the device node that hogs this line (if any) - * @srcu: SRCU struct protecting the label pointer. * * These are obtained using gpiod_get() and are preferable to the old * integer-based handles. @@ -177,13 +183,12 @@ struct gpio_desc { #define FLAG_EVENT_CLOCK_HTE 19 /* GPIO CDEV reports hardware timestamps in events */ /* Connection label */ - const char __rcu *label; + struct gpio_desc_label __rcu *label; /* Name of the GPIO */ const char *name; #ifdef CONFIG_OF_DYNAMIC struct device_node *hog; #endif - struct srcu_struct srcu; }; #define gpiod_not_found(desc) (IS_ERR(desc) && PTR_ERR(desc) == -ENOENT) @@ -251,7 +256,7 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc) #define gpiod_err(desc, fmt, ...) \ do { \ - scoped_guard(srcu, &desc->srcu) { \ + scoped_guard(srcu, &desc->gdev->desc_srcu) { \ pr_err("gpio-%d (%s): " fmt, desc_to_gpio(desc), \ gpiod_get_label(desc) ? : "?", ##__VA_ARGS__); \ } \ @@ -259,7 +264,7 @@ do { \ #define gpiod_warn(desc, fmt, ...) \ do { \ - scoped_guard(srcu, &desc->srcu) { \ + scoped_guard(srcu, &desc->gdev->desc_srcu) { \ pr_warn("gpio-%d (%s): " fmt, desc_to_gpio(desc), \ gpiod_get_label(desc) ? : "?", ##__VA_ARGS__); \ } \ @@ -267,7 +272,7 @@ do { \ #define gpiod_dbg(desc, fmt, ...) \ do { \ - scoped_guard(srcu, &desc->srcu) { \ + scoped_guard(srcu, &desc->gdev->desc_srcu) { \ pr_debug("gpio-%d (%s): " fmt, desc_to_gpio(desc), \ gpiod_get_label(desc) ? : "?", ##__VA_ARGS__); \ } \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 109fe557a02b..29c197c00018 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -427,7 +427,7 @@ bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, amdgpu_res_first(res, 0, res->size, &cursor); while (cursor.remaining) { - if ((cursor.start + cursor.size) >= adev->gmc.visible_vram_size) + if ((cursor.start + cursor.size) > adev->gmc.visible_vram_size) return false; amdgpu_res_next(&cursor, cursor.size); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 55aa74cbc532..1e6cc0bfc432 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1139,7 +1139,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, goto err_unlock; } offset = dev->adev->rmmio_remap.bus_addr; - if (!offset) { + if (!offset || (PAGE_SIZE > 4096)) { err = -ENOMEM; goto err_unlock; } @@ -2307,7 +2307,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, return -EINVAL; } offset = pdd->dev->adev->rmmio_remap.bus_addr; - if (!offset) { + if (!offset || (PAGE_SIZE > 4096)) { pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n"); return -ENOMEM; } @@ -3349,6 +3349,9 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process, if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; + if (PAGE_SIZE > 4096) + return -EINVAL; + address = dev->adev->rmmio_remap.bus_addr; vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index c51f131eaa2f..bc9eb847ecfe 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1997,9 +1997,8 @@ int kfd_topology_add_device(struct kfd_node *gpu) HSA_CAP_ASIC_REVISION_MASK); dev->node_props.location_id = pci_dev_id(gpu->adev->pdev); - /* On multi-partition nodes, node id = location_id[31:28] */ - if (gpu->kfd->num_nodes > 1) - dev->node_props.location_id |= (dev->gpu->node_id << 28); + if (KFD_GC_VERSION(dev->gpu->kfd) == IP_VERSION(9, 4, 3)) + dev->node_props.location_id |= dev->gpu->node_id; dev->node_props.domain = pci_domain_nr(gpu->adev->pdev->bus); dev->node_props.max_engine_clk_fcompute = diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 941e96f100f4..cb31a699c662 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -1219,8 +1219,10 @@ static bool is_dsc_need_re_compute( if (dc_link->type != dc_connection_mst_branch) return false; - if (!(dc_link->dpcd_caps.dsc_caps.dsc_basic_caps.fields.dsc_support.DSC_SUPPORT || - dc_link->dpcd_caps.dsc_caps.dsc_basic_caps.fields.dsc_support.DSC_PASSTHROUGH_SUPPORT)) + /* add a check for older MST DSC with no virtual DPCDs */ + if (needs_dsc_aux_workaround(dc_link) && + (!(dc_link->dpcd_caps.dsc_caps.dsc_basic_caps.fields.dsc_support.DSC_SUPPORT || + dc_link->dpcd_caps.dsc_caps.dsc_basic_caps.fields.dsc_support.DSC_PASSTHROUGH_SUPPORT))) return false; for (i = 0; i < MAX_PIPES; i++) @@ -1240,7 +1242,15 @@ static bool is_dsc_need_re_compute( continue; aconnector = (struct amdgpu_dm_connector *) stream->dm_stream_context; - if (!aconnector) + if (!aconnector || !aconnector->dsc_aux) + continue; + + /* + * check if cached virtual MST DSC caps are available and DSC is supported + * as per specifications in their Virtual DPCD registers. + */ + if (!(aconnector->dc_sink->dsc_caps.dsc_dec_caps.is_dsc_supported || + aconnector->dc_link->dpcd_caps.dsc_caps.dsc_basic_caps.fields.dsc_support.DSC_PASSTHROUGH_SUPPORT)) continue; stream_on_link[new_stream_on_link_num] = aconnector; diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c index 21e0eef3269b..53e40d3c48d4 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c @@ -195,9 +195,9 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = { .dcn_downspread_percent = 0.5, .gpuvm_min_page_size_bytes = 4096, .hostvm_min_page_size_bytes = 4096, - .do_urgent_latency_adjustment = 0, + .do_urgent_latency_adjustment = 1, .urgent_latency_adjustment_fabric_clock_component_us = 0, - .urgent_latency_adjustment_fabric_clock_reference_mhz = 0, + .urgent_latency_adjustment_fabric_clock_reference_mhz = 3000, }; void dcn35_build_wm_range_table_fpu(struct clk_mgr *clk_mgr) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index a5560b3fc39b..9067ca78f851 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -638,22 +638,43 @@ void dcn35_power_down_on_boot(struct dc *dc) bool dcn35_apply_idle_power_optimizations(struct dc *dc, bool enable) { - struct dc_link *edp_links[MAX_NUM_EDP]; - int i, edp_num; if (dc->debug.dmcub_emulation) return true; if (enable) { - dc_get_edp_links(dc, edp_links, &edp_num); - if (edp_num == 0 || edp_num > 1) - return false; + uint32_t num_active_edp = 0; + int i; for (i = 0; i < dc->current_state->stream_count; ++i) { struct dc_stream_state *stream = dc->current_state->streams[i]; + struct dc_link *link = stream->link; + bool is_psr = link && !link->panel_config.psr.disable_psr && + (link->psr_settings.psr_version == DC_PSR_VERSION_1 || + link->psr_settings.psr_version == DC_PSR_VERSION_SU_1); + bool is_replay = link && link->replay_settings.replay_feature_enabled; + + /* Ignore streams that disabled. */ + if (stream->dpms_off) + continue; + + /* Active external displays block idle optimizations. */ + if (!dc_is_embedded_signal(stream->signal)) + return false; + + /* If not PWRSEQ0 can't enter idle optimizations */ + if (link && link->link_index != 0) + return false; - if (!stream->dpms_off && !dc_is_embedded_signal(stream->signal)) + /* Check for panel power features required for idle optimizations. */ + if (!is_psr && !is_replay) return false; + + num_active_edp += 1; } + + /* If more than one active eDP then disallow. */ + if (num_active_edp > 1) + return false; } // TODO: review other cases when idle optimization is allowed diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c index 949131bd1ecb..4abfcd32747d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c @@ -226,7 +226,7 @@ static int smu_v13_0_4_system_features_control(struct smu_context *smu, bool en) struct amdgpu_device *adev = smu->adev; int ret = 0; - if (!en && !adev->in_s0ix) { + if (!en && adev->in_s4) { /* Adds a GFX reset as workaround just before sending the * MP1_UNLOAD message to prevent GC/RLC/PMFW from entering * an invalid state. diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index b0516505f7ae..4d2df7f64dc5 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -2940,7 +2940,7 @@ int drm_mode_getconnector(struct drm_device *dev, void *data, dev->mode_config.max_width, dev->mode_config.max_height); else - drm_dbg_kms(dev, "User-space requested a forced probe on [CONNECTOR:%d:%s] but is not the DRM master, demoting to read-only probe", + drm_dbg_kms(dev, "User-space requested a forced probe on [CONNECTOR:%d:%s] but is not the DRM master, demoting to read-only probe\n", connector->base.id, connector->name); } diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c index 07e0c73204f3..ed81e1466c4b 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.c +++ b/drivers/gpu/drm/i915/display/intel_audio.c @@ -76,19 +76,6 @@ struct intel_audio_funcs { struct intel_crtc_state *crtc_state); }; -/* DP N/M table */ -#define LC_810M 810000 -#define LC_540M 540000 -#define LC_270M 270000 -#define LC_162M 162000 - -struct dp_aud_n_m { - int sample_rate; - int clock; - u16 m; - u16 n; -}; - struct hdmi_aud_ncts { int sample_rate; int clock; @@ -96,60 +83,6 @@ struct hdmi_aud_ncts { int cts; }; -/* Values according to DP 1.4 Table 2-104 */ -static const struct dp_aud_n_m dp_aud_n_m[] = { - { 32000, LC_162M, 1024, 10125 }, - { 44100, LC_162M, 784, 5625 }, - { 48000, LC_162M, 512, 3375 }, - { 64000, LC_162M, 2048, 10125 }, - { 88200, LC_162M, 1568, 5625 }, - { 96000, LC_162M, 1024, 3375 }, - { 128000, LC_162M, 4096, 10125 }, - { 176400, LC_162M, 3136, 5625 }, - { 192000, LC_162M, 2048, 3375 }, - { 32000, LC_270M, 1024, 16875 }, - { 44100, LC_270M, 784, 9375 }, - { 48000, LC_270M, 512, 5625 }, - { 64000, LC_270M, 2048, 16875 }, - { 88200, LC_270M, 1568, 9375 }, - { 96000, LC_270M, 1024, 5625 }, - { 128000, LC_270M, 4096, 16875 }, - { 176400, LC_270M, 3136, 9375 }, - { 192000, LC_270M, 2048, 5625 }, - { 32000, LC_540M, 1024, 33750 }, - { 44100, LC_540M, 784, 18750 }, - { 48000, LC_540M, 512, 11250 }, - { 64000, LC_540M, 2048, 33750 }, - { 88200, LC_540M, 1568, 18750 }, - { 96000, LC_540M, 1024, 11250 }, - { 128000, LC_540M, 4096, 33750 }, - { 176400, LC_540M, 3136, 18750 }, - { 192000, LC_540M, 2048, 11250 }, - { 32000, LC_810M, 1024, 50625 }, - { 44100, LC_810M, 784, 28125 }, - { 48000, LC_810M, 512, 16875 }, - { 64000, LC_810M, 2048, 50625 }, - { 88200, LC_810M, 1568, 28125 }, - { 96000, LC_810M, 1024, 16875 }, - { 128000, LC_810M, 4096, 50625 }, - { 176400, LC_810M, 3136, 28125 }, - { 192000, LC_810M, 2048, 16875 }, -}; - -static const struct dp_aud_n_m * -audio_config_dp_get_n_m(const struct intel_crtc_state *crtc_state, int rate) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(dp_aud_n_m); i++) { - if (rate == dp_aud_n_m[i].sample_rate && - crtc_state->port_clock == dp_aud_n_m[i].clock) - return &dp_aud_n_m[i]; - } - - return NULL; -} - static const struct { int clock; u32 config; @@ -387,47 +320,17 @@ hsw_dp_audio_config_update(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { struct drm_i915_private *i915 = to_i915(encoder->base.dev); - struct i915_audio_component *acomp = i915->display.audio.component; enum transcoder cpu_transcoder = crtc_state->cpu_transcoder; - enum port port = encoder->port; - const struct dp_aud_n_m *nm; - int rate; - u32 tmp; - - rate = acomp ? acomp->aud_sample_rate[port] : 0; - nm = audio_config_dp_get_n_m(crtc_state, rate); - if (nm) - drm_dbg_kms(&i915->drm, "using Maud %u, Naud %u\n", nm->m, - nm->n); - else - drm_dbg_kms(&i915->drm, "using automatic Maud, Naud\n"); - - tmp = intel_de_read(i915, HSW_AUD_CFG(cpu_transcoder)); - tmp &= ~AUD_CONFIG_N_VALUE_INDEX; - tmp &= ~AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK; - tmp &= ~AUD_CONFIG_N_PROG_ENABLE; - tmp |= AUD_CONFIG_N_VALUE_INDEX; - if (nm) { - tmp &= ~AUD_CONFIG_N_MASK; - tmp |= AUD_CONFIG_N(nm->n); - tmp |= AUD_CONFIG_N_PROG_ENABLE; - } - - intel_de_write(i915, HSW_AUD_CFG(cpu_transcoder), tmp); - - tmp = intel_de_read(i915, HSW_AUD_M_CTS_ENABLE(cpu_transcoder)); - tmp &= ~AUD_CONFIG_M_MASK; - tmp &= ~AUD_M_CTS_M_VALUE_INDEX; - tmp &= ~AUD_M_CTS_M_PROG_ENABLE; - - if (nm) { - tmp |= nm->m; - tmp |= AUD_M_CTS_M_VALUE_INDEX; - tmp |= AUD_M_CTS_M_PROG_ENABLE; - } + /* Enable time stamps. Let HW calculate Maud/Naud values */ + intel_de_rmw(i915, HSW_AUD_CFG(cpu_transcoder), + AUD_CONFIG_N_VALUE_INDEX | + AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK | + AUD_CONFIG_UPPER_N_MASK | + AUD_CONFIG_LOWER_N_MASK | + AUD_CONFIG_N_PROG_ENABLE, + AUD_CONFIG_N_VALUE_INDEX); - intel_de_write(i915, HSW_AUD_M_CTS_ENABLE(cpu_transcoder), tmp); } static void diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index 52bd3576835b..7d1e443f97b9 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -1042,22 +1042,11 @@ parse_lfp_backlight(struct drm_i915_private *i915, panel->vbt.backlight.type = INTEL_BACKLIGHT_DISPLAY_DDI; panel->vbt.backlight.controller = 0; if (i915->display.vbt.version >= 191) { - size_t exp_size; + const struct lfp_backlight_control_method *method; - if (i915->display.vbt.version >= 236) - exp_size = sizeof(struct bdb_lfp_backlight_data); - else if (i915->display.vbt.version >= 234) - exp_size = EXP_BDB_LFP_BL_DATA_SIZE_REV_234; - else - exp_size = EXP_BDB_LFP_BL_DATA_SIZE_REV_191; - - if (get_blocksize(backlight_data) >= exp_size) { - const struct lfp_backlight_control_method *method; - - method = &backlight_data->backlight_control[panel_type]; - panel->vbt.backlight.type = method->type; - panel->vbt.backlight.controller = method->controller; - } + method = &backlight_data->backlight_control[panel_type]; + panel->vbt.backlight.type = method->type; + panel->vbt.backlight.controller = method->controller; } panel->vbt.backlight.pwm_freq_hz = entry->pwm_freq_hz; diff --git a/drivers/gpu/drm/i915/display/intel_vbt_defs.h b/drivers/gpu/drm/i915/display/intel_vbt_defs.h index a9f44abfc9fc..b50cd0dcabda 100644 --- a/drivers/gpu/drm/i915/display/intel_vbt_defs.h +++ b/drivers/gpu/drm/i915/display/intel_vbt_defs.h @@ -897,11 +897,6 @@ struct lfp_brightness_level { u16 reserved; } __packed; -#define EXP_BDB_LFP_BL_DATA_SIZE_REV_191 \ - offsetof(struct bdb_lfp_backlight_data, brightness_level) -#define EXP_BDB_LFP_BL_DATA_SIZE_REV_234 \ - offsetof(struct bdb_lfp_backlight_data, brightness_precision_bits) - struct bdb_lfp_backlight_data { u8 entry_size; struct lfp_backlight_data_entry data[16]; diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c index 044219c5960a..99b71bb7da0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c @@ -8,14 +8,14 @@ #include "intel_gt_ccs_mode.h" #include "intel_gt_regs.h" -void intel_gt_apply_ccs_mode(struct intel_gt *gt) +unsigned int intel_gt_apply_ccs_mode(struct intel_gt *gt) { int cslice; u32 mode = 0; int first_ccs = __ffs(CCS_MASK(gt)); if (!IS_DG2(gt->i915)) - return; + return 0; /* Build the value for the fixed CCS load balancing */ for (cslice = 0; cslice < I915_MAX_CCS; cslice++) { @@ -35,5 +35,5 @@ void intel_gt_apply_ccs_mode(struct intel_gt *gt) XEHP_CCS_MODE_CSLICE_MASK); } - intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode); + return mode; } diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h index 9e5549caeb26..55547f2ff426 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h @@ -8,6 +8,6 @@ struct intel_gt; -void intel_gt_apply_ccs_mode(struct intel_gt *gt); +unsigned int intel_gt_apply_ccs_mode(struct intel_gt *gt); #endif /* __INTEL_GT_CCS_MODE_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 6ec3582c9735..85c860ea9d7c 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -2859,6 +2859,7 @@ add_render_compute_tuning_settings(struct intel_gt *gt, static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal) { struct intel_gt *gt = engine->gt; + u32 mode; if (!IS_DG2(gt->i915)) return; @@ -2875,7 +2876,8 @@ static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_li * After having disabled automatic load balancing we need to * assign all slices to a single CCS. We will call it CCS mode 1 */ - intel_gt_apply_ccs_mode(gt); + mode = intel_gt_apply_ccs_mode(gt); + wa_masked_en(wal, XEHP_CCS_MODE, mode); } /* diff --git a/drivers/gpu/drm/meson/meson_dw_hdmi.c b/drivers/gpu/drm/meson/meson_dw_hdmi.c index 5a9538bc0e26..5565f7777529 100644 --- a/drivers/gpu/drm/meson/meson_dw_hdmi.c +++ b/drivers/gpu/drm/meson/meson_dw_hdmi.c @@ -106,6 +106,8 @@ #define HHI_HDMI_CLK_CNTL 0x1cc /* 0x73 */ #define HHI_HDMI_PHY_CNTL0 0x3a0 /* 0xe8 */ #define HHI_HDMI_PHY_CNTL1 0x3a4 /* 0xe9 */ +#define PHY_CNTL1_INIT 0x03900000 +#define PHY_INVERT BIT(17) #define HHI_HDMI_PHY_CNTL2 0x3a8 /* 0xea */ #define HHI_HDMI_PHY_CNTL3 0x3ac /* 0xeb */ #define HHI_HDMI_PHY_CNTL4 0x3b0 /* 0xec */ @@ -130,6 +132,8 @@ struct meson_dw_hdmi_data { unsigned int addr); void (*dwc_write)(struct meson_dw_hdmi *dw_hdmi, unsigned int addr, unsigned int data); + u32 cntl0_init; + u32 cntl1_init; }; struct meson_dw_hdmi { @@ -384,26 +388,6 @@ static int dw_hdmi_phy_init(struct dw_hdmi *hdmi, void *data, dw_hdmi_bus_fmt_is_420(hdmi)) mode_is_420 = true; - /* Enable clocks */ - regmap_update_bits(priv->hhi, HHI_HDMI_CLK_CNTL, 0xffff, 0x100); - - /* Bring HDMITX MEM output of power down */ - regmap_update_bits(priv->hhi, HHI_MEM_PD_REG0, 0xff << 8, 0); - - /* Bring out of reset */ - dw_hdmi->data->top_write(dw_hdmi, HDMITX_TOP_SW_RESET, 0); - - /* Enable internal pixclk, tmds_clk, spdif_clk, i2s_clk, cecclk */ - dw_hdmi_top_write_bits(dw_hdmi, HDMITX_TOP_CLK_CNTL, - 0x3, 0x3); - - /* Enable cec_clk and hdcp22_tmdsclk_en */ - dw_hdmi_top_write_bits(dw_hdmi, HDMITX_TOP_CLK_CNTL, - 0x3 << 4, 0x3 << 4); - - /* Enable normal output to PHY */ - dw_hdmi->data->top_write(dw_hdmi, HDMITX_TOP_BIST_CNTL, BIT(12)); - /* TMDS pattern setup */ if (mode->clock > 340000 && !mode_is_420) { dw_hdmi->data->top_write(dw_hdmi, HDMITX_TOP_TMDS_CLK_PTTN_01, @@ -425,20 +409,6 @@ static int dw_hdmi_phy_init(struct dw_hdmi *hdmi, void *data, /* Setup PHY parameters */ meson_hdmi_phy_setup_mode(dw_hdmi, mode, mode_is_420); - /* Setup PHY */ - regmap_update_bits(priv->hhi, HHI_HDMI_PHY_CNTL1, - 0xffff << 16, 0x0390 << 16); - - /* BIT_INVERT */ - if (dw_hdmi_is_compatible(dw_hdmi, "amlogic,meson-gxl-dw-hdmi") || - dw_hdmi_is_compatible(dw_hdmi, "amlogic,meson-gxm-dw-hdmi") || - dw_hdmi_is_compatible(dw_hdmi, "amlogic,meson-g12a-dw-hdmi")) - regmap_update_bits(priv->hhi, HHI_HDMI_PHY_CNTL1, - BIT(17), 0); - else - regmap_update_bits(priv->hhi, HHI_HDMI_PHY_CNTL1, - BIT(17), BIT(17)); - /* Disable clock, fifo, fifo_wr */ regmap_update_bits(priv->hhi, HHI_HDMI_PHY_CNTL1, 0xf, 0); @@ -492,7 +462,9 @@ static void dw_hdmi_phy_disable(struct dw_hdmi *hdmi, DRM_DEBUG_DRIVER("\n"); - regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL0, 0); + /* Fallback to init mode */ + regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL1, dw_hdmi->data->cntl1_init); + regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL0, dw_hdmi->data->cntl0_init); } static enum drm_connector_status dw_hdmi_read_hpd(struct dw_hdmi *hdmi, @@ -610,11 +582,22 @@ static const struct regmap_config meson_dw_hdmi_regmap_config = { .fast_io = true, }; -static const struct meson_dw_hdmi_data meson_dw_hdmi_gx_data = { +static const struct meson_dw_hdmi_data meson_dw_hdmi_gxbb_data = { .top_read = dw_hdmi_top_read, .top_write = dw_hdmi_top_write, .dwc_read = dw_hdmi_dwc_read, .dwc_write = dw_hdmi_dwc_write, + .cntl0_init = 0x0, + .cntl1_init = PHY_CNTL1_INIT | PHY_INVERT, +}; + +static const struct meson_dw_hdmi_data meson_dw_hdmi_gxl_data = { + .top_read = dw_hdmi_top_read, + .top_write = dw_hdmi_top_write, + .dwc_read = dw_hdmi_dwc_read, + .dwc_write = dw_hdmi_dwc_write, + .cntl0_init = 0x0, + .cntl1_init = PHY_CNTL1_INIT, }; static const struct meson_dw_hdmi_data meson_dw_hdmi_g12a_data = { @@ -622,6 +605,8 @@ static const struct meson_dw_hdmi_data meson_dw_hdmi_g12a_data = { .top_write = dw_hdmi_g12a_top_write, .dwc_read = dw_hdmi_g12a_dwc_read, .dwc_write = dw_hdmi_g12a_dwc_write, + .cntl0_init = 0x000b4242, /* Bandgap */ + .cntl1_init = PHY_CNTL1_INIT, }; static void meson_dw_hdmi_init(struct meson_dw_hdmi *meson_dw_hdmi) @@ -656,6 +641,13 @@ static void meson_dw_hdmi_init(struct meson_dw_hdmi *meson_dw_hdmi) meson_dw_hdmi->data->top_write(meson_dw_hdmi, HDMITX_TOP_CLK_CNTL, 0xff); + /* Enable normal output to PHY */ + meson_dw_hdmi->data->top_write(meson_dw_hdmi, HDMITX_TOP_BIST_CNTL, BIT(12)); + + /* Setup PHY */ + regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL1, meson_dw_hdmi->data->cntl1_init); + regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL0, meson_dw_hdmi->data->cntl0_init); + /* Enable HDMI-TX Interrupt */ meson_dw_hdmi->data->top_write(meson_dw_hdmi, HDMITX_TOP_INTR_STAT_CLR, HDMITX_TOP_INTR_CORE); @@ -865,11 +857,11 @@ static const struct dev_pm_ops meson_dw_hdmi_pm_ops = { static const struct of_device_id meson_dw_hdmi_of_table[] = { { .compatible = "amlogic,meson-gxbb-dw-hdmi", - .data = &meson_dw_hdmi_gx_data }, + .data = &meson_dw_hdmi_gxbb_data }, { .compatible = "amlogic,meson-gxl-dw-hdmi", - .data = &meson_dw_hdmi_gx_data }, + .data = &meson_dw_hdmi_gxl_data }, { .compatible = "amlogic,meson-gxm-dw-hdmi", - .data = &meson_dw_hdmi_gx_data }, + .data = &meson_dw_hdmi_gxl_data }, { .compatible = "amlogic,meson-g12a-dw-hdmi", .data = &meson_dw_hdmi_g12a_data }, { } diff --git a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c index 141b0a513bf5..adc60b25f8e6 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c @@ -205,9 +205,7 @@ nvkm_firmware_dtor(struct nvkm_firmware *fw) break; case NVKM_FIRMWARE_IMG_DMA: nvkm_memory_unref(&memory); - dma_unmap_single(fw->device->dev, fw->phys, sg_dma_len(&fw->mem.sgl), - DMA_TO_DEVICE); - kfree(fw->img); + dma_free_coherent(fw->device->dev, sg_dma_len(&fw->mem.sgl), fw->img, fw->phys); break; case NVKM_FIRMWARE_IMG_SGT: nvkm_memory_unref(&memory); @@ -237,17 +235,14 @@ nvkm_firmware_ctor(const struct nvkm_firmware_func *func, const char *name, fw->img = kmemdup(src, fw->len, GFP_KERNEL); break; case NVKM_FIRMWARE_IMG_DMA: { - len = ALIGN(fw->len, PAGE_SIZE); + dma_addr_t addr; - fw->img = kmalloc(len, GFP_KERNEL); - if (!fw->img) - return -ENOMEM; + len = ALIGN(fw->len, PAGE_SIZE); - memcpy(fw->img, src, fw->len); - fw->phys = dma_map_single(fw->device->dev, fw->img, len, DMA_TO_DEVICE); - if (dma_mapping_error(fw->device->dev, fw->phys)) { - kfree(fw->img); - return -EFAULT; + fw->img = dma_alloc_coherent(fw->device->dev, len, &addr, GFP_KERNEL); + if (fw->img) { + memcpy(fw->img, src, fw->len); + fw->phys = addr; } sg_init_one(&fw->mem.sgl, fw->img, len); diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index 6ad4c1a90a78..5339f8a49004 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -100,7 +100,7 @@ struct __guc_ads_blob { struct guc_engine_usage engine_usage; struct guc_um_init_params um_init_params; /* From here on, location is dynamic! Refer to above diagram. */ - struct guc_mmio_reg regset[0]; + struct guc_mmio_reg regset[]; } __packed; #define ads_blob_read(ads_, field_) \ diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 7f32547f94b2..8bbfa45798e2 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -120,6 +120,7 @@ static void guc_ct_fini(struct drm_device *drm, void *arg) { struct xe_guc_ct *ct = arg; + destroy_workqueue(ct->g2h_wq); xa_destroy(&ct->fence_lookup); } @@ -145,13 +146,20 @@ int xe_guc_ct_init(struct xe_guc_ct *ct) xe_assert(xe, !(guc_ct_size() % PAGE_SIZE)); - drmm_mutex_init(&xe->drm, &ct->lock); + ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0); + if (!ct->g2h_wq) + return -ENOMEM; + spin_lock_init(&ct->fast_lock); xa_init(&ct->fence_lookup); INIT_WORK(&ct->g2h_worker, g2h_worker_func); init_waitqueue_head(&ct->wq); init_waitqueue_head(&ct->g2h_fence_wq); + err = drmm_mutex_init(&xe->drm, &ct->lock); + if (err) + return err; + primelockdep(ct); bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(), diff --git a/drivers/gpu/drm/xe/xe_guc_ct.h b/drivers/gpu/drm/xe/xe_guc_ct.h index 5083e099064f..105bb8e99a8d 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.h +++ b/drivers/gpu/drm/xe/xe_guc_ct.h @@ -34,7 +34,7 @@ static inline void xe_guc_ct_irq_handler(struct xe_guc_ct *ct) return; wake_up_all(&ct->wq); - queue_work(system_unbound_wq, &ct->g2h_worker); + queue_work(ct->g2h_wq, &ct->g2h_worker); xe_guc_ct_fast_path(ct); } diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h index d29144c9f20b..fede4c6e93cb 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct_types.h +++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h @@ -120,6 +120,8 @@ struct xe_guc_ct { wait_queue_head_t wq; /** @g2h_fence_wq: wait queue used for G2H fencing */ wait_queue_head_t g2h_fence_wq; + /** @g2h_wq: used to process G2H */ + struct workqueue_struct *g2h_wq; /** @msg: Message buffer */ u32 msg[GUC_CTB_MSG_MAX_LEN]; /** @fast_msg: Message buffer */ diff --git a/drivers/hwmon/corsair-cpro.c b/drivers/hwmon/corsair-cpro.c index a284a02839fb..3e63666a61bd 100644 --- a/drivers/hwmon/corsair-cpro.c +++ b/drivers/hwmon/corsair-cpro.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <linux/types.h> #define USB_VENDOR_ID_CORSAIR 0x1b1c @@ -77,8 +78,11 @@ struct ccp_device { struct hid_device *hdev; struct device *hwmon_dev; + /* For reinitializing the completion below */ + spinlock_t wait_input_report_lock; struct completion wait_input_report; struct mutex mutex; /* whenever buffer is used, lock before send_usb_cmd */ + u8 *cmd_buffer; u8 *buffer; int target[6]; DECLARE_BITMAP(temp_cnct, NUM_TEMP_SENSORS); @@ -111,15 +115,23 @@ static int send_usb_cmd(struct ccp_device *ccp, u8 command, u8 byte1, u8 byte2, unsigned long t; int ret; - memset(ccp->buffer, 0x00, OUT_BUFFER_SIZE); - ccp->buffer[0] = command; - ccp->buffer[1] = byte1; - ccp->buffer[2] = byte2; - ccp->buffer[3] = byte3; - + memset(ccp->cmd_buffer, 0x00, OUT_BUFFER_SIZE); + ccp->cmd_buffer[0] = command; + ccp->cmd_buffer[1] = byte1; + ccp->cmd_buffer[2] = byte2; + ccp->cmd_buffer[3] = byte3; + + /* + * Disable raw event parsing for a moment to safely reinitialize the + * completion. Reinit is done because hidraw could have triggered + * the raw event parsing and marked the ccp->wait_input_report + * completion as done. + */ + spin_lock_bh(&ccp->wait_input_report_lock); reinit_completion(&ccp->wait_input_report); + spin_unlock_bh(&ccp->wait_input_report_lock); - ret = hid_hw_output_report(ccp->hdev, ccp->buffer, OUT_BUFFER_SIZE); + ret = hid_hw_output_report(ccp->hdev, ccp->cmd_buffer, OUT_BUFFER_SIZE); if (ret < 0) return ret; @@ -135,11 +147,12 @@ static int ccp_raw_event(struct hid_device *hdev, struct hid_report *report, u8 struct ccp_device *ccp = hid_get_drvdata(hdev); /* only copy buffer when requested */ - if (completion_done(&ccp->wait_input_report)) - return 0; - - memcpy(ccp->buffer, data, min(IN_BUFFER_SIZE, size)); - complete(&ccp->wait_input_report); + spin_lock(&ccp->wait_input_report_lock); + if (!completion_done(&ccp->wait_input_report)) { + memcpy(ccp->buffer, data, min(IN_BUFFER_SIZE, size)); + complete_all(&ccp->wait_input_report); + } + spin_unlock(&ccp->wait_input_report_lock); return 0; } @@ -492,7 +505,11 @@ static int ccp_probe(struct hid_device *hdev, const struct hid_device_id *id) if (!ccp) return -ENOMEM; - ccp->buffer = devm_kmalloc(&hdev->dev, OUT_BUFFER_SIZE, GFP_KERNEL); + ccp->cmd_buffer = devm_kmalloc(&hdev->dev, OUT_BUFFER_SIZE, GFP_KERNEL); + if (!ccp->cmd_buffer) + return -ENOMEM; + + ccp->buffer = devm_kmalloc(&hdev->dev, IN_BUFFER_SIZE, GFP_KERNEL); if (!ccp->buffer) return -ENOMEM; @@ -510,7 +527,9 @@ static int ccp_probe(struct hid_device *hdev, const struct hid_device_id *id) ccp->hdev = hdev; hid_set_drvdata(hdev, ccp); + mutex_init(&ccp->mutex); + spin_lock_init(&ccp->wait_input_report_lock); init_completion(&ccp->wait_input_report); hid_device_io_start(hdev); diff --git a/drivers/hwmon/pmbus/ucd9000.c b/drivers/hwmon/pmbus/ucd9000.c index 8d9d422450e5..d817c719b90b 100644 --- a/drivers/hwmon/pmbus/ucd9000.c +++ b/drivers/hwmon/pmbus/ucd9000.c @@ -80,11 +80,11 @@ struct ucd9000_debugfs_entry { * It has been observed that the UCD90320 randomly fails register access when * doing another access right on the back of a register write. To mitigate this * make sure that there is a minimum delay between a write access and the - * following access. The 250us is based on experimental data. At a delay of - * 200us the issue seems to go away. Add a bit of extra margin to allow for + * following access. The 500 is based on experimental data. At a delay of + * 350us the issue seems to go away. Add a bit of extra margin to allow for * system to system differences. */ -#define UCD90320_WAIT_DELAY_US 250 +#define UCD90320_WAIT_DELAY_US 500 static inline void ucd90320_wait(const struct ucd9000_data *data) { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e692217fcb28..fb727f5b0b82 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2754,6 +2754,10 @@ static int amd_iommu_def_domain_type(struct device *dev) if (!dev_data) return 0; + /* Always use DMA domain for untrusted device */ + if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) + return IOMMU_DOMAIN_DMA; + /* * Do not identity map IOMMUv2 capable devices when: * - memory encryption is active, because some of those devices diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c index 87bf522b9d2e..957d988b6d83 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c @@ -221,11 +221,9 @@ static irqreturn_t nvidia_smmu_context_fault(int irq, void *dev) unsigned int inst; irqreturn_t ret = IRQ_NONE; struct arm_smmu_device *smmu; - struct iommu_domain *domain = dev; - struct arm_smmu_domain *smmu_domain; + struct arm_smmu_domain *smmu_domain = dev; struct nvidia_smmu *nvidia; - smmu_domain = container_of(domain, struct arm_smmu_domain, domain); smmu = smmu_domain->smmu; nvidia = to_nvidia_smmu(smmu); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 3dfd5ae99ae0..499a8bb7cac7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -423,13 +423,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * For iopoll, complete it directly. + * For iopoll, complete it directly. Note that using the uring_cmd + * helper for this is safe only because we check blk_rq_is_poll(). + * As that returns false if we're NOT on a polled queue, then it's + * safe to use the polled completion helper. + * * Otherwise, move the completion to task work. */ - if (blk_rq_is_poll(req)) - nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); - else + if (blk_rq_is_poll(req)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b564c5f1450c..05532c281177 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,6 +162,11 @@ enum nvme_quirks { * Disables simple suspend/resume path. */ NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND = (1 << 20), + + /* + * MSI (but not MSI-X) interrupts are broken and never fire. + */ + NVME_QUIRK_BROKEN_MSI = (1 << 21), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e393f6947ce4..710043086dff 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2224,6 +2224,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .priv = dev, }; unsigned int irq_queues, poll_queues; + unsigned int flags = PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY; /* * Poll queues don't need interrupts, but we need at least one I/O queue @@ -2247,8 +2248,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) irq_queues = 1; if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) irq_queues += (nr_io_queues - poll_queues); - return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI) + flags &= ~PCI_IRQ_MSI; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags, + &affd); } static unsigned int nvme_max_io_queues(struct nvme_dev *dev) @@ -2477,6 +2480,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); + unsigned int flags = PCI_IRQ_ALL_TYPES; if (pci_enable_device_mem(pdev)) return result; @@ -2493,7 +2497,9 @@ static int nvme_pci_enable(struct nvme_dev *dev) * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll * adjust this later. */ - result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI) + flags &= ~PCI_IRQ_MSI; + result = pci_alloc_irq_vectors(pdev, 1, 1, flags); if (result < 0) goto disable; @@ -3390,6 +3396,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | NVME_QUIRK_DISABLE_WRITE_ZEROES| NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */ + .driver_data = NVME_QUIRK_BROKEN_MSI }, { PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index fb518b00f71f..4f08362aee8b 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -480,7 +480,7 @@ out_free_response: nvme_auth_free_key(transformed_key); out_free_tfm: crypto_free_shash(shash_tfm); - return 0; + return ret; } int nvmet_auth_ctrl_exponential(struct nvmet_req *req, diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 89a001101a7f..7fda69395c1e 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -757,10 +757,9 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid) { struct config_item *ns_item; - char name[4] = {}; + char name[12]; - if (sprintf(name, "%u", nsid) <= 0) - return false; + snprintf(name, sizeof(name), "%u", nsid); mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex); ns_item = config_group_find_item(&subsys->namespaces_group, name); mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index e06013c5dace..2fde22323622 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1686,7 +1686,8 @@ static int __init nvmet_init(void) if (!buffered_io_wq) goto out_free_zbd_work_queue; - nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0); + nvmet_wq = alloc_workqueue("nvmet-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!nvmet_wq) goto out_free_buffered_work_queue; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 5b8c63e74639..6e1b4140cde0 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -474,12 +474,8 @@ nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) return 0; out_free: - while (--i >= 0) { - struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - - list_del(&rsp->free_list); - nvmet_rdma_free_rsp(ndev, rsp); - } + while (--i >= 0) + nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); kfree(queue->rsps); out: return ret; @@ -490,12 +486,8 @@ static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) struct nvmet_rdma_device *ndev = queue->dev; int i, nr_rsps = queue->recv_queue_size * 2; - for (i = 0; i < nr_rsps; i++) { - struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - - list_del(&rsp->free_list); - nvmet_rdma_free_rsp(ndev, rsp); - } + for (i = 0; i < nr_rsps; i++) + nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); kfree(queue->rsps); } diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index dabac9772741..2c33653ffdea 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1911,19 +1911,24 @@ static struct regulator *create_regulator(struct regulator_dev *rdev, } } - if (err != -EEXIST) + if (err != -EEXIST) { regulator->debugfs = debugfs_create_dir(supply_name, rdev->debugfs); - if (IS_ERR(regulator->debugfs)) - rdev_dbg(rdev, "Failed to create debugfs directory\n"); + if (IS_ERR(regulator->debugfs)) { + rdev_dbg(rdev, "Failed to create debugfs directory\n"); + regulator->debugfs = NULL; + } + } - debugfs_create_u32("uA_load", 0444, regulator->debugfs, - ®ulator->uA_load); - debugfs_create_u32("min_uV", 0444, regulator->debugfs, - ®ulator->voltage[PM_SUSPEND_ON].min_uV); - debugfs_create_u32("max_uV", 0444, regulator->debugfs, - ®ulator->voltage[PM_SUSPEND_ON].max_uV); - debugfs_create_file("constraint_flags", 0444, regulator->debugfs, - regulator, &constraint_flags_fops); + if (regulator->debugfs) { + debugfs_create_u32("uA_load", 0444, regulator->debugfs, + ®ulator->uA_load); + debugfs_create_u32("min_uV", 0444, regulator->debugfs, + ®ulator->voltage[PM_SUSPEND_ON].min_uV); + debugfs_create_u32("max_uV", 0444, regulator->debugfs, + ®ulator->voltage[PM_SUSPEND_ON].max_uV); + debugfs_create_file("constraint_flags", 0444, regulator->debugfs, + regulator, &constraint_flags_fops); + } /* * Check now if the regulator is an always on regulator - if diff --git a/drivers/regulator/rtq2208-regulator.c b/drivers/regulator/rtq2208-regulator.c index 2d54844c4226..3079de41ef3f 100644 --- a/drivers/regulator/rtq2208-regulator.c +++ b/drivers/regulator/rtq2208-regulator.c @@ -48,7 +48,7 @@ /* Value */ #define RTQ2208_RAMP_VALUE_MIN_uV 500 -#define RTQ2208_RAMP_VALUE_MAX_uV 64000 +#define RTQ2208_RAMP_VALUE_MAX_uV 16000 #define RTQ2208_BUCK_MASK(uv_irq, ov_irq) (1 << ((uv_irq) % 8) | 1 << ((ov_irq) % 8)) @@ -142,12 +142,11 @@ static int rtq2208_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay) * Because the relation of seleltion and value is like that * * seletion: value - * 000: 64mv - * 001: 32mv + * 010: 16mv * ... * 111: 0.5mv * - * For example, if I would like to select 64mv, the fls(ramp_delay) - 1 will be 0b111, + * For example, if I would like to select 16mv, the fls(ramp_delay) - 1 will be 0b010, * and I need to use 0b111 - sel to do the shifting */ diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c index 03d125a71fd9..09f16471c537 100644 --- a/drivers/spi/spi-microchip-core-qspi.c +++ b/drivers/spi/spi-microchip-core-qspi.c @@ -283,6 +283,7 @@ static int mchp_coreqspi_setup_clock(struct mchp_coreqspi *qspi, struct spi_devi } control = readl_relaxed(qspi->regs + REG_CONTROL); + control &= ~CONTROL_CLKRATE_MASK; control |= baud_rate_val << CONTROL_CLKRATE_SHIFT; writel_relaxed(control, qspi->regs + REG_CONTROL); control = readl_relaxed(qspi->regs + REG_CONTROL); diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index e4e7ddb7524a..4a68abcdcc35 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -1016,10 +1016,8 @@ end_irq: static irqreturn_t stm32fx_spi_irq_thread(int irq, void *dev_id) { struct spi_controller *ctrl = dev_id; - struct stm32_spi *spi = spi_controller_get_devdata(ctrl); spi_finalize_current_transfer(ctrl); - stm32fx_spi_disable(spi); return IRQ_HANDLED; } @@ -1187,6 +1185,8 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl, ~clrb) | setb, spi->base + spi->cfg->regs->cpol.reg); + stm32_spi_enable(spi); + spin_unlock_irqrestore(&spi->lock, flags); return 0; @@ -1204,7 +1204,6 @@ static void stm32fx_spi_dma_tx_cb(void *data) if (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) { spi_finalize_current_transfer(spi->ctrl); - stm32fx_spi_disable(spi); } } @@ -1219,7 +1218,6 @@ static void stm32_spi_dma_rx_cb(void *data) struct stm32_spi *spi = data; spi_finalize_current_transfer(spi->ctrl); - spi->cfg->disable(spi); } /** @@ -1307,8 +1305,6 @@ static int stm32fx_spi_transfer_one_irq(struct stm32_spi *spi) stm32_spi_set_bits(spi, STM32FX_SPI_CR2, cr2); - stm32_spi_enable(spi); - /* starting data transfer when buffer is loaded */ if (spi->tx_buf) spi->cfg->write_tx(spi); @@ -1345,8 +1341,6 @@ static int stm32h7_spi_transfer_one_irq(struct stm32_spi *spi) spin_lock_irqsave(&spi->lock, flags); - stm32_spi_enable(spi); - /* Be sure to have data in fifo before starting data transfer */ if (spi->tx_buf) stm32h7_spi_write_txfifo(spi); @@ -1378,8 +1372,6 @@ static void stm32fx_spi_transfer_one_dma_start(struct stm32_spi *spi) */ stm32_spi_set_bits(spi, STM32FX_SPI_CR2, STM32FX_SPI_CR2_ERRIE); } - - stm32_spi_enable(spi); } /** @@ -1413,8 +1405,6 @@ static void stm32h7_spi_transfer_one_dma_start(struct stm32_spi *spi) stm32_spi_set_bits(spi, STM32H7_SPI_IER, ier); - stm32_spi_enable(spi); - if (STM32_SPI_HOST_MODE(spi)) stm32_spi_set_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_CSTART); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 23fbab954c20..102f48668c35 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1817,15 +1817,13 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, } static void make_uffd_wp_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) + unsigned long addr, pte_t *pte, pte_t ptent) { - pte_t ptent = ptep_get(pte); - if (pte_present(ptent)) { pte_t old_pte; old_pte = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_mkuffd_wp(ptent); + ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_mkuffd_wp(ptent); @@ -2175,9 +2173,12 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { - if (pte_uffd_wp(ptep_get(pte))) + pte_t ptent = ptep_get(pte); + + if ((pte_present(ptent) && pte_uffd_wp(ptent)) || + pte_swp_uffd_wp_any(ptent)) continue; - make_uffd_wp_pte(vma, addr, pte); + make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = addr + PAGE_SIZE; @@ -2190,8 +2191,10 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, p->arg.return_mask == PAGE_IS_WRITTEN) { for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { unsigned long next = addr + PAGE_SIZE; + pte_t ptent = ptep_get(pte); - if (pte_uffd_wp(ptep_get(pte))) + if ((pte_present(ptent) && pte_uffd_wp(ptent)) || + pte_swp_uffd_wp_any(ptent)) continue; ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, p, addr, &next); @@ -2199,7 +2202,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; - make_uffd_wp_pte(vma, addr, pte); + make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; @@ -2208,8 +2211,9 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, } for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(pte); unsigned long categories = p->cur_vma_category | - pagemap_page_category(p, vma, addr, ptep_get(pte)); + pagemap_page_category(p, vma, addr, ptent); unsigned long next = addr + PAGE_SIZE; if (!pagemap_scan_is_interesting_page(categories, p)) @@ -2224,7 +2228,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, if (~categories & PAGE_IS_WRITTEN) continue; - make_uffd_wp_pte(vma, addr, pte); + make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 60dcfafdc11a..292f5fd50104 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -895,6 +895,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; continue; } + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, vma->vm_end, new_flags, diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index a29ba6ef1e27..422343ace39c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -278,6 +278,17 @@ struct ftrace_likely_data { # define __no_kcsan #endif +#ifdef __SANITIZE_MEMORY__ +/* + * Similarly to KASAN and KCSAN, KMSAN loses function attributes of inlined + * functions, therefore disabling KMSAN checks also requires disabling inlining. + * + * __no_sanitize_or_inline effectively prevents KMSAN from reporting errors + * within the function and marks all its outputs as initialized. + */ +# define __no_sanitize_or_inline __no_kmsan_checks notrace __maybe_unused +#endif + #ifndef __no_sanitize_or_inline #define __no_sanitize_or_inline __always_inline #endif diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 68ed6697fece..e123d5e17b52 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -11,7 +11,6 @@ void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) @@ -45,11 +44,6 @@ static inline const char *io_uring_get_opcode(u8 opcode) { return ""; } -static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} static inline bool io_is_uring_fops(struct file *file) { return false; diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index e453a997c060..447fbfd32215 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -26,12 +26,25 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); + +/* + * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd + * and the corresponding io_uring request. + * + * Note: the caller should never hard code @issue_flags and is only allowed + * to pass the mask provided by the core io_uring code. + */ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); + void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned), unsigned flags); +/* + * Note: the caller should never hard code @issue_flags and only use the + * mask provided by the core io_uring code. + */ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); @@ -56,6 +69,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } #endif +/* + * Polled completions must ensure they are coming from a poll queue, and + * hence are completed inside the usual poll handling loops. + */ +static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, + ssize_t ret, ssize_t res2) +{ + lockdep_assert(in_task()); + io_uring_cmd_done(ioucmd, ret, res2, 0); +} + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h new file mode 100644 index 000000000000..b58f39fed4d5 --- /dev/null +++ b/include/linux/io_uring/net.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_NET_H +#define _LINUX_IO_URING_NET_H + +struct io_uring_cmd; + +#if defined(CONFIG_IO_URING) +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); + +#else +static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} +#endif + +#endif diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ac333ea81d31..7a6b190c7da7 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -205,6 +205,7 @@ struct io_submit_state { bool plug_started; bool need_plug; + bool cq_flush; unsigned short submit_nr; unsigned int cqes_count; struct blk_plug plug; @@ -219,7 +220,7 @@ struct io_ev_fd { }; struct io_alloc_cache { - struct io_wq_work_node list; + void **entries; unsigned int nr_cached; unsigned int max_cached; size_t elem_size; @@ -299,6 +300,8 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; + struct io_alloc_cache rw_cache; + struct io_alloc_cache uring_cache; /* * Any cancelable uring_cmd is added to this list in @@ -341,14 +344,8 @@ struct io_ring_ctx { unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; - struct io_uring_cqe completion_cqes[16]; - spinlock_t completion_lock; - /* IRQ completion list, under ->completion_lock */ - unsigned int locked_free_nr; - struct io_wq_work_list locked_free_list; - struct list_head io_buffers_comp; struct list_head cq_overflow_list; struct io_hash_table cancel_table; @@ -371,9 +368,6 @@ struct io_ring_ctx { struct list_head io_buffers_cache; - /* deferred free list, protected by ->uring_lock */ - struct hlist_head io_buf_list; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; @@ -438,8 +432,6 @@ struct io_ring_ctx { }; struct io_tw_state { - /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ - bool locked; }; enum { @@ -480,6 +472,7 @@ enum { REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, + REQ_F_BUFFERS_COMMIT_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -558,6 +551,8 @@ enum { REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), /* don't recycle provided buffers for this request */ REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), + /* buffer ring head needs incrementing on put */ + REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7bd10201a02b..994bf7af0efe 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -72,6 +72,7 @@ struct io_uring_sqe { __u32 waitid_flags; __u32 futex_flags; __u32 install_fd_flags; + __u32 nop_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -115,7 +116,7 @@ struct io_uring_sqe { */ #define IORING_FILE_INDEX_ALLOC (~0U) -enum { +enum io_uring_sqe_flags_bit { IOSQE_FIXED_FILE_BIT, IOSQE_IO_DRAIN_BIT, IOSQE_IO_LINK_BIT, @@ -351,11 +352,20 @@ enum io_uring_op { * 0 is reported if zerocopy was actually possible. * IORING_NOTIF_USAGE_ZC_COPIED if data was copied * (at least partially). + * + * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send or + * recv will grab as many buffers from the buffer + * group ID given and send them all. The completion + * result will be the number of buffers send, with + * the starting buffer ID in cqe->flags as per + * usual for provided buffer usage. The buffers + * will be contigious from the starting buffer ID. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) +#define IORING_RECVSEND_BUNDLE (1U << 4) /* * cqe.res for IORING_CQE_F_NOTIF if @@ -370,11 +380,13 @@ enum io_uring_op { * accept flags stored in sqe->ioprio */ #define IORING_ACCEPT_MULTISHOT (1U << 0) +#define IORING_ACCEPT_DONTWAIT (1U << 1) +#define IORING_ACCEPT_POLL_FIRST (1U << 2) /* * IORING_OP_MSG_RING command types, stored in sqe->addr */ -enum { +enum io_uring_msg_ring_flags { IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */ IORING_MSG_SEND_FD, /* send a registered fd to another ring */ }; @@ -397,6 +409,13 @@ enum { #define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) /* + * IORING_OP_NOP flags (sqe->nop_flags) + * + * IORING_NOP_INJECT_RESULT Inject result from sqe->result + */ +#define IORING_NOP_INJECT_RESULT (1U << 0) + +/* * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { @@ -425,9 +444,7 @@ struct io_uring_cqe { #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) -enum { - IORING_CQE_BUFFER_SHIFT = 16, -}; +#define IORING_CQE_BUFFER_SHIFT 16 /* * Magic offsets for the application to mmap the data it needs @@ -522,11 +539,12 @@ struct io_uring_params { #define IORING_FEAT_CQE_SKIP (1U << 11) #define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_REG_REG_RING (1U << 13) +#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) /* * io_uring_register(2) opcodes and arguments */ -enum { +enum io_uring_register_op { IORING_REGISTER_BUFFERS = 0, IORING_UNREGISTER_BUFFERS = 1, IORING_REGISTER_FILES = 2, @@ -583,7 +601,7 @@ enum { }; /* io-wq worker categories */ -enum { +enum io_wq_type { IO_WQ_BOUND, IO_WQ_UNBOUND, }; @@ -688,7 +706,7 @@ struct io_uring_buf_ring { * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * to get a virtual mapping for the ring. */ -enum { +enum io_uring_register_pbuf_ring_flags { IOU_PBUF_RING_MMAP = 1, }; @@ -719,7 +737,7 @@ struct io_uring_napi { /* * io_uring_restriction->opcode values */ -enum { +enum io_uring_register_restriction_op { /* Allow an io_uring_register(2) opcode */ IORING_RESTRICTION_REGISTER_OP = 0, @@ -775,7 +793,7 @@ struct io_uring_recvmsg_out { /* * Argument for IORING_OP_URING_CMD when file is a socket */ -enum { +enum io_uring_socket_op { SOCKET_URING_OP_SIOCINQ = 0, SOCKET_URING_OP_SIOCOUTQ, SOCKET_URING_OP_GETSOCKOPT, diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e1d4e03799c..fc1b23c524e8 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,13 +2,14 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o truncate.o +obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ + tctx.o filetable.o rw.o net.o poll.o \ + uring_cmd.o openclose.o sqpoll.o \ + xattr.o nop.o fs.o splice.o sync.o \ + msg_ring.o advise.o openclose.o \ + epoll.o statx.o timeout.o fdinfo.o \ + cancel.o waitid.o register.o \ + truncate.o memmap.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index bf2fb26a6539..b7a38a2069cf 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -4,63 +4,58 @@ /* * Don't allow the cache to grow beyond this size. */ -#define IO_ALLOC_CACHE_MAX 512 - -struct io_cache_entry { - struct io_wq_work_node node; -}; +#define IO_ALLOC_CACHE_MAX 128 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, - struct io_cache_entry *entry) + void *entry) { if (cache->nr_cached < cache->max_cached) { - cache->nr_cached++; - wq_stack_add_head(&entry->node, &cache->list); - kasan_mempool_poison_object(entry); + if (!kasan_mempool_poison_object(entry)) + return false; + cache->entries[cache->nr_cached++] = entry; return true; } return false; } -static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) -{ - return !cache->list.next; -} - -static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) +static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (cache->list.next) { - struct io_cache_entry *entry; + if (cache->nr_cached) { + void *entry = cache->entries[--cache->nr_cached]; - entry = container_of(cache->list.next, struct io_cache_entry, node); kasan_mempool_unpoison_object(entry, cache->elem_size); - cache->list.next = cache->list.next->next; - cache->nr_cached--; return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, +/* returns false if the cache was initialized properly */ +static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) { - cache->list.next = NULL; - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (cache->entries) { + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + return false; + } + return true; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(struct io_cache_entry *)) + void (*free)(const void *)) { - while (1) { - struct io_cache_entry *entry = io_alloc_cache_get(cache); + void *entry; + + if (!cache->entries) + return; - if (!entry) - break; + while ((entry = io_alloc_cache_get(cache)) != NULL) free(entry); - } - cache->nr_cached = 0; + + kvfree(cache->entries); + cache->entries = NULL; } #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index acfcdd7f059a..a6e58a20efdd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -184,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); + ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 8d444dd1b0a7..b1e0e0d85349 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -50,9 +50,9 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) { - struct io_ring_ctx *ctx = f->private_data; + struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 6e86e6188dbe..997c56d32ee6 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -84,12 +84,12 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, return ret; file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); + } else { + io_file_bitmap_set(&ctx->file_table, slot_index); } *io_get_tag_slot(ctx->file_data, slot_index) = 0; io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); return 0; } diff --git a/io_uring/futex.c b/io_uring/futex.c index 792a03df58de..914848f46beb 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -9,7 +9,7 @@ #include "../kernel/futex/futex.h" #include "io_uring.h" -#include "rsrc.h" +#include "alloc_cache.h" #include "futex.h" struct io_futex { @@ -27,27 +27,21 @@ struct io_futex { }; struct io_futex_data { - union { - struct futex_q q; - struct io_cache_entry cache; - }; + struct futex_q q; struct io_kiocb *req; }; -void io_futex_cache_init(struct io_ring_ctx *ctx) -{ - io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); -} +#define IO_FUTEX_ALLOC_CACHE_MAX 32 -static void io_futex_cache_entry_free(struct io_cache_entry *entry) +bool io_futex_cache_init(struct io_ring_ctx *ctx) { - kfree(container_of(entry, struct io_futex_data, cache)); + return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, + sizeof(struct io_futex_data)); } void io_futex_cache_free(struct io_ring_ctx *ctx) { - io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); + io_alloc_cache_free(&ctx->futex_cache, kfree); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) @@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) + if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); __io_futex_complete(req, ts); } @@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) { - struct io_cache_entry *entry; + struct io_futex_data *ifd; - entry = io_alloc_cache_get(&ctx->futex_cache); - if (entry) - return container_of(entry, struct io_futex_data, cache); + ifd = io_alloc_cache_get(&ctx->futex_cache); + if (ifd) + return ifd; return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); } diff --git a/io_uring/futex.h b/io_uring/futex.h index 0847e9e8a127..b8bb09873d57 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -void io_futex_cache_init(struct io_ring_ctx *ctx); +bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); #else static inline int io_futex_cancel(struct io_ring_ctx *ctx, @@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, { return false; } -static inline void io_futex_cache_init(struct io_ring_ctx *ctx) +static inline bool io_futex_cache_init(struct io_ring_ctx *ctx) { + return false; } static inline void io_futex_cache_free(struct io_ring_ctx *ctx) { diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 522196dfb0ff..d1c47a9d9215 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -25,10 +25,10 @@ #define WORKER_IDLE_TIMEOUT (5 * HZ) enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ + IO_WORKER_F_UP = 0, /* up and active */ + IO_WORKER_F_RUNNING = 1, /* account as running */ + IO_WORKER_F_FREE = 2, /* worker on free list */ + IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -44,21 +44,20 @@ enum { */ struct io_worker { refcount_t ref; - unsigned flags; + int create_index; + unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; struct io_wq_work *cur_work; - struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; unsigned long create_state; struct callback_head create_work; - int create_index; union { struct rcu_head rcu; @@ -165,7 +164,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); } static void io_worker_ref_put(struct io_wq *wq) @@ -225,7 +224,7 @@ static void io_worker_exit(struct io_worker *worker) wait_for_completion(&worker->ref_done); raw_spin_lock(&wq->lock); - if (worker->flags & IO_WORKER_F_FREE) + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); raw_spin_unlock(&wq->lock); @@ -410,7 +409,7 @@ static void io_wq_dec_running(struct io_worker *worker) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; if (!atomic_dec_and_test(&acct->nr_running)) @@ -430,8 +429,8 @@ static void io_wq_dec_running(struct io_worker *worker) */ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { + clear_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); raw_spin_unlock(&wq->lock); @@ -444,8 +443,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) __must_hold(wq->lock) { - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { + set_bit(IO_WORKER_F_FREE, &worker->flags); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -539,7 +538,6 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; - worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -564,10 +562,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * clear the stalled flag. */ work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wq, worker); - /* * Make sure cancelation can find this, even before * it becomes the active work. That avoids a window @@ -576,11 +571,17 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * current work item for this worker. */ raw_spin_lock(&worker->lock); - worker->next_work = work; + worker->cur_work = work; raw_spin_unlock(&worker->lock); - } else { - break; } + + raw_spin_unlock(&acct->lock); + + if (!work) + break; + + __io_worker_busy(wq, worker); + io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -631,7 +632,8 @@ static int io_wq_worker(void *data) bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + set_mask_bits(&worker->flags, 0, + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); @@ -695,11 +697,11 @@ void io_wq_worker_running(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (worker->flags & IO_WORKER_F_RUNNING) + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags |= IO_WORKER_F_RUNNING; + set_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_inc_running(worker); } @@ -713,12 +715,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags &= ~IO_WORKER_F_RUNNING; + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_dec_running(worker); } @@ -732,7 +734,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, raw_spin_lock(&wq->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); list_add_tail_rcu(&worker->all_list, &wq->all_list); - worker->flags |= IO_WORKER_F_FREE; + set_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -838,7 +840,7 @@ fail: init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; + set_bit(IO_WORKER_F_BOUND, &worker->flags); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { @@ -924,8 +926,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); + unsigned long work_flags = work->flags; struct io_cb_cancel_data match; - unsigned work_flags = work->flags; bool do_create; /* @@ -1005,8 +1007,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) match->nr_running++; raw_spin_unlock(&worker->lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c170a2b8d2cf..2675cffbd9a4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -63,7 +63,6 @@ #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> -#include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> @@ -95,6 +94,8 @@ #include "waitid.h" #include "futex.h" #include "napi.h" +#include "uring_cmd.h" +#include "memmap.h" #include "timeout.h" #include "poll.h" @@ -170,17 +171,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {}, }; #endif -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) - __io_submit_flush_completions(ctx); -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -253,14 +246,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - struct io_tw_state ts = { .locked = true, }; + struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); - if (WARN_ON_ONCE(!ts.locked)) - return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -284,6 +275,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; + bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -312,14 +304,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); - io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); - io_futex_cache_init(ctx); + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_rw)); + ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct uring_cache)); + ret |= io_futex_cache_init(ctx); + if (ret) + goto err; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -337,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); @@ -349,6 +345,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; err: + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); xa_destroy(&ctx->io_bl_xa); @@ -379,7 +381,7 @@ static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); + io_kbuf_drop(req); spin_unlock(&req->ctx->completion_lock); } @@ -498,7 +500,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) +static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -666,28 +668,14 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) io_commit_cqring_flush(ctx); } -static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) -{ - struct io_overflow_cqe *ocqe; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->cq_overflow_list, &list); - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - spin_unlock(&ctx->completion_lock); - - while (!list_empty(&list)) { - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); - list_del(&ocqe->list); - kfree(ocqe); - } -} - -static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { size_t cqe_size = sizeof(struct io_uring_cqe); - if (__io_cqring_events(ctx) == ctx->cq_entries) + lockdep_assert_held(&ctx->uring_lock); + + /* don't abort if we're dying, entries must get freed */ + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) @@ -698,11 +686,14 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!io_get_cqe_overflow(ctx, &cqe, true)) - break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - memcpy(cqe, &ocqe->cqe, cqe_size); + + if (!dying) { + if (!io_get_cqe_overflow(ctx, &cqe, true)) + break; + memcpy(cqe, &ocqe->cqe, cqe_size); + } list_del(&ocqe->list); kfree(ocqe); } @@ -714,20 +705,17 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - io_cqring_do_overflow_flush(ctx); + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx, false); + mutex_unlock(&ctx->uring_lock); } /* can be called by any task */ @@ -817,7 +805,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -void io_req_cqe_overflow(struct io_kiocb *req) +static void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, @@ -890,151 +878,71 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static void __io_flush_post_cqes(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_submit_state *state = &ctx->submit_state; - unsigned int i; - - lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; - - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - } - } - } - state->cqes_count = 0; -} - -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled && allow_overflow) + if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); io_cq_unlock_post(ctx); return filled; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) -{ - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); -} - /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - u64 user_data = req->cqe.user_data; - struct io_uring_cqe *cqe; + bool posted; lockdep_assert(!io_wq_current_is_worker()); - - if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, false); - lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { - __io_cq_lock(ctx); - __io_flush_post_cqes(ctx); - /* no need to flush - flush is deferred */ - __io_cq_unlock_post(ctx); - } - - /* For defered completions this is not as strict as it is otherwise, - * however it's main job is to prevent unbounded posted completions, - * and in that it works just as well. - */ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - return false; - - cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; - cqe->user_data = user_data; - cqe->res = res; - cqe->flags = cflags; - return true; + __io_cq_lock(ctx); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + ctx->submit_state.cq_flush = true; + __io_cq_unlock_post(ctx); + return posted; } -static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) +static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *rsrc_node = NULL; + + /* + * All execution paths but io-wq use the deferred completions by + * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. + */ + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) + return; + + /* + * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires + * the submitter task context, IOPOLL protects with uring_lock. + */ + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return; + } io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) { if (!io_fill_cqe_req(ctx, req)) io_req_cqe_overflow(req); } + io_cq_unlock_post(ctx); /* - * If we're the last reference to this request, add to our locked - * free_list cache. + * We don't free the request here because we know it's called from + * io-wq only, which holds a reference, so it cannot be the last put. */ - if (req_ref_put_and_test(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_put_kbuf_comp(req); - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - io_put_file(req); - - rsrc_node = req->rsrc_node; - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_task_remote(req->task); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - } - io_cq_unlock_post(ctx); - - if (rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); - io_put_rsrc_node(ctx, rsrc_node); - io_ring_submit_unlock(ctx, issue_flags); - } -} - -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (ctx->task_complete && ctx->submitter_task != current) { - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req, issue_flags); - } else { - mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); - mutex_unlock(&ctx->uring_lock); - } + req_ref_put(req); } void io_req_defer_failed(struct io_kiocb *req, s32 res) @@ -1065,15 +973,6 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. @@ -1085,18 +984,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } + int ret; ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -1112,8 +1000,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; + while (ret--) { + struct io_kiocb *req = reqs[ret]; io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); @@ -1163,11 +1051,9 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ts->locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - ts->locked = false; - } + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } @@ -1191,8 +1077,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, if (req->ctx != ctx) { ctx_flush_and_put(ctx, &ts); ctx = req->ctx; - /* if not contended, grab and improve batching */ - ts.locked = mutex_trylock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, @@ -1453,11 +1338,9 @@ again: if (io_run_local_work_continue(ctx, ret, min_events)) goto again; - if (ts->locked) { - io_submit_flush_completions(ctx); - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; - } + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; trace_io_uring_local_work_run(ctx, ret, loops); return ret; @@ -1466,17 +1349,11 @@ again: static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) { - struct io_tw_state ts = { .locked = true, }; - int ret; + struct io_tw_state ts = {}; if (llist_empty(&ctx->work_llist)) return 0; - - ret = __io_run_local_work(ctx, &ts, min_events); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!ts.locked)) - mutex_lock(&ctx->uring_lock); - return ret; + return __io_run_local_work(ctx, &ts, min_events); } static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) @@ -1484,11 +1361,9 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) struct io_tw_state ts = {}; int ret; - ts.locked = mutex_trylock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); ret = __io_run_local_work(ctx, &ts, min_events); - if (ts.locked) - mutex_unlock(&ctx->uring_lock); - + mutex_unlock(&ctx->uring_lock); return ret; } @@ -1505,7 +1380,7 @@ void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, ts); + io_queue_iowq(req); else io_queue_sqe(req); } @@ -1550,7 +1425,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) + if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) kfree(apoll); req->flags &= ~REQ_F_POLLED; } @@ -1560,10 +1435,9 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_clean_op(req); } io_put_file(req); - - io_req_put_rsrc_locked(req, ctx); - + io_put_rsrc_node(ctx, req->rsrc_node); io_put_task(req->task); + node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); @@ -1576,9 +1450,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node; __io_cq_lock(ctx); - /* must come first to preserve CQE ordering in failure cases */ - if (state->cqes_count) - __io_flush_post_cqes(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1600,6 +1471,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + ctx->submit_state.cq_flush = false; } static unsigned io_cqring_events(struct io_ring_ctx *ctx) @@ -1642,13 +1514,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) unsigned int nr_events = 0; unsigned long check_cq; + lockdep_assert_held(&ctx->uring_lock); + if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx, false); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. @@ -1711,10 +1585,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (ts->locked) - io_req_complete_defer(req); - else - io_req_complete_post(req, IO_URING_F_UNLOCKED); + io_req_complete_defer(req); } /* @@ -1785,8 +1656,10 @@ io_req_flags_t io_file_get_flags(struct file *file) bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); - req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1794,25 +1667,6 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!cdef->prep_async) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (!def->manual_alloc) { - if (io_alloc_async_data(req)) - return -EAGAIN; - } - return cdef->prep_async(req); -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -2093,7 +1947,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) break; case IO_APOLL_ABORTED: io_kbuf_recycle(req, 0); - io_queue_iowq(req, NULL); + io_queue_iowq(req); break; case IO_APOLL_OK: break; @@ -2130,17 +1984,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) { - io_req_defer_failed(req, ret); - return; - } - if (unlikely(req->ctx->drain_active)) io_drain_req(req); else - io_queue_iowq(req, NULL); + io_queue_iowq(req); } } @@ -2346,10 +2193,6 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; @@ -2597,8 +2440,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx, min_events); io_run_task_work(); - io_cqring_overflow_flush(ctx); - /* if user messes with these they will just get an early return */ + + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); if (__io_cqring_events_user(ctx) >= min_events) return 0; @@ -2698,89 +2542,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -void io_mem_free(void *ptr) -{ - if (!ptr) - return; - - folio_put(virt_to_folio(ptr)); -} - -static void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - int i; - - if (!page_array) - return; - - for (i = 0; i < npages; i++) - unpin_user_page(page_array[i]); - kvfree(page_array); - *pages = NULL; -} - -static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - int ret, i, pinned; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > USHRT_MAX) - return ERR_PTR(-EINVAL); - page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!page_array) - return ERR_PTR(-ENOMEM); - - - pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - page_array); - if (pinned != nr_pages) { - ret = (pinned < 0) ? pinned : -EFAULT; - goto free_pages; - } - - page_addr = page_address(page_array[0]); - for (i = 0; i < nr_pages; i++) { - ret = -EINVAL; - - /* - * Can't support mapping user allocated ring memory on 32-bit - * archs where it could potentially reside in highmem. Just - * fail those with -EINVAL, just like we did on kernels that - * didn't support this feature. - */ - if (PageHighMem(page_array[i])) - goto free_pages; - - /* - * No support for discontig pages for now, should either be a - * single normal page, or a huge page. Later on we can add - * support for remapping discontig pages, for now we will - * just fail them with EINVAL. - */ - if (page_address(page_array[i]) != page_addr) - goto free_pages; - page_addr += PAGE_SIZE; - } - - *pages = page_array; - *npages = nr_pages; - return page_to_virt(page_array[0]); - -free_pages: - io_pages_free(&page_array, pinned > 0 ? pinned : 0); - return ERR_PTR(ret); -} - static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { @@ -2798,30 +2559,23 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, + true); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, + true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); ctx->n_sqe_pages = 0; + vunmap(ctx->rings); + vunmap(ctx->sq_sqes); } ctx->rings = NULL; ctx->sq_sqes = NULL; } -void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - void *ret; - - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret) - return ret; - return ERR_PTR(-ENOMEM); -} - static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { @@ -2867,7 +2621,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) int nr = 0; mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); @@ -2879,11 +2632,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_rsrc_node_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct io_rsrc_node, cache)); -} - static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -2898,8 +2646,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); + io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); @@ -2915,13 +2665,12 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); - io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3145,17 +2894,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); - if (ctx->rings) - io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* - * If we failed setting up the ctx, we might not have any rings - * and therefore did not submit any requests - */ - if (ctx->rings) - io_kill_timeouts(ctx, NULL, true); - flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); @@ -3241,37 +2981,6 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool ret = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, - hash_node) { - struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, - struct io_uring_cmd); - struct file *file = req->file; - - if (!cancel_all && req->task != task) - continue; - - if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; - file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); - ret = true; - } - } - io_submit_flush_completions(ctx); - - return ret; -} - static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) @@ -3326,6 +3035,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); return ret; } @@ -3424,137 +3135,6 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - - switch (offset & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - ptr = ctx->rings; - break; - case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - ptr = ctx->sq_sqes; - break; - case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - io_put_bl(ctx, bl); - break; - } - default: - return ERR_PTR(-EINVAL); - } - - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; -} - -#ifdef CONFIG_MMU - -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - size_t sz = vma->vm_end - vma->vm_start; - unsigned long pfn; - void *ptr; - - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); -} - -static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - /* - * Do not allow to map to user-provided address to avoid breaking the - * aliasing rules. Userspace is not able to guess the offset address of - * kernel kmalloc()ed memory area. - */ - if (addr) - return -EINVAL; - - ptr = io_uring_validate_mmap_request(filp, pgoff, len); - if (IS_ERR(ptr)) - return -ENOMEM; - - /* - * Some architectures have strong cache aliasing requirements. - * For such architectures we need a coherent mapping which aliases - * kernel memory *and* userspace memory. To achieve that: - * - use a NULL file pointer to reference physical memory, and - * - use the kernel virtual address of the shared io_uring context - * (instead of the userspace-provided address, which has to be 0UL - * anyway). - * - use the same pgoff which the get_unmapped_area() uses to - * calculate the page colouring. - * For architectures without such aliasing requirements, the - * architecture will return any suitable mapping because addr is 0. - */ - filp = NULL; - flags |= MAP_SHARED; - pgoff = 0; /* has been translated to ptr above */ -#ifdef SHM_COLOUR - addr = (uintptr_t) ptr; - pgoff = addr >> PAGE_SHIFT; -#else - addr = 0UL; -#endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { @@ -3647,8 +3227,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; @@ -3737,11 +3315,9 @@ out: static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, + .get_unmapped_area = io_uring_get_unmapped_area, #ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, -#else - .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS @@ -3770,7 +3346,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_mem_alloc(size); + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); @@ -3795,7 +3371,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_mem_alloc(size); + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); @@ -3994,7 +3570,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | + IORING_FEAT_RECVSEND_BUNDLE; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6426ee382276..624ca9076a50 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -62,16 +62,12 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) } bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); -void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); - struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -79,7 +75,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); @@ -97,7 +92,6 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); -int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); @@ -110,9 +104,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -void *io_mem_alloc(size_t size); -void io_mem_free(void *ptr); - enum { IO_EVENTFD_OP_SIGNAL_BIT, IO_EVENTFD_OP_FREE_BIT, @@ -121,9 +112,9 @@ enum { void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); -#if defined(CONFIG_PROVE_LOCKING) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { +#if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); if (ctx->flags & IORING_SETUP_IOPOLL) { @@ -142,18 +133,21 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) else lockdep_assert(current == ctx->submitter_task); } -} -#else -static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) -{ -} #endif +} static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs) || + ctx->submit_state.cq_flush) + __io_submit_flush_completions(ctx); +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) @@ -340,15 +334,12 @@ static inline int io_run_task_work(void) static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); + return task_work_pending(current) || !llist_empty(&ctx->work_llist); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!ts->locked) { - mutex_lock(&ctx->uring_lock); - ts->locked = true; - } + lockdep_assert_held(&ctx->uring_lock); } /* diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3aa16e27f509..d2945c9c812b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/vmalloc.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -14,8 +15,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) +#include "memmap.h" /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) @@ -31,25 +31,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; - -static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - return xa_load(&ctx->io_bl_xa, bgid); -} - static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -130,6 +117,27 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + struct iovec *iov) +{ + void __user *buf; + + buf = io_provided_buffer_select(req, len, bl); + if (unlikely(!buf)) + return -ENOBUFS; + + iov[0].iov_base = buf; + iov[0].iov_len = *len; + return 0; +} + +static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, + __u16 head, __u16 mask) +{ + return &br->bufs[head & mask]; +} + static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -145,19 +153,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (head + 1 == tail) req->flags |= REQ_F_BL_EMPTY; - head &= bl->mask; - /* mmaped buffers are always contig */ - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } + buf = io_ring_head_to_buf(br, head, bl->mask); if (*len == 0 || *len > buf->len) *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; @@ -172,6 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ + req->flags &= ~REQ_F_BUFFERS_COMMIT; req->buf_list = NULL; bl->head++; } @@ -198,22 +198,134 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +/* cap it at a reasonable 256, will be one page even for 4K */ +#define PEEK_MAX_IMPORT 256 + +static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_buffer_list *bl) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct iovec *iov = arg->iovs; + int nr_iovs = arg->nr_iovs; + __u16 nr_avail, tail, head; + struct io_uring_buf *buf; + + tail = smp_load_acquire(&br->tail); + head = bl->head; + nr_avail = min_t(__u16, tail - head, UIO_MAXIOV); + if (unlikely(!nr_avail)) + return -ENOBUFS; + + buf = io_ring_head_to_buf(br, head, bl->mask); + if (arg->max_len) { + int needed; + + needed = (arg->max_len + buf->len - 1) / buf->len; + needed = min(needed, PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } + + /* + * only alloc a bigger array if we know we have data to map, eg not + * a speculative peek operation. + */ + if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + if (arg->mode & KBUF_MODE_FREE) + kfree(arg->iovs); + arg->iovs = iov; + nr_iovs = nr_avail; + } else if (nr_avail < nr_iovs) { + nr_iovs = nr_avail; + } + + /* set it to max, if not set, so we can use it unconditionally */ + if (!arg->max_len) + arg->max_len = INT_MAX; + + req->buf_index = buf->bid; + do { + /* truncate end piece, if needed */ + if (buf->len > arg->max_len) + buf->len = arg->max_len; + + iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_len = buf->len; + iov++; + + arg->out_len += buf->len; + arg->max_len -= buf->len; + if (!arg->max_len) + break; + + buf = io_ring_head_to_buf(br, ++head, bl->mask); + } while (--nr_iovs); + + if (head == tail) + req->flags |= REQ_F_BL_EMPTY; + + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + return iov - arg->iovs; +} + +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags) { - struct io_buf_free *ibf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = -ENOENT; - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + goto out_unlock; + + if (bl->is_buf_ring) { + ret = io_ring_buffers_peek(req, arg, bl); + /* + * Don't recycle these buffers if we need to go through poll. + * Nobody else can use them anyway, and holding on to provided + * buffers for a send/write operation would happen on the app + * side anyway with normal buffers. Besides, we already + * committed them, they cannot be put back in the queue. + */ + if (ret > 0) { + req->flags |= REQ_F_BL_NO_RECYCLE; + req->buf_list->head += ret; } + } else { + ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + } +out_unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + return -ENOENT; + + if (bl->is_buf_ring) { + ret = io_ring_buffers_peek(req, arg, bl); + if (ret > 0) + req->flags |= REQ_F_BUFFERS_COMMIT; + return ret; } - /* can't happen... */ - WARN_ON_ONCE(1); + /* don't support multiple buffer selections for legacy */ + return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } static int __io_remove_buffers(struct io_ring_ctx *ctx, @@ -227,22 +339,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_buf_ring) { i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { + if (bl->buf_nr_pages) { int j; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (!bl->is_mmap) { + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + } + io_pages_unmap(bl->buf_ring, &bl->buf_pages, + &bl->buf_nr_pages, bl->is_mmap); + bl->is_mmap = 0; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); @@ -498,9 +604,9 @@ err: static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_uring_buf_ring *br; + struct io_uring_buf_ring *br = NULL; struct page **pages; - int i, nr_pages; + int nr_pages, ret; pages = io_pin_pages(reg->ring_addr, flex_array_size(br, bufs, reg->ring_entries), @@ -508,18 +614,12 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, if (IS_ERR(pages)) return PTR_ERR(pages); - /* - * Apparently some 32-bit boxes (ARM) will return highmem pages, - * which then need to be mapped. We could support that, but it'd - * complicate the code and slowdown the common cases quite a bit. - * So just error out, returning -EINVAL just like we did on kernels - * that didn't support mapped buffer rings. - */ - for (i = 0; i < nr_pages; i++) - if (PageHighMem(pages[i])) - goto error_unpin; + br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!br) { + ret = -ENOMEM; + goto error_unpin; + } - br = page_address(pages[0]); #ifdef SHM_COLOUR /* * On platforms that have specific aliasing requirements, SHM_COLOUR @@ -530,8 +630,10 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, * should use IOU_PBUF_RING_MMAP instead, and liburing will handle * this transparently. */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { + ret = -EINVAL; goto error_unpin; + } #endif bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; @@ -540,69 +642,24 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->is_mmap = 0; return 0; error_unpin: - for (i = 0; i < nr_pages; i++) - unpin_user_page(pages[i]); + unpin_user_pages(pages, nr_pages); kvfree(pages); - return -EINVAL; -} - -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; + vunmap(br); + return ret; } static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_buf_free *ibf; size_t ring_size; - void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); - } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); + if (!bl->buf_ring) + return -ENOMEM; + bl->is_buf_ring = 1; bl->is_mmap = 1; return 0; @@ -750,18 +807,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, return ERR_PTR(-EINVAL); } -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) { - struct io_buf_free *ibf; - struct hlist_node *tmp; + struct io_ring_ctx *ctx = file->private_data; + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; + struct io_buffer_list *bl; + int bgid, ret; - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + io_put_bl(ctx, bl); + return ret; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index df365b8860cf..b90aca3a57fa 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -41,8 +41,26 @@ struct io_buffer { __u16 bgid; }; +enum { + /* can alloc a bigger vec */ + KBUF_MODE_EXPAND = 1, + /* if bigger vec allocated, free old one */ + KBUF_MODE_FREE = 2, +}; + +struct buf_sel_arg { + struct iovec *iovs; + size_t out_len; + size_t max_len; + int nr_iovs; + int mode; +}; + void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -55,8 +73,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); - void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -64,6 +80,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid); +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { @@ -76,7 +93,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) */ if (req->buf_list) { req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; + req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } return false; @@ -100,11 +117,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req) +static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->buf_list->head++; + struct io_buffer_list *bl = req->buf_list; + + if (bl) { + if (req->flags & REQ_F_BUFFERS_COMMIT) { + bl->head += nr; + req->flags &= ~REQ_F_BUFFERS_COMMIT; + } + req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; } @@ -113,7 +135,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); @@ -121,22 +143,18 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, } } -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +static inline void io_kbuf_drop(struct io_kiocb *req) { - unsigned int ret; - lockdep_assert_held(&req->ctx->completion_lock); if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; + return; - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); - return ret; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) { unsigned int ret; @@ -145,9 +163,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, nbufs); else __io_put_kbuf(req, issue_flags); return ret; } + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + return __io_put_kbufs(req, 1, issue_flags); +} + +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) +{ + return __io_put_kbufs(req, nbufs, issue_flags); +} #endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c new file mode 100644 index 000000000000..523d982af2b0 --- /dev/null +++ b/io_uring/memmap.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> +#include <asm/shmparam.h> + +#include "memmap.h" +#include "kbuf.h" + +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); +} + +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) +{ + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } + + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } + + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ret; +} + +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages) +{ + bool do_vunmap = false; + + if (!ptr) + return; + + if (put_pages && *npages) { + struct page **to_free = *pages; + int i; + + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + +void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + +void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + void *page_addr; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = 0; + page_array = io_pin_pages(uaddr, size, &nr_pages); + if (IS_ERR(page_array)) + return page_array; + + page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); + if (page_addr) { + *pages = page_array; + *npages = nr_pages; + return page_addr; + } + + io_pages_free(&page_array, nr_pages); + return ERR_PTR(-ENOMEM); +} + +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, + size_t sz) +{ + struct io_ring_ctx *ctx = file->private_data; + loff_t offset = pgoff << PAGE_SHIFT; + + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + return ctx->rings; + case IORING_OFF_SQES: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + return ctx->sq_sqes; + case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; + unsigned int bgid; + void *ptr; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return bl; + ptr = bl->buf_ring; + io_put_bl(ctx, bl); + return ptr; + } + } + + return ERR_PTR(-EINVAL); +} + +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + +#ifdef CONFIG_MMU + +__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct io_ring_ctx *ctx = file->private_data; + size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, + ctx->n_ring_pages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + case IORING_OFF_PBUF_RING: + return io_pbuf_mmap(file, vma); + } + + return -EINVAL; +} + +unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + /* + * Some architectures have strong cache aliasing requirements. + * For such architectures we need a coherent mapping which aliases + * kernel memory *and* userspace memory. To achieve that: + * - use a NULL file pointer to reference physical memory, and + * - use the kernel virtual address of the shared io_uring context + * (instead of the userspace-provided address, which has to be 0UL + * anyway). + * - use the same pgoff which the get_unmapped_area() uses to + * calculate the page colouring. + * For architectures without such aliasing requirements, the + * architecture will return any suitable mapping because addr is 0. + */ + filp = NULL; + flags |= MAP_SHARED; + pgoff = 0; /* has been translated to ptr above */ +#ifdef SHM_COLOUR + addr = (uintptr_t) ptr; + pgoff = addr >> PAGE_SHIFT; +#else + addr = 0UL; +#endif + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +#else /* !CONFIG_MMU */ + +int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; +} + +unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + void *ptr; + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ diff --git a/io_uring/memmap.h b/io_uring/memmap.h new file mode 100644 index 000000000000..5cec5b7ac49a --- /dev/null +++ b/io_uring/memmap.h @@ -0,0 +1,25 @@ +#ifndef IO_URING_MEMMAP_H +#define IO_URING_MEMMAP_H + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +void io_pages_free(struct page ***pages, int npages); +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages); + +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size); +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages); + +void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size); + +#ifndef CONFIG_MMU +unsigned int io_uring_nommu_mmap_capabilities(struct file *file); +#endif +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +int io_uring_mmap(struct file *file, struct vm_area_struct *vma); + +#endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index cd6dcf634ba3..81c4a9d43729 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -83,7 +83,7 @@ static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) return -EOWNERDEAD; init_task_work(&msg->tw, func); - if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL)) + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) return -EOWNERDEAD; return IOU_ISSUE_SKIP_COMPLETE; @@ -147,13 +147,11 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; - io_double_unlock_ctx(target_ctx); - } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; } + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) + ret = 0; + if (target_ctx->flags & IORING_SETUP_IOPOLL) + io_double_unlock_ctx(target_ctx); return ret; } diff --git a/io_uring/net.c b/io_uring/net.c index 4afb475d4197..070dea9a4eda 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -28,6 +28,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + int iou_flags; u32 file_slot; unsigned long nofile; }; @@ -57,7 +58,7 @@ struct io_sr_msg { struct user_msghdr __user *umsg; void __user *buf; }; - unsigned len; + int len; unsigned done_io; unsigned msg_flags; unsigned nr_multishot_loops; @@ -115,80 +116,85 @@ static bool io_net_retry(struct socket *sock, int flags) return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } +static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) +{ + if (kmsg->free_iov) { + kfree(kmsg->free_iov); + kmsg->free_iov_nr = 0; + kmsg->free_iov = NULL; + } +} + static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; + struct iovec *iov; - if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) + /* can't recycle, ensure we free the iovec if we have one */ + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_netmsg_iovec_free(hdr); return; + } /* Let normal cleanup path reap it if we fail adding to the cache */ - if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + iov = hdr->free_iov; + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { + if (iov) + kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } } -static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, - unsigned int issue_flags) +static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct io_async_msghdr *hdr; - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->netmsg_cache); - if (entry) { - hdr = container_of(entry, struct io_async_msghdr, cache); - hdr->free_iov = NULL; - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; + hdr = io_alloc_cache_get(&ctx->netmsg_cache); + if (hdr) { + if (hdr->free_iov) { + kasan_mempool_unpoison_object(hdr->free_iov, + hdr->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; } + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = hdr; + return hdr; } if (!io_alloc_async_data(req)) { hdr = req->async_data; + hdr->free_iov_nr = 0; hdr->free_iov = NULL; return hdr; } return NULL; } -static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) +/* assign new iovec to kmsg, if we need to */ +static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, + struct iovec *iov) { - /* ->prep_async is always called from the submission context */ - return io_msg_alloc_async(req, 0); + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; + if (kmsg->free_iov) + kfree(kmsg->free_iov); + kmsg->free_iov = iov; + } + return 0; } -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg, - unsigned int issue_flags) +static inline void io_mshot_prep_retry(struct io_kiocb *req, + struct io_async_msghdr *kmsg) { - struct io_async_msghdr *async_msg; - - if (req_has_async_data(req)) - return -EAGAIN; - async_msg = io_msg_alloc_async(req, issue_flags); - if (!async_msg) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - if (async_msg->msg.msg_name) - async_msg->msg.msg_name = &async_msg->addr; - - if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) - return -EAGAIN; - - /* if were using fast_iov, set it to the new one */ - if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { - size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; - async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; - } + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - return -EAGAIN; + req->flags &= ~REQ_F_BL_EMPTY; + sr->done_io = 0; + sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; } #ifdef CONFIG_COMPAT @@ -198,7 +204,16 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct compat_iovec __user *uiov; - int ret; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; @@ -207,9 +222,9 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - iomsg->free_iov = NULL; if (msg->msg_iovlen == 0) { - sr->len = 0; + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { @@ -225,14 +240,12 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return 0; } - iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); + nr_segs, &iov, &iomsg->msg.msg_iter, true); if (unlikely(ret < 0)) return ret; - return 0; + return io_net_vec_assign(req, iomsg, iov); } #endif @@ -240,7 +253,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr *msg, int ddir) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - int ret; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) return -EFAULT; @@ -256,9 +278,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iomsg->fast_iov[0].iov_len = 0; - iomsg->fast_iov[0].iov_base = NULL; - iomsg->free_iov = NULL; + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { ret = -EINVAL; goto ua_end; @@ -266,10 +287,9 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, /* we only need the length for provided buffers */ if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) goto ua_end; - unsafe_get_user(iomsg->fast_iov[0].iov_len, - &msg->msg_iov[0].iov_len, ua_end); - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; + unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, + ua_end); + sr->len = iov->iov_len; } ret = 0; ua_end: @@ -278,13 +298,12 @@ ua_end: } user_access_end(); - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, false); + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, + &iov, &iomsg->msg.msg_iter, false); if (unlikely(ret < 0)) return ret; - return 0; + return io_net_vec_assign(req, iomsg, iov); } static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -320,60 +339,58 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, return ret; } -int io_send_prep_async(struct io_kiocb *req) +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; - int ret; + struct io_async_msghdr *io = req->async_data; - if (req_has_async_data(req)) - return 0; - zc->done_io = 0; - if (!zc->addr) - return 0; - io = io_msg_alloc_async_prep(req); - if (!io) - return -ENOMEM; - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); - return ret; + io_netmsg_iovec_free(io); } -static int io_setup_async_addr(struct io_kiocb *req, - struct sockaddr_storage *addr_storage, - unsigned int issue_flags) +static int io_send_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *kmsg = req->async_data; + int ret; - if (!sr->addr || req_has_async_data(req)) - return -EAGAIN; - io = io_msg_alloc_async(req, issue_flags); - if (!io) - return -ENOMEM; - memcpy(&io->addr, addr_storage, sizeof(io->addr)); - return -EAGAIN; + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_ubuf = NULL; + + if (sr->addr) { + ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); + if (unlikely(ret < 0)) + return ret; + kmsg->msg.msg_name = &kmsg->addr; + kmsg->msg.msg_namelen = sr->addr_len; + } + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret < 0)) + return ret; + } + return 0; } -int io_sendmsg_prep_async(struct io_kiocb *req) +static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg; int ret; - sr->done_io = 0; - if (!io_msg_alloc_async_prep(req)) + kmsg = io_msg_alloc_async(req); + if (unlikely(!kmsg)) return -ENOMEM; - ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!is_msg) + return io_send_setup(req); + ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); -} +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -393,34 +410,114 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + if (sr->flags & ~SENDMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_SENDMSG) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + sr->msg_flags |= MSG_WAITALL; + sr->buf_group = req->buf_index; + req->buf_list = NULL; + } + if (req->flags & REQ_F_BUFFER_SELECT && sr->len) + return -EINVAL; #ifdef CONFIG_COMPAT if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); } static void io_req_msg_cleanup(struct io_kiocb *req, - struct io_async_msghdr *kmsg, unsigned int issue_flags) { req->flags &= ~REQ_F_NEED_CLEANUP; - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); io_netmsg_recycle(req, issue_flags); } +/* + * For bundle completions, we need to figure out how many segments we consumed. + * A bundle could be using a single ITER_UBUF if that's all we mapped, or it + * could be using an ITER_IOVEC. If the latter, then if we consumed all of + * the segments, then it's a trivial questiont o answer. If we have residual + * data in the iter, then loop the segments to figure out how much we + * transferred. + */ +static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) +{ + struct iovec *iov; + int nbufs; + + /* no data is always zero segments, and a ubuf is always 1 segment */ + if (ret <= 0) + return 0; + if (iter_is_ubuf(&kmsg->msg.msg_iter)) + return 1; + + iov = kmsg->free_iov; + if (!iov) + iov = &kmsg->fast_iov; + + /* if all data was transferred, it's basic pointer math */ + if (!iov_iter_count(&kmsg->msg.msg_iter)) + return iter_iov(&kmsg->msg.msg_iter) - iov; + + /* short transfer, count segments */ + nbufs = 0; + do { + int this_len = min_t(int, iov[nbufs].iov_len, ret); + + nbufs++; + ret -= this_len; + } while (ret); + + return nbufs; +} + +static inline bool io_send_finish(struct io_kiocb *req, int *ret, + struct io_async_msghdr *kmsg, + unsigned issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + bool bundle_finished = *ret <= 0; + unsigned int cflags; + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { + cflags = io_put_kbuf(req, issue_flags); + goto finish; + } + + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); + + if (bundle_finished || req->flags & REQ_F_BL_EMPTY) + goto finish; + + /* + * Fill CQE for this receive and see if we should keep trying to + * receive from this socket. + */ + if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { + io_mshot_prep_retry(req, kmsg); + return false; + } + + /* Otherwise stop bundle and use the current result. */ +finish: + io_req_set_res(req, *ret, cflags); + *ret = IOU_OK; + return true; +} + int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; @@ -430,19 +527,9 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -450,23 +537,25 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - io_req_msg_cleanup(req, kmsg, issue_flags); + io_req_msg_cleanup(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -477,65 +566,77 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) int io_send(struct io_kiocb *req, unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; int ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_ubuf = NULL; - - if (sr->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = sr->addr_len; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); - if (unlikely(ret)) - return ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + +retry_bundle: + if (io_do_buffer_select(req)) { + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .max_len = INT_MAX, + .nr_iovs = 1, + .mode = KBUF_MODE_EXPAND, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode |= KBUF_MODE_FREE; + } + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) + arg.nr_iovs = 1; + + ret = io_buffers_select(req, &arg, issue_flags); + if (unlikely(ret < 0)) + return ret; + + sr->len = arg.out_len; + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + } + } + + /* + * If MSG_WAITALL is set, or this is a bundle send, then we need + * the full amount. If just bundle is set, if we do a short send + * then we complete the bundle sequence rather than continue on. + */ + if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = flags; + ret = sock_sendmsg(sock, &kmsg->msg); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -545,8 +646,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; + + if (!io_send_finish(req, &ret, kmsg, issue_flags)) + goto retry_bundle; + + io_req_msg_cleanup(req, issue_flags); + return ret; } static int io_recvmsg_mshot_prep(struct io_kiocb *req, @@ -611,23 +716,42 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, msg.msg_controllen); } -int io_recvmsg_prep_async(struct io_kiocb *req) +static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *iomsg; + struct io_async_msghdr *kmsg; int ret; - sr->done_io = 0; - if (!io_msg_alloc_async_prep(req)) + kmsg = io_msg_alloc_async(req); + if (unlikely(!kmsg)) return -ENOMEM; - iomsg = req->async_data; - ret = io_recvmsg_copy_hdr(req, iomsg); + + if (req->opcode == IORING_OP_RECV) { + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_get_inq = 1; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_iocb = NULL; + kmsg->msg.msg_ubuf = NULL; + + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + return 0; + } + + ret = io_recvmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ + IORING_RECVSEND_BUNDLE) int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -641,21 +765,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~(RECVMSG_FLAGS)) + if (sr->flags & ~RECVMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (sr->flags & IORING_RECV_MULTISHOT) { - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sr->msg_flags & MSG_WAITALL) - return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) - return -EINVAL; - req->flags |= REQ_F_APOLL_MULTISHOT; + if (req->flags & REQ_F_BUFFER_SELECT) { /* * Store the buffer group for this multishot receive separately, * as if we end up doing an io-wq based issue that selects a @@ -665,6 +782,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * restore it. */ sr->buf_group = req->buf_index; + req->buf_list = NULL; + } + if (sr->flags & IORING_RECV_MULTISHOT) { + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sr->msg_flags & MSG_WAITALL) + return -EINVAL; + if (req->opcode == IORING_OP_RECV && sr->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_RECVMSG) + return -EINVAL; } #ifdef CONFIG_COMPAT @@ -672,17 +803,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif sr->nr_multishot_loops = 0; - return 0; -} - -static inline void io_recv_prep_retry(struct io_kiocb *req) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - - req->flags &= ~REQ_F_BL_EMPTY; - sr->done_io = 0; - sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; + return io_recvmsg_prep_setup(req); } /* @@ -692,28 +813,36 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, - struct msghdr *msg, bool mshot_finished, - unsigned issue_flags) + struct io_async_msghdr *kmsg, + bool mshot_finished, unsigned issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); unsigned int cflags; - cflags = io_put_kbuf(req, issue_flags); - if (msg->msg_inq > 0) + if (sr->flags & IORING_RECVSEND_BUNDLE) + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), + issue_flags); + else + cflags = io_put_kbuf(req, issue_flags); + + if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; + /* bundle with no more immediate buffers, we're done */ + if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) + goto finish; + /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && - io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - *ret, cflags | IORING_CQE_F_MORE)) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - io_recv_prep_retry(req); + io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) return false; /* mshot retries exceeded, force a requeue */ @@ -728,12 +857,14 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } /* Finish the request / stop multishot. */ +finish: io_req_set_res(req, *ret, cflags); if (issue_flags & IO_URING_F_MULTISHOT) *ret = IOU_STOP_MULTISHOT; else *ret = IOU_OK; + io_req_msg_cleanup(req, issue_flags); return true; } @@ -824,7 +955,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -835,18 +966,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (force_nonblock) @@ -888,17 +1010,16 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg, issue_flags); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { + if (issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } - return ret; + return -EAGAIN; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -914,21 +1035,79 @@ retry_multishot: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; - if (mshot_finished) - io_req_msg_cleanup(req, kmsg, issue_flags); - else if (ret == -EAGAIN) - return io_setup_async_msg(req, kmsg, issue_flags); - return ret; } +static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, + size_t *len, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + /* + * If the ring isn't locked, then don't use the peek interface + * to grab multiple buffers as we will lock/unlock between + * this selection and posting the buffers. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED) && + sr->flags & IORING_RECVSEND_BUNDLE) { + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .nr_iovs = 1, + .mode = KBUF_MODE_EXPAND, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode |= KBUF_MODE_FREE; + } + + if (kmsg->msg.msg_inq > 0) + arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + + ret = io_buffers_peek(req, &arg); + if (unlikely(ret < 0)) + return ret; + + /* special case 1 vec, can be a fast path */ + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + sr->len = arg.iovs[0].iov_len; + goto map_ubuf; + } + iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + } + } else { + void __user *buf; + + *len = sr->len; + buf = io_buffer_select(req, len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; + sr->len = *len; +map_ubuf: + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + + return 0; +} + int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -943,40 +1122,25 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - msg.msg_ubuf = NULL; - flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; retry_multishot: if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - sr->len = len; + ret = io_recv_buf_select(req, kmsg, &len, issue_flags); + if (unlikely(ret)) + goto out_free; + sr->buf = NULL; } - ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_inq = -1; - msg.msg_flags = 0; + kmsg->msg.msg_inq = -1; + kmsg->msg.msg_flags = 0; if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = sock_recvmsg(sock, &msg, flags); + ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { if (issue_flags & IO_URING_F_MULTISHOT) { @@ -996,7 +1160,7 @@ retry_multishot: if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } @@ -1008,7 +1172,7 @@ out_free: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) goto retry_multishot; return ret; @@ -1017,14 +1181,10 @@ out_free: void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *io = req->async_data; - if (req_has_async_data(req)) { - io = req->async_data; - /* might be ->fast_iov if *msg_copy_hdr failed */ - if (io->free_iov != io->fast_iov) - kfree(io->free_iov); - } + if (req_has_async_data(req)) + io_netmsg_iovec_free(io); if (zc->notif) { io_notif_flush(zc->notif); zc->notif = NULL; @@ -1041,6 +1201,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *notif; zc->done_io = 0; + req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1061,8 +1222,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->flags & ~IO_ZC_FLAGS_VALID) return -EINVAL; if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { - io_notif_set_extended(notif); - io_notif_to_data(notif)->zc_report = true; + struct io_notif_data *nd = io_notif_to_data(notif); + + nd->zc_report = true; + nd->zc_used = false; + nd->zc_copied = false; } } @@ -1090,7 +1254,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); - zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -1098,7 +1262,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); } static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, @@ -1159,11 +1323,34 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, return ret; } +static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, + (u64)(uintptr_t)sr->buf, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter; + } else { + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + ret = io_notif_account_mem(sr->notif, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + } + + return ret; +} + int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1174,67 +1361,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - if (zc->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = zc->addr_len; - } - if (!(req->flags & REQ_F_POLLED) && (zc->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, - (u64)(uintptr_t)zc->buf, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter; - } else { - io_notif_set_extended(zc->notif); - ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); + if (!zc->done_io) { + ret = io_send_zc_import(req, kmsg); if (unlikely(ret)) return ret; - ret = io_notif_account_mem(zc->notif, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter_iovec; } - msg_flags = zc->msg_flags | MSG_ZEROCOPY; + msg_flags = zc->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = msg_flags; - msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = msg_flags; + kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; + ret = sock_sendmsg(sock, &kmsg->msg); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; - if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { + if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { zc->len -= ret; zc->buf += ret; zc->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1252,7 +1409,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1261,63 +1418,46 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; - io_notif_set_extended(sr->notif); - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (req_has_async_data(req)) { - kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; - flags = sr->msg_flags | MSG_ZEROCOPY; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov = NULL; - } - io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -1329,7 +1469,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1347,10 +1487,12 @@ void io_sendrecv_fail(struct io_kiocb *req) req->cqe.flags |= IORING_CQE_F_MORE; } +#define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ + IORING_ACCEPT_POLL_FIRST) + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); - unsigned flags; if (sqe->len || sqe->buf_index) return -EINVAL; @@ -1359,15 +1501,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) + accept->iou_flags = READ_ONCE(sqe->ioprio); + if (accept->iou_flags & ~ACCEPT_FLAGS) return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); if (accept->file_slot) { if (accept->flags & SOCK_CLOEXEC) return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && accept->file_slot != IORING_FILE_INDEX_ALLOC) return -EINVAL; } @@ -1375,8 +1517,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) req->flags |= REQ_F_APOLL_MULTISHOT; + if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) + req->flags |= REQ_F_NOWAIT; return 0; } @@ -1389,6 +1533,10 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; + if (!(req->flags & REQ_F_POLLED) && + accept->iou_flags & IORING_ACCEPT_POLL_FIRST) + return -EAGAIN; + retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); @@ -1401,7 +1549,8 @@ retry: if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { + if (ret == -EAGAIN && force_nonblock && + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { /* * if it's multishot and polled, we don't need to * return EAGAIN to arm the poll infra since it @@ -1429,8 +1578,7 @@ retry: if (ret < 0) return ret; - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, IORING_CQE_F_MORE)) + if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE)) goto retry; io_req_set_res(req, ret, 0); @@ -1491,17 +1639,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); + struct io_async_msghdr *io; if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -1509,32 +1650,26 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); conn->in_progress = conn->seen_econnaborted = false; - return 0; + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); } int io_connect(struct io_kiocb *req, unsigned int issue_flags) { struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); - struct io_async_connect __io, *io; + struct io_async_msghdr *io = req->async_data; unsigned file_flags; int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(connect->addr, - connect->addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->address, - connect->addr_len, file_flags); + ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, + file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) && force_nonblock) { if (ret == -EINPROGRESS) { @@ -1544,13 +1679,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) goto out; connect->seen_econnaborted = true; } - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (connect->in_progress) { @@ -1568,12 +1696,20 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) out: if (ret < 0) req_set_fail(req); + io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); return IOU_OK; } -void io_netmsg_cache_free(struct io_cache_entry *entry) +void io_netmsg_cache_free(const void *entry) { - kfree(container_of(entry, struct io_async_msghdr, cache)); + struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; + + if (kmsg->free_iov) { + kasan_mempool_unpoison_object(kmsg->free_iov, + kmsg->free_iov_nr * sizeof(struct iovec)); + io_netmsg_iovec_free(kmsg); + } + kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index 191009979bcb..0eb1c1920fc9 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,22 +3,15 @@ #include <linux/net.h> #include <linux/uio.h> -#include "alloc_cache.h" - struct io_async_msghdr { #if defined(CONFIG_NET) - union { - struct iovec fast_iov[UIO_FASTIOV]; - struct { - struct iovec fast_iov_one; - __kernel_size_t controllen; - int namelen; - __kernel_size_t payloadlen; - }; - struct io_cache_entry cache; - }; + struct iovec fast_iov; /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; + int free_iov_nr; + int namelen; + __kernel_size_t controllen; + __kernel_size_t payloadlen; struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; @@ -27,22 +20,15 @@ struct io_async_msghdr { #if defined(CONFIG_NET) -struct io_async_connect { - struct sockaddr_storage address; -}; - int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); -int io_sendmsg_prep_async(struct io_kiocb *req); void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); int io_send(struct io_kiocb *req, unsigned int issue_flags); -int io_send_prep_async(struct io_kiocb *req); -int io_recvmsg_prep_async(struct io_kiocb *req); int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); int io_recv(struct io_kiocb *req, unsigned int issue_flags); @@ -55,7 +41,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags); int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_socket(struct io_kiocb *req, unsigned int issue_flags); -int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); @@ -64,9 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); -void io_netmsg_cache_free(struct io_cache_entry *entry); +void io_netmsg_cache_free(const void *entry); #else -static inline void io_netmsg_cache_free(struct io_cache_entry *entry) +static inline void io_netmsg_cache_free(const void *entry) { } #endif diff --git a/io_uring/nop.c b/io_uring/nop.c index d956599a3c1b..a5bcf3d6984f 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -10,16 +10,34 @@ #include "io_uring.h" #include "nop.h" +struct io_nop { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct file *file; + int result; +}; + int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + unsigned int flags; + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + flags = READ_ONCE(sqe->nop_flags); + if (flags & ~IORING_NOP_INJECT_RESULT) + return -EINVAL; + + if (flags & IORING_NOP_INJECT_RESULT) + nop->result = READ_ONCE(sqe->len); + else + nop->result = 0; return 0; } -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - io_req_set_res(req, 0, 0); + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + if (nop->result < 0) + req_set_fail(req); + io_req_set_res(req, nop->result, 0); return IOU_OK; } diff --git a/io_uring/notif.c b/io_uring/notif.c index 204f0fc7d966..28859ae3ee6e 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,35 +9,36 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) +static const struct ubuf_info_ops io_ubuf_ops; + +static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); - struct io_ring_ctx *ctx = notif->ctx; - if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) - notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + do { + notif = cmd_to_io_kiocb(nd); - if (nd->account_pages && ctx->user) { - __io_unaccount_mem(ctx->user, nd->account_pages); - nd->account_pages = 0; - } - io_req_task_complete(notif, ts); -} + lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); -static void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) -{ - struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); - struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + + if (nd->account_pages && notif->ctx->user) { + __io_unaccount_mem(notif->ctx->user, nd->account_pages); + nd->account_pages = 0; + } - if (refcount_dec_and_test(&uarg->refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + nd = nd->next; + io_req_task_complete(notif, ts); + } while (nd); } -static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + struct io_kiocb *notif = cmd_to_io_kiocb(nd); + unsigned tw_flags; if (nd->zc_report) { if (success && !nd->zc_used && skb) @@ -45,31 +46,64 @@ static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, else if (!success && !nd->zc_copied) WRITE_ONCE(nd->zc_copied, true); } - io_tx_ubuf_complete(skb, uarg, success); -} -static const struct ubuf_info_ops io_ubuf_ops = { - .complete = io_tx_ubuf_complete, -}; + if (!refcount_dec_and_test(&uarg->refcnt)) + return; -static const struct ubuf_info_ops io_ubuf_ops_ext = { - .complete = io_tx_ubuf_callback_ext, -}; + if (nd->head != nd) { + io_tx_ubuf_complete(skb, &nd->head->uarg, success); + return; + } + + tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE; + notif->io_task_work.func = io_notif_tw_complete; + __io_req_task_work_add(notif, tw_flags); +} -void io_notif_set_extended(struct io_kiocb *notif) +static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) { - struct io_notif_data *nd = io_notif_to_data(notif); + struct io_notif_data *nd, *prev_nd; + struct io_kiocb *prev_notif, *notif; + struct ubuf_info *prev_uarg = skb_zcopy(skb); - if (nd->uarg.ops != &io_ubuf_ops_ext) { - nd->account_pages = 0; - nd->zc_report = false; - nd->zc_used = false; - nd->zc_copied = false; - nd->uarg.ops = &io_ubuf_ops_ext; - notif->io_task_work.func = io_notif_complete_tw_ext; + nd = container_of(uarg, struct io_notif_data, uarg); + notif = cmd_to_io_kiocb(nd); + + if (!prev_uarg) { + net_zcopy_get(&nd->uarg); + skb_zcopy_init(skb, &nd->uarg); + return 0; } + /* handle it separately as we can't link a notif to itself */ + if (unlikely(prev_uarg == &nd->uarg)) + return 0; + /* we can't join two links together, just request a fresh skb */ + if (unlikely(nd->head != nd || nd->next)) + return -EEXIST; + /* don't mix zc providers */ + if (unlikely(prev_uarg->ops != &io_ubuf_ops)) + return -EEXIST; + + prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); + prev_notif = cmd_to_io_kiocb(nd); + + /* make sure all noifications can be finished in the same task_work */ + if (unlikely(notif->ctx != prev_notif->ctx || + notif->task != prev_notif->task)) + return -EEXIST; + + nd->head = prev_nd->head; + nd->next = prev_nd->next; + prev_nd->next = nd; + net_zcopy_get(&nd->head->uarg); + return 0; } +static const struct ubuf_info_ops io_ubuf_ops = { + .complete = io_tx_ubuf_complete, + .link_skb = io_link_skb, +}; + struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { @@ -84,9 +118,13 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->task = current; io_get_task_refs(1); notif->rsrc_node = NULL; - notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); + nd->zc_report = false; + nd->account_pages = 0; + nd->next = NULL; + nd->head = nd; + nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; nd->uarg.ops = &io_ubuf_ops; refcount_set(&nd->uarg.refcnt, 1); diff --git a/io_uring/notif.h b/io_uring/notif.h index 86d32bd9f856..f3589cfef4a9 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -13,14 +13,19 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; - unsigned long account_pages; + + struct io_notif_data *next; + struct io_notif_data *head; + + unsigned account_pages; bool zc_report; bool zc_used; bool zc_copied; }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); -void io_notif_set_extended(struct io_kiocb *notif); +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { @@ -32,9 +37,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + io_tx_ubuf_complete(NULL, &nd->uarg, true); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9c080aadc5a6..2de5cca9504e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,7 +67,8 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv, .issue = io_read, }, [IORING_OP_WRITEV] = { @@ -81,7 +82,8 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev, .issue = io_write, }, [IORING_OP_FSYNC] = { @@ -99,7 +101,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_read_fixed, .issue = io_read, }, [IORING_OP_WRITE_FIXED] = { @@ -112,7 +115,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_write_fixed, .issue = io_write, }, [IORING_OP_POLL_ADD] = { @@ -138,8 +142,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, #else @@ -152,8 +156,8 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, #else @@ -162,6 +166,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_timeout_prep, .issue = io_timeout, }, @@ -191,6 +196,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -199,6 +205,7 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_connect_prep, .issue = io_connect, #else @@ -239,7 +246,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_read, .issue = io_read, }, [IORING_OP_WRITE] = { @@ -252,7 +260,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_write, .issue = io_write, }, [IORING_OP_FADVISE] = { @@ -272,8 +281,9 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, + .buffer_select = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, #else @@ -288,6 +298,7 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recv, #else @@ -403,6 +414,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = 2 * sizeof(struct io_uring_sqe), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -412,8 +424,8 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, #else @@ -425,8 +437,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else @@ -439,10 +451,12 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .audit_skip = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_read_mshot_prep, .issue = io_read_mshot, }, [IORING_OP_WAITID] = { + .async_size = sizeof(struct io_waitid_async), .prep = io_waitid_prep, .issue = io_waitid, }, @@ -488,16 +502,12 @@ const struct io_cold_def io_cold_defs[] = { .name = "NOP", }, [IORING_OP_READV] = { - .async_size = sizeof(struct io_async_rw), .name = "READV", - .prep_async = io_readv_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITEV", - .prep_async = io_writev_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, @@ -505,12 +515,10 @@ const struct io_cold_def io_cold_defs[] = { .name = "FSYNC", }, [IORING_OP_READ_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "READ_FIXED", .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE_FIXED", .fail = io_rw_fail, }, @@ -526,8 +534,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG] = { .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -535,14 +541,11 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECVMSG] = { .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_recvmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "TIMEOUT", }, [IORING_OP_TIMEOUT_REMOVE] = { @@ -555,15 +558,10 @@ const struct io_cold_def io_cold_defs[] = { .name = "ASYNC_CANCEL", }, [IORING_OP_LINK_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "LINK_TIMEOUT", }, [IORING_OP_CONNECT] = { .name = "CONNECT", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), - .prep_async = io_connect_prep_async, -#endif }, [IORING_OP_FALLOCATE] = { .name = "FALLOCATE", @@ -583,12 +581,10 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { - .async_size = sizeof(struct io_async_rw), .name = "READ", .fail = io_rw_fail, }, [IORING_OP_WRITE] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE", .fail = io_rw_fail, }, @@ -601,14 +597,14 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SEND] = { .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, #endif }, [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, @@ -679,14 +675,10 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", - .async_size = 2 * sizeof(struct io_uring_sqe), - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_send_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif @@ -694,8 +686,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG_ZC] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif @@ -705,7 +695,6 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_WAITID] = { .name = "WAITID", - .async_size = sizeof(struct io_waitid_async), }, [IORING_OP_FUTEX_WAIT] = { .name = "FUTEX_WAIT", diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 9e5435ec27d0..7ee6f5aa90aa 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -27,22 +27,19 @@ struct io_issue_def { unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; - /* opcode specific path will handle ->async_data allocation if needed */ - unsigned manual_alloc : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* size of async data needed, if any */ + unsigned short async_size; + int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); }; struct io_cold_def { - /* size of async data needed, if any */ - unsigned short async_size; - const char *name; - int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; diff --git a/io_uring/poll.c b/io_uring/poll.c index 6db1dcb2c797..0a8e02944689 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -14,6 +14,7 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "alloc_cache.h" #include "refs.h" #include "napi.h" #include "opdef.h" @@ -322,8 +323,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_fill_cqe_req_aux(req, ts->locked, mask, - IORING_CQE_F_MORE)) { + if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } @@ -687,17 +687,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->apoll_cache); - if (entry == NULL) + apoll = io_alloc_cache_get(&ctx->apoll_cache); + if (!apoll) goto alloc_apoll; - apoll = container_of(entry, struct async_poll, cache); apoll->poll.retries = APOLL_MAX_RETRY; } else { alloc_apoll: @@ -1056,8 +1054,3 @@ out: io_req_set_res(req, ret, 0); return IOU_OK; } - -void io_apoll_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct async_poll, cache)); -} diff --git a/io_uring/poll.h b/io_uring/poll.h index 1dacae9e816c..b0e3745f5a29 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "alloc_cache.h" +#define IO_POLL_ALLOC_CACHE_MAX 32 enum { IO_APOLL_OK, @@ -17,10 +17,7 @@ struct io_poll { }; struct async_poll { - union { - struct io_poll poll; - struct io_cache_entry cache; - }; + struct io_poll poll; struct io_poll *double_poll; }; @@ -46,6 +43,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); -void io_apoll_cache_free(struct io_cache_entry *entry); - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/refs.h b/io_uring/refs.h index 1336de3f2a30..63982ead9f7d 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -33,6 +33,13 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_dec(&req->refs); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { diff --git a/io_uring/register.c b/io_uring/register.c index 99c37775f974..ef8c908346a4 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -368,8 +368,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - + tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4818b79231dd..65417c9553b1 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -13,8 +13,10 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" +#include "memmap.h" struct io_rsrc_update { struct file *file; @@ -169,7 +171,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node) void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) kfree(node); } @@ -197,12 +199,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; - struct io_cache_entry *entry; - entry = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (entry) { - ref_node = container_of(entry, struct io_rsrc_node, cache); - } else { + ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (!ref_node) { ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; @@ -872,42 +871,6 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) -{ - unsigned long start, end, nr_pages; - struct page **pages = NULL; - int ret; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - WARN_ON(!nr_pages); - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - mmap_read_lock(current->mm); - ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); - mmap_read_unlock(current->mm); - - /* success, mapped all pages */ - if (ret == nr_pages) { - *npages = nr_pages; - return pages; - } - - /* partial map, or didn't map anything */ - if (ret >= 0) { - /* if we did partial map, release any pages we did get */ - if (ret) - unpin_user_pages(pages, ret); - ret = -EFAULT; - } - kvfree(pages); - return ERR_PTR(ret); -} - static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index e21000238954..c032ca3436ca 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,8 +2,6 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H -#include "alloc_cache.h" - #define IO_NODE_ALLOC_CACHE_MAX 32 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) @@ -36,10 +34,7 @@ struct io_rsrc_data { }; struct io_rsrc_node { - union { - struct io_cache_entry cache; - struct io_ring_ctx *ctx; - }; + struct io_ring_ctx *ctx; int refs; bool empty; u16 type; @@ -88,12 +83,6 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node io_rsrc_node_ref_zero(node); } -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) -{ - io_put_rsrc_node(ctx, req->rsrc_node); -} - static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { diff --git a/io_uring/rw.c b/io_uring/rw.c index c8d48287439e..a6bf2ea8db91 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -18,6 +18,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" +#include "alloc_cache.h" #include "rsrc.h" #include "poll.h" #include "rw.h" @@ -75,7 +76,179 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_import_iovec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct iovec *iov; + void __user *buf; + int nr_segs, ret; + size_t sqe_len; + + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; + + if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + + return import_ubuf(ddir, buf, sqe_len, &io->iter); + } + + if (io->free_iovec) { + nr_segs = io->free_iov_nr; + iov = io->free_iovec; + } else { + iov = &io->fast_iov; + nr_segs = 1; + } + ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, + req->ctx->compat); + if (unlikely(ret < 0)) + return ret; + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io->free_iov_nr = io->iter.nr_segs; + kfree(io->free_iovec); + io->free_iovec = iov; + } + return 0; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + int ret; + + ret = __io_import_iovec(rw, req, io, issue_flags); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&io->iter, &io->iter_state); + return 0; +} + +static void io_rw_iovec_free(struct io_async_rw *rw) +{ + if (rw->free_iovec) { + kfree(rw->free_iovec); + rw->free_iov_nr = 0; + rw->free_iovec = NULL; + } +} + +static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_async_rw *rw = req->async_data; + struct iovec *iov; + + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_rw_iovec_free(rw); + return; + } + iov = rw->free_iovec; + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { + if (iov) + kasan_mempool_poison_object(iov); + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + /* + * Disable quick recycling for anything that's gone through io-wq. + * In theory, this should be fine to cleanup. However, some read or + * write iter handling touches the iovec AFTER having called into the + * handler, eg to reexpand or revert. This means we can have: + * + * task io-wq + * issue + * punt to io-wq + * issue + * blkdev_write_iter() + * ->ki_complete() + * io_complete_rw() + * queue tw complete + * run tw + * req_rw_cleanup + * iov_iter_count() <- look at iov_iter again + * + * which can lead to a UAF. This is only possible for io-wq offload + * as the cleanup can run in parallel. As io-wq is not the fast path, + * just leave cleanup to the end. + * + * This is really a bug in the core code that does this, any issue + * path should assume that a successful (or -EIOCBQUEUED) return can + * mean that the underlying data can be gone at any time. But that + * should be fixed seperately, and then this check could be killed. + */ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags &= ~REQ_F_NEED_CLEANUP; + io_rw_recycle(req, issue_flags); + } +} + +static int io_rw_alloc_async(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_async_rw *rw; + + rw = io_alloc_cache_get(&ctx->rw_cache); + if (rw) { + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; + } + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = rw; + goto done; + } + + if (!io_alloc_async_data(req)) { + rw = req->async_data; + rw->free_iovec = NULL; + rw->free_iov_nr = 0; +done: + rw->bytes_done = 0; + return 0; + } + + return -ENOMEM; +} + +static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) +{ + struct io_async_rw *rw; + int ret; + + if (io_rw_alloc_async(req)) + return -ENOMEM; + + if (!do_import || io_do_buffer_select(req)) + return 0; + + rw = req->async_data; + ret = io_import_iovec(ddir, req, rw, 0); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&rw->iter, &rw->iter_state); + return 0; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; @@ -100,34 +273,58 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return 0; + return io_prep_rw_setup(req, ddir, do_import); +} + +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw(req, sqe, ITER_DEST, true); } -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + return io_prep_rw(req, sqe, ITER_SOURCE, true); +} + +static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, do_import); if (unlikely(ret)) return ret; + if (do_import) + return 0; /* * Have to do this validation here, as this is in io_read() rw->len * might have chanaged due to buffer selection */ - if (req->flags & REQ_F_BUFFER_SELECT) - return io_iov_buffer_select_prep(req); + return io_iov_buffer_select_prep(req); +} - return 0; +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_DEST); +} + +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_SOURCE); } -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_ring_ctx *ctx = req->ctx; + struct io_async_rw *io; u16 index; int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, false); if (unlikely(ret)) return ret; @@ -136,7 +333,21 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); - return 0; + + io = req->async_data; + ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); + iov_iter_save_state(&io->iter, &io->iter_state); + return ret; +} + +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_DEST); +} + +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_SOURCE); } /* @@ -152,7 +363,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ITER_DEST, false); if (unlikely(ret)) return ret; @@ -165,9 +376,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); + io_rw_iovec_free(req->async_data); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) @@ -187,21 +396,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - #ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) +static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&io->s.iter, &io->s.iter_state); - return true; + iov_iter_restore(&io->iter, &io->iter_state); } static bool io_rw_should_reissue(struct io_kiocb *req) @@ -230,9 +430,8 @@ static bool io_rw_should_reissue(struct io_kiocb *req) return true; } #else -static bool io_resubmit_prep(struct io_kiocb *req) +static void io_resubmit_prep(struct io_kiocb *req) { - return false; } static bool io_rw_should_reissue(struct io_kiocb *req) { @@ -311,11 +510,10 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_io_end(req); - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) + req->cqe.flags |= io_put_kbuf(req, 0); - req->cqe.flags |= io_put_kbuf(req, issue_flags); - } + io_req_rw_cleanup(req, 0); io_req_task_complete(req, ts); } @@ -396,6 +594,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, issue_flags)); + io_req_rw_cleanup(req, issue_flags); return IOU_OK; } } else { @@ -404,71 +603,12 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, final_ret); + io_resubmit_prep(req); + return -EAGAIN; } return IOU_ISSUE_SKIP_COMPLETE; } -static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - ret = import_ubuf(ddir, buf, sqe_len, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (IS_ERR(*iovec)) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -540,89 +680,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) return ret; } -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *io = req->async_data; - - memcpy(&io->s.iter, iter, sizeof(*iter)); - io->free_iovec = iovec; - io->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - io->s.iter.__iov = io->s.fast_iov; - if (iter->__iov != fast_iov) { - iov_off = iter_iov(iter) - fast_iov; - io->s.iter.__iov += iov_off; - } - if (io->s.fast_iov != fast_iov) - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_cold_defs[req->opcode].prep_async) - return 0; - /* opcode type doesn't need async data */ - if (!io_cold_defs[req->opcode].async_size) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - iorw->bytes_done = 0; - iorw->free_iovec = NULL; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - if (iov) { - iorw->free_iovec = iov; - req->flags |= REQ_F_NEED_CLEANUP; - } - - return 0; -} - -int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_DEST); -} - -int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_SOURCE); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -762,54 +819,28 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) static int __io_read(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *io; - ssize_t ret, ret2; + ssize_t ret; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); + if (io_do_buffer_select(req)) { + ret = io_import_iovec(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; - } else { - io = req->async_data; - s = &io->s; - - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; } + ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } + if (unlikely(!io_file_supports_nowait(req))) + return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { /* Ensure we clear previously set non-block flag */ @@ -819,20 +850,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; - /* - * If we can poll, just do that. For a vectored read, we'll - * need to copy state first. - */ - if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored) + /* If we can poll, just do that. */ + if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -842,8 +868,6 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { - if (iovec) - kfree(iovec); return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { @@ -856,21 +880,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - iovec = NULL; - if (ret2) { - ret = ret > 0 ? ret : ret2; - goto done; - } - - io = req->async_data; - s = &io->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ + iov_iter_restore(&io->iter, &io->iter_state); do { /* @@ -878,11 +888,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) + iov_iter_advance(&io->iter, ret); + if (!iov_iter_count(&io->iter)) break; io->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); + iov_iter_save_state(&io->iter, &io->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -890,24 +900,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); + iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); done: /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); return ret; } @@ -970,9 +978,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ - if (io_fill_cqe_req_aux(req, - issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, cflags | IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (issue_flags & IO_URING_F_MULTISHOT) { /* * Force retry, as we might have more data to @@ -991,6 +997,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * multishot request, hitting overflow will terminate it. */ io_req_set_res(req, ret, cflags); + io_req_rw_cleanup(req, issue_flags); if (issue_flags & IO_URING_F_MULTISHOT) return IOU_STOP_MULTISHOT; return IOU_OK; @@ -998,42 +1005,28 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) int io_write(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *io = req->async_data; - - s = &io->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; + goto ret_eagain; /* File path supports NOWAIT for non-direct_IO only for block devices. */ if (!(kiocb->ki_flags & IOCB_DIRECT) && !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && (req->flags & REQ_F_ISREG)) - goto copy_iov; + goto ret_eagain; kiocb->ki_flags |= IOCB_NOWAIT; } else { @@ -1044,19 +1037,17 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } if (req->flags & REQ_F_ISREG) kiocb_start_write(kiocb); kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); + ret2 = call_write_iter(req->file, kiocb, &io->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &s->iter); + ret2 = loop_rw_iter(WRITE, rw, &io->iter); else ret2 = -EINVAL; @@ -1077,11 +1068,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; + goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { - struct io_async_rw *io; - trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, req->cqe.res, ret2); @@ -1090,34 +1079,22 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) * in the worker. Also update bytes_done to account for * the bytes already written. */ - iov_iter_save_state(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, true); - - io = req->async_data; - if (io) - io->bytes_done += ret2; + iov_iter_save_state(&io->iter, &io->iter_state); + io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); - return ret ? ret : -EAGAIN; + return -EAGAIN; } done: - ret = kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, issue_flags); } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - if (!ret) { - if (kiocb->ki_flags & IOCB_WRITE) - io_req_end_write(req); - return -EAGAIN; - } - return ret; +ret_eagain: + iov_iter_restore(&io->iter, &io->iter_state); + if (kiocb->ki_flags & IOCB_WRITE) + io_req_end_write(req); + return -EAGAIN; } - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; } void io_rw_fail(struct io_kiocb *req) @@ -1191,6 +1168,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) break; nr_events++; req->cqe.flags = io_put_kbuf(req, 0); + if (req->opcode != IORING_OP_URING_CMD) + io_req_rw_cleanup(req, 0); } if (unlikely(!nr_events)) return 0; @@ -1204,3 +1183,15 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) __io_submit_flush_completions(ctx); return nr_events; } + +void io_rw_cache_free(const void *entry) +{ + struct io_async_rw *rw = (struct io_async_rw *) entry; + + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + io_rw_iovec_free(rw); + } + kfree(rw); +} diff --git a/io_uring/rw.h b/io_uring/rw.h index f9e89b4fe4da..3f432dc75441 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,28 +2,27 @@ #include <linux/pagemap.h> -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; size_t bytes_done; + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov; + struct iovec *free_iovec; + int free_iov_nr; struct wait_page_queue wpq; }; -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); -int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); -int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); +void io_rw_cache_free(const void *entry); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 3983708cef5b..554c7212aa46 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -291,6 +291,14 @@ static int io_sq_thread(void *data) sqd->sq_cpu = raw_smp_processor_id(); } + /* + * Force audit context to get setup, in case we do prep side async + * operations that would trigger an audit call before any issue side + * audit has been done. + */ + audit_uring_entry(IORING_OP_NOP); + audit_uring_exit(true, 0); + mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7fd7dbb211d6..1c9bf07499b1 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -72,10 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { - bool filled; - filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, - IORING_CQE_F_MORE); - if (filled) { + if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); @@ -301,7 +298,6 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -313,7 +309,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t .data = prev->cqe.user_data, }; - ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); + ret = io_try_cancel(req->task->io_uring, &cd, 0); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, ts); @@ -541,7 +537,6 @@ static int __io_timeout_prep(struct io_kiocb *req, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; - INIT_LIST_HEAD(&timeout->list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 42f63adfa54a..21ac5fb2d5f0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> +#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> #include <net/sock.h> @@ -11,9 +12,71 @@ #include <asm/ioctls.h> #include "io_uring.h" +#include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" +static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct uring_cache *cache; + + cache = io_alloc_cache_get(&ctx->uring_cache); + if (cache) { + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = cache; + return cache; + } + if (!io_alloc_async_data(req)) + return req->async_data; + return NULL; +} + +static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct uring_cache *cache = req->async_data; + + if (issue_flags & IO_URING_F_UNLOCKED) + return; + if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + ioucmd->sqe = NULL; + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool ret = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, + hash_node) { + struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, + struct io_uring_cmd); + struct file *file = req->file; + + if (!cancel_all && req->task != task) + continue; + + if (cmd->flags & IORING_URING_CMD_CANCELABLE) { + /* ->sqe isn't available if no async data */ + if (!req_has_async_data(req)) + cmd->sqe = NULL; + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | + IO_URING_F_COMPLETE_DEFER); + ret = true; + } + } + io_submit_flush_completions(ctx); + return ret; +} + static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -56,9 +119,9 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; - ioucmd->task_work_cb(ioucmd, issue_flags); + /* task_work executor checks the deffered list completion */ + ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -97,23 +160,38 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); + io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); + } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED)) + return; + io_req_complete_defer(req); } else { - struct io_tw_state ts = { - .locked = !(issue_flags & IO_URING_F_UNLOCKED), - }; - io_req_task_complete(req, &ts); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -int io_uring_cmd_prep_async(struct io_kiocb *req) +static int io_uring_cmd_prep_setup(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct uring_cache *cache; + + cache = io_uring_async_get(req); + if (unlikely(!cache)) + return -ENOMEM; + + if (!(req->flags & REQ_F_FORCE_ASYNC)) { + /* defer memcpy until we need it */ + ioucmd->sqe = sqe; + return 0; + } - memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); + memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); ioucmd->sqe = req->async_data; return 0; } @@ -140,9 +218,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); } - ioucmd->sqe = sqe; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; + + return io_uring_cmd_prep_setup(req, sqe); } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) @@ -174,22 +252,20 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } + struct uring_cache *cache = req->async_data; - if (ret != -EIOCBQUEUED) { - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return ret; + if (ioucmd->sqe != (void *) cache) + memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); + return -EAGAIN; + } else if (ret == -EIOCBQUEUED) { + return -EIOCBQUEUED; } - return IOU_ISSUE_SKIP_COMPLETE; + if (ret < 0) + req_set_fail(req); + io_req_uring_cleanup(req, issue_flags); + io_req_set_res(req, ret, 0); + return ret; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 8117684ec3ca..a361f98664d2 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,5 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +struct uring_cache { + struct io_uring_sqe sqes[2]; +}; + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_uring_cmd_prep_async(struct io_kiocb *req); + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 77d340666cb9..6362ec20abc0 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,7 +118,7 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = { .locked = true }; + struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index ccba875d2234..84413114db5c 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1596,7 +1596,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) } while (i < tmigr_hierarchy_levels); - do { + while (i > 0) { group = stack[--i]; if (err < 0) { @@ -1645,7 +1645,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) tmigr_connect_child_parent(child, group); } } - } while (i > 0); + } kfree(stack); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 55e1b35bf877..2d7d27e6ae3c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5109,18 +5109,18 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, if (size == 0 || max - min < size - 1) return -EINVAL; - if (mas_is_start(mas)) { + if (mas_is_start(mas)) mas_start(mas); - mas->offset = mas_data_end(mas); - } else if (mas->offset >= 2) { - mas->offset -= 2; - } else if (!mas_rewind_node(mas)) { + else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; - } - /* Empty set. */ - if (mas_is_none(mas) || mas_is_ptr(mas)) + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); + else if (mas->offset >= 2) + mas->offset -= 2; + else + mas->offset = mas_data_end(mas); + /* The start of the window can only be within these values. */ mas->index = min; diff --git a/lib/test_xarray.c b/lib/test_xarray.c index ebe2af2e072d..928fc20337e6 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -744,15 +744,20 @@ static noinline void check_xa_multi_store_adv_add(struct xarray *xa, do { xas_lock_irq(&xas); - xas_store(&xas, p); - XA_BUG_ON(xa, xas_error(&xas)); - XA_BUG_ON(xa, xa_load(xa, index) != p); - xas_unlock_irq(&xas); + /* + * In our selftest case the only failure we can expect is for + * there not to be enough memory as we're not mimicking the + * entire page cache, so verify that's the only error we can run + * into here. The xas_nomem() which follows will ensure to fix + * that condition for us so to chug on on the loop. + */ + XA_BUG_ON(xa, xas_error(&xas) && xas_error(&xas) != -ENOMEM); } while (xas_nomem(&xas, GFP_KERNEL)); XA_BUG_ON(xa, xas_error(&xas)); + XA_BUG_ON(xa, xa_load(xa, index) != p); } /* mimics page_cache_delete() */ @@ -1783,9 +1788,11 @@ static void check_split_1(struct xarray *xa, unsigned long index, unsigned int order, unsigned int new_order) { XA_STATE_ORDER(xas, xa, index, new_order); - unsigned int i; + unsigned int i, found; + void *entry; xa_store_order(xa, index, order, xa, GFP_KERNEL); + xa_set_mark(xa, index, XA_MARK_1); xas_split_alloc(&xas, xa, order, GFP_KERNEL); xas_lock(&xas); @@ -1802,6 +1809,16 @@ static void check_split_1(struct xarray *xa, unsigned long index, xa_set_mark(xa, index, XA_MARK_0); XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + xas_set_order(&xas, index, 0); + found = 0; + rcu_read_lock(); + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_1) { + found++; + XA_BUG_ON(xa, xa_is_internal(entry)); + } + rcu_read_unlock(); + XA_BUG_ON(xa, found != 1 << (order - new_order)); + xa_destroy(xa); } diff --git a/lib/xarray.c b/lib/xarray.c index 39f07bfc4dcc..5e7d6334d70d 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -969,8 +969,22 @@ static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) return marks; } +static inline void node_mark_slots(struct xa_node *node, unsigned int sibs, + xa_mark_t mark) +{ + int i; + + if (sibs == 0) + node_mark_all(node, mark); + else { + for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1) + node_set_mark(node, i, mark); + } +} + static void node_set_marks(struct xa_node *node, unsigned int offset, - struct xa_node *child, unsigned int marks) + struct xa_node *child, unsigned int sibs, + unsigned int marks) { xa_mark_t mark = XA_MARK_0; @@ -978,7 +992,7 @@ static void node_set_marks(struct xa_node *node, unsigned int offset, if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) - node_mark_all(child, mark); + node_mark_slots(child, sibs, mark); } if (mark == XA_MARK_MAX) break; @@ -1077,7 +1091,8 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); - node_set_marks(node, offset, child, marks); + node_set_marks(node, offset, child, xas->xa_sibs, + marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) @@ -1086,7 +1101,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) } else { unsigned int canon = offset - xas->xa_sibs; - node_set_marks(node, canon, NULL, marks); + node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], diff --git a/mm/nommu.c b/mm/nommu.c index 5ec8f44e7ce9..a34a0e376611 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -355,6 +355,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages); + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 742f432e5bf0..8eed0f3dc085 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -170,7 +170,7 @@ static void add_stack_record_to_list(struct stack_record *stack_record, /* Filter gfp_mask the same way stackdepot does, for consistency */ gfp_mask &= ~GFP_ZONEMASK; - gfp_mask &= (GFP_ATOMIC | GFP_KERNEL); + gfp_mask &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP); gfp_mask |= __GFP_NOWARN; set_current_in_page_owner(); @@ -328,7 +328,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, if (unlikely(!page_ext)) return; __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, - current->pid, current->tgid, ts_nsec, + ts_nsec, current->pid, current->tgid, current->comm); page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask, 1 << order); diff --git a/mm/readahead.c b/mm/readahead.c index 130c0e7df99f..d55138e9560b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -490,6 +490,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; + unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -504,6 +505,8 @@ void page_cache_ra_order(struct readahead_control *ractl, new_order = min_t(unsigned int, new_order, ilog2(ra->size)); } + /* See comment in page_cache_ra_unbounded() */ + nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; @@ -527,6 +530,7 @@ void page_cache_ra_order(struct readahead_control *ractl, read_pages(ractl); filemap_invalidate_unlock_shared(mapping); + memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 68fa001648cc..125427cbdb87 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2710,7 +2710,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) * get_order(0) returns funny result. Just warn and terminate * early. */ - return NULL; + return ERR_PTR(-EINVAL); } order = get_order(size); diff --git a/net/socket.c b/net/socket.c index e5f3af49a8b6..01a71ae10c35 100644 --- a/net/socket.c +++ b/net/socket.c @@ -88,7 +88,7 @@ #include <linux/xattr.h> #include <linux/nospec.h> #include <linux/indirect_call_wrapper.h> -#include <linux/io_uring.h> +#include <linux/io_uring/net.h> #include <linux/uaccess.h> #include <asm/unistd.h> diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index c5c9d05f29da..c0a2bb785b92 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -18,6 +18,8 @@ #define pr_info printk #define pr_debug printk #define pr_cont printk +#define schedule() +#define PAGE_SHIFT 12 #define __acquires(x) #define __releases(x) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eb5f39a2668b..410495e0a611 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -12,7 +12,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/powerpc/') endif # Without this, failed build products remain, with up-to-date timestamps, @@ -98,13 +98,13 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),ppc64)) +ifneq (,$(findstring $(ARCH),powerpc)) TEST_GEN_FILES += protection_keys endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index cdfed403ba13..7b543e7f04d7 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -53,15 +53,19 @@ #if __riscv_xlen == 32 #define VDSO_32BIT 1 #endif +#elif defined(__loongarch__) +#define VDSO_VERSION 6 +#define VDSO_NAMES 1 #endif -static const char *versions[6] = { +static const char *versions[7] = { "LINUX_2.6", "LINUX_2.6.15", "LINUX_2.6.29", "LINUX_2.6.39", "LINUX_4", "LINUX_4.15", + "LINUX_5.10" }; static const char *names[2][6] = { diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index 1df5d057d79f..b758f68c6c9c 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -13,13 +13,7 @@ #include "../kselftest.h" #include "parse_vdso.h" - -#if defined(__riscv) -const char *version = "LINUX_4.15"; -#else -const char *version = "LINUX_2.6"; -#endif -const char *name = "__vdso_getcpu"; +#include "vdso_config.h" struct getcpu_cache; typedef long (*getcpu_t)(unsigned int *, unsigned int *, @@ -27,6 +21,8 @@ typedef long (*getcpu_t)(unsigned int *, unsigned int *, int main(int argc, char **argv) { + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; unsigned long sysinfo_ehdr; unsigned int cpu, node; getcpu_t get_cpu; @@ -40,9 +36,9 @@ int main(int argc, char **argv) vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); - get_cpu = (getcpu_t)vdso_sym(version, name); + get_cpu = (getcpu_t)vdso_sym(version, name[4]); if (!get_cpu) { - printf("Could not find %s\n", name); + printf("Could not find %s\n", name[4]); return KSFT_SKIP; } @@ -50,7 +46,7 @@ int main(int argc, char **argv) if (ret == 0) { printf("Running on CPU %u node %u\n", cpu, node); } else { - printf("%s failed\n", name); + printf("%s failed\n", name[4]); return KSFT_FAIL; } diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index e411f287a426..ee4f1ca56a71 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -18,25 +18,13 @@ #include "../kselftest.h" #include "parse_vdso.h" - -/* - * ARM64's vDSO exports its gettimeofday() implementation with a different - * name and version from other architectures, so we need to handle it as - * a special case. - */ -#if defined(__aarch64__) -const char *version = "LINUX_2.6.39"; -const char *name = "__kernel_gettimeofday"; -#elif defined(__riscv) -const char *version = "LINUX_4.15"; -const char *name = "__vdso_gettimeofday"; -#else -const char *version = "LINUX_2.6"; -const char *name = "__vdso_gettimeofday"; -#endif +#include "vdso_config.h" int main(int argc, char **argv) { + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); @@ -47,10 +35,10 @@ int main(int argc, char **argv) /* Find gettimeofday. */ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); - gtod_t gtod = (gtod_t)vdso_sym(version, name); + gtod_t gtod = (gtod_t)vdso_sym(version, name[0]); if (!gtod) { - printf("Could not find %s\n", name); + printf("Could not find %s\n", name[0]); return KSFT_SKIP; } @@ -61,7 +49,7 @@ int main(int argc, char **argv) printf("The time is %lld.%06lld\n", (long long)tv.tv_sec, (long long)tv.tv_usec); } else { - printf("%s failed\n", name); + printf("%s failed\n", name[0]); return KSFT_FAIL; } |