diff options
105 files changed, 5205 insertions, 2371 deletions
diff --git a/Documentation/gpu/rfc/i915_parallel_execbuf.h b/Documentation/gpu/rfc/i915_parallel_execbuf.h deleted file mode 100644 index 8cbe2c4e0172..000000000000 --- a/Documentation/gpu/rfc/i915_parallel_execbuf.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2021 Intel Corporation - */ - -#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ - -/** - * struct drm_i915_context_engines_parallel_submit - Configure engine for - * parallel submission. - * - * Setup a slot in the context engine map to allow multiple BBs to be submitted - * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU - * in parallel. Multiple hardware contexts are created internally in the i915 - * run these BBs. Once a slot is configured for N BBs only N BBs can be - * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user - * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how - * many BBs there are based on the slot's configuration. The N BBs are the last - * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set. - * - * The default placement behavior is to create implicit bonds between each - * context if each context maps to more than 1 physical engine (e.g. context is - * a virtual engine). Also we only allow contexts of same engine class and these - * contexts must be in logically contiguous order. Examples of the placement - * behavior described below. Lastly, the default is to not allow BBs to - * preempted mid BB rather insert coordinated preemption on all hardware - * contexts between each set of BBs. Flags may be added in the future to change - * both of these default behaviors. - * - * Returns -EINVAL if hardware context placement configuration is invalid or if - * the placement configuration isn't supported on the platform / submission - * interface. - * Returns -ENODEV if extension isn't supported on the platform / submission - * interface. - * - * .. code-block:: none - * - * Example 1 pseudo code: - * CS[X] = generic engine of same class, logical instance X - * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE - * set_engines(INVALID) - * set_parallel(engine_index=0, width=2, num_siblings=1, - * engines=CS[0],CS[1]) - * - * Results in the following valid placement: - * CS[0], CS[1] - * - * Example 2 pseudo code: - * CS[X] = generic engine of same class, logical instance X - * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE - * set_engines(INVALID) - * set_parallel(engine_index=0, width=2, num_siblings=2, - * engines=CS[0],CS[2],CS[1],CS[3]) - * - * Results in the following valid placements: - * CS[0], CS[1] - * CS[2], CS[3] - * - * This can also be thought of as 2 virtual engines described by 2-D array - * in the engines the field with bonds placed between each index of the - * virtual engines. e.g. CS[0] is bonded to CS[1], CS[2] is bonded to - * CS[3]. - * VE[0] = CS[0], CS[2] - * VE[1] = CS[1], CS[3] - * - * Example 3 pseudo code: - * CS[X] = generic engine of same class, logical instance X - * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE - * set_engines(INVALID) - * set_parallel(engine_index=0, width=2, num_siblings=2, - * engines=CS[0],CS[1],CS[1],CS[3]) - * - * Results in the following valid and invalid placements: - * CS[0], CS[1] - * CS[1], CS[3] - Not logical contiguous, return -EINVAL - */ -struct drm_i915_context_engines_parallel_submit { - /** - * @base: base user extension. - */ - struct i915_user_extension base; - - /** - * @engine_index: slot for parallel engine - */ - __u16 engine_index; - - /** - * @width: number of contexts per parallel engine - */ - __u16 width; - - /** - * @num_siblings: number of siblings per context - */ - __u16 num_siblings; - - /** - * @mbz16: reserved for future use; must be zero - */ - __u16 mbz16; - - /** - * @flags: all undefined flags must be zero, currently not defined flags - */ - __u64 flags; - - /** - * @mbz64: reserved for future use; must be zero - */ - __u64 mbz64[3]; - - /** - * @engines: 2-d array of engine instances to configure parallel engine - * - * length = width (i) * num_siblings (j) - * index = j + i * num_siblings - */ - struct i915_engine_class_instance engines[0]; - -} __packed; - diff --git a/Documentation/gpu/rfc/i915_scheduler.rst b/Documentation/gpu/rfc/i915_scheduler.rst index cbda75065dad..d630f15ab795 100644 --- a/Documentation/gpu/rfc/i915_scheduler.rst +++ b/Documentation/gpu/rfc/i915_scheduler.rst @@ -135,8 +135,8 @@ Add I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT and drm_i915_context_engines_parallel_submit to the uAPI to implement this extension. -.. kernel-doc:: Documentation/gpu/rfc/i915_parallel_execbuf.h - :functions: drm_i915_context_engines_parallel_submit +.. kernel-doc:: include/uapi/drm/i915_drm.h + :functions: i915_context_engines_parallel_submit Extend execbuf2 IOCTL to support submitting N BBs in a single IOCTL ------------------------------------------------------------------- diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 8859444943a0..bf041b26ffec 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -132,15 +132,15 @@ config DRM_I915_GVT_KVMGT Intel GVT-g. config DRM_I915_PXP - bool "Enable Intel PXP support for Intel Gen12 and newer platform" + bool "Enable Intel PXP support" depends on DRM_I915 depends on INTEL_MEI && INTEL_MEI_PXP default n help - PXP (Protected Xe Path) is an i915 component, available on GEN12 and - newer GPUs, that helps to establish the hardware protected session and - manage the status of the alive software session, as well as its life - cycle. + PXP (Protected Xe Path) is an i915 component, available on graphics + version 12 and newer GPUs, that helps to establish the hardware + protected session and manage the status of the alive software session, + as well as its life cycle. menu "drm/i915 Debugging" depends on DRM_I915 diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index cdc244bbbfc1..660bb03de6fc 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -47,13 +47,15 @@ i915-y += i915_drv.o \ intel_dram.o \ intel_memory_region.o \ intel_pch.o \ + intel_pcode.o \ intel_pm.o \ intel_region_ttm.o \ intel_runtime_pm.o \ - intel_sideband.o \ + intel_sbi.o \ intel_step.o \ intel_uncore.o \ intel_wakeref.o \ + vlv_sideband.o \ vlv_suspend.o # core library code @@ -214,6 +216,7 @@ i915-y += \ display/intel_drrs.o \ display/intel_dsb.o \ display/intel_fb.o \ + display/intel_fb_pin.o \ display/intel_fbc.o \ display/intel_fdi.o \ display/intel_fifo_underrun.o \ @@ -223,6 +226,7 @@ i915-y += \ display/intel_hotplug.o \ display/intel_lpe_audio.o \ display/intel_overlay.o \ + display/intel_plane_initial.o \ display/intel_psr.o \ display/intel_quirks.o \ display/intel_sprite.o \ diff --git a/drivers/gpu/drm/i915/display/g4x_dp.c b/drivers/gpu/drm/i915/display/g4x_dp.c index 60ae2ba52006..dc41868d01ef 100644 --- a/drivers/gpu/drm/i915/display/g4x_dp.c +++ b/drivers/gpu/drm/i915/display/g4x_dp.c @@ -18,7 +18,7 @@ #include "intel_hdmi.h" #include "intel_hotplug.h" #include "intel_pps.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" struct dp_link_dpll { int clock; @@ -637,7 +637,7 @@ static void intel_dp_enable_port(struct intel_dp *intel_dp, /* enable with pattern 1 (as per spec) */ intel_dp_program_link_training_pattern(intel_dp, crtc_state, - DP_TRAINING_PATTERN_1); + DP_PHY_DPRX, DP_TRAINING_PATTERN_1); /* * Magic for VLV/CHV. We _must_ first set up the register diff --git a/drivers/gpu/drm/i915/display/g4x_hdmi.c b/drivers/gpu/drm/i915/display/g4x_hdmi.c index be352e9f0afc..88c427f3c346 100644 --- a/drivers/gpu/drm/i915/display/g4x_hdmi.c +++ b/drivers/gpu/drm/i915/display/g4x_hdmi.c @@ -14,8 +14,8 @@ #include "intel_fifo_underrun.h" #include "intel_hdmi.h" #include "intel_hotplug.h" -#include "intel_sideband.h" #include "intel_sdvo.h" +#include "vlv_sideband.h" static void intel_hdmi_prepare(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index 9ee62707ec72..168c84a74d30 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -233,7 +233,7 @@ static void dsi_program_swing_and_deemphasis(struct intel_encoder *encoder) * Program voltage swing and pre-emphasis level values as per * table in BSPEC under DDI buffer programing */ - tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN0(phy)); + tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN(0, phy)); tmp &= ~(SCALING_MODE_SEL_MASK | RTERM_SELECT_MASK); tmp |= SCALING_MODE_SEL(0x2); tmp |= TAP2_DISABLE | TAP3_DISABLE; @@ -247,7 +247,7 @@ static void dsi_program_swing_and_deemphasis(struct intel_encoder *encoder) tmp |= RTERM_SELECT(0x6); intel_de_write(dev_priv, ICL_PORT_TX_DW5_AUX(phy), tmp); - tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW2_LN0(phy)); + tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW2_LN(0, phy)); tmp &= ~(SWING_SEL_LOWER_MASK | SWING_SEL_UPPER_MASK | RCOMP_SCALAR_MASK); tmp |= SWING_SEL_UPPER(0x2); @@ -455,7 +455,7 @@ static void gen11_dsi_config_phy_lanes_sequence(struct intel_encoder *encoder) tmp &= ~FRC_LATENCY_OPTIM_MASK; tmp |= FRC_LATENCY_OPTIM_VAL(0x5); intel_de_write(dev_priv, ICL_PORT_TX_DW2_AUX(phy), tmp); - tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW2_LN0(phy)); + tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW2_LN(0, phy)); tmp &= ~FRC_LATENCY_OPTIM_MASK; tmp |= FRC_LATENCY_OPTIM_VAL(0x5); intel_de_write(dev_priv, ICL_PORT_TX_DW2_GRP(phy), tmp); @@ -470,7 +470,7 @@ static void gen11_dsi_config_phy_lanes_sequence(struct intel_encoder *encoder) tmp); tmp = intel_de_read(dev_priv, - ICL_PORT_PCS_DW1_LN0(phy)); + ICL_PORT_PCS_DW1_LN(0, phy)); tmp &= ~LATENCY_OPTIM_MASK; tmp |= LATENCY_OPTIM_VAL(0x1); intel_de_write(dev_priv, ICL_PORT_PCS_DW1_GRP(phy), @@ -489,7 +489,7 @@ static void gen11_dsi_voltage_swing_program_seq(struct intel_encoder *encoder) /* clear common keeper enable bit */ for_each_dsi_phy(phy, intel_dsi->phys) { - tmp = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_LN0(phy)); + tmp = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_LN(0, phy)); tmp &= ~COMMON_KEEPER_EN; intel_de_write(dev_priv, ICL_PORT_PCS_DW1_GRP(phy), tmp); tmp = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_AUX(phy)); @@ -510,7 +510,7 @@ static void gen11_dsi_voltage_swing_program_seq(struct intel_encoder *encoder) /* Clear training enable to change swing values */ for_each_dsi_phy(phy, intel_dsi->phys) { - tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN0(phy)); + tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN(0, phy)); tmp &= ~TX_TRAINING_EN; intel_de_write(dev_priv, ICL_PORT_TX_DW5_GRP(phy), tmp); tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_AUX(phy)); @@ -523,7 +523,7 @@ static void gen11_dsi_voltage_swing_program_seq(struct intel_encoder *encoder) /* Set training enable to trigger update */ for_each_dsi_phy(phy, intel_dsi->phys) { - tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN0(phy)); + tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN(0, phy)); tmp |= TX_TRAINING_EN; intel_de_write(dev_priv, ICL_PORT_TX_DW5_GRP(phy), tmp); tmp = intel_de_read(dev_priv, ICL_PORT_TX_DW5_AUX(phy)); diff --git a/drivers/gpu/drm/i915/display/intel_acpi.c b/drivers/gpu/drm/i915/display/intel_acpi.c index 72cac55c0f0f..e78430001f07 100644 --- a/drivers/gpu/drm/i915/display/intel_acpi.c +++ b/drivers/gpu/drm/i915/display/intel_acpi.c @@ -186,13 +186,16 @@ void intel_dsm_get_bios_data_funcs_supported(struct drm_i915_private *i915) { struct pci_dev *pdev = to_pci_dev(i915->drm.dev); acpi_handle dhandle; + union acpi_object *obj; dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return; - acpi_evaluate_dsm(dhandle, &intel_dsm_guid2, INTEL_DSM_REVISION_ID, - INTEL_DSM_FN_GET_BIOS_DATA_FUNCS_SUPPORTED, NULL); + obj = acpi_evaluate_dsm(dhandle, &intel_dsm_guid2, INTEL_DSM_REVISION_ID, + INTEL_DSM_FN_GET_BIOS_DATA_FUNCS_SUPPORTED, NULL); + if (obj) + ACPI_FREE(obj); } /* diff --git a/drivers/gpu/drm/i915/display/intel_atomic_plane.c b/drivers/gpu/drm/i915/display/intel_atomic_plane.c index 47234d898549..0be8c00e3db9 100644 --- a/drivers/gpu/drm/i915/display/intel_atomic_plane.c +++ b/drivers/gpu/drm/i915/display/intel_atomic_plane.c @@ -39,8 +39,10 @@ #include "intel_atomic_plane.h" #include "intel_cdclk.h" #include "intel_display_types.h" +#include "intel_fb_pin.h" #include "intel_pm.h" #include "intel_sprite.h" +#include "gt/intel_rps.h" static void intel_plane_state_reset(struct intel_plane_state *plane_state, struct intel_plane *plane) @@ -601,6 +603,213 @@ int intel_atomic_plane_check_clipping(struct intel_plane_state *plane_state, return 0; } +struct wait_rps_boost { + struct wait_queue_entry wait; + + struct drm_crtc *crtc; + struct i915_request *request; +}; + +static int do_rps_boost(struct wait_queue_entry *_wait, + unsigned mode, int sync, void *key) +{ + struct wait_rps_boost *wait = container_of(_wait, typeof(*wait), wait); + struct i915_request *rq = wait->request; + + /* + * If we missed the vblank, but the request is already running it + * is reasonable to assume that it will complete before the next + * vblank without our intervention, so leave RPS alone. + */ + if (!i915_request_started(rq)) + intel_rps_boost(rq); + i915_request_put(rq); + + drm_crtc_vblank_put(wait->crtc); + + list_del(&wait->wait.entry); + kfree(wait); + return 1; +} + +static void add_rps_boost_after_vblank(struct drm_crtc *crtc, + struct dma_fence *fence) +{ + struct wait_rps_boost *wait; + + if (!dma_fence_is_i915(fence)) + return; + + if (DISPLAY_VER(to_i915(crtc->dev)) < 6) + return; + + if (drm_crtc_vblank_get(crtc)) + return; + + wait = kmalloc(sizeof(*wait), GFP_KERNEL); + if (!wait) { + drm_crtc_vblank_put(crtc); + return; + } + + wait->request = to_request(dma_fence_get(fence)); + wait->crtc = crtc; + + wait->wait.func = do_rps_boost; + wait->wait.flags = 0; + + add_wait_queue(drm_crtc_vblank_waitqueue(crtc), &wait->wait); +} + +/** + * intel_prepare_plane_fb - Prepare fb for usage on plane + * @_plane: drm plane to prepare for + * @_new_plane_state: the plane state being prepared + * + * Prepares a framebuffer for usage on a display plane. Generally this + * involves pinning the underlying object and updating the frontbuffer tracking + * bits. Some older platforms need special physical address handling for + * cursor planes. + * + * Returns 0 on success, negative error code on failure. + */ +static int +intel_prepare_plane_fb(struct drm_plane *_plane, + struct drm_plane_state *_new_plane_state) +{ + struct i915_sched_attr attr = { .priority = I915_PRIORITY_DISPLAY }; + struct intel_plane *plane = to_intel_plane(_plane); + struct intel_plane_state *new_plane_state = + to_intel_plane_state(_new_plane_state); + struct intel_atomic_state *state = + to_intel_atomic_state(new_plane_state->uapi.state); + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + const struct intel_plane_state *old_plane_state = + intel_atomic_get_old_plane_state(state, plane); + struct drm_i915_gem_object *obj = intel_fb_obj(new_plane_state->hw.fb); + struct drm_i915_gem_object *old_obj = intel_fb_obj(old_plane_state->hw.fb); + int ret; + + if (old_obj) { + const struct intel_crtc_state *crtc_state = + intel_atomic_get_new_crtc_state(state, + to_intel_crtc(old_plane_state->hw.crtc)); + + /* Big Hammer, we also need to ensure that any pending + * MI_WAIT_FOR_EVENT inside a user batch buffer on the + * current scanout is retired before unpinning the old + * framebuffer. Note that we rely on userspace rendering + * into the buffer attached to the pipe they are waiting + * on. If not, userspace generates a GPU hang with IPEHR + * point to the MI_WAIT_FOR_EVENT. + * + * This should only fail upon a hung GPU, in which case we + * can safely continue. + */ + if (intel_crtc_needs_modeset(crtc_state)) { + ret = i915_sw_fence_await_reservation(&state->commit_ready, + old_obj->base.resv, NULL, + false, 0, + GFP_KERNEL); + if (ret < 0) + return ret; + } + } + + if (new_plane_state->uapi.fence) { /* explicit fencing */ + i915_gem_fence_wait_priority(new_plane_state->uapi.fence, + &attr); + ret = i915_sw_fence_await_dma_fence(&state->commit_ready, + new_plane_state->uapi.fence, + i915_fence_timeout(dev_priv), + GFP_KERNEL); + if (ret < 0) + return ret; + } + + if (!obj) + return 0; + + + ret = intel_plane_pin_fb(new_plane_state); + if (ret) + return ret; + + i915_gem_object_wait_priority(obj, 0, &attr); + + if (!new_plane_state->uapi.fence) { /* implicit fencing */ + struct dma_fence *fence; + + ret = i915_sw_fence_await_reservation(&state->commit_ready, + obj->base.resv, NULL, + false, + i915_fence_timeout(dev_priv), + GFP_KERNEL); + if (ret < 0) + goto unpin_fb; + + fence = dma_resv_get_excl_unlocked(obj->base.resv); + if (fence) { + add_rps_boost_after_vblank(new_plane_state->hw.crtc, + fence); + dma_fence_put(fence); + } + } else { + add_rps_boost_after_vblank(new_plane_state->hw.crtc, + new_plane_state->uapi.fence); + } + + /* + * We declare pageflips to be interactive and so merit a small bias + * towards upclocking to deliver the frame on time. By only changing + * the RPS thresholds to sample more regularly and aim for higher + * clocks we can hopefully deliver low power workloads (like kodi) + * that are not quite steady state without resorting to forcing + * maximum clocks following a vblank miss (see do_rps_boost()). + */ + if (!state->rps_interactive) { + intel_rps_mark_interactive(&dev_priv->gt.rps, true); + state->rps_interactive = true; + } + + return 0; + +unpin_fb: + intel_plane_unpin_fb(new_plane_state); + + return ret; +} + +/** + * intel_cleanup_plane_fb - Cleans up an fb after plane use + * @plane: drm plane to clean up for + * @_old_plane_state: the state from the previous modeset + * + * Cleans up a framebuffer that has just been removed from a plane. + */ +static void +intel_cleanup_plane_fb(struct drm_plane *plane, + struct drm_plane_state *_old_plane_state) +{ + struct intel_plane_state *old_plane_state = + to_intel_plane_state(_old_plane_state); + struct intel_atomic_state *state = + to_intel_atomic_state(old_plane_state->uapi.state); + struct drm_i915_private *dev_priv = to_i915(plane->dev); + struct drm_i915_gem_object *obj = intel_fb_obj(old_plane_state->hw.fb); + + if (!obj) + return; + + if (state->rps_interactive) { + intel_rps_mark_interactive(&dev_priv->gt.rps, false); + state->rps_interactive = false; + } + + /* Should only be called after a successful intel_prepare_plane_fb()! */ + intel_plane_unpin_fb(old_plane_state); +} + static const struct drm_plane_helper_funcs intel_plane_helper_funcs = { .prepare_fb = intel_prepare_plane_fb, .cleanup_fb = intel_cleanup_plane_fb, diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index f9776ca85de3..b99907c656bb 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -1930,6 +1930,50 @@ static int _intel_bios_max_tmds_clock(const struct intel_bios_encoder_data *devd } } +static enum port get_edp_port(struct drm_i915_private *i915) +{ + const struct intel_bios_encoder_data *devdata; + enum port port; + + for_each_port(port) { + devdata = i915->vbt.ports[port]; + + if (devdata && intel_bios_encoder_supports_edp(devdata)) + return port; + } + + return PORT_NONE; +} + +/* + * FIXME: The power sequencer and backlight code currently do not support more + * than one set registers, at least not on anything other than VLV/CHV. It will + * clobber the registers. As a temporary workaround, gracefully prevent more + * than one eDP from being registered. + */ +static void sanitize_dual_edp(struct intel_bios_encoder_data *devdata, + enum port port) +{ + struct drm_i915_private *i915 = devdata->i915; + struct child_device_config *child = &devdata->child; + enum port p; + + /* CHV might not clobber PPS registers. */ + if (IS_CHERRYVIEW(i915)) + return; + + p = get_edp_port(i915); + if (p == PORT_NONE) + return; + + drm_dbg_kms(&i915->drm, "both ports %c and %c configured as eDP, " + "disabling port %c eDP\n", port_name(p), port_name(port), + port_name(port)); + + child->device_type &= ~DEVICE_TYPE_DISPLAYPORT_OUTPUT; + child->device_type &= ~DEVICE_TYPE_INTERNAL_CONNECTOR; +} + static bool is_port_valid(struct drm_i915_private *i915, enum port port) { /* @@ -1987,6 +2031,9 @@ static void parse_ddi_port(struct drm_i915_private *i915, supports_typec_usb, supports_tbt, devdata->dsc != NULL); + if (is_edp) + sanitize_dual_edp(devdata, port); + if (is_dvi) sanitize_ddc_pin(devdata, port); diff --git a/drivers/gpu/drm/i915/display/intel_bw.c b/drivers/gpu/drm/i915/display/intel_bw.c index 4b94256d7319..8d9d888e9316 100644 --- a/drivers/gpu/drm/i915/display/intel_bw.c +++ b/drivers/gpu/drm/i915/display/intel_bw.c @@ -9,8 +9,8 @@ #include "intel_bw.h" #include "intel_cdclk.h" #include "intel_display_types.h" +#include "intel_pcode.h" #include "intel_pm.h" -#include "intel_sideband.h" /* Parameters for Qclk Geyserville (QGV) */ struct intel_qgv_point { diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index ecb28e8f1eb6..9e466d829019 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -28,8 +28,9 @@ #include "intel_cdclk.h" #include "intel_de.h" #include "intel_display_types.h" +#include "intel_pcode.h" #include "intel_psr.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" /** * DOC: CDCLK / RAWCLK diff --git a/drivers/gpu/drm/i915/display/intel_combo_phy.c b/drivers/gpu/drm/i915/display/intel_combo_phy.c index bacdf8a16bcb..634e8d449457 100644 --- a/drivers/gpu/drm/i915/display/intel_combo_phy.c +++ b/drivers/gpu/drm/i915/display/intel_combo_phy.c @@ -220,13 +220,13 @@ static bool icl_combo_phy_verify_state(struct drm_i915_private *dev_priv, return false; if (DISPLAY_VER(dev_priv) >= 12) { - ret &= check_phy_reg(dev_priv, phy, ICL_PORT_TX_DW8_LN0(phy), + ret &= check_phy_reg(dev_priv, phy, ICL_PORT_TX_DW8_LN(0, phy), ICL_PORT_TX_DW8_ODCC_CLK_SEL | ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_MASK, ICL_PORT_TX_DW8_ODCC_CLK_SEL | ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_DIV2); - ret &= check_phy_reg(dev_priv, phy, ICL_PORT_PCS_DW1_LN0(phy), + ret &= check_phy_reg(dev_priv, phy, ICL_PORT_PCS_DW1_LN(0, phy), DCC_MODE_SELECT_MASK, DCC_MODE_SELECT_CONTINUOSLY); } @@ -343,13 +343,13 @@ static void icl_combo_phys_init(struct drm_i915_private *dev_priv) skip_phy_misc: if (DISPLAY_VER(dev_priv) >= 12) { - val = intel_de_read(dev_priv, ICL_PORT_TX_DW8_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_TX_DW8_LN(0, phy)); val &= ~ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_MASK; val |= ICL_PORT_TX_DW8_ODCC_CLK_SEL; val |= ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_DIV2; intel_de_write(dev_priv, ICL_PORT_TX_DW8_GRP(phy), val); - val = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_LN(0, phy)); val &= ~DCC_MODE_SELECT_MASK; val |= DCC_MODE_SELECT_CONTINUOSLY; intel_de_write(dev_priv, ICL_PORT_PCS_DW1_GRP(phy), val); diff --git a/drivers/gpu/drm/i915/display/intel_cursor.c b/drivers/gpu/drm/i915/display/intel_cursor.c index f6dcb5aa63f6..11842f212613 100644 --- a/drivers/gpu/drm/i915/display/intel_cursor.c +++ b/drivers/gpu/drm/i915/display/intel_cursor.c @@ -17,7 +17,7 @@ #include "intel_display_types.h" #include "intel_display.h" #include "intel_fb.h" - +#include "intel_fb_pin.h" #include "intel_frontbuffer.h" #include "intel_pm.h" #include "intel_psr.h" diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 0d4cf7fa8720..1dcfe31e6c6f 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -1023,6 +1023,18 @@ static u8 intel_ddi_dp_preemph_max(struct intel_dp *intel_dp) return DP_TRAIN_PRE_EMPH_LEVEL_3; } +static u32 icl_combo_phy_loadgen_select(const struct intel_crtc_state *crtc_state, + int lane) +{ + if (crtc_state->port_clock > 600000) + return 0; + + if (crtc_state->lane_count == 4) + return lane >= 1 ? LOADGEN_SELECT : 0; + else + return lane == 1 || lane == 2 ? LOADGEN_SELECT : 0; +} + static void icl_ddi_combo_vswing_program(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { @@ -1047,7 +1059,7 @@ static void icl_ddi_combo_vswing_program(struct intel_encoder *encoder, } /* Set PORT_TX_DW5 */ - val = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN(0, phy)); val &= ~(SCALING_MODE_SEL_MASK | RTERM_SELECT_MASK | TAP2_DISABLE | TAP3_DISABLE); val |= SCALING_MODE_SEL(0x2); @@ -1056,7 +1068,7 @@ static void icl_ddi_combo_vswing_program(struct intel_encoder *encoder, intel_de_write(dev_priv, ICL_PORT_TX_DW5_GRP(phy), val); /* Program PORT_TX_DW2 */ - val = intel_de_read(dev_priv, ICL_PORT_TX_DW2_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_TX_DW2_LN(0, phy)); val &= ~(SWING_SEL_LOWER_MASK | SWING_SEL_UPPER_MASK | RCOMP_SCALAR_MASK); val |= SWING_SEL_UPPER(trans->entries[level].icl.dw2_swing_sel); @@ -1067,7 +1079,7 @@ static void icl_ddi_combo_vswing_program(struct intel_encoder *encoder, /* Program PORT_TX_DW4 */ /* We cannot write to GRP. It would overwrite individual loadgen. */ - for (ln = 0; ln <= 3; ln++) { + for (ln = 0; ln < 4; ln++) { val = intel_de_read(dev_priv, ICL_PORT_TX_DW4_LN(ln, phy)); val &= ~(POST_CURSOR_1_MASK | POST_CURSOR_2_MASK | CURSOR_COEFF_MASK); @@ -1078,7 +1090,7 @@ static void icl_ddi_combo_vswing_program(struct intel_encoder *encoder, } /* Program PORT_TX_DW7 */ - val = intel_de_read(dev_priv, ICL_PORT_TX_DW7_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_TX_DW7_LN(0, phy)); val &= ~N_SCALAR_MASK; val |= N_SCALAR(trans->entries[level].icl.dw7_n_scalar); intel_de_write(dev_priv, ICL_PORT_TX_DW7_GRP(phy), val); @@ -1089,18 +1101,15 @@ static void icl_combo_phy_set_signal_levels(struct intel_encoder *encoder, { struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); enum phy phy = intel_port_to_phy(dev_priv, encoder->port); - int width, rate, ln; u32 val; - - width = crtc_state->lane_count; - rate = crtc_state->port_clock; + int ln; /* * 1. If port type is eDP or DP, * set PORT_PCS_DW1 cmnkeeper_enable to 1b, * else clear to 0b. */ - val = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_PCS_DW1_LN(0, phy)); if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI)) val &= ~COMMON_KEEPER_EN; else @@ -1109,19 +1118,15 @@ static void icl_combo_phy_set_signal_levels(struct intel_encoder *encoder, /* 2. Program loadgen select */ /* - * Program PORT_TX_DW4_LN depending on Bit rate and used lanes + * Program PORT_TX_DW4 depending on Bit rate and used lanes * <= 6 GHz and 4 lanes (LN0=0, LN1=1, LN2=1, LN3=1) * <= 6 GHz and 1,2 lanes (LN0=0, LN1=1, LN2=1, LN3=0) * > 6 GHz (LN0=0, LN1=0, LN2=0, LN3=0) */ - for (ln = 0; ln <= 3; ln++) { + for (ln = 0; ln < 4; ln++) { val = intel_de_read(dev_priv, ICL_PORT_TX_DW4_LN(ln, phy)); val &= ~LOADGEN_SELECT; - - if ((rate <= 600000 && width == 4 && ln >= 1) || - (rate <= 600000 && width < 4 && (ln == 1 || ln == 2))) { - val |= LOADGEN_SELECT; - } + val |= icl_combo_phy_loadgen_select(crtc_state, ln); intel_de_write(dev_priv, ICL_PORT_TX_DW4_LN(ln, phy), val); } @@ -1131,7 +1136,7 @@ static void icl_combo_phy_set_signal_levels(struct intel_encoder *encoder, intel_de_write(dev_priv, ICL_PORT_CL_DW5(phy), val); /* 4. Clear training enable to change swing values */ - val = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN(0, phy)); val &= ~TX_TRAINING_EN; intel_de_write(dev_priv, ICL_PORT_TX_DW5_GRP(phy), val); @@ -1139,7 +1144,7 @@ static void icl_combo_phy_set_signal_levels(struct intel_encoder *encoder, icl_ddi_combo_vswing_program(encoder, crtc_state); /* 6. Set training enable to trigger update */ - val = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN0(phy)); + val = intel_de_read(dev_priv, ICL_PORT_TX_DW5_LN(0, phy)); val |= TX_TRAINING_EN; intel_de_write(dev_priv, ICL_PORT_TX_DW5_GRP(phy), val); } @@ -1285,9 +1290,9 @@ static void tgl_dkl_phy_set_signal_levels(struct intel_encoder *encoder, dpcnt_mask = (DKL_TX_PRESHOOT_COEFF_MASK | DKL_TX_DE_EMPAHSIS_COEFF_MASK | DKL_TX_VSWING_CONTROL_MASK); - dpcnt_val = DKL_TX_VSWING_CONTROL(trans->entries[level].dkl.dkl_vswing_control); - dpcnt_val |= DKL_TX_DE_EMPHASIS_COEFF(trans->entries[level].dkl.dkl_de_emphasis_control); - dpcnt_val |= DKL_TX_PRESHOOT_COEFF(trans->entries[level].dkl.dkl_preshoot_control); + dpcnt_val = DKL_TX_VSWING_CONTROL(trans->entries[level].dkl.vswing); + dpcnt_val |= DKL_TX_DE_EMPHASIS_COEFF(trans->entries[level].dkl.de_emphasis); + dpcnt_val |= DKL_TX_PRESHOOT_COEFF(trans->entries[level].dkl.preshoot); for (ln = 0; ln < 2; ln++) { intel_de_write(dev_priv, HIP_INDEX_REG(tc_port), @@ -1309,14 +1314,6 @@ static void tgl_dkl_phy_set_signal_levels(struct intel_encoder *encoder, val = intel_de_read(dev_priv, DKL_TX_DPCNTL2(tc_port)); val &= ~DKL_TX_DP20BITMODE; intel_de_write(dev_priv, DKL_TX_DPCNTL2(tc_port), val); - - if ((intel_crtc_has_dp_encoder(crtc_state) && - crtc_state->port_clock == 162000) || - (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI) && - crtc_state->port_clock == 594000)) - val |= DKL_TX_LOADGEN_SHARING_PMD_DISABLE; - else - val &= ~DKL_TX_LOADGEN_SHARING_PMD_DISABLE; } } @@ -1338,13 +1335,20 @@ static int translate_signal_level(struct intel_dp *intel_dp, return 0; } -static int intel_ddi_dp_level(struct intel_dp *intel_dp, int lane) +static int intel_ddi_dp_level(struct intel_dp *intel_dp, + const struct intel_crtc_state *crtc_state, + int lane) { u8 train_set = intel_dp->train_set[lane]; - u8 signal_levels = train_set & (DP_TRAIN_VOLTAGE_SWING_MASK | - DP_TRAIN_PRE_EMPHASIS_MASK); - return translate_signal_level(intel_dp, signal_levels); + if (intel_dp_is_uhbr(crtc_state)) { + return train_set & DP_TX_FFE_PRESET_VALUE_MASK; + } else { + u8 signal_levels = train_set & (DP_TRAIN_VOLTAGE_SWING_MASK | + DP_TRAIN_PRE_EMPHASIS_MASK); + + return translate_signal_level(intel_dp, signal_levels); + } } int intel_ddi_level(struct intel_encoder *encoder, @@ -1362,7 +1366,8 @@ int intel_ddi_level(struct intel_encoder *encoder, if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI)) level = intel_ddi_hdmi_level(encoder, trans); else - level = intel_ddi_dp_level(enc_to_intel_dp(encoder), lane); + level = intel_ddi_dp_level(enc_to_intel_dp(encoder), crtc_state, + lane); if (drm_WARN_ON_ONCE(&i915->drm, level >= n_entries)) level = n_entries - 1; @@ -3937,8 +3942,7 @@ static void intel_ddi_encoder_destroy(struct drm_encoder *encoder) intel_display_power_flush_work(i915); drm_encoder_cleanup(encoder); - if (dig_port) - kfree(dig_port->hdcp_port_data.streams); + kfree(dig_port->hdcp_port_data.streams); kfree(dig_port); } diff --git a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c index a2d39131ea53..78cd8f77b49d 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c +++ b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c @@ -8,6 +8,7 @@ #include "intel_ddi_buf_trans.h" #include "intel_de.h" #include "intel_display_types.h" +#include "intel_dp.h" /* HDMI/DVI modes ignore everything but the last 2 items. So we share * them for both DP and FDI transports, allowing those ports to @@ -1611,7 +1612,8 @@ dg2_get_snps_buf_trans(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state, int *n_entries) { - if (crtc_state->port_clock > 1000000) + if (intel_crtc_has_dp_encoder(crtc_state) && + intel_dp_is_uhbr(crtc_state)) return intel_get_buf_trans(&dg2_snps_trans_uhbr, n_entries); else return intel_get_buf_trans(&dg2_snps_trans, n_entries); diff --git a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.h b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.h index 6cdb8e9073c7..2133984a572b 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.h +++ b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.h @@ -34,21 +34,21 @@ struct icl_ddi_buf_trans { }; struct icl_mg_phy_ddi_buf_trans { - u32 cri_txdeemph_override_11_6; - u32 cri_txdeemph_override_5_0; - u32 cri_txdeemph_override_17_12; + u8 cri_txdeemph_override_11_6; + u8 cri_txdeemph_override_5_0; + u8 cri_txdeemph_override_17_12; }; struct tgl_dkl_phy_ddi_buf_trans { - u32 dkl_vswing_control; - u32 dkl_preshoot_control; - u32 dkl_de_emphasis_control; + u8 vswing; + u8 preshoot; + u8 de_emphasis; }; struct dg2_snps_phy_buf_trans { - u8 snps_vswing; - u8 snps_pre_cursor; - u8 snps_post_cursor; + u8 vswing; + u8 pre_cursor; + u8 post_cursor; }; union intel_ddi_buf_trans_entry { diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 8faa7f729547..ff598b6cd953 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -68,7 +68,6 @@ #include "gem/i915_gem_lmem.h" #include "gem/i915_gem_object.h" -#include "gt/intel_rps.h" #include "gt/gen8_ppgtt.h" #include "pxp/intel_pxp.h" @@ -89,26 +88,29 @@ #include "intel_dp_link_training.h" #include "intel_dpt.h" #include "intel_fbc.h" -#include "intel_fdi.h" #include "intel_fbdev.h" +#include "intel_fdi.h" #include "intel_fifo_underrun.h" #include "intel_frontbuffer.h" #include "intel_hdcp.h" #include "intel_hotplug.h" #include "intel_overlay.h" #include "intel_panel.h" +#include "intel_pcode.h" #include "intel_pipe_crc.h" +#include "intel_plane_initial.h" #include "intel_pm.h" #include "intel_pps.h" #include "intel_psr.h" #include "intel_quirks.h" -#include "intel_sideband.h" +#include "intel_sbi.h" #include "intel_sprite.h" #include "intel_tc.h" #include "intel_vga.h" #include "i9xx_plane.h" #include "skl_scaler.h" #include "skl_universal_plane.h" +#include "vlv_sideband.h" static void i9xx_crtc_clock_get(struct intel_crtc *crtc, struct intel_crtc_state *pipe_config); @@ -854,7 +856,7 @@ unsigned int intel_remapped_info_size(const struct intel_remapped_info *rem_info return size; } -static bool intel_plane_uses_fence(const struct intel_plane_state *plane_state) +bool intel_plane_uses_fence(const struct intel_plane_state *plane_state) { struct intel_plane *plane = to_intel_plane(plane_state->uapi.plane); struct drm_i915_private *dev_priv = to_i915(plane->base.dev); @@ -864,198 +866,6 @@ static bool intel_plane_uses_fence(const struct intel_plane_state *plane_state) plane_state->view.gtt.type == I915_GGTT_VIEW_NORMAL); } -static struct i915_vma * -intel_pin_fb_obj_dpt(struct drm_framebuffer *fb, - const struct i915_ggtt_view *view, - bool uses_fence, - unsigned long *out_flags, - struct i915_address_space *vm) -{ - struct drm_device *dev = fb->dev; - struct drm_i915_private *dev_priv = to_i915(dev); - struct drm_i915_gem_object *obj = intel_fb_obj(fb); - struct i915_vma *vma; - u32 alignment; - int ret; - - if (WARN_ON(!i915_gem_object_is_framebuffer(obj))) - return ERR_PTR(-EINVAL); - - alignment = 4096 * 512; - - atomic_inc(&dev_priv->gpu_error.pending_fb_pin); - - ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE); - if (ret) { - vma = ERR_PTR(ret); - goto err; - } - - vma = i915_vma_instance(obj, vm, view); - if (IS_ERR(vma)) - goto err; - - if (i915_vma_misplaced(vma, 0, alignment, 0)) { - ret = i915_vma_unbind(vma); - if (ret) { - vma = ERR_PTR(ret); - goto err; - } - } - - ret = i915_vma_pin(vma, 0, alignment, PIN_GLOBAL); - if (ret) { - vma = ERR_PTR(ret); - goto err; - } - - vma->display_alignment = max_t(u64, vma->display_alignment, alignment); - - i915_gem_object_flush_if_display(obj); - - i915_vma_get(vma); -err: - atomic_dec(&dev_priv->gpu_error.pending_fb_pin); - - return vma; -} - -struct i915_vma * -intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, - bool phys_cursor, - const struct i915_ggtt_view *view, - bool uses_fence, - unsigned long *out_flags) -{ - struct drm_device *dev = fb->dev; - struct drm_i915_private *dev_priv = to_i915(dev); - struct drm_i915_gem_object *obj = intel_fb_obj(fb); - intel_wakeref_t wakeref; - struct i915_gem_ww_ctx ww; - struct i915_vma *vma; - unsigned int pinctl; - u32 alignment; - int ret; - - if (drm_WARN_ON(dev, !i915_gem_object_is_framebuffer(obj))) - return ERR_PTR(-EINVAL); - - if (phys_cursor) - alignment = intel_cursor_alignment(dev_priv); - else - alignment = intel_surf_alignment(fb, 0); - if (drm_WARN_ON(dev, alignment && !is_power_of_2(alignment))) - return ERR_PTR(-EINVAL); - - /* Note that the w/a also requires 64 PTE of padding following the - * bo. We currently fill all unused PTE with the shadow page and so - * we should always have valid PTE following the scanout preventing - * the VT-d warning. - */ - if (intel_scanout_needs_vtd_wa(dev_priv) && alignment < 256 * 1024) - alignment = 256 * 1024; - - /* - * Global gtt pte registers are special registers which actually forward - * writes to a chunk of system memory. Which means that there is no risk - * that the register values disappear as soon as we call - * intel_runtime_pm_put(), so it is correct to wrap only the - * pin/unpin/fence and not more. - */ - wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm); - - atomic_inc(&dev_priv->gpu_error.pending_fb_pin); - - /* - * Valleyview is definitely limited to scanning out the first - * 512MiB. Lets presume this behaviour was inherited from the - * g4x display engine and that all earlier gen are similarly - * limited. Testing suggests that it is a little more - * complicated than this. For example, Cherryview appears quite - * happy to scanout from anywhere within its global aperture. - */ - pinctl = 0; - if (HAS_GMCH(dev_priv)) - pinctl |= PIN_MAPPABLE; - - i915_gem_ww_ctx_init(&ww, true); -retry: - ret = i915_gem_object_lock(obj, &ww); - if (!ret && phys_cursor) - ret = i915_gem_object_attach_phys(obj, alignment); - else if (!ret && HAS_LMEM(dev_priv)) - ret = i915_gem_object_migrate(obj, &ww, INTEL_REGION_LMEM); - /* TODO: Do we need to sync when migration becomes async? */ - if (!ret) - ret = i915_gem_object_pin_pages(obj); - if (ret) - goto err; - - if (!ret) { - vma = i915_gem_object_pin_to_display_plane(obj, &ww, alignment, - view, pinctl); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto err_unpin; - } - } - - if (uses_fence && i915_vma_is_map_and_fenceable(vma)) { - /* - * Install a fence for tiled scan-out. Pre-i965 always needs a - * fence, whereas 965+ only requires a fence if using - * framebuffer compression. For simplicity, we always, when - * possible, install a fence as the cost is not that onerous. - * - * If we fail to fence the tiled scanout, then either the - * modeset will reject the change (which is highly unlikely as - * the affected systems, all but one, do not have unmappable - * space) or we will not be able to enable full powersaving - * techniques (also likely not to apply due to various limits - * FBC and the like impose on the size of the buffer, which - * presumably we violated anyway with this unmappable buffer). - * Anyway, it is presumably better to stumble onwards with - * something and try to run the system in a "less than optimal" - * mode that matches the user configuration. - */ - ret = i915_vma_pin_fence(vma); - if (ret != 0 && DISPLAY_VER(dev_priv) < 4) { - i915_vma_unpin(vma); - goto err_unpin; - } - ret = 0; - - if (vma->fence) - *out_flags |= PLANE_HAS_FENCE; - } - - i915_vma_get(vma); - -err_unpin: - i915_gem_object_unpin_pages(obj); -err: - if (ret == -EDEADLK) { - ret = i915_gem_ww_ctx_backoff(&ww); - if (!ret) - goto retry; - } - i915_gem_ww_ctx_fini(&ww); - if (ret) - vma = ERR_PTR(ret); - - atomic_dec(&dev_priv->gpu_error.pending_fb_pin); - intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref); - return vma; -} - -void intel_unpin_fb_vma(struct i915_vma *vma, unsigned long flags) -{ - if (flags & PLANE_HAS_FENCE) - i915_vma_unpin_fence(vma); - i915_vma_unpin(vma); - i915_vma_put(vma); -} - /* * Convert the x/y offsets into a linear offset. * Only valid with 0/180 degree rotation, which is fine since linear @@ -1241,123 +1051,6 @@ u32 intel_plane_fb_max_stride(struct drm_i915_private *dev_priv, DRM_MODE_ROTATE_0); } -static struct i915_vma * -initial_plane_vma(struct drm_i915_private *i915, - struct intel_initial_plane_config *plane_config) -{ - struct drm_i915_gem_object *obj; - struct i915_vma *vma; - u32 base, size; - - if (plane_config->size == 0) - return NULL; - - base = round_down(plane_config->base, - I915_GTT_MIN_ALIGNMENT); - size = round_up(plane_config->base + plane_config->size, - I915_GTT_MIN_ALIGNMENT); - size -= base; - - /* - * If the FB is too big, just don't use it since fbdev is not very - * important and we should probably use that space with FBC or other - * features. - */ - if (IS_ENABLED(CONFIG_FRAMEBUFFER_CONSOLE) && - size * 2 > i915->stolen_usable_size) - return NULL; - - obj = i915_gem_object_create_stolen_for_preallocated(i915, base, size); - if (IS_ERR(obj)) - return NULL; - - /* - * Mark it WT ahead of time to avoid changing the - * cache_level during fbdev initialization. The - * unbind there would get stuck waiting for rcu. - */ - i915_gem_object_set_cache_coherency(obj, HAS_WT(i915) ? - I915_CACHE_WT : I915_CACHE_NONE); - - switch (plane_config->tiling) { - case I915_TILING_NONE: - break; - case I915_TILING_X: - case I915_TILING_Y: - obj->tiling_and_stride = - plane_config->fb->base.pitches[0] | - plane_config->tiling; - break; - default: - MISSING_CASE(plane_config->tiling); - goto err_obj; - } - - vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); - if (IS_ERR(vma)) - goto err_obj; - - if (i915_ggtt_pin(vma, NULL, 0, PIN_MAPPABLE | PIN_OFFSET_FIXED | base)) - goto err_obj; - - if (i915_gem_object_is_tiled(obj) && - !i915_vma_is_map_and_fenceable(vma)) - goto err_obj; - - return vma; - -err_obj: - i915_gem_object_put(obj); - return NULL; -} - -static bool -intel_alloc_initial_plane_obj(struct intel_crtc *crtc, - struct intel_initial_plane_config *plane_config) -{ - struct drm_device *dev = crtc->base.dev; - struct drm_i915_private *dev_priv = to_i915(dev); - struct drm_mode_fb_cmd2 mode_cmd = { 0 }; - struct drm_framebuffer *fb = &plane_config->fb->base; - struct i915_vma *vma; - - switch (fb->modifier) { - case DRM_FORMAT_MOD_LINEAR: - case I915_FORMAT_MOD_X_TILED: - case I915_FORMAT_MOD_Y_TILED: - break; - default: - drm_dbg(&dev_priv->drm, - "Unsupported modifier for initial FB: 0x%llx\n", - fb->modifier); - return false; - } - - vma = initial_plane_vma(dev_priv, plane_config); - if (!vma) - return false; - - mode_cmd.pixel_format = fb->format->format; - mode_cmd.width = fb->width; - mode_cmd.height = fb->height; - mode_cmd.pitches[0] = fb->pitches[0]; - mode_cmd.modifier[0] = fb->modifier; - mode_cmd.flags = DRM_MODE_FB_MODIFIERS; - - if (intel_framebuffer_init(to_intel_framebuffer(fb), - vma->obj, &mode_cmd)) { - drm_dbg_kms(&dev_priv->drm, "intel fb init failed\n"); - goto err_vma; - } - - plane_config->vma = vma; - return true; - -err_vma: - i915_vma_put(vma); - return false; -} - static void intel_set_plane_visible(struct intel_crtc_state *crtc_state, struct intel_plane_state *plane_state, @@ -1393,8 +1086,8 @@ static void fixup_plane_bitmasks(struct intel_crtc_state *crtc_state) } } -static void intel_plane_disable_noatomic(struct intel_crtc *crtc, - struct intel_plane *plane) +void intel_plane_disable_noatomic(struct intel_crtc *crtc, + struct intel_plane *plane) { struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); struct intel_crtc_state *crtc_state = @@ -1439,123 +1132,6 @@ static void intel_plane_disable_noatomic(struct intel_crtc *crtc, intel_wait_for_vblank(dev_priv, crtc->pipe); } -static bool -intel_reuse_initial_plane_obj(struct drm_i915_private *i915, - const struct intel_initial_plane_config *plane_config, - struct drm_framebuffer **fb, - struct i915_vma **vma) -{ - struct intel_crtc *crtc; - - for_each_intel_crtc(&i915->drm, crtc) { - struct intel_crtc_state *crtc_state = - to_intel_crtc_state(crtc->base.state); - struct intel_plane *plane = - to_intel_plane(crtc->base.primary); - struct intel_plane_state *plane_state = - to_intel_plane_state(plane->base.state); - - if (!crtc_state->uapi.active) - continue; - - if (!plane_state->ggtt_vma) - continue; - - if (intel_plane_ggtt_offset(plane_state) == plane_config->base) { - *fb = plane_state->hw.fb; - *vma = plane_state->ggtt_vma; - return true; - } - } - - return false; -} - -static void -intel_find_initial_plane_obj(struct intel_crtc *crtc, - struct intel_initial_plane_config *plane_config) -{ - struct drm_device *dev = crtc->base.dev; - struct drm_i915_private *dev_priv = to_i915(dev); - struct intel_crtc_state *crtc_state = - to_intel_crtc_state(crtc->base.state); - struct intel_plane *plane = - to_intel_plane(crtc->base.primary); - struct intel_plane_state *plane_state = - to_intel_plane_state(plane->base.state); - struct drm_framebuffer *fb; - struct i915_vma *vma; - - /* - * TODO: - * Disable planes if get_initial_plane_config() failed. - * Make sure things work if the surface base is not page aligned. - */ - if (!plane_config->fb) - return; - - if (intel_alloc_initial_plane_obj(crtc, plane_config)) { - fb = &plane_config->fb->base; - vma = plane_config->vma; - goto valid_fb; - } - - /* - * Failed to alloc the obj, check to see if we should share - * an fb with another CRTC instead - */ - if (intel_reuse_initial_plane_obj(dev_priv, plane_config, &fb, &vma)) - goto valid_fb; - - /* - * We've failed to reconstruct the BIOS FB. Current display state - * indicates that the primary plane is visible, but has a NULL FB, - * which will lead to problems later if we don't fix it up. The - * simplest solution is to just disable the primary plane now and - * pretend the BIOS never had it enabled. - */ - intel_plane_disable_noatomic(crtc, plane); - if (crtc_state->bigjoiner) { - struct intel_crtc *slave = - crtc_state->bigjoiner_linked_crtc; - intel_plane_disable_noatomic(slave, to_intel_plane(slave->base.primary)); - } - - return; - -valid_fb: - plane_state->uapi.rotation = plane_config->rotation; - intel_fb_fill_view(to_intel_framebuffer(fb), - plane_state->uapi.rotation, &plane_state->view); - - __i915_vma_pin(vma); - plane_state->ggtt_vma = i915_vma_get(vma); - if (intel_plane_uses_fence(plane_state) && - i915_vma_pin_fence(vma) == 0 && vma->fence) - plane_state->flags |= PLANE_HAS_FENCE; - - plane_state->uapi.src_x = 0; - plane_state->uapi.src_y = 0; - plane_state->uapi.src_w = fb->width << 16; - plane_state->uapi.src_h = fb->height << 16; - - plane_state->uapi.crtc_x = 0; - plane_state->uapi.crtc_y = 0; - plane_state->uapi.crtc_w = fb->width; - plane_state->uapi.crtc_h = fb->height; - - if (plane_config->tiling) - dev_priv->preserve_bios_swizzle = true; - - plane_state->uapi.fb = fb; - drm_framebuffer_get(fb); - - plane_state->uapi.crtc = &crtc->base; - intel_plane_copy_uapi_to_hw_state(plane_state, plane_state, crtc); - - atomic_or(plane->frontbuffer_bit, &to_intel_frontbuffer(fb)->bits); -} - unsigned int intel_plane_fence_y_offset(const struct intel_plane_state *plane_state) { @@ -2313,6 +1889,33 @@ static bool needs_cursorclk_wa(const struct intel_crtc_state *crtc_state) return false; } +static void intel_async_flip_vtd_wa(struct drm_i915_private *i915, + enum pipe pipe, bool enable) +{ + if (DISPLAY_VER(i915) == 9) { + /* + * "Plane N strech max must be programmed to 11b (x1) + * when Async flips are enabled on that plane." + */ + intel_de_rmw(i915, CHICKEN_PIPESL_1(pipe), + SKL_PLANE1_STRETCH_MAX_MASK, + enable ? SKL_PLANE1_STRETCH_MAX_X1 : SKL_PLANE1_STRETCH_MAX_X8); + } else { + /* Also needed on HSW/BDW albeit undocumented */ + intel_de_rmw(i915, CHICKEN_PIPESL_1(pipe), + HSW_PRI_STRETCH_MAX_MASK, + enable ? HSW_PRI_STRETCH_MAX_X1 : HSW_PRI_STRETCH_MAX_X8); + } +} + +static bool needs_async_flip_vtd_wa(const struct intel_crtc_state *crtc_state) +{ + struct drm_i915_private *i915 = to_i915(crtc_state->uapi.crtc->dev); + + return crtc_state->uapi.async_flip && intel_vtd_active() && + (DISPLAY_VER(i915) == 9 || IS_BROADWELL(i915) || IS_HASWELL(i915)); +} + static bool planes_enabling(const struct intel_crtc_state *old_crtc_state, const struct intel_crtc_state *new_crtc_state) { @@ -2348,6 +1951,10 @@ static void intel_post_plane_update(struct intel_atomic_state *state, intel_fbc_post_update(state, crtc); intel_drrs_page_flip(state, crtc); + if (needs_async_flip_vtd_wa(old_crtc_state) && + !needs_async_flip_vtd_wa(new_crtc_state)) + intel_async_flip_vtd_wa(dev_priv, pipe, false); + if (needs_nv12_wa(old_crtc_state) && !needs_nv12_wa(new_crtc_state)) skl_wa_827(dev_priv, pipe, false); @@ -2446,6 +2053,10 @@ static void intel_pre_plane_update(struct intel_atomic_state *state, if (intel_fbc_pre_update(state, crtc)) intel_wait_for_vblank(dev_priv, pipe); + if (!needs_async_flip_vtd_wa(old_crtc_state) && + needs_async_flip_vtd_wa(new_crtc_state)) + intel_async_flip_vtd_wa(dev_priv, pipe, true); + /* Display WA 827 */ if (!needs_nv12_wa(old_crtc_state) && needs_nv12_wa(new_crtc_state)) @@ -10478,279 +10089,6 @@ static int intel_atomic_commit(struct drm_device *dev, return 0; } -struct wait_rps_boost { - struct wait_queue_entry wait; - - struct drm_crtc *crtc; - struct i915_request *request; -}; - -static int do_rps_boost(struct wait_queue_entry *_wait, - unsigned mode, int sync, void *key) -{ - struct wait_rps_boost *wait = container_of(_wait, typeof(*wait), wait); - struct i915_request *rq = wait->request; - - /* - * If we missed the vblank, but the request is already running it - * is reasonable to assume that it will complete before the next - * vblank without our intervention, so leave RPS alone. - */ - if (!i915_request_started(rq)) - intel_rps_boost(rq); - i915_request_put(rq); - - drm_crtc_vblank_put(wait->crtc); - - list_del(&wait->wait.entry); - kfree(wait); - return 1; -} - -static void add_rps_boost_after_vblank(struct drm_crtc *crtc, - struct dma_fence *fence) -{ - struct wait_rps_boost *wait; - - if (!dma_fence_is_i915(fence)) - return; - - if (DISPLAY_VER(to_i915(crtc->dev)) < 6) - return; - - if (drm_crtc_vblank_get(crtc)) - return; - - wait = kmalloc(sizeof(*wait), GFP_KERNEL); - if (!wait) { - drm_crtc_vblank_put(crtc); - return; - } - - wait->request = to_request(dma_fence_get(fence)); - wait->crtc = crtc; - - wait->wait.func = do_rps_boost; - wait->wait.flags = 0; - - add_wait_queue(drm_crtc_vblank_waitqueue(crtc), &wait->wait); -} - -int intel_plane_pin_fb(struct intel_plane_state *plane_state) -{ - struct intel_plane *plane = to_intel_plane(plane_state->uapi.plane); - struct drm_i915_private *dev_priv = to_i915(plane->base.dev); - struct drm_framebuffer *fb = plane_state->hw.fb; - struct i915_vma *vma; - bool phys_cursor = - plane->id == PLANE_CURSOR && - INTEL_INFO(dev_priv)->display.cursor_needs_physical; - - if (!intel_fb_uses_dpt(fb)) { - vma = intel_pin_and_fence_fb_obj(fb, phys_cursor, - &plane_state->view.gtt, - intel_plane_uses_fence(plane_state), - &plane_state->flags); - if (IS_ERR(vma)) - return PTR_ERR(vma); - - plane_state->ggtt_vma = vma; - } else { - struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); - - vma = intel_dpt_pin(intel_fb->dpt_vm); - if (IS_ERR(vma)) - return PTR_ERR(vma); - - plane_state->ggtt_vma = vma; - - vma = intel_pin_fb_obj_dpt(fb, &plane_state->view.gtt, false, - &plane_state->flags, intel_fb->dpt_vm); - if (IS_ERR(vma)) { - intel_dpt_unpin(intel_fb->dpt_vm); - plane_state->ggtt_vma = NULL; - return PTR_ERR(vma); - } - - plane_state->dpt_vma = vma; - - WARN_ON(plane_state->ggtt_vma == plane_state->dpt_vma); - } - - return 0; -} - -void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state) -{ - struct drm_framebuffer *fb = old_plane_state->hw.fb; - struct i915_vma *vma; - - if (!intel_fb_uses_dpt(fb)) { - vma = fetch_and_zero(&old_plane_state->ggtt_vma); - if (vma) - intel_unpin_fb_vma(vma, old_plane_state->flags); - } else { - struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); - - vma = fetch_and_zero(&old_plane_state->dpt_vma); - if (vma) - intel_unpin_fb_vma(vma, old_plane_state->flags); - - vma = fetch_and_zero(&old_plane_state->ggtt_vma); - if (vma) - intel_dpt_unpin(intel_fb->dpt_vm); - } -} - -/** - * intel_prepare_plane_fb - Prepare fb for usage on plane - * @_plane: drm plane to prepare for - * @_new_plane_state: the plane state being prepared - * - * Prepares a framebuffer for usage on a display plane. Generally this - * involves pinning the underlying object and updating the frontbuffer tracking - * bits. Some older platforms need special physical address handling for - * cursor planes. - * - * Returns 0 on success, negative error code on failure. - */ -int -intel_prepare_plane_fb(struct drm_plane *_plane, - struct drm_plane_state *_new_plane_state) -{ - struct i915_sched_attr attr = { .priority = I915_PRIORITY_DISPLAY }; - struct intel_plane *plane = to_intel_plane(_plane); - struct intel_plane_state *new_plane_state = - to_intel_plane_state(_new_plane_state); - struct intel_atomic_state *state = - to_intel_atomic_state(new_plane_state->uapi.state); - struct drm_i915_private *dev_priv = to_i915(plane->base.dev); - const struct intel_plane_state *old_plane_state = - intel_atomic_get_old_plane_state(state, plane); - struct drm_i915_gem_object *obj = intel_fb_obj(new_plane_state->hw.fb); - struct drm_i915_gem_object *old_obj = intel_fb_obj(old_plane_state->hw.fb); - int ret; - - if (old_obj) { - const struct intel_crtc_state *crtc_state = - intel_atomic_get_new_crtc_state(state, - to_intel_crtc(old_plane_state->hw.crtc)); - - /* Big Hammer, we also need to ensure that any pending - * MI_WAIT_FOR_EVENT inside a user batch buffer on the - * current scanout is retired before unpinning the old - * framebuffer. Note that we rely on userspace rendering - * into the buffer attached to the pipe they are waiting - * on. If not, userspace generates a GPU hang with IPEHR - * point to the MI_WAIT_FOR_EVENT. - * - * This should only fail upon a hung GPU, in which case we - * can safely continue. - */ - if (intel_crtc_needs_modeset(crtc_state)) { - ret = i915_sw_fence_await_reservation(&state->commit_ready, - old_obj->base.resv, NULL, - false, 0, - GFP_KERNEL); - if (ret < 0) - return ret; - } - } - - if (new_plane_state->uapi.fence) { /* explicit fencing */ - i915_gem_fence_wait_priority(new_plane_state->uapi.fence, - &attr); - ret = i915_sw_fence_await_dma_fence(&state->commit_ready, - new_plane_state->uapi.fence, - i915_fence_timeout(dev_priv), - GFP_KERNEL); - if (ret < 0) - return ret; - } - - if (!obj) - return 0; - - - ret = intel_plane_pin_fb(new_plane_state); - if (ret) - return ret; - - i915_gem_object_wait_priority(obj, 0, &attr); - - if (!new_plane_state->uapi.fence) { /* implicit fencing */ - struct dma_fence *fence; - - ret = i915_sw_fence_await_reservation(&state->commit_ready, - obj->base.resv, NULL, - false, - i915_fence_timeout(dev_priv), - GFP_KERNEL); - if (ret < 0) - goto unpin_fb; - - fence = dma_resv_get_excl_unlocked(obj->base.resv); - if (fence) { - add_rps_boost_after_vblank(new_plane_state->hw.crtc, - fence); - dma_fence_put(fence); - } - } else { - add_rps_boost_after_vblank(new_plane_state->hw.crtc, - new_plane_state->uapi.fence); - } - - /* - * We declare pageflips to be interactive and so merit a small bias - * towards upclocking to deliver the frame on time. By only changing - * the RPS thresholds to sample more regularly and aim for higher - * clocks we can hopefully deliver low power workloads (like kodi) - * that are not quite steady state without resorting to forcing - * maximum clocks following a vblank miss (see do_rps_boost()). - */ - if (!state->rps_interactive) { - intel_rps_mark_interactive(&dev_priv->gt.rps, true); - state->rps_interactive = true; - } - - return 0; - -unpin_fb: - intel_plane_unpin_fb(new_plane_state); - - return ret; -} - -/** - * intel_cleanup_plane_fb - Cleans up an fb after plane use - * @plane: drm plane to clean up for - * @_old_plane_state: the state from the previous modeset - * - * Cleans up a framebuffer that has just been removed from a plane. - */ -void -intel_cleanup_plane_fb(struct drm_plane *plane, - struct drm_plane_state *_old_plane_state) -{ - struct intel_plane_state *old_plane_state = - to_intel_plane_state(_old_plane_state); - struct intel_atomic_state *state = - to_intel_atomic_state(old_plane_state->uapi.state); - struct drm_i915_private *dev_priv = to_i915(plane->dev); - struct drm_i915_gem_object *obj = intel_fb_obj(old_plane_state->hw.fb); - - if (!obj) - return; - - if (state->rps_interactive) { - intel_rps_mark_interactive(&dev_priv->gt.rps, false); - state->rps_interactive = false; - } - - /* Should only be called after a successful intel_prepare_plane_fb()! */ - intel_plane_unpin_fb(old_plane_state); -} - /** * intel_plane_destroy - destroy a plane * @plane: plane to destroy @@ -11580,22 +10918,6 @@ static void intel_mode_config_cleanup(struct drm_i915_private *i915) drm_mode_config_cleanup(&i915->drm); } -static void plane_config_fini(struct intel_initial_plane_config *plane_config) -{ - if (plane_config->fb) { - struct drm_framebuffer *fb = &plane_config->fb->base; - - /* We may only have the stub and not a full framebuffer */ - if (drm_framebuffer_read_refcount(fb)) - drm_framebuffer_put(fb); - else - kfree(fb); - } - - if (plane_config->vma) - i915_vma_put(plane_config->vma); -} - /* part #1: call before irq install */ int intel_modeset_init_noirq(struct drm_i915_private *i915) { @@ -11728,27 +11050,9 @@ int intel_modeset_init_nogem(struct drm_i915_private *i915) drm_modeset_unlock_all(dev); for_each_intel_crtc(dev, crtc) { - struct intel_initial_plane_config plane_config = {}; - if (!to_intel_crtc_state(crtc->base.state)->uapi.active) continue; - - /* - * Note that reserving the BIOS fb up front prevents us - * from stuffing other stolen allocations like the ring - * on top. This prevents some ugliness at boot time, and - * can even allow for smooth boot transitions if the BIOS - * fb is large enough for the active pipe configuration. - */ - i915->display->get_initial_plane_config(crtc, &plane_config); - - /* - * If the fb is shared between multiple heads, we'll - * just get the first one. - */ - intel_find_initial_plane_obj(crtc, &plane_config); - - plane_config_fini(&plane_config); + intel_crtc_initial_plane_config(crtc); } /* diff --git a/drivers/gpu/drm/i915/display/intel_display.h b/drivers/gpu/drm/i915/display/intel_display.h index 3028072c2cf3..0c76bf57f86b 100644 --- a/drivers/gpu/drm/i915/display/intel_display.h +++ b/drivers/gpu/drm/i915/display/intel_display.h @@ -576,19 +576,9 @@ int intel_get_load_detect_pipe(struct drm_connector *connector, void intel_release_load_detect_pipe(struct drm_connector *connector, struct intel_load_detect_pipe *old, struct drm_modeset_acquire_ctx *ctx); -struct i915_vma * -intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, bool phys_cursor, - const struct i915_ggtt_view *view, - bool uses_fence, - unsigned long *out_flags); -void intel_unpin_fb_vma(struct i915_vma *vma, unsigned long flags); struct drm_framebuffer * intel_framebuffer_create(struct drm_i915_gem_object *obj, struct drm_mode_fb_cmd2 *mode_cmd); -int intel_prepare_plane_fb(struct drm_plane *plane, - struct drm_plane_state *new_state); -void intel_cleanup_plane_fb(struct drm_plane *plane, - struct drm_plane_state *old_state); void assert_pch_transcoder_disabled(struct drm_i915_private *dev_priv, enum pipe pipe); @@ -619,15 +609,16 @@ void ilk_pfit_disable(const struct intel_crtc_state *old_crtc_state); int bdw_get_pipemisc_bpp(struct intel_crtc *crtc); unsigned int intel_plane_fence_y_offset(const struct intel_plane_state *plane_state); +bool intel_plane_uses_fence(const struct intel_plane_state *plane_state); bool intel_format_info_is_yuv_semiplanar(const struct drm_format_info *info, u64 modifier); -int intel_plane_pin_fb(struct intel_plane_state *plane_state); -void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state); struct intel_encoder * intel_get_crtc_new_encoder(const struct intel_atomic_state *state, const struct intel_crtc_state *crtc_state); +void intel_plane_disable_noatomic(struct intel_crtc *crtc, + struct intel_plane *plane); void intel_display_driver_register(struct drm_i915_private *i915); void intel_display_driver_unregister(struct drm_i915_private *i915); diff --git a/drivers/gpu/drm/i915/display/intel_display_debugfs.c b/drivers/gpu/drm/i915/display/intel_display_debugfs.c index 309d74fd86ce..e04767695530 100644 --- a/drivers/gpu/drm/i915/display/intel_display_debugfs.c +++ b/drivers/gpu/drm/i915/display/intel_display_debugfs.c @@ -7,19 +7,19 @@ #include <drm/drm_fourcc.h> #include "i915_debugfs.h" +#include "intel_de.h" #include "intel_display_debugfs.h" #include "intel_display_power.h" -#include "intel_de.h" #include "intel_display_types.h" #include "intel_dmc.h" #include "intel_dp.h" +#include "intel_dp_mst.h" #include "intel_drrs.h" #include "intel_fbc.h" #include "intel_hdcp.h" #include "intel_hdmi.h" #include "intel_pm.h" #include "intel_psr.h" -#include "intel_sideband.h" #include "intel_sprite.h" static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node) @@ -1379,7 +1379,7 @@ static int i915_dp_mst_info(struct seq_file *m, void *unused) continue; dig_port = enc_to_dig_port(intel_encoder); - if (!dig_port->dp.can_mst) + if (!intel_dp_mst_source_support(&dig_port->dp)) continue; seq_printf(m, "MST Source Port [ENCODER:%d:%s]\n", diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 06e9879aedd7..1672604f9ef7 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -3,12 +3,11 @@ * Copyright © 2019 Intel Corporation */ -#include "display/intel_crt.h" - #include "i915_drv.h" #include "i915_irq.h" #include "intel_cdclk.h" #include "intel_combo_phy.h" +#include "intel_crt.h" #include "intel_de.h" #include "intel_display_power.h" #include "intel_display_types.h" @@ -16,12 +15,13 @@ #include "intel_dpio_phy.h" #include "intel_dpll.h" #include "intel_hotplug.h" +#include "intel_pcode.h" #include "intel_pm.h" #include "intel_pps.h" -#include "intel_sideband.h" #include "intel_snps_phy.h" #include "intel_tc.h" #include "intel_vga.h" +#include "vlv_sideband.h" bool intel_display_power_well_is_enabled(struct drm_i915_private *dev_priv, enum i915_power_well_id power_well_id); diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index 21ce8bccc645..39e11eaec1a3 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -1580,7 +1580,6 @@ struct intel_dp { struct intel_pps pps; - bool can_mst; /* this port supports mst */ bool is_mst; int active_mst_links; diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 74a657ae131a..9d8132dd4cc5 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -66,7 +66,6 @@ #include "intel_panel.h" #include "intel_pps.h" #include "intel_psr.h" -#include "intel_sideband.h" #include "intel_tc.h" #include "intel_vdsc.h" #include "intel_vrr.h" @@ -140,6 +139,9 @@ static void intel_dp_set_sink_rates(struct intel_dp *intel_dp) return; } + /* + * Sink rates for 8b/10b. + */ max_rate = drm_dp_bw_code_to_link_rate(intel_dp->dpcd[DP_MAX_LINK_RATE]); max_lttpr_rate = drm_dp_lttpr_max_link_rate(intel_dp->lttpr_common_caps); if (max_lttpr_rate) @@ -163,6 +165,21 @@ static void intel_dp_set_sink_rates(struct intel_dp *intel_dp) drm_dp_dpcd_readb(&intel_dp->aux, DP_128B132B_SUPPORTED_LINK_RATES, &uhbr_rates); + if (drm_dp_lttpr_count(intel_dp->lttpr_common_caps)) { + /* We have a repeater */ + if (intel_dp->lttpr_common_caps[0] >= 0x20 && + intel_dp->lttpr_common_caps[DP_MAIN_LINK_CHANNEL_CODING_PHY_REPEATER - + DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV] & + DP_PHY_REPEATER_128B132B_SUPPORTED) { + /* Repeater supports 128b/132b, valid UHBR rates */ + uhbr_rates &= intel_dp->lttpr_common_caps[DP_PHY_REPEATER_128B132B_RATES - + DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV]; + } else { + /* Does not support 128b/132b */ + uhbr_rates = 0; + } + } + if (uhbr_rates & DP_UHBR10) intel_dp->sink_rates[i++] = 1000000; if (uhbr_rates & DP_UHBR13_5) @@ -2649,7 +2666,7 @@ intel_dp_can_mst(struct intel_dp *intel_dp) struct drm_i915_private *i915 = dp_to_i915(intel_dp); return i915->params.enable_dp_mst && - intel_dp->can_mst && + intel_dp_mst_source_support(intel_dp) && drm_dp_read_mst_cap(&intel_dp->aux, intel_dp->dpcd); } @@ -2664,10 +2681,10 @@ intel_dp_configure_mst(struct intel_dp *intel_dp) drm_dbg_kms(&i915->drm, "[ENCODER:%d:%s] MST support: port: %s, sink: %s, modparam: %s\n", encoder->base.base.id, encoder->base.name, - yesno(intel_dp->can_mst), yesno(sink_can_mst), + yesno(intel_dp_mst_source_support(intel_dp)), yesno(sink_can_mst), yesno(i915->params.enable_dp_mst)); - if (!intel_dp->can_mst) + if (!intel_dp_mst_source_support(intel_dp)) return; intel_dp->is_mst = sink_can_mst && @@ -5067,7 +5084,7 @@ void intel_dp_mst_suspend(struct drm_i915_private *dev_priv) intel_dp = enc_to_intel_dp(encoder); - if (!intel_dp->can_mst) + if (!intel_dp_mst_source_support(intel_dp)) continue; if (intel_dp->is_mst) @@ -5091,7 +5108,7 @@ void intel_dp_mst_resume(struct drm_i915_private *dev_priv) intel_dp = enc_to_intel_dp(encoder); - if (!intel_dp->can_mst) + if (!intel_dp_mst_source_support(intel_dp)) continue; ret = drm_dp_mst_topology_mgr_resume(&intel_dp->mst_mgr, diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c index c052ce14894d..85676c953e0a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c +++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c @@ -25,15 +25,6 @@ #include "intel_dp.h" #include "intel_dp_link_training.h" -static void -intel_dp_dump_link_status(struct drm_device *drm, - const u8 link_status[DP_LINK_STATUS_SIZE]) -{ - drm_dbg_kms(drm, - "ln0_1:0x%x ln2_3:0x%x align:0x%x sink:0x%x adj_req0_1:0x%x adj_req2_3:0x%x\n", - link_status[0], link_status[1], link_status[2], - link_status[3], link_status[4], link_status[5]); -} static void intel_dp_reset_lttpr_common_caps(struct intel_dp *intel_dp) { @@ -66,6 +57,7 @@ static u8 *intel_dp_lttpr_phy_caps(struct intel_dp *intel_dp, static void intel_dp_read_lttpr_phy_caps(struct intel_dp *intel_dp, enum drm_dp_phy dp_phy) { + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; u8 *phy_caps = intel_dp_lttpr_phy_caps(intel_dp, dp_phy); char phy_name[10]; @@ -73,21 +65,22 @@ static void intel_dp_read_lttpr_phy_caps(struct intel_dp *intel_dp, if (drm_dp_read_lttpr_phy_caps(&intel_dp->aux, dp_phy, phy_caps) < 0) { drm_dbg_kms(&dp_to_i915(intel_dp)->drm, - "failed to read the PHY caps for %s\n", - phy_name); + "[ENCODER:%d:%s][%s] failed to read the PHY caps\n", + encoder->base.base.id, encoder->base.name, phy_name); return; } drm_dbg_kms(&dp_to_i915(intel_dp)->drm, - "%s PHY capabilities: %*ph\n", - phy_name, + "[ENCODER:%d:%s][%s] PHY capabilities: %*ph\n", + encoder->base.base.id, encoder->base.name, phy_name, (int)sizeof(intel_dp->lttpr_phy_caps[0]), phy_caps); } static bool intel_dp_read_lttpr_common_caps(struct intel_dp *intel_dp) { - struct drm_i915_private *i915 = dp_to_i915(intel_dp); + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); if (intel_dp_is_edp(intel_dp)) return false; @@ -104,7 +97,8 @@ static bool intel_dp_read_lttpr_common_caps(struct intel_dp *intel_dp) goto reset_caps; drm_dbg_kms(&dp_to_i915(intel_dp)->drm, - "LTTPR common capabilities: %*ph\n", + "[ENCODER:%d:%s] LTTPR common capabilities: %*ph\n", + encoder->base.base.id, encoder->base.name, (int)sizeof(intel_dp->lttpr_common_caps), intel_dp->lttpr_common_caps); @@ -130,6 +124,8 @@ intel_dp_set_lttpr_transparent_mode(struct intel_dp *intel_dp, bool enable) static int intel_dp_init_lttpr(struct intel_dp *intel_dp) { + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); int lttpr_count; int i; @@ -161,8 +157,9 @@ static int intel_dp_init_lttpr(struct intel_dp *intel_dp) return 0; if (!intel_dp_set_lttpr_transparent_mode(intel_dp, false)) { - drm_dbg_kms(&dp_to_i915(intel_dp)->drm, - "Switching to LTTPR non-transparent LT mode failed, fall-back to transparent mode\n"); + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s] Switching to LTTPR non-transparent LT mode failed, fall-back to transparent mode\n", + encoder->base.base.id, encoder->base.name); intel_dp_set_lttpr_transparent_mode(intel_dp, true); intel_dp_reset_lttpr_count(intel_dp); @@ -307,11 +304,32 @@ static bool has_per_lane_signal_levels(struct intel_dp *intel_dp, return !intel_dp_phy_is_downstream_of_source(intel_dp, dp_phy); } -static u8 intel_dp_get_lane_adjust_train(struct intel_dp *intel_dp, - const struct intel_crtc_state *crtc_state, - enum drm_dp_phy dp_phy, - const u8 link_status[DP_LINK_STATUS_SIZE], - int lane) +/* 128b/132b */ +static u8 intel_dp_get_lane_adjust_tx_ffe_preset(struct intel_dp *intel_dp, + const struct intel_crtc_state *crtc_state, + enum drm_dp_phy dp_phy, + const u8 link_status[DP_LINK_STATUS_SIZE], + int lane) +{ + u8 tx_ffe = 0; + + if (has_per_lane_signal_levels(intel_dp, dp_phy)) { + lane = min(lane, crtc_state->lane_count - 1); + tx_ffe = drm_dp_get_adjust_tx_ffe_preset(link_status, lane); + } else { + for (lane = 0; lane < crtc_state->lane_count; lane++) + tx_ffe = max(tx_ffe, drm_dp_get_adjust_tx_ffe_preset(link_status, lane)); + } + + return tx_ffe; +} + +/* 8b/10b */ +static u8 intel_dp_get_lane_adjust_vswing_preemph(struct intel_dp *intel_dp, + const struct intel_crtc_state *crtc_state, + enum drm_dp_phy dp_phy, + const u8 link_status[DP_LINK_STATUS_SIZE], + int lane) { u8 v = 0; u8 p = 0; @@ -343,14 +361,72 @@ static u8 intel_dp_get_lane_adjust_train(struct intel_dp *intel_dp, return v | p; } +static u8 intel_dp_get_lane_adjust_train(struct intel_dp *intel_dp, + const struct intel_crtc_state *crtc_state, + enum drm_dp_phy dp_phy, + const u8 link_status[DP_LINK_STATUS_SIZE], + int lane) +{ + if (intel_dp_is_uhbr(crtc_state)) + return intel_dp_get_lane_adjust_tx_ffe_preset(intel_dp, crtc_state, + dp_phy, link_status, lane); + else + return intel_dp_get_lane_adjust_vswing_preemph(intel_dp, crtc_state, + dp_phy, link_status, lane); +} + +#define TRAIN_REQ_FMT "%d/%d/%d/%d" +#define _TRAIN_REQ_VSWING_ARGS(link_status, lane) \ + (drm_dp_get_adjust_request_voltage((link_status), (lane)) >> DP_TRAIN_VOLTAGE_SWING_SHIFT) +#define TRAIN_REQ_VSWING_ARGS(link_status) \ + _TRAIN_REQ_VSWING_ARGS(link_status, 0), \ + _TRAIN_REQ_VSWING_ARGS(link_status, 1), \ + _TRAIN_REQ_VSWING_ARGS(link_status, 2), \ + _TRAIN_REQ_VSWING_ARGS(link_status, 3) +#define _TRAIN_REQ_PREEMPH_ARGS(link_status, lane) \ + (drm_dp_get_adjust_request_pre_emphasis((link_status), (lane)) >> DP_TRAIN_PRE_EMPHASIS_SHIFT) +#define TRAIN_REQ_PREEMPH_ARGS(link_status) \ + _TRAIN_REQ_PREEMPH_ARGS(link_status, 0), \ + _TRAIN_REQ_PREEMPH_ARGS(link_status, 1), \ + _TRAIN_REQ_PREEMPH_ARGS(link_status, 2), \ + _TRAIN_REQ_PREEMPH_ARGS(link_status, 3) +#define _TRAIN_REQ_TX_FFE_ARGS(link_status, lane) \ + drm_dp_get_adjust_tx_ffe_preset((link_status), (lane)) +#define TRAIN_REQ_TX_FFE_ARGS(link_status) \ + _TRAIN_REQ_TX_FFE_ARGS(link_status, 0), \ + _TRAIN_REQ_TX_FFE_ARGS(link_status, 1), \ + _TRAIN_REQ_TX_FFE_ARGS(link_status, 2), \ + _TRAIN_REQ_TX_FFE_ARGS(link_status, 3) + void intel_dp_get_adjust_train(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, enum drm_dp_phy dp_phy, const u8 link_status[DP_LINK_STATUS_SIZE]) { + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); + char phy_name[10]; int lane; + if (intel_dp_is_uhbr(crtc_state)) { + drm_dbg_kms(&i915->drm, "[ENCODER:%d:%s][%s] 128b/132b, lanes: %d, " + "TX FFE request: " TRAIN_REQ_FMT "\n", + encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), + crtc_state->lane_count, + TRAIN_REQ_TX_FFE_ARGS(link_status)); + } else { + drm_dbg_kms(&i915->drm, "[ENCODER:%d:%s][%s] 8b/10b, lanes: %d, " + "vswing request: " TRAIN_REQ_FMT ", " + "pre-emphasis request: " TRAIN_REQ_FMT "\n", + encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), + crtc_state->lane_count, + TRAIN_REQ_VSWING_ARGS(link_status), + TRAIN_REQ_PREEMPH_ARGS(link_status)); + } + for (lane = 0; lane < 4; lane++) intel_dp->train_set[lane] = intel_dp_get_lane_adjust_train(intel_dp, crtc_state, @@ -376,7 +452,7 @@ intel_dp_set_link_train(struct intel_dp *intel_dp, int len; intel_dp_program_link_training_pattern(intel_dp, crtc_state, - dp_train_pat); + dp_phy, dp_train_pat); buf[0] = dp_train_pat; /* DP_TRAINING_LANEx_SET follow DP_TRAINING_PATTERN_SET */ @@ -404,16 +480,19 @@ static char dp_training_pattern_name(u8 train_pat) void intel_dp_program_link_training_pattern(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, + enum drm_dp_phy dp_phy, u8 dp_train_pat) { struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; - struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); + struct drm_i915_private *i915 = to_i915(encoder->base.dev); u8 train_pat = intel_dp_training_pattern_symbol(dp_train_pat); + char phy_name[10]; if (train_pat != DP_TRAINING_PATTERN_DISABLE) - drm_dbg_kms(&dev_priv->drm, - "[ENCODER:%d:%s] Using DP training pattern TPS%c\n", + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s][%s] Using DP training pattern TPS%c\n", encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), dp_training_pattern_name(train_pat)); intel_dp->set_link_train(intel_dp, crtc_state, dp_train_pat); @@ -436,23 +515,39 @@ intel_dp_program_link_training_pattern(struct intel_dp *intel_dp, _TRAIN_SET_PREEMPH_ARGS((train_set)[1]), \ _TRAIN_SET_PREEMPH_ARGS((train_set)[2]), \ _TRAIN_SET_PREEMPH_ARGS((train_set)[3]) +#define _TRAIN_SET_TX_FFE_ARGS(train_set) \ + ((train_set) & DP_TX_FFE_PRESET_VALUE_MASK), "" +#define TRAIN_SET_TX_FFE_ARGS(train_set) \ + _TRAIN_SET_TX_FFE_ARGS((train_set)[0]), \ + _TRAIN_SET_TX_FFE_ARGS((train_set)[1]), \ + _TRAIN_SET_TX_FFE_ARGS((train_set)[2]), \ + _TRAIN_SET_TX_FFE_ARGS((train_set)[3]) void intel_dp_set_signal_levels(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, enum drm_dp_phy dp_phy) { struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; - struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); + struct drm_i915_private *i915 = to_i915(encoder->base.dev); char phy_name[10]; - drm_dbg_kms(&dev_priv->drm, "[ENCODER:%d:%s] lanes: %d, " - "vswing levels: " TRAIN_SET_FMT ", " - "pre-emphasis levels: " TRAIN_SET_FMT ", at %s\n", - encoder->base.base.id, encoder->base.name, - crtc_state->lane_count, - TRAIN_SET_VSWING_ARGS(intel_dp->train_set), - TRAIN_SET_PREEMPH_ARGS(intel_dp->train_set), - intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name))); + if (intel_dp_is_uhbr(crtc_state)) { + drm_dbg_kms(&i915->drm, "[ENCODER:%d:%s][%s] 128b/132b, lanes: %d, " + "TX FFE presets: " TRAIN_SET_FMT "\n", + encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), + crtc_state->lane_count, + TRAIN_SET_TX_FFE_ARGS(intel_dp->train_set)); + } else { + drm_dbg_kms(&i915->drm, "[ENCODER:%d:%s][%s] 8b/10b, lanes: %d, " + "vswing levels: " TRAIN_SET_FMT ", " + "pre-emphasis levels: " TRAIN_SET_FMT "\n", + encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), + crtc_state->lane_count, + TRAIN_SET_VSWING_ARGS(intel_dp->train_set), + TRAIN_SET_PREEMPH_ARGS(intel_dp->train_set)); + } if (intel_dp_phy_is_downstream_of_source(intel_dp, dp_phy)) encoder->set_signal_levels(encoder, crtc_state); @@ -487,15 +582,55 @@ intel_dp_update_link_train(struct intel_dp *intel_dp, return ret == crtc_state->lane_count; } +/* 128b/132b */ +static bool intel_dp_lane_max_tx_ffe_reached(u8 train_set_lane) +{ + return (train_set_lane & DP_TX_FFE_PRESET_VALUE_MASK) == + DP_TX_FFE_PRESET_VALUE_MASK; +} + +/* + * 8b/10b + * + * FIXME: The DP spec is very confusing here, also the Link CTS spec seems to + * have self contradicting tests around this area. + * + * In lieu of better ideas let's just stop when we've reached the max supported + * vswing with its max pre-emphasis, which is either 2+1 or 3+0 depending on + * whether vswing level 3 is supported or not. + */ +static bool intel_dp_lane_max_vswing_reached(u8 train_set_lane) +{ + u8 v = (train_set_lane & DP_TRAIN_VOLTAGE_SWING_MASK) >> + DP_TRAIN_VOLTAGE_SWING_SHIFT; + u8 p = (train_set_lane & DP_TRAIN_PRE_EMPHASIS_MASK) >> + DP_TRAIN_PRE_EMPHASIS_SHIFT; + + if ((train_set_lane & DP_TRAIN_MAX_SWING_REACHED) == 0) + return false; + + if (v + p != 3) + return false; + + return true; +} + static bool intel_dp_link_max_vswing_reached(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state) { int lane; - for (lane = 0; lane < crtc_state->lane_count; lane++) - if ((intel_dp->train_set[lane] & - DP_TRAIN_MAX_SWING_REACHED) == 0) - return false; + for (lane = 0; lane < crtc_state->lane_count; lane++) { + u8 train_set_lane = intel_dp->train_set[lane]; + + if (intel_dp_is_uhbr(crtc_state)) { + if (!intel_dp_lane_max_tx_ffe_reached(train_set_lane)) + return false; + } else { + if (!intel_dp_lane_max_vswing_reached(train_set_lane)) + return false; + } + } return true; } @@ -508,7 +643,8 @@ static bool intel_dp_prepare_link_train(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state) { - struct drm_i915_private *i915 = dp_to_i915(intel_dp); + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); u8 link_config[2]; u8 link_bw, rate_select; @@ -520,10 +656,12 @@ intel_dp_prepare_link_train(struct intel_dp *intel_dp, if (link_bw) drm_dbg_kms(&i915->drm, - "Using LINK_BW_SET value %02x\n", link_bw); + "[ENCODER:%d:%s] Using LINK_BW_SET value %02x\n", + encoder->base.base.id, encoder->base.name, link_bw); else drm_dbg_kms(&i915->drm, - "Using LINK_RATE_SET value %02x\n", rate_select); + "[ENCODER:%d:%s] Using LINK_RATE_SET value %02x\n", + encoder->base.base.id, encoder->base.name, rate_select); /* Write the link configuration data */ link_config[0] = link_bw; @@ -554,17 +692,24 @@ static void intel_dp_link_training_clock_recovery_delay(struct intel_dp *intel_d drm_dp_lttpr_link_train_clock_recovery_delay(); } -static bool intel_dp_adjust_request_changed(int lane_count, +static bool intel_dp_adjust_request_changed(const struct intel_crtc_state *crtc_state, const u8 old_link_status[DP_LINK_STATUS_SIZE], const u8 new_link_status[DP_LINK_STATUS_SIZE]) { int lane; - for (lane = 0; lane < lane_count; lane++) { - u8 old = drm_dp_get_adjust_request_voltage(old_link_status, lane) | - drm_dp_get_adjust_request_pre_emphasis(old_link_status, lane); - u8 new = drm_dp_get_adjust_request_voltage(new_link_status, lane) | - drm_dp_get_adjust_request_pre_emphasis(new_link_status, lane); + for (lane = 0; lane < crtc_state->lane_count; lane++) { + u8 old, new; + + if (intel_dp_is_uhbr(crtc_state)) { + old = drm_dp_get_adjust_tx_ffe_preset(old_link_status, lane); + new = drm_dp_get_adjust_tx_ffe_preset(new_link_status, lane); + } else { + old = drm_dp_get_adjust_request_voltage(old_link_status, lane) | + drm_dp_get_adjust_request_pre_emphasis(old_link_status, lane); + new = drm_dp_get_adjust_request_voltage(new_link_status, lane) | + drm_dp_get_adjust_request_pre_emphasis(new_link_status, lane); + } if (old != new) return true; @@ -573,6 +718,22 @@ static bool intel_dp_adjust_request_changed(int lane_count, return false; } +static void +intel_dp_dump_link_status(struct intel_dp *intel_dp, enum drm_dp_phy dp_phy, + const u8 link_status[DP_LINK_STATUS_SIZE]) +{ + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); + char phy_name[10]; + + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s][%s] ln0_1:0x%x ln2_3:0x%x align:0x%x sink:0x%x adj_req0_1:0x%x adj_req2_3:0x%x\n", + encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), + link_status[0], link_status[1], link_status[2], + link_status[3], link_status[4], link_status[5]); +} + /* * Perform the link training clock recovery phase on the given DP PHY using * training pattern 1. @@ -582,16 +743,22 @@ intel_dp_link_training_clock_recovery(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, enum drm_dp_phy dp_phy) { - struct drm_i915_private *i915 = dp_to_i915(intel_dp); + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); u8 old_link_status[DP_LINK_STATUS_SIZE] = {}; int voltage_tries, cr_tries, max_cr_tries; + u8 link_status[DP_LINK_STATUS_SIZE]; bool max_vswing_reached = false; + char phy_name[10]; + + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)); /* clock recovery */ if (!intel_dp_reset_link_train(intel_dp, crtc_state, dp_phy, DP_TRAINING_PATTERN_1 | DP_LINK_SCRAMBLING_DISABLE)) { - drm_err(&i915->drm, "failed to enable link training\n"); + drm_err(&i915->drm, "[ENCODER:%d:%s][%s] Failed to enable link training\n", + encoder->base.base.id, encoder->base.name, phy_name); return false; } @@ -610,29 +777,35 @@ intel_dp_link_training_clock_recovery(struct intel_dp *intel_dp, voltage_tries = 1; for (cr_tries = 0; cr_tries < max_cr_tries; ++cr_tries) { - u8 link_status[DP_LINK_STATUS_SIZE]; - intel_dp_link_training_clock_recovery_delay(intel_dp, dp_phy); if (drm_dp_dpcd_read_phy_link_status(&intel_dp->aux, dp_phy, link_status) < 0) { - drm_err(&i915->drm, "failed to get link status\n"); + drm_err(&i915->drm, "[ENCODER:%d:%s][%s] Failed to get link status\n", + encoder->base.base.id, encoder->base.name, phy_name); return false; } if (drm_dp_clock_recovery_ok(link_status, crtc_state->lane_count)) { - drm_dbg_kms(&i915->drm, "clock recovery OK\n"); + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s][%s] Clock recovery OK\n", + encoder->base.base.id, encoder->base.name, phy_name); return true; } if (voltage_tries == 5) { + intel_dp_dump_link_status(intel_dp, dp_phy, link_status); drm_dbg_kms(&i915->drm, - "Same voltage tried 5 times\n"); + "[ENCODER:%d:%s][%s] Same voltage tried 5 times\n", + encoder->base.base.id, encoder->base.name, phy_name); return false; } if (max_vswing_reached) { - drm_dbg_kms(&i915->drm, "Max Voltage Swing reached\n"); + intel_dp_dump_link_status(intel_dp, dp_phy, link_status); + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s][%s] Max Voltage Swing reached\n", + encoder->base.base.id, encoder->base.name, phy_name); return false; } @@ -641,12 +814,12 @@ intel_dp_link_training_clock_recovery(struct intel_dp *intel_dp, link_status); if (!intel_dp_update_link_train(intel_dp, crtc_state, dp_phy)) { drm_err(&i915->drm, - "failed to update link training\n"); + "[ENCODER:%d:%s][%s] Failed to update link training\n", + encoder->base.base.id, encoder->base.name, phy_name); return false; } - if (!intel_dp_adjust_request_changed(crtc_state->lane_count, - old_link_status, link_status)) + if (!intel_dp_adjust_request_changed(crtc_state, old_link_status, link_status)) ++voltage_tries; else voltage_tries = 1; @@ -655,10 +828,13 @@ intel_dp_link_training_clock_recovery(struct intel_dp *intel_dp, if (intel_dp_link_max_vswing_reached(intel_dp, crtc_state)) max_vswing_reached = true; - } + + intel_dp_dump_link_status(intel_dp, dp_phy, link_status); drm_err(&i915->drm, - "Failed clock recovery %d times, giving up!\n", max_cr_tries); + "[ENCODER:%d:%s][%s] Failed clock recovery %d times, giving up!\n", + encoder->base.base.id, encoder->base.name, phy_name, max_cr_tries); + return false; } @@ -742,11 +918,15 @@ intel_dp_link_training_channel_equalization(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, enum drm_dp_phy dp_phy) { - struct drm_i915_private *i915 = dp_to_i915(intel_dp); + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; + struct drm_i915_private *i915 = to_i915(encoder->base.dev); int tries; u32 training_pattern; u8 link_status[DP_LINK_STATUS_SIZE]; bool channel_eq = false; + char phy_name[10]; + + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)); training_pattern = intel_dp_training_pattern(intel_dp, crtc_state, dp_phy); /* Scrambling is disabled for TPS2/3 and enabled for TPS4 */ @@ -756,7 +936,10 @@ intel_dp_link_training_channel_equalization(struct intel_dp *intel_dp, /* channel equalization */ if (!intel_dp_set_link_train(intel_dp, crtc_state, dp_phy, training_pattern)) { - drm_err(&i915->drm, "failed to start channel equalization\n"); + drm_err(&i915->drm, + "[ENCODER:%d:%s][%s] Failed to start channel equalization\n", + encoder->base.base.id, encoder->base.name, + phy_name); return false; } @@ -766,25 +949,28 @@ intel_dp_link_training_channel_equalization(struct intel_dp *intel_dp, if (drm_dp_dpcd_read_phy_link_status(&intel_dp->aux, dp_phy, link_status) < 0) { drm_err(&i915->drm, - "failed to get link status\n"); + "[ENCODER:%d:%s][%s] Failed to get link status\n", + encoder->base.base.id, encoder->base.name, phy_name); break; } /* Make sure clock is still ok */ if (!drm_dp_clock_recovery_ok(link_status, crtc_state->lane_count)) { - intel_dp_dump_link_status(&i915->drm, link_status); + intel_dp_dump_link_status(intel_dp, dp_phy, link_status); drm_dbg_kms(&i915->drm, - "Clock recovery check failed, cannot " - "continue channel equalization\n"); + "[ENCODER:%d:%s][%s] Clock recovery check failed, cannot " + "continue channel equalization\n", + encoder->base.base.id, encoder->base.name, phy_name); break; } if (drm_dp_channel_eq_ok(link_status, crtc_state->lane_count)) { channel_eq = true; - drm_dbg_kms(&i915->drm, "Channel EQ done. DP Training " - "successful\n"); + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s][%s] Channel EQ done. DP Training successful\n", + encoder->base.base.id, encoder->base.name, phy_name); break; } @@ -793,16 +979,18 @@ intel_dp_link_training_channel_equalization(struct intel_dp *intel_dp, link_status); if (!intel_dp_update_link_train(intel_dp, crtc_state, dp_phy)) { drm_err(&i915->drm, - "failed to update link training\n"); + "[ENCODER:%d:%s][%s] Failed to update link training\n", + encoder->base.base.id, encoder->base.name, phy_name); break; } } /* Try 5 times, else fail and try at lower BW */ if (tries == 5) { - intel_dp_dump_link_status(&i915->drm, link_status); + intel_dp_dump_link_status(intel_dp, dp_phy, link_status); drm_dbg_kms(&i915->drm, - "Channel equalization failed 5 times\n"); + "[ENCODER:%d:%s][%s] Channel equalization failed 5 times\n", + encoder->base.base.id, encoder->base.name, phy_name); } return channel_eq; @@ -839,7 +1027,7 @@ void intel_dp_stop_link_train(struct intel_dp *intel_dp, intel_dp->link_trained = true; intel_dp_disable_dpcd_training_pattern(intel_dp, DP_PHY_DPRX); - intel_dp_program_link_training_pattern(intel_dp, crtc_state, + intel_dp_program_link_training_pattern(intel_dp, crtc_state, DP_PHY_DPRX, DP_TRAINING_PATTERN_DISABLE); } @@ -848,7 +1036,8 @@ intel_dp_link_train_phy(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, enum drm_dp_phy dp_phy) { - struct intel_connector *intel_connector = intel_dp->attached_connector; + struct intel_connector *connector = intel_dp->attached_connector; + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; char phy_name[10]; bool ret = false; @@ -862,12 +1051,12 @@ intel_dp_link_train_phy(struct intel_dp *intel_dp, out: drm_dbg_kms(&dp_to_i915(intel_dp)->drm, - "[CONNECTOR:%d:%s] Link Training %s at link rate = %d, lane count = %d, at %s\n", - intel_connector->base.base.id, - intel_connector->base.name, + "[CONNECTOR:%d:%s][ENCODER:%d:%s][%s] Link Training %s at link rate = %d, lane count = %d\n", + connector->base.base.id, connector->base.name, + encoder->base.base.id, encoder->base.name, + intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name)), ret ? "passed" : "failed", - crtc_state->port_clock, crtc_state->lane_count, - intel_dp_phy_name(dp_phy, phy_name, sizeof(phy_name))); + crtc_state->port_clock, crtc_state->lane_count); return ret; } @@ -876,10 +1065,13 @@ static void intel_dp_schedule_fallback_link_training(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state) { struct intel_connector *intel_connector = intel_dp->attached_connector; + struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base; if (intel_dp->hobl_active) { drm_dbg_kms(&dp_to_i915(intel_dp)->drm, - "Link Training failed with HOBL active, not enabling it from now on"); + "[ENCODER:%d:%s] Link Training failed with HOBL active, " + "not enabling it from now on", + encoder->base.base.id, encoder->base.name); intel_dp->hobl_failed = true; } else if (intel_dp_get_link_train_fallback_values(intel_dp, crtc_state->port_clock, diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.h b/drivers/gpu/drm/i915/display/intel_dp_link_training.h index 9d24d594368c..6a3a7b37349a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_link_training.h +++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.h @@ -19,6 +19,7 @@ void intel_dp_get_adjust_train(struct intel_dp *intel_dp, const u8 link_status[DP_LINK_STATUS_SIZE]); void intel_dp_program_link_training_pattern(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, + enum drm_dp_phy dp_phy, u8 dp_train_pat); void intel_dp_set_signal_levels(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state, diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index fd0a31bc3dcd..0de0b4ff4d73 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -977,24 +977,31 @@ intel_dp_mst_encoder_init(struct intel_digital_port *dig_port, int conn_base_id) dig_port->max_lanes, max_source_rate, conn_base_id); - if (ret) + if (ret) { + intel_dp->mst_mgr.cbs = NULL; return ret; - - intel_dp->can_mst = true; + } return 0; } +bool intel_dp_mst_source_support(struct intel_dp *intel_dp) +{ + return intel_dp->mst_mgr.cbs; +} + void intel_dp_mst_encoder_cleanup(struct intel_digital_port *dig_port) { struct intel_dp *intel_dp = &dig_port->dp; - if (!intel_dp->can_mst) + if (!intel_dp_mst_source_support(intel_dp)) return; drm_dp_mst_topology_mgr_destroy(&intel_dp->mst_mgr); /* encoders will get killed by normal cleanup */ + + intel_dp->mst_mgr.cbs = NULL; } bool intel_dp_mst_is_master_trans(const struct intel_crtc_state *crtc_state) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.h b/drivers/gpu/drm/i915/display/intel_dp_mst.h index 6afda4e86b3c..f7301de6cdfb 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.h +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.h @@ -8,13 +8,15 @@ #include <linux/types.h> -struct intel_digital_port; struct intel_crtc_state; +struct intel_digital_port; +struct intel_dp; int intel_dp_mst_encoder_init(struct intel_digital_port *dig_port, int conn_id); void intel_dp_mst_encoder_cleanup(struct intel_digital_port *dig_port); int intel_dp_mst_encoder_active_links(struct intel_digital_port *dig_port); bool intel_dp_mst_is_master_trans(const struct intel_crtc_state *crtc_state); bool intel_dp_mst_is_slave_trans(const struct intel_crtc_state *crtc_state); +bool intel_dp_mst_source_support(struct intel_dp *intel_dp); #endif /* __INTEL_DP_MST_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_dpio_phy.c b/drivers/gpu/drm/i915/display/intel_dpio_phy.c index 5a2eccb12fe4..44edeb2e55c0 100644 --- a/drivers/gpu/drm/i915/display/intel_dpio_phy.c +++ b/drivers/gpu/drm/i915/display/intel_dpio_phy.c @@ -21,14 +21,13 @@ * DEALINGS IN THE SOFTWARE. */ -#include "display/intel_dp.h" - #include "intel_ddi.h" #include "intel_ddi_buf_trans.h" #include "intel_de.h" #include "intel_display_types.h" +#include "intel_dp.h" #include "intel_dpio_phy.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" /** * DOC: DPIO diff --git a/drivers/gpu/drm/i915/display/intel_dpll.c b/drivers/gpu/drm/i915/display/intel_dpll.c index b84ed4a1bd95..04a7af8340ca 100644 --- a/drivers/gpu/drm/i915/display/intel_dpll.c +++ b/drivers/gpu/drm/i915/display/intel_dpll.c @@ -13,8 +13,8 @@ #include "intel_lvds.h" #include "intel_panel.h" #include "intel_pps.h" -#include "intel_sideband.h" #include "intel_snps_phy.h" +#include "vlv_sideband.h" struct intel_limit { struct { diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c index c2a2cd1f84dc..f241bedb8597 100644 --- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c @@ -31,7 +31,6 @@ #include <linux/pinctrl/machine.h> #include <linux/slab.h> -#include <asm/intel-mid.h> #include <asm/unaligned.h> #include <drm/drm_crtc.h> @@ -42,7 +41,7 @@ #include "i915_drv.h" #include "intel_display_types.h" #include "intel_dsi.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" #define MIPI_TRANSFER_MODE_SHIFT 0 #define MIPI_VIRTUAL_CHANNEL_SHIFT 1 diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.c b/drivers/gpu/drm/i915/display/intel_fb_pin.c new file mode 100644 index 000000000000..3f77f3013584 --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_fb_pin.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +/** + * DOC: display pinning helpers + */ + +#include "intel_display_types.h" +#include "intel_fb_pin.h" +#include "intel_fb.h" + +#include "intel_dpt.h" + +#include "gem/i915_gem_object.h" + +static struct i915_vma * +intel_pin_fb_obj_dpt(struct drm_framebuffer *fb, + const struct i915_ggtt_view *view, + bool uses_fence, + unsigned long *out_flags, + struct i915_address_space *vm) +{ + struct drm_device *dev = fb->dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_i915_gem_object *obj = intel_fb_obj(fb); + struct i915_vma *vma; + u32 alignment; + int ret; + + if (WARN_ON(!i915_gem_object_is_framebuffer(obj))) + return ERR_PTR(-EINVAL); + + alignment = 4096 * 512; + + atomic_inc(&dev_priv->gpu_error.pending_fb_pin); + + ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE); + if (ret) { + vma = ERR_PTR(ret); + goto err; + } + + vma = i915_vma_instance(obj, vm, view); + if (IS_ERR(vma)) + goto err; + + if (i915_vma_misplaced(vma, 0, alignment, 0)) { + ret = i915_vma_unbind(vma); + if (ret) { + vma = ERR_PTR(ret); + goto err; + } + } + + ret = i915_vma_pin(vma, 0, alignment, PIN_GLOBAL); + if (ret) { + vma = ERR_PTR(ret); + goto err; + } + + vma->display_alignment = max_t(u64, vma->display_alignment, alignment); + + i915_gem_object_flush_if_display(obj); + + i915_vma_get(vma); +err: + atomic_dec(&dev_priv->gpu_error.pending_fb_pin); + + return vma; +} + +struct i915_vma * +intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, + bool phys_cursor, + const struct i915_ggtt_view *view, + bool uses_fence, + unsigned long *out_flags) +{ + struct drm_device *dev = fb->dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_i915_gem_object *obj = intel_fb_obj(fb); + intel_wakeref_t wakeref; + struct i915_gem_ww_ctx ww; + struct i915_vma *vma; + unsigned int pinctl; + u32 alignment; + int ret; + + if (drm_WARN_ON(dev, !i915_gem_object_is_framebuffer(obj))) + return ERR_PTR(-EINVAL); + + if (phys_cursor) + alignment = intel_cursor_alignment(dev_priv); + else + alignment = intel_surf_alignment(fb, 0); + if (drm_WARN_ON(dev, alignment && !is_power_of_2(alignment))) + return ERR_PTR(-EINVAL); + + /* Note that the w/a also requires 64 PTE of padding following the + * bo. We currently fill all unused PTE with the shadow page and so + * we should always have valid PTE following the scanout preventing + * the VT-d warning. + */ + if (intel_scanout_needs_vtd_wa(dev_priv) && alignment < 256 * 1024) + alignment = 256 * 1024; + + /* + * Global gtt pte registers are special registers which actually forward + * writes to a chunk of system memory. Which means that there is no risk + * that the register values disappear as soon as we call + * intel_runtime_pm_put(), so it is correct to wrap only the + * pin/unpin/fence and not more. + */ + wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm); + + atomic_inc(&dev_priv->gpu_error.pending_fb_pin); + + /* + * Valleyview is definitely limited to scanning out the first + * 512MiB. Lets presume this behaviour was inherited from the + * g4x display engine and that all earlier gen are similarly + * limited. Testing suggests that it is a little more + * complicated than this. For example, Cherryview appears quite + * happy to scanout from anywhere within its global aperture. + */ + pinctl = 0; + if (HAS_GMCH(dev_priv)) + pinctl |= PIN_MAPPABLE; + + i915_gem_ww_ctx_init(&ww, true); +retry: + ret = i915_gem_object_lock(obj, &ww); + if (!ret && phys_cursor) + ret = i915_gem_object_attach_phys(obj, alignment); + else if (!ret && HAS_LMEM(dev_priv)) + ret = i915_gem_object_migrate(obj, &ww, INTEL_REGION_LMEM); + /* TODO: Do we need to sync when migration becomes async? */ + if (!ret) + ret = i915_gem_object_pin_pages(obj); + if (ret) + goto err; + + if (!ret) { + vma = i915_gem_object_pin_to_display_plane(obj, &ww, alignment, + view, pinctl); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unpin; + } + } + + if (uses_fence && i915_vma_is_map_and_fenceable(vma)) { + /* + * Install a fence for tiled scan-out. Pre-i965 always needs a + * fence, whereas 965+ only requires a fence if using + * framebuffer compression. For simplicity, we always, when + * possible, install a fence as the cost is not that onerous. + * + * If we fail to fence the tiled scanout, then either the + * modeset will reject the change (which is highly unlikely as + * the affected systems, all but one, do not have unmappable + * space) or we will not be able to enable full powersaving + * techniques (also likely not to apply due to various limits + * FBC and the like impose on the size of the buffer, which + * presumably we violated anyway with this unmappable buffer). + * Anyway, it is presumably better to stumble onwards with + * something and try to run the system in a "less than optimal" + * mode that matches the user configuration. + */ + ret = i915_vma_pin_fence(vma); + if (ret != 0 && DISPLAY_VER(dev_priv) < 4) { + i915_vma_unpin(vma); + goto err_unpin; + } + ret = 0; + + if (vma->fence) + *out_flags |= PLANE_HAS_FENCE; + } + + i915_vma_get(vma); + +err_unpin: + i915_gem_object_unpin_pages(obj); +err: + if (ret == -EDEADLK) { + ret = i915_gem_ww_ctx_backoff(&ww); + if (!ret) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + if (ret) + vma = ERR_PTR(ret); + + atomic_dec(&dev_priv->gpu_error.pending_fb_pin); + intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref); + return vma; +} + +void intel_unpin_fb_vma(struct i915_vma *vma, unsigned long flags) +{ + if (flags & PLANE_HAS_FENCE) + i915_vma_unpin_fence(vma); + i915_vma_unpin(vma); + i915_vma_put(vma); +} + +int intel_plane_pin_fb(struct intel_plane_state *plane_state) +{ + struct intel_plane *plane = to_intel_plane(plane_state->uapi.plane); + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + struct drm_framebuffer *fb = plane_state->hw.fb; + struct i915_vma *vma; + bool phys_cursor = + plane->id == PLANE_CURSOR && + INTEL_INFO(dev_priv)->display.cursor_needs_physical; + + if (!intel_fb_uses_dpt(fb)) { + vma = intel_pin_and_fence_fb_obj(fb, phys_cursor, + &plane_state->view.gtt, + intel_plane_uses_fence(plane_state), + &plane_state->flags); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + plane_state->ggtt_vma = vma; + } else { + struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); + + vma = intel_dpt_pin(intel_fb->dpt_vm); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + plane_state->ggtt_vma = vma; + + vma = intel_pin_fb_obj_dpt(fb, &plane_state->view.gtt, false, + &plane_state->flags, intel_fb->dpt_vm); + if (IS_ERR(vma)) { + intel_dpt_unpin(intel_fb->dpt_vm); + plane_state->ggtt_vma = NULL; + return PTR_ERR(vma); + } + + plane_state->dpt_vma = vma; + + WARN_ON(plane_state->ggtt_vma == plane_state->dpt_vma); + } + + return 0; +} + +void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state) +{ + struct drm_framebuffer *fb = old_plane_state->hw.fb; + struct i915_vma *vma; + + if (!intel_fb_uses_dpt(fb)) { + vma = fetch_and_zero(&old_plane_state->ggtt_vma); + if (vma) + intel_unpin_fb_vma(vma, old_plane_state->flags); + } else { + struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); + + vma = fetch_and_zero(&old_plane_state->dpt_vma); + if (vma) + intel_unpin_fb_vma(vma, old_plane_state->flags); + + vma = fetch_and_zero(&old_plane_state->ggtt_vma); + if (vma) + intel_dpt_unpin(intel_fb->dpt_vm); + } +} diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.h b/drivers/gpu/drm/i915/display/intel_fb_pin.h new file mode 100644 index 000000000000..e4fcd0218d9d --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_fb_pin.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __INTEL_FB_PIN_H__ +#define __INTEL_FB_PIN_H__ + +#include <linux/types.h> + +struct drm_framebuffer; +struct i915_vma; +struct intel_plane_state; +struct i915_ggtt_view; + +struct i915_vma * +intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, + bool phys_cursor, + const struct i915_ggtt_view *view, + bool uses_fence, + unsigned long *out_flags); + +void intel_unpin_fb_vma(struct i915_vma *vma, unsigned long flags); + +int intel_plane_pin_fb(struct intel_plane_state *plane_state); +void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state); + +#endif diff --git a/drivers/gpu/drm/i915/display/intel_fbdev.c b/drivers/gpu/drm/i915/display/intel_fbdev.c index 53484267b2a4..adc3a81be9f7 100644 --- a/drivers/gpu/drm/i915/display/intel_fbdev.c +++ b/drivers/gpu/drm/i915/display/intel_fbdev.c @@ -46,6 +46,7 @@ #include "i915_drv.h" #include "intel_display_types.h" #include "intel_fb.h" +#include "intel_fb_pin.h" #include "intel_fbdev.h" #include "intel_frontbuffer.h" diff --git a/drivers/gpu/drm/i915/display/intel_fdi.c b/drivers/gpu/drm/i915/display/intel_fdi.c index 51d6f810e69b..dd2cf0c59921 100644 --- a/drivers/gpu/drm/i915/display/intel_fdi.c +++ b/drivers/gpu/drm/i915/display/intel_fdi.c @@ -8,7 +8,7 @@ #include "intel_de.h" #include "intel_display_types.h" #include "intel_fdi.h" -#include "intel_sideband.h" +#include "intel_sbi.h" static void assert_fdi_tx(struct drm_i915_private *dev_priv, enum pipe pipe, bool state) diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index 9b9fd9d13043..4509fe7438e8 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -17,12 +17,12 @@ #include "i915_drv.h" #include "i915_reg.h" -#include "intel_display_power.h" +#include "intel_connector.h" #include "intel_de.h" +#include "intel_display_power.h" #include "intel_display_types.h" #include "intel_hdcp.h" -#include "intel_sideband.h" -#include "intel_connector.h" +#include "intel_pcode.h" #define KEY_LOAD_TRIES 5 #define HDCP2_LC_RETRY_CNT 3 diff --git a/drivers/gpu/drm/i915/display/intel_hotplug.c b/drivers/gpu/drm/i915/display/intel_hotplug.c index 3c1cec953b42..955f6d07b0e1 100644 --- a/drivers/gpu/drm/i915/display/intel_hotplug.c +++ b/drivers/gpu/drm/i915/display/intel_hotplug.c @@ -215,7 +215,7 @@ intel_hpd_irq_storm_switch_to_polling(struct drm_i915_private *dev_priv) static void intel_hpd_irq_setup(struct drm_i915_private *i915) { - if (i915->display_irqs_enabled && i915->hotplug_funcs->hpd_irq_setup) + if (i915->display_irqs_enabled && i915->hotplug_funcs) i915->hotplug_funcs->hpd_irq_setup(i915); } diff --git a/drivers/gpu/drm/i915/display/intel_plane_initial.c b/drivers/gpu/drm/i915/display/intel_plane_initial.c new file mode 100644 index 000000000000..dcd698a02da2 --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_plane_initial.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "intel_display_types.h" +#include "intel_plane_initial.h" +#include "intel_atomic_plane.h" +#include "intel_display.h" +#include "intel_fb.h" + +static bool +intel_reuse_initial_plane_obj(struct drm_i915_private *i915, + const struct intel_initial_plane_config *plane_config, + struct drm_framebuffer **fb, + struct i915_vma **vma) +{ + struct intel_crtc *crtc; + + for_each_intel_crtc(&i915->drm, crtc) { + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + struct intel_plane *plane = + to_intel_plane(crtc->base.primary); + struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + + if (!crtc_state->uapi.active) + continue; + + if (!plane_state->ggtt_vma) + continue; + + if (intel_plane_ggtt_offset(plane_state) == plane_config->base) { + *fb = plane_state->hw.fb; + *vma = plane_state->ggtt_vma; + return true; + } + } + + return false; +} + +static struct i915_vma * +initial_plane_vma(struct drm_i915_private *i915, + struct intel_initial_plane_config *plane_config) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u32 base, size; + + if (plane_config->size == 0) + return NULL; + + base = round_down(plane_config->base, + I915_GTT_MIN_ALIGNMENT); + size = round_up(plane_config->base + plane_config->size, + I915_GTT_MIN_ALIGNMENT); + size -= base; + + /* + * If the FB is too big, just don't use it since fbdev is not very + * important and we should probably use that space with FBC or other + * features. + */ + if (IS_ENABLED(CONFIG_FRAMEBUFFER_CONSOLE) && + size * 2 > i915->stolen_usable_size) + return NULL; + + obj = i915_gem_object_create_stolen_for_preallocated(i915, base, size); + if (IS_ERR(obj)) + return NULL; + + /* + * Mark it WT ahead of time to avoid changing the + * cache_level during fbdev initialization. The + * unbind there would get stuck waiting for rcu. + */ + i915_gem_object_set_cache_coherency(obj, HAS_WT(i915) ? + I915_CACHE_WT : I915_CACHE_NONE); + + switch (plane_config->tiling) { + case I915_TILING_NONE: + break; + case I915_TILING_X: + case I915_TILING_Y: + obj->tiling_and_stride = + plane_config->fb->base.pitches[0] | + plane_config->tiling; + break; + default: + MISSING_CASE(plane_config->tiling); + goto err_obj; + } + + vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); + if (IS_ERR(vma)) + goto err_obj; + + if (i915_ggtt_pin(vma, NULL, 0, PIN_MAPPABLE | PIN_OFFSET_FIXED | base)) + goto err_obj; + + if (i915_gem_object_is_tiled(obj) && + !i915_vma_is_map_and_fenceable(vma)) + goto err_obj; + + return vma; + +err_obj: + i915_gem_object_put(obj); + return NULL; +} + +static bool +intel_alloc_initial_plane_obj(struct intel_crtc *crtc, + struct intel_initial_plane_config *plane_config) +{ + struct drm_device *dev = crtc->base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_mode_fb_cmd2 mode_cmd = { 0 }; + struct drm_framebuffer *fb = &plane_config->fb->base; + struct i915_vma *vma; + + switch (fb->modifier) { + case DRM_FORMAT_MOD_LINEAR: + case I915_FORMAT_MOD_X_TILED: + case I915_FORMAT_MOD_Y_TILED: + break; + default: + drm_dbg(&dev_priv->drm, + "Unsupported modifier for initial FB: 0x%llx\n", + fb->modifier); + return false; + } + + vma = initial_plane_vma(dev_priv, plane_config); + if (!vma) + return false; + + mode_cmd.pixel_format = fb->format->format; + mode_cmd.width = fb->width; + mode_cmd.height = fb->height; + mode_cmd.pitches[0] = fb->pitches[0]; + mode_cmd.modifier[0] = fb->modifier; + mode_cmd.flags = DRM_MODE_FB_MODIFIERS; + + if (intel_framebuffer_init(to_intel_framebuffer(fb), + vma->obj, &mode_cmd)) { + drm_dbg_kms(&dev_priv->drm, "intel fb init failed\n"); + goto err_vma; + } + + plane_config->vma = vma; + return true; + +err_vma: + i915_vma_put(vma); + return false; +} + +static void +intel_find_initial_plane_obj(struct intel_crtc *crtc, + struct intel_initial_plane_config *plane_config) +{ + struct drm_device *dev = crtc->base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + struct intel_plane *plane = + to_intel_plane(crtc->base.primary); + struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + struct drm_framebuffer *fb; + struct i915_vma *vma; + + /* + * TODO: + * Disable planes if get_initial_plane_config() failed. + * Make sure things work if the surface base is not page aligned. + */ + if (!plane_config->fb) + return; + + if (intel_alloc_initial_plane_obj(crtc, plane_config)) { + fb = &plane_config->fb->base; + vma = plane_config->vma; + goto valid_fb; + } + + /* + * Failed to alloc the obj, check to see if we should share + * an fb with another CRTC instead + */ + if (intel_reuse_initial_plane_obj(dev_priv, plane_config, &fb, &vma)) + goto valid_fb; + + /* + * We've failed to reconstruct the BIOS FB. Current display state + * indicates that the primary plane is visible, but has a NULL FB, + * which will lead to problems later if we don't fix it up. The + * simplest solution is to just disable the primary plane now and + * pretend the BIOS never had it enabled. + */ + intel_plane_disable_noatomic(crtc, plane); + if (crtc_state->bigjoiner) { + struct intel_crtc *slave = + crtc_state->bigjoiner_linked_crtc; + intel_plane_disable_noatomic(slave, to_intel_plane(slave->base.primary)); + } + + return; + +valid_fb: + plane_state->uapi.rotation = plane_config->rotation; + intel_fb_fill_view(to_intel_framebuffer(fb), + plane_state->uapi.rotation, &plane_state->view); + + __i915_vma_pin(vma); + plane_state->ggtt_vma = i915_vma_get(vma); + if (intel_plane_uses_fence(plane_state) && + i915_vma_pin_fence(vma) == 0 && vma->fence) + plane_state->flags |= PLANE_HAS_FENCE; + + plane_state->uapi.src_x = 0; + plane_state->uapi.src_y = 0; + plane_state->uapi.src_w = fb->width << 16; + plane_state->uapi.src_h = fb->height << 16; + + plane_state->uapi.crtc_x = 0; + plane_state->uapi.crtc_y = 0; + plane_state->uapi.crtc_w = fb->width; + plane_state->uapi.crtc_h = fb->height; + + if (plane_config->tiling) + dev_priv->preserve_bios_swizzle = true; + + plane_state->uapi.fb = fb; + drm_framebuffer_get(fb); + + plane_state->uapi.crtc = &crtc->base; + intel_plane_copy_uapi_to_hw_state(plane_state, plane_state, crtc); + + atomic_or(plane->frontbuffer_bit, &to_intel_frontbuffer(fb)->bits); +} + +static void plane_config_fini(struct intel_initial_plane_config *plane_config) +{ + if (plane_config->fb) { + struct drm_framebuffer *fb = &plane_config->fb->base; + + /* We may only have the stub and not a full framebuffer */ + if (drm_framebuffer_read_refcount(fb)) + drm_framebuffer_put(fb); + else + kfree(fb); + } + + if (plane_config->vma) + i915_vma_put(plane_config->vma); +} + +void intel_crtc_initial_plane_config(struct intel_crtc *crtc) +{ + struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); + struct intel_initial_plane_config plane_config = {}; + + /* + * Note that reserving the BIOS fb up front prevents us + * from stuffing other stolen allocations like the ring + * on top. This prevents some ugliness at boot time, and + * can even allow for smooth boot transitions if the BIOS + * fb is large enough for the active pipe configuration. + */ + dev_priv->display->get_initial_plane_config(crtc, &plane_config); + + /* + * If the fb is shared between multiple heads, we'll + * just get the first one. + */ + intel_find_initial_plane_obj(crtc, &plane_config); + + plane_config_fini(&plane_config); +} diff --git a/drivers/gpu/drm/i915/display/intel_plane_initial.h b/drivers/gpu/drm/i915/display/intel_plane_initial.h new file mode 100644 index 000000000000..c7e35ab3182b --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_plane_initial.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __INTEL_PLANE_INITIAL_H__ +#define __INTEL_PLANE_INITIAL_H__ + +struct intel_crtc; + +void intel_crtc_initial_plane_config(struct intel_crtc *crtc); + +#endif diff --git a/drivers/gpu/drm/i915/display/intel_snps_phy.c b/drivers/gpu/drm/i915/display/intel_snps_phy.c index b18f08c851dc..5e20f340730f 100644 --- a/drivers/gpu/drm/i915/display/intel_snps_phy.c +++ b/drivers/gpu/drm/i915/display/intel_snps_phy.c @@ -68,9 +68,9 @@ void intel_snps_phy_set_signal_levels(struct intel_encoder *encoder, for (ln = 0; ln < 4; ln++) { u32 val = 0; - val |= REG_FIELD_PREP(SNPS_PHY_TX_EQ_MAIN, trans->entries[level].snps.snps_vswing); - val |= REG_FIELD_PREP(SNPS_PHY_TX_EQ_PRE, trans->entries[level].snps.snps_pre_cursor); - val |= REG_FIELD_PREP(SNPS_PHY_TX_EQ_POST, trans->entries[level].snps.snps_post_cursor); + val |= REG_FIELD_PREP(SNPS_PHY_TX_EQ_MAIN, trans->entries[level].snps.vswing); + val |= REG_FIELD_PREP(SNPS_PHY_TX_EQ_PRE, trans->entries[level].snps.pre_cursor); + val |= REG_FIELD_PREP(SNPS_PHY_TX_EQ_POST, trans->entries[level].snps.post_cursor); intel_de_write(dev_priv, SNPS_PHY_TX_EQ(ln, phy), val); } diff --git a/drivers/gpu/drm/i915/display/vlv_dsi.c b/drivers/gpu/drm/i915/display/vlv_dsi.c index 081b772bfe10..07584695514b 100644 --- a/drivers/gpu/drm/i915/display/vlv_dsi.c +++ b/drivers/gpu/drm/i915/display/vlv_dsi.c @@ -40,8 +40,8 @@ #include "intel_dsi.h" #include "intel_fifo_underrun.h" #include "intel_panel.h" -#include "intel_sideband.h" #include "skl_scaler.h" +#include "vlv_sideband.h" /* return pixels in terms of txbyteclkhs */ static u16 txbyteclkhs(u16 pixels, int bpp, int lane_count, diff --git a/drivers/gpu/drm/i915/display/vlv_dsi_pll.c b/drivers/gpu/drm/i915/display/vlv_dsi_pll.c index 0078973cd219..5413b52ab6ba 100644 --- a/drivers/gpu/drm/i915/display/vlv_dsi_pll.c +++ b/drivers/gpu/drm/i915/display/vlv_dsi_pll.c @@ -31,7 +31,7 @@ #include "intel_de.h" #include "intel_display_types.h" #include "intel_dsi.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" static const u16 lfsr_converts[] = { 426, 469, 234, 373, 442, 221, 110, 311, 411, /* 62 - 70 */ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c index 6234e17259c1..7358bebef15c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c @@ -4,6 +4,8 @@ * Copyright © 2014-2016 Intel Corporation */ +#include <linux/dma-fence-array.h> + #include "gt/intel_engine.h" #include "i915_gem_ioctls.h" @@ -36,7 +38,7 @@ static __always_inline u32 __busy_write_id(u16 id) } static __always_inline unsigned int -__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id)) +__busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id)) { const struct i915_request *rq; @@ -46,29 +48,60 @@ __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id)) * to eventually flush us, but to minimise latency just ask the * hardware. * - * Note we only report on the status of native fences. + * Note we only report on the status of native fences and we currently + * have two native fences: + * + * 1. A composite fence (dma_fence_array) constructed of i915 requests + * created during a parallel submission. In this case we deconstruct the + * composite fence into individual i915 requests and check the status of + * each request. + * + * 2. A single i915 request. */ - if (!dma_fence_is_i915(fence)) + if (dma_fence_is_array(fence)) { + struct dma_fence_array *array = to_dma_fence_array(fence); + struct dma_fence **child = array->fences; + unsigned int nchild = array->num_fences; + + do { + struct dma_fence *current_fence = *child++; + + /* Not an i915 fence, can't be busy per above */ + if (!dma_fence_is_i915(current_fence) || + !test_bit(I915_FENCE_FLAG_COMPOSITE, + ¤t_fence->flags)) { + return 0; + } + + rq = to_request(current_fence); + if (!i915_request_completed(rq)) + return flag(rq->engine->uabi_class); + } while (--nchild); + + /* All requests in array complete, not busy */ return 0; + } else { + if (!dma_fence_is_i915(fence)) + return 0; - /* opencode to_request() in order to avoid const warnings */ - rq = container_of(fence, const struct i915_request, fence); - if (i915_request_completed(rq)) - return 0; + rq = to_request(fence); + if (i915_request_completed(rq)) + return 0; - /* Beware type-expansion follies! */ - BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); - return flag(rq->engine->uabi_class); + /* Beware type-expansion follies! */ + BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); + return flag(rq->engine->uabi_class); + } } static __always_inline unsigned int -busy_check_reader(const struct dma_fence *fence) +busy_check_reader(struct dma_fence *fence) { return __busy_set_if_active(fence, __busy_read_flag); } static __always_inline unsigned int -busy_check_writer(const struct dma_fence *fence) +busy_check_writer(struct dma_fence *fence) { if (!fence) return 0; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index d225d3dd0b40..fb33d0322960 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -556,9 +556,147 @@ set_proto_ctx_engines_bond(struct i915_user_extension __user *base, void *data) return 0; } +static int +set_proto_ctx_engines_parallel_submit(struct i915_user_extension __user *base, + void *data) +{ + struct i915_context_engines_parallel_submit __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_proto_ctx_engines *set = data; + struct drm_i915_private *i915 = set->i915; + u64 flags; + int err = 0, n, i, j; + u16 slot, width, num_siblings; + struct intel_engine_cs **siblings = NULL; + intel_engine_mask_t prev_mask; + + /* FIXME: This is NIY for execlists */ + if (!(intel_uc_uses_guc_submission(&i915->gt.uc))) + return -ENODEV; + + if (get_user(slot, &ext->engine_index)) + return -EFAULT; + + if (get_user(width, &ext->width)) + return -EFAULT; + + if (get_user(num_siblings, &ext->num_siblings)) + return -EFAULT; + + if (slot >= set->num_engines) { + drm_dbg(&i915->drm, "Invalid placement value, %d >= %d\n", + slot, set->num_engines); + return -EINVAL; + } + + if (set->engines[slot].type != I915_GEM_ENGINE_TYPE_INVALID) { + drm_dbg(&i915->drm, + "Invalid placement[%d], already occupied\n", slot); + return -EINVAL; + } + + if (get_user(flags, &ext->flags)) + return -EFAULT; + + if (flags) { + drm_dbg(&i915->drm, "Unknown flags 0x%02llx", flags); + return -EINVAL; + } + + for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) { + err = check_user_mbz(&ext->mbz64[n]); + if (err) + return err; + } + + if (width < 2) { + drm_dbg(&i915->drm, "Width (%d) < 2\n", width); + return -EINVAL; + } + + if (num_siblings < 1) { + drm_dbg(&i915->drm, "Number siblings (%d) < 1\n", + num_siblings); + return -EINVAL; + } + + siblings = kmalloc_array(num_siblings * width, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return -ENOMEM; + + /* Create contexts / engines */ + for (i = 0; i < width; ++i) { + intel_engine_mask_t current_mask = 0; + struct i915_engine_class_instance prev_engine; + + for (j = 0; j < num_siblings; ++j) { + struct i915_engine_class_instance ci; + + n = i * num_siblings + j; + if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) { + err = -EFAULT; + goto out_err; + } + + siblings[n] = + intel_engine_lookup_user(i915, ci.engine_class, + ci.engine_instance); + if (!siblings[n]) { + drm_dbg(&i915->drm, + "Invalid sibling[%d]: { class:%d, inst:%d }\n", + n, ci.engine_class, ci.engine_instance); + err = -EINVAL; + goto out_err; + } + + if (n) { + if (prev_engine.engine_class != + ci.engine_class) { + drm_dbg(&i915->drm, + "Mismatched class %d, %d\n", + prev_engine.engine_class, + ci.engine_class); + err = -EINVAL; + goto out_err; + } + } + + prev_engine = ci; + current_mask |= siblings[n]->logical_mask; + } + + if (i > 0) { + if (current_mask != prev_mask << 1) { + drm_dbg(&i915->drm, + "Non contiguous logical mask 0x%x, 0x%x\n", + prev_mask, current_mask); + err = -EINVAL; + goto out_err; + } + } + prev_mask = current_mask; + } + + set->engines[slot].type = I915_GEM_ENGINE_TYPE_PARALLEL; + set->engines[slot].num_siblings = num_siblings; + set->engines[slot].width = width; + set->engines[slot].siblings = siblings; + + return 0; + +out_err: + kfree(siblings); + + return err; +} + static const i915_user_extension_fn set_proto_ctx_engines_extensions[] = { [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_proto_ctx_engines_balance, [I915_CONTEXT_ENGINES_EXT_BOND] = set_proto_ctx_engines_bond, + [I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT] = + set_proto_ctx_engines_parallel_submit, }; static int set_proto_ctx_engines(struct drm_i915_file_private *fpriv, @@ -794,6 +932,7 @@ static int intel_context_set_gem(struct intel_context *ce, GEM_BUG_ON(rcu_access_pointer(ce->gem_context)); RCU_INIT_POINTER(ce->gem_context, ctx); + GEM_BUG_ON(intel_context_is_pinned(ce)); ce->ring_size = SZ_16K; i915_vm_put(ce->vm); @@ -818,6 +957,25 @@ static int intel_context_set_gem(struct intel_context *ce, return ret; } +static void __unpin_engines(struct i915_gem_engines *e, unsigned int count) +{ + while (count--) { + struct intel_context *ce = e->engines[count], *child; + + if (!ce || !test_bit(CONTEXT_PERMA_PIN, &ce->flags)) + continue; + + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); + } +} + +static void unpin_engines(struct i915_gem_engines *e) +{ + __unpin_engines(e, e->num_engines); +} + static void __free_engines(struct i915_gem_engines *e, unsigned int count) { while (count--) { @@ -933,6 +1091,40 @@ free_engines: return err; } +static int perma_pin_contexts(struct intel_context *ce) +{ + struct intel_context *child; + int i = 0, j = 0, ret; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + ret = intel_context_pin(ce); + if (unlikely(ret)) + return ret; + + for_each_child(ce, child) { + ret = intel_context_pin(child); + if (unlikely(ret)) + goto unwind; + ++i; + } + + set_bit(CONTEXT_PERMA_PIN, &ce->flags); + + return 0; + +unwind: + intel_context_unpin(ce); + for_each_child(ce, child) { + if (j++ < i) + intel_context_unpin(child); + else + break; + } + + return ret; +} + static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, unsigned int num_engines, struct i915_gem_proto_engine *pe) @@ -946,7 +1138,7 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, e->num_engines = num_engines; for (n = 0; n < num_engines; n++) { - struct intel_context *ce; + struct intel_context *ce, *child; int ret; switch (pe[n].type) { @@ -956,7 +1148,13 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, case I915_GEM_ENGINE_TYPE_BALANCED: ce = intel_engine_create_virtual(pe[n].siblings, - pe[n].num_siblings); + pe[n].num_siblings, 0); + break; + + case I915_GEM_ENGINE_TYPE_PARALLEL: + ce = intel_engine_create_parallel(pe[n].siblings, + pe[n].num_siblings, + pe[n].width); break; case I915_GEM_ENGINE_TYPE_INVALID: @@ -977,6 +1175,30 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, err = ERR_PTR(ret); goto free_engines; } + for_each_child(ce, child) { + ret = intel_context_set_gem(child, ctx, pe->sseu); + if (ret) { + err = ERR_PTR(ret); + goto free_engines; + } + } + + /* + * XXX: Must be done after calling intel_context_set_gem as that + * function changes the ring size. The ring is allocated when + * the context is pinned. If the ring size is changed after + * allocation we have a mismatch of the ring size and will cause + * the context to hang. Presumably with a bit of reordering we + * could move the perma-pin step to the backend function + * intel_engine_create_parallel. + */ + if (pe[n].type == I915_GEM_ENGINE_TYPE_PARALLEL) { + ret = perma_pin_contexts(ce); + if (ret) { + err = ERR_PTR(ret); + goto free_engines; + } + } } return e; @@ -1219,6 +1441,7 @@ static void context_close(struct i915_gem_context *ctx) /* Flush any concurrent set_engines() */ mutex_lock(&ctx->engines_mutex); + unpin_engines(__context_engines_static(ctx)); engines_idle_release(ctx, rcu_replace_pointer(ctx->engines, NULL, 1)); i915_gem_context_set_closed(ctx); mutex_unlock(&ctx->engines_mutex); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index a627b09c4680..282cdb8a5c5a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -78,13 +78,16 @@ enum i915_gem_engine_type { /** @I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set */ I915_GEM_ENGINE_TYPE_BALANCED, + + /** @I915_GEM_ENGINE_TYPE_PARALLEL: A parallel engine set */ + I915_GEM_ENGINE_TYPE_PARALLEL, }; /** * struct i915_gem_proto_engine - prototype engine * * This struct describes an engine that a context may contain. Engines - * have three types: + * have four types: * * - I915_GEM_ENGINE_TYPE_INVALID: Invalid engines can be created but they * show up as a NULL in i915_gem_engines::engines[i] and any attempt to @@ -97,6 +100,10 @@ enum i915_gem_engine_type { * * - I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set, described * i915_gem_proto_engine::num_siblings and i915_gem_proto_engine::siblings. + * + * - I915_GEM_ENGINE_TYPE_PARALLEL: A parallel submission engine set, described + * i915_gem_proto_engine::width, i915_gem_proto_engine::num_siblings, and + * i915_gem_proto_engine::siblings. */ struct i915_gem_proto_engine { /** @type: Type of this engine */ @@ -105,10 +112,13 @@ struct i915_gem_proto_engine { /** @engine: Engine, for physical */ struct intel_engine_cs *engine; - /** @num_siblings: Number of balanced siblings */ + /** @num_siblings: Number of balanced or parallel siblings */ unsigned int num_siblings; - /** @siblings: Balanced siblings */ + /** @width: Width of each sibling */ + unsigned int width; + + /** @siblings: Balanced siblings or num_siblings * width for parallel */ struct intel_engine_cs **siblings; /** @sseu: Client-set SSEU parameters */ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c index afa34111de02..1adcd8e02d29 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c @@ -232,6 +232,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_gem_object *gem_obj, int flags) static int i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); struct sg_table *pages; unsigned int sg_page_sizes; @@ -242,8 +243,11 @@ static int i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj) if (IS_ERR(pages)) return PTR_ERR(pages); - sg_page_sizes = i915_sg_dma_sizes(pages->sgl); + /* XXX: consider doing a vmap flush or something */ + if (!HAS_LLC(i915) || i915_gem_object_can_bypass_llc(obj)) + wbinvd_on_all_cpus(); + sg_page_sizes = i915_sg_dma_sizes(pages->sgl); __i915_gem_object_set_pages(obj, pages, sg_page_sizes); return 0; @@ -301,7 +305,8 @@ struct drm_gem_object *i915_gem_prime_import(struct drm_device *dev, } drm_gem_private_object_init(dev, &obj->base, dma_buf->size); - i915_gem_object_init(obj, &i915_gem_object_dmabuf_ops, &lock_class, 0); + i915_gem_object_init(obj, &i915_gem_object_dmabuf_ops, &lock_class, + I915_BO_ALLOC_USER); obj->base.import_attach = attach; obj->base.resv = dma_buf->resv; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 8b3a25bd93e6..4d7da07442f2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -246,17 +246,25 @@ struct i915_execbuffer { struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ struct eb_vma *vma; - struct intel_engine_cs *engine; /** engine to queue the request to */ + struct intel_gt *gt; /* gt for the execbuf */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ - struct i915_request *request; /** our request to build */ - struct eb_vma *batch; /** identity of the batch obj/vma */ + /** our requests to build */ + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; + /** identity of the batch obj/vma */ + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; struct i915_vma *trampoline; /** trampoline used for chaining */ + /** used for excl fence in dma_resv objects when > 1 BB submitted */ + struct dma_fence *composite_fence; + /** actual size of execobj[] as we may extend it for the cmdparser */ unsigned int buffer_count; + /* number of batches in execbuf IOCTL */ + unsigned int num_batches; + /** list of vma not yet bound during reservation phase */ struct list_head unbound; @@ -283,7 +291,8 @@ struct i915_execbuffer { u64 invalid_flags; /** Set of execobj.flags that are invalid */ - u64 batch_len; /** Length of batch within object */ + /** Length of batch within object */ + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; u32 batch_start_offset; /** Location within object of batch */ u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ @@ -301,14 +310,13 @@ struct i915_execbuffer { }; static int eb_parse(struct i915_execbuffer *eb); -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, - bool throttle); +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle); static void eb_unpin_engine(struct i915_execbuffer *eb); static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return intel_engine_requires_cmd_parser(eb->engine) || - (intel_engine_using_cmd_parser(eb->engine) && + return intel_engine_requires_cmd_parser(eb->context->engine) || + (intel_engine_using_cmd_parser(eb->context->engine) && eb->args->batch_len); } @@ -535,11 +543,21 @@ eb_validate_vma(struct i915_execbuffer *eb, return 0; } -static void +static inline bool +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx) +{ + return eb->args->flags & I915_EXEC_BATCH_FIRST ? + buffer_idx < eb->num_batches : + buffer_idx >= eb->args->buffer_count - eb->num_batches; +} + +static int eb_add_vma(struct i915_execbuffer *eb, - unsigned int i, unsigned batch_idx, + unsigned int *current_batch, + unsigned int i, struct i915_vma *vma) { + struct drm_i915_private *i915 = eb->i915; struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; struct eb_vma *ev = &eb->vma[i]; @@ -566,15 +584,43 @@ eb_add_vma(struct i915_execbuffer *eb, * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere. */ - if (i == batch_idx) { + if (is_batch_buffer(eb, i)) { if (entry->relocation_count && !(ev->flags & EXEC_OBJECT_PINNED)) ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; if (eb->reloc_cache.has_fence) ev->flags |= EXEC_OBJECT_NEEDS_FENCE; - eb->batch = ev; + eb->batches[*current_batch] = ev; + + if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) { + drm_dbg(&i915->drm, + "Attempting to use self-modifying batch buffer\n"); + return -EINVAL; + } + + if (range_overflows_t(u64, + eb->batch_start_offset, + eb->args->batch_len, + ev->vma->size)) { + drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); + return -EINVAL; + } + + if (eb->args->batch_len == 0) + eb->batch_len[*current_batch] = ev->vma->size - + eb->batch_start_offset; + else + eb->batch_len[*current_batch] = eb->args->batch_len; + if (unlikely(eb->batch_len[*current_batch] == 0)) { /* impossible! */ + drm_dbg(&i915->drm, "Invalid batch length\n"); + return -EINVAL; + } + + ++*current_batch; } + + return 0; } static inline int use_cpu_reloc(const struct reloc_cache *cache, @@ -718,14 +764,6 @@ static int eb_reserve(struct i915_execbuffer *eb) } while (1); } -static unsigned int eb_batch_index(const struct i915_execbuffer *eb) -{ - if (eb->args->flags & I915_EXEC_BATCH_FIRST) - return 0; - else - return eb->buffer_count - 1; -} - static int eb_select_context(struct i915_execbuffer *eb) { struct i915_gem_context *ctx; @@ -846,9 +884,7 @@ static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle) static int eb_lookup_vmas(struct i915_execbuffer *eb) { - struct drm_i915_private *i915 = eb->i915; - unsigned int batch = eb_batch_index(eb); - unsigned int i; + unsigned int i, current_batch = 0; int err = 0; INIT_LIST_HEAD(&eb->relocs); @@ -868,7 +904,9 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) goto err; } - eb_add_vma(eb, i, batch, vma); + err = eb_add_vma(eb, ¤t_batch, i, vma); + if (err) + return err; if (i915_gem_object_is_userptr(vma->obj)) { err = i915_gem_object_userptr_submit_init(vma->obj); @@ -891,26 +929,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) } } - if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { - drm_dbg(&i915->drm, - "Attempting to use self-modifying batch buffer\n"); - return -EINVAL; - } - - if (range_overflows_t(u64, - eb->batch_start_offset, eb->batch_len, - eb->batch->vma->size)) { - drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); - return -EINVAL; - } - - if (eb->batch_len == 0) - eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; - if (unlikely(eb->batch_len == 0)) { /* impossible! */ - drm_dbg(&i915->drm, "Invalid batch length\n"); - return -EINVAL; - } - return 0; err: @@ -1643,8 +1661,7 @@ static int eb_reinit_userptr(struct i915_execbuffer *eb) return 0; } -static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb, - struct i915_request *rq) +static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb) { bool have_copy = false; struct eb_vma *ev; @@ -1660,21 +1677,6 @@ repeat: eb_release_vmas(eb, false); i915_gem_ww_ctx_fini(&eb->ww); - if (rq) { - /* nonblocking is always false */ - if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, - MAX_SCHEDULE_TIMEOUT) < 0) { - i915_request_put(rq); - rq = NULL; - - err = -EINTR; - goto err_relock; - } - - i915_request_put(rq); - rq = NULL; - } - /* * We take 3 passes through the slowpatch. * @@ -1701,28 +1703,21 @@ repeat: if (!err) err = eb_reinit_userptr(eb); -err_relock: i915_gem_ww_ctx_init(&eb->ww, true); if (err) goto out; /* reacquire the objects */ repeat_validate: - rq = eb_pin_engine(eb, false); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - rq = NULL; + err = eb_pin_engine(eb, false); + if (err) goto err; - } - - /* We didn't throttle, should be NULL */ - GEM_WARN_ON(rq); err = eb_validate_vmas(eb); if (err) goto err; - GEM_BUG_ON(!eb->batch); + GEM_BUG_ON(!eb->batches[0]); list_for_each_entry(ev, &eb->relocs, reloc_link) { if (!have_copy) { @@ -1786,46 +1781,23 @@ out: } } - if (rq) - i915_request_put(rq); - return err; } static int eb_relocate_parse(struct i915_execbuffer *eb) { int err; - struct i915_request *rq = NULL; bool throttle = true; retry: - rq = eb_pin_engine(eb, throttle); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - rq = NULL; + err = eb_pin_engine(eb, throttle); + if (err) { if (err != -EDEADLK) return err; goto err; } - if (rq) { - bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; - - /* Need to drop all locks now for throttling, take slowpath */ - err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0); - if (err == -ETIME) { - if (nonblock) { - err = -EWOULDBLOCK; - i915_request_put(rq); - goto err; - } - goto slow; - } - i915_request_put(rq); - rq = NULL; - } - /* only throttle once, even if we didn't need to throttle */ throttle = false; @@ -1865,7 +1837,7 @@ err: return err; slow: - err = eb_relocate_parse_slow(eb, rq); + err = eb_relocate_parse_slow(eb); if (err) /* * If the user expects the execobject.offset and @@ -1879,11 +1851,40 @@ slow: return err; } +/* + * Using two helper loops for the order of which requests / batches are created + * and added the to backend. Requests are created in order from the parent to + * the last child. Requests are added in the reverse order, from the last child + * to parent. This is done for locking reasons as the timeline lock is acquired + * during request creation and released when the request is added to the + * backend. To make lockdep happy (see intel_context_timeline_lock) this must be + * the ordering. + */ +#define for_each_batch_create_order(_eb, _i) \ + for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i)) +#define for_each_batch_add_order(_eb, _i) \ + BUILD_BUG_ON(!typecheck(int, _i)); \ + for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i)) + +static struct i915_request * +eb_find_first_request_added(struct i915_execbuffer *eb) +{ + int i; + + for_each_batch_add_order(eb, i) + if (eb->requests[i]) + return eb->requests[i]; + + GEM_BUG_ON("Request not found"); + + return NULL; +} + static int eb_move_to_gpu(struct i915_execbuffer *eb) { const unsigned int count = eb->buffer_count; unsigned int i = count; - int err = 0; + int err = 0, j; while (i--) { struct eb_vma *ev = &eb->vma[i]; @@ -1896,11 +1897,17 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) if (flags & EXEC_OBJECT_CAPTURE) { struct i915_capture_list *capture; - capture = kmalloc(sizeof(*capture), GFP_KERNEL); - if (capture) { - capture->next = eb->request->capture_list; - capture->vma = vma; - eb->request->capture_list = capture; + for_each_batch_create_order(eb, j) { + if (!eb->requests[j]) + break; + + capture = kmalloc(sizeof(*capture), GFP_KERNEL); + if (capture) { + capture->next = + eb->requests[j]->capture_list; + capture->vma = vma; + eb->requests[j]->capture_list = capture; + } } } @@ -1915,20 +1922,43 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) * but gcc's optimiser doesn't handle that as well and emits * two jumps instead of one. Maybe one day... + * + * FIXME: There is also sync flushing in set_pages(), which + * serves a different purpose(some of the time at least). + * + * We should consider: + * + * 1. Rip out the async flush code. + * + * 2. Or make the sync flushing use the async clflush path + * using mandatory fences underneath. Currently the below + * async flush happens after we bind the object. */ if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { if (i915_gem_clflush_object(obj, 0)) flags &= ~EXEC_OBJECT_ASYNC; } + /* We only need to await on the first request */ if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { err = i915_request_await_object - (eb->request, obj, flags & EXEC_OBJECT_WRITE); + (eb_find_first_request_added(eb), obj, + flags & EXEC_OBJECT_WRITE); } - if (err == 0) - err = i915_vma_move_to_active(vma, eb->request, - flags | __EXEC_OBJECT_NO_RESERVE); + for_each_batch_add_order(eb, j) { + if (err) + break; + if (!eb->requests[j]) + continue; + + err = _i915_vma_move_to_active(vma, eb->requests[j], + j ? NULL : + eb->composite_fence ? + eb->composite_fence : + &eb->requests[j]->fence, + flags | __EXEC_OBJECT_NO_RESERVE); + } } #ifdef CONFIG_MMU_NOTIFIER @@ -1959,11 +1989,16 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) goto err_skip; /* Unconditionally flush any chipset caches (for streaming writes). */ - intel_gt_chipset_flush(eb->engine->gt); + intel_gt_chipset_flush(eb->gt); return 0; err_skip: - i915_request_set_error_once(eb->request, err); + for_each_batch_create_order(eb, j) { + if (!eb->requests[j]) + break; + + i915_request_set_error_once(eb->requests[j], err); + } return err; } @@ -2058,14 +2093,17 @@ static int eb_parse(struct i915_execbuffer *eb) int err; if (!eb_use_cmdparser(eb)) { - batch = eb_dispatch_secure(eb, eb->batch->vma); + batch = eb_dispatch_secure(eb, eb->batches[0]->vma); if (IS_ERR(batch)) return PTR_ERR(batch); goto secure_batch; } - len = eb->batch_len; + if (intel_context_is_parallel(eb->context)) + return -EINVAL; + + len = eb->batch_len[0]; if (!CMDPARSER_USES_GGTT(eb->i915)) { /* * ppGTT backed shadow buffers must be mapped RO, to prevent @@ -2079,11 +2117,11 @@ static int eb_parse(struct i915_execbuffer *eb) } else { len += I915_CMD_PARSER_TRAMPOLINE_SIZE; } - if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ + if (unlikely(len < eb->batch_len[0])) /* last paranoid check of overflow */ return -EINVAL; if (!pool) { - pool = intel_gt_get_buffer_pool(eb->engine->gt, len, + pool = intel_gt_get_buffer_pool(eb->gt, len, I915_MAP_WB); if (IS_ERR(pool)) return PTR_ERR(pool); @@ -2108,7 +2146,7 @@ static int eb_parse(struct i915_execbuffer *eb) trampoline = shadow; shadow = shadow_batch_pin(eb, pool->obj, - &eb->engine->gt->ggtt->vm, + &eb->gt->ggtt->vm, PIN_GLOBAL); if (IS_ERR(shadow)) { err = PTR_ERR(shadow); @@ -2130,26 +2168,29 @@ static int eb_parse(struct i915_execbuffer *eb) if (err) goto err_trampoline; - err = intel_engine_cmd_parser(eb->engine, - eb->batch->vma, + err = intel_engine_cmd_parser(eb->context->engine, + eb->batches[0]->vma, eb->batch_start_offset, - eb->batch_len, + eb->batch_len[0], shadow, trampoline); if (err) goto err_unpin_batch; - eb->batch = &eb->vma[eb->buffer_count++]; - eb->batch->vma = i915_vma_get(shadow); - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; + eb->batches[0] = &eb->vma[eb->buffer_count++]; + eb->batches[0]->vma = i915_vma_get(shadow); + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; eb->trampoline = trampoline; eb->batch_start_offset = 0; secure_batch: if (batch) { - eb->batch = &eb->vma[eb->buffer_count++]; - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; - eb->batch->vma = i915_vma_get(batch); + if (intel_context_is_parallel(eb->context)) + return -EINVAL; + + eb->batches[0] = &eb->vma[eb->buffer_count++]; + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; + eb->batches[0]->vma = i915_vma_get(batch); } return 0; @@ -2165,19 +2206,18 @@ err: return err; } -static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) +static int eb_request_submit(struct i915_execbuffer *eb, + struct i915_request *rq, + struct i915_vma *batch, + u64 batch_len) { int err; - if (intel_context_nopreempt(eb->context)) - __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); - - err = eb_move_to_gpu(eb); - if (err) - return err; + if (intel_context_nopreempt(rq->context)) + __set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags); if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { - err = i915_reset_gen7_sol_offsets(eb->request); + err = i915_reset_gen7_sol_offsets(rq); if (err) return err; } @@ -2188,26 +2228,26 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) * allows us to determine if the batch is still waiting on the GPU * or actually running by checking the breadcrumb. */ - if (eb->engine->emit_init_breadcrumb) { - err = eb->engine->emit_init_breadcrumb(eb->request); + if (rq->context->engine->emit_init_breadcrumb) { + err = rq->context->engine->emit_init_breadcrumb(rq); if (err) return err; } - err = eb->engine->emit_bb_start(eb->request, - batch->node.start + - eb->batch_start_offset, - eb->batch_len, - eb->batch_flags); + err = rq->context->engine->emit_bb_start(rq, + batch->node.start + + eb->batch_start_offset, + batch_len, + eb->batch_flags); if (err) return err; if (eb->trampoline) { + GEM_BUG_ON(intel_context_is_parallel(rq->context)); GEM_BUG_ON(eb->batch_start_offset); - err = eb->engine->emit_bb_start(eb->request, - eb->trampoline->node.start + - eb->batch_len, - 0, 0); + err = rq->context->engine->emit_bb_start(rq, + eb->trampoline->node.start + + batch_len, 0, 0); if (err) return err; } @@ -2215,6 +2255,27 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) return 0; } +static int eb_submit(struct i915_execbuffer *eb) +{ + unsigned int i; + int err; + + err = eb_move_to_gpu(eb); + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + trace_i915_request_queue(eb->requests[i], eb->batch_flags); + if (!err) + err = eb_request_submit(eb, eb->requests[i], + eb->batches[i]->vma, + eb->batch_len[i]); + } + + return err; +} + static int num_vcs_engines(const struct drm_i915_private *i915) { return hweight_long(VDBOX_MASK(&i915->gt)); @@ -2280,26 +2341,11 @@ static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel return i915_request_get(rq); } -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle) +static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce, + bool throttle) { - struct intel_context *ce = eb->context; struct intel_timeline *tl; struct i915_request *rq = NULL; - int err; - - GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); - - if (unlikely(intel_context_is_banned(ce))) - return ERR_PTR(-EIO); - - /* - * Pinning the contexts may generate requests in order to acquire - * GGTT space, so do this first before we reserve a seqno for - * ourselves. - */ - err = intel_context_pin_ww(ce, &eb->ww); - if (err) - return ERR_PTR(err); /* * Take a local wakeref for preparing to dispatch the execbuf as @@ -2310,33 +2356,108 @@ static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throt * taken on the engine, and the parent device. */ tl = intel_context_timeline_lock(ce); - if (IS_ERR(tl)) { - intel_context_unpin(ce); - return ERR_CAST(tl); - } + if (IS_ERR(tl)) + return PTR_ERR(tl); intel_context_enter(ce); if (throttle) rq = eb_throttle(eb, ce); intel_context_timeline_unlock(tl); + if (rq) { + bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; + long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT; + + if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, + timeout) < 0) { + i915_request_put(rq); + + tl = intel_context_timeline_lock(ce); + intel_context_exit(ce); + intel_context_timeline_unlock(tl); + + if (nonblock) + return -EWOULDBLOCK; + else + return -EINTR; + } + i915_request_put(rq); + } + + return 0; +} + +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle) +{ + struct intel_context *ce = eb->context, *child; + int err; + int i = 0, j = 0; + + GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); + + if (unlikely(intel_context_is_banned(ce))) + return -EIO; + + /* + * Pinning the contexts may generate requests in order to acquire + * GGTT space, so do this first before we reserve a seqno for + * ourselves. + */ + err = intel_context_pin_ww(ce, &eb->ww); + if (err) + return err; + for_each_child(ce, child) { + err = intel_context_pin_ww(child, &eb->ww); + GEM_BUG_ON(err); /* perma-pinned should incr a counter */ + } + + for_each_child(ce, child) { + err = eb_pin_timeline(eb, child, throttle); + if (err) + goto unwind; + ++i; + } + err = eb_pin_timeline(eb, ce, throttle); + if (err) + goto unwind; + eb->args->flags |= __EXEC_ENGINE_PINNED; - return rq; + return 0; + +unwind: + for_each_child(ce, child) { + if (j++ < i) { + mutex_lock(&child->timeline->mutex); + intel_context_exit(child); + mutex_unlock(&child->timeline->mutex); + } + } + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); + return err; } static void eb_unpin_engine(struct i915_execbuffer *eb) { - struct intel_context *ce = eb->context; - struct intel_timeline *tl = ce->timeline; + struct intel_context *ce = eb->context, *child; if (!(eb->args->flags & __EXEC_ENGINE_PINNED)) return; eb->args->flags &= ~__EXEC_ENGINE_PINNED; - mutex_lock(&tl->mutex); + for_each_child(ce, child) { + mutex_lock(&child->timeline->mutex); + intel_context_exit(child); + mutex_unlock(&child->timeline->mutex); + + intel_context_unpin(child); + } + + mutex_lock(&ce->timeline->mutex); intel_context_exit(ce); - mutex_unlock(&tl->mutex); + mutex_unlock(&ce->timeline->mutex); intel_context_unpin(ce); } @@ -2387,7 +2508,7 @@ eb_select_legacy_ring(struct i915_execbuffer *eb) static int eb_select_engine(struct i915_execbuffer *eb) { - struct intel_context *ce; + struct intel_context *ce, *child; unsigned int idx; int err; @@ -2400,6 +2521,20 @@ eb_select_engine(struct i915_execbuffer *eb) if (IS_ERR(ce)) return PTR_ERR(ce); + if (intel_context_is_parallel(ce)) { + if (eb->buffer_count < ce->parallel.number_children + 1) { + intel_context_put(ce); + return -EINVAL; + } + if (eb->batch_start_offset || eb->args->batch_len) { + intel_context_put(ce); + return -EINVAL; + } + } + eb->num_batches = ce->parallel.number_children + 1; + + for_each_child(ce, child) + intel_context_get(child); intel_gt_pm_get(ce->engine->gt); if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { @@ -2407,6 +2542,13 @@ eb_select_engine(struct i915_execbuffer *eb) if (err) goto err; } + for_each_child(ce, child) { + if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) { + err = intel_context_alloc_state(child); + if (err) + goto err; + } + } /* * ABI: Before userspace accesses the GPU (e.g. execbuffer), report @@ -2417,7 +2559,7 @@ eb_select_engine(struct i915_execbuffer *eb) goto err; eb->context = ce; - eb->engine = ce->engine; + eb->gt = ce->engine->gt; /* * Make sure engine pool stays alive even if we call intel_context_put @@ -2428,6 +2570,8 @@ eb_select_engine(struct i915_execbuffer *eb) err: intel_gt_pm_put(ce->engine->gt); + for_each_child(ce, child) + intel_context_put(child); intel_context_put(ce); return err; } @@ -2435,7 +2579,11 @@ err: static void eb_put_engine(struct i915_execbuffer *eb) { - intel_gt_pm_put(eb->engine->gt); + struct intel_context *child; + + intel_gt_pm_put(eb->gt); + for_each_child(eb->context, child) + intel_context_put(child); intel_context_put(eb->context); } @@ -2658,7 +2806,8 @@ static void put_fence_array(struct eb_fence *fences, int num_fences) } static int -await_fence_array(struct i915_execbuffer *eb) +await_fence_array(struct i915_execbuffer *eb, + struct i915_request *rq) { unsigned int n; int err; @@ -2672,8 +2821,7 @@ await_fence_array(struct i915_execbuffer *eb) if (!eb->fences[n].dma_fence) continue; - err = i915_request_await_dma_fence(eb->request, - eb->fences[n].dma_fence); + err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence); if (err < 0) return err; } @@ -2681,9 +2829,9 @@ await_fence_array(struct i915_execbuffer *eb) return 0; } -static void signal_fence_array(const struct i915_execbuffer *eb) +static void signal_fence_array(const struct i915_execbuffer *eb, + struct dma_fence * const fence) { - struct dma_fence * const fence = &eb->request->fence; unsigned int n; for (n = 0; n < eb->num_fences; n++) { @@ -2731,9 +2879,9 @@ static void retire_requests(struct intel_timeline *tl, struct i915_request *end) break; } -static int eb_request_add(struct i915_execbuffer *eb, int err) +static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq, + int err, bool last_parallel) { - struct i915_request *rq = eb->request; struct intel_timeline * const tl = i915_request_timeline(rq); struct i915_sched_attr attr = {}; struct i915_request *prev; @@ -2755,6 +2903,17 @@ static int eb_request_add(struct i915_execbuffer *eb, int err) err = -ENOENT; /* override any transient errors */ } + if (intel_context_is_parallel(eb->context)) { + if (err) { + __i915_request_skip(rq); + set_bit(I915_FENCE_FLAG_SKIP_PARALLEL, + &rq->fence.flags); + } + if (last_parallel) + set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, + &rq->fence.flags); + } + __i915_request_queue(rq, &attr); /* Try to clean up the client's timeline after submitting the request */ @@ -2766,6 +2925,25 @@ static int eb_request_add(struct i915_execbuffer *eb, int err) return err; } +static int eb_requests_add(struct i915_execbuffer *eb, int err) +{ + int i; + + /* + * We iterate in reverse order of creation to release timeline mutexes in + * same order. + */ + for_each_batch_add_order(eb, i) { + struct i915_request *rq = eb->requests[i]; + + if (!rq) + continue; + err |= eb_request_add(eb, rq, err, i == 0); + } + + return err; +} + static const i915_user_extension_fn execbuf_extensions[] = { [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, }; @@ -2792,6 +2970,185 @@ parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args, eb); } +static void eb_requests_get(struct i915_execbuffer *eb) +{ + unsigned int i; + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + i915_request_get(eb->requests[i]); + } +} + +static void eb_requests_put(struct i915_execbuffer *eb) +{ + unsigned int i; + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + i915_request_put(eb->requests[i]); + } +} + +static struct sync_file * +eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + struct dma_fence_array *fence_array; + struct dma_fence **fences; + unsigned int i; + + GEM_BUG_ON(!intel_context_is_parent(eb->context)); + + fences = kmalloc_array(eb->num_batches, sizeof(*fences), GFP_KERNEL); + if (!fences) + return ERR_PTR(-ENOMEM); + + for_each_batch_create_order(eb, i) { + fences[i] = &eb->requests[i]->fence; + __set_bit(I915_FENCE_FLAG_COMPOSITE, + &eb->requests[i]->fence.flags); + } + + fence_array = dma_fence_array_create(eb->num_batches, + fences, + eb->context->parallel.fence_context, + eb->context->parallel.seqno, + false); + if (!fence_array) { + kfree(fences); + return ERR_PTR(-ENOMEM); + } + + /* Move ownership to the dma_fence_array created above */ + for_each_batch_create_order(eb, i) + dma_fence_get(fences[i]); + + if (out_fence_fd != -1) { + out_fence = sync_file_create(&fence_array->base); + /* sync_file now owns fence_arry, drop creation ref */ + dma_fence_put(&fence_array->base); + if (!out_fence) + return ERR_PTR(-ENOMEM); + } + + eb->composite_fence = &fence_array->base; + + return out_fence; +} + +static struct sync_file * +eb_fences_add(struct i915_execbuffer *eb, struct i915_request *rq, + struct dma_fence *in_fence, int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + int err; + + if (unlikely(eb->gem_context->syncobj)) { + struct dma_fence *fence; + + fence = drm_syncobj_fence_get(eb->gem_context->syncobj); + err = i915_request_await_dma_fence(rq, fence); + dma_fence_put(fence); + if (err) + return ERR_PTR(err); + } + + if (in_fence) { + if (eb->args->flags & I915_EXEC_FENCE_SUBMIT) + err = i915_request_await_execution(rq, in_fence); + else + err = i915_request_await_dma_fence(rq, in_fence); + if (err < 0) + return ERR_PTR(err); + } + + if (eb->fences) { + err = await_fence_array(eb, rq); + if (err) + return ERR_PTR(err); + } + + if (intel_context_is_parallel(eb->context)) { + out_fence = eb_composite_fence_create(eb, out_fence_fd); + if (IS_ERR(out_fence)) + return ERR_PTR(-ENOMEM); + } else if (out_fence_fd != -1) { + out_fence = sync_file_create(&rq->fence); + if (!out_fence) + return ERR_PTR(-ENOMEM); + } + + return out_fence; +} + +static struct intel_context * +eb_find_context(struct i915_execbuffer *eb, unsigned int context_number) +{ + struct intel_context *child; + + if (likely(context_number == 0)) + return eb->context; + + for_each_child(eb->context, child) + if (!--context_number) + return child; + + GEM_BUG_ON("Context not found"); + + return NULL; +} + +static struct sync_file * +eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence, + int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + unsigned int i; + + for_each_batch_create_order(eb, i) { + /* Allocate a request for this batch buffer nice and early. */ + eb->requests[i] = i915_request_create(eb_find_context(eb, i)); + if (IS_ERR(eb->requests[i])) { + out_fence = ERR_PTR(PTR_ERR(eb->requests[i])); + eb->requests[i] = NULL; + return out_fence; + } + + /* + * Only the first request added (committed to backend) has to + * take the in fences into account as all subsequent requests + * will have fences inserted inbetween them. + */ + if (i + 1 == eb->num_batches) { + out_fence = eb_fences_add(eb, eb->requests[i], + in_fence, out_fence_fd); + if (IS_ERR(out_fence)) + return out_fence; + } + + /* + * Whilst this request exists, batch_obj will be on the + * active_list, and so will hold the active reference. Only when + * this request is retired will the batch_obj be moved onto + * the inactive_list and lose its active reference. Hence we do + * not need to explicitly hold another reference here. + */ + eb->requests[i]->batch = eb->batches[i]->vma; + if (eb->batch_pool) { + GEM_BUG_ON(intel_context_is_parallel(eb->context)); + intel_gt_buffer_pool_mark_active(eb->batch_pool, + eb->requests[i]); + } + } + + return out_fence; +} + static int i915_gem_do_execbuffer(struct drm_device *dev, struct drm_file *file, @@ -2802,7 +3159,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, struct i915_execbuffer eb; struct dma_fence *in_fence = NULL; struct sync_file *out_fence = NULL; - struct i915_vma *batch; int out_fence_fd = -1; int err; @@ -2826,12 +3182,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.buffer_count = args->buffer_count; eb.batch_start_offset = args->batch_start_offset; - eb.batch_len = args->batch_len; eb.trampoline = NULL; eb.fences = NULL; eb.num_fences = 0; + memset(eb.requests, 0, sizeof(struct i915_request *) * + ARRAY_SIZE(eb.requests)); + eb.composite_fence = NULL; + eb.batch_flags = 0; if (args->flags & I915_EXEC_SECURE) { if (GRAPHICS_VER(i915) >= 11) @@ -2915,70 +3274,25 @@ i915_gem_do_execbuffer(struct drm_device *dev, ww_acquire_done(&eb.ww.ctx); - batch = eb.batch->vma; - - /* Allocate a request for this batch buffer nice and early. */ - eb.request = i915_request_create(eb.context); - if (IS_ERR(eb.request)) { - err = PTR_ERR(eb.request); - goto err_vma; - } - - if (unlikely(eb.gem_context->syncobj)) { - struct dma_fence *fence; - - fence = drm_syncobj_fence_get(eb.gem_context->syncobj); - err = i915_request_await_dma_fence(eb.request, fence); - dma_fence_put(fence); - if (err) - goto err_ext; - } - - if (in_fence) { - if (args->flags & I915_EXEC_FENCE_SUBMIT) - err = i915_request_await_execution(eb.request, - in_fence); - else - err = i915_request_await_dma_fence(eb.request, - in_fence); - if (err < 0) - goto err_request; - } - - if (eb.fences) { - err = await_fence_array(&eb); - if (err) + out_fence = eb_requests_create(&eb, in_fence, out_fence_fd); + if (IS_ERR(out_fence)) { + err = PTR_ERR(out_fence); + if (eb.requests[0]) goto err_request; + else + goto err_vma; } - if (out_fence_fd != -1) { - out_fence = sync_file_create(&eb.request->fence); - if (!out_fence) { - err = -ENOMEM; - goto err_request; - } - } - - /* - * Whilst this request exists, batch_obj will be on the - * active_list, and so will hold the active reference. Only when this - * request is retired will the the batch_obj be moved onto the - * inactive_list and lose its active reference. Hence we do not need - * to explicitly hold another reference here. - */ - eb.request->batch = batch; - if (eb.batch_pool) - intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request); - - trace_i915_request_queue(eb.request, eb.batch_flags); - err = eb_submit(&eb, batch); + err = eb_submit(&eb); err_request: - i915_request_get(eb.request); - err = eb_request_add(&eb, err); + eb_requests_get(&eb); + err = eb_requests_add(&eb, err); if (eb.fences) - signal_fence_array(&eb); + signal_fence_array(&eb, eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); if (out_fence) { if (err == 0) { @@ -2993,10 +3307,15 @@ err_request: if (unlikely(eb.gem_context->syncobj)) { drm_syncobj_replace_fence(eb.gem_context->syncobj, - &eb.request->fence); + eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); } - i915_request_put(eb.request); + if (!out_fence && eb.composite_fence) + dma_fence_put(eb.composite_fence); + + eb_requests_put(&eb); err_vma: eb_release_vmas(&eb, true); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index e5ae9c06510c..a57a6b7013c2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -134,6 +134,8 @@ static void i915_gem_object_put_pages_internal(struct drm_i915_gem_object *obj, internal_free_pages(pages); obj->mm.dirty = false; + + __start_cpu_write(obj); } static const struct drm_i915_gem_object_ops i915_gem_object_internal_ops = { diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index 76ce6a1500bc..1e426a42a36c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -128,6 +128,32 @@ void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE); } +bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj) +{ + struct drm_i915_private *i915 = to_i915(obj->base.dev); + + /* + * This is purely from a security perspective, so we simply don't care + * about non-userspace objects being able to bypass the LLC. + */ + if (!(obj->flags & I915_BO_ALLOC_USER)) + return false; + + /* + * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it + * possible for userspace to bypass the GTT caching bits set by the + * kernel, as per the given object cache_level. This is troublesome + * since the heavy flush we apply when first gathering the pages is + * skipped if the kernel thinks the object is coherent with the GPU. As + * a result it might be possible to bypass the cache and read the + * contents of the page directly, which could be stale data. If it's + * just a case of userspace shooting themselves in the foot then so be + * it, but since i915 takes the stance of always zeroing memory before + * handing it to userspace, we need to prevent this. + */ + return IS_JSL_EHL(i915); +} + static void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) { struct drm_i915_gem_object *obj = to_intel_bo(gem); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 9df3ee60604e..59201801cec5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -514,6 +514,7 @@ i915_gem_object_finish_access(struct drm_i915_gem_object *obj) void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, unsigned int cache_level); +bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj); void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj); void i915_gem_object_flush_if_display_locked(struct drm_i915_gem_object *obj); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 7c3da4e3e737..da85169006d4 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -427,6 +427,33 @@ struct drm_i915_gem_object { * can freely bypass the CPU cache when touching the pages with the GPU, * where the kernel is completely unaware. On such platform we need * apply the sledgehammer-on-acquire regardless of the @cache_coherent. + * + * Special care is taken on non-LLC platforms, to prevent potential + * information leak. The driver currently ensures: + * + * 1. All userspace objects, by default, have @cache_level set as + * I915_CACHE_NONE. The only exception is userptr objects, where we + * instead force I915_CACHE_LLC, but we also don't allow userspace to + * ever change the @cache_level for such objects. Another special case + * is dma-buf, which doesn't rely on @cache_dirty, but there we + * always do a forced flush when acquiring the pages, if there is a + * chance that the pages can be read directly from main memory with + * the GPU. + * + * 2. All I915_CACHE_NONE objects have @cache_dirty initially true. + * + * 3. All swapped-out objects(i.e shmem) have @cache_dirty set to + * true. + * + * 4. The @cache_dirty is never freely reset before the initial + * flush, even if userspace adjusts the @cache_level through the + * i915_gem_set_caching_ioctl. + * + * 5. All @cache_dirty objects(including swapped-in) are initially + * flushed with a synchronous call to drm_clflush_sg in + * __i915_gem_object_set_pages. The @cache_dirty can be freely reset + * at this point. All further asynchronous clfushes are never security + * critical, i.e userspace is free to race against itself. */ unsigned int cache_dirty:1; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 11f072193f3b..d77da59fae04 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -182,22 +182,7 @@ rebuild_st: if (i915_gem_object_needs_bit17_swizzle(obj)) i915_gem_object_do_bit_17_swizzle(obj, st); - /* - * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it - * possible for userspace to bypass the GTT caching bits set by the - * kernel, as per the given object cache_level. This is troublesome - * since the heavy flush we apply when first gathering the pages is - * skipped if the kernel thinks the object is coherent with the GPU. As - * a result it might be possible to bypass the cache and read the - * contents of the page directly, which could be stale data. If it's - * just a case of userspace shooting themselves in the foot then so be - * it, but since i915 takes the stance of always zeroing memory before - * handing it to userspace, we need to prevent this. - * - * By setting cache_dirty here we make the clflush in set_pages - * unconditional on such platforms. - */ - if (IS_JSL_EHL(i915) && obj->flags & I915_BO_ALLOC_USER) + if (i915_gem_object_can_bypass_llc(obj)) obj->cache_dirty = true; __i915_gem_object_set_pages(obj, st, sg_page_sizes); @@ -301,6 +286,8 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages, bool needs_clflush) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); + GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED); if (obj->mm.madv == I915_MADV_DONTNEED) @@ -312,6 +299,16 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, drm_clflush_sg(pages); __start_cpu_write(obj); + /* + * On non-LLC platforms, force the flush-on-acquire if this is ever + * swapped-in. Our async flush path is not trust worthy enough yet(and + * happens in the wrong order), and with some tricks it's conceivable + * for userspace to change the cache-level to I915_CACHE_NONE after the + * pages are swapped-in, and since execbuf binds the object before doing + * the async flush, we have a race window. + */ + if (!HAS_LLC(i915)) + obj->cache_dirty = true; } void i915_gem_object_put_pages_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 8ea0fa665e53..3173c9f9a040 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -165,8 +165,11 @@ alloc_table: goto err; } - sg_page_sizes = i915_sg_dma_sizes(st->sgl); + WARN_ON_ONCE(!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)); + if (i915_gem_object_can_bypass_llc(obj)) + obj->cache_dirty = true; + sg_page_sizes = i915_sg_dma_sizes(st->sgl); __i915_gem_object_set_pages(obj, st, sg_page_sizes); return 0; @@ -546,7 +549,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev, return -ENOMEM; drm_gem_private_object_init(dev, &obj->base, args->user_size); - i915_gem_object_init(obj, &i915_gem_userptr_ops, &lock_class, 0); + i915_gem_object_init(obj, &i915_gem_userptr_ops, &lock_class, + I915_BO_ALLOC_USER); obj->mem_flags = I915_BO_FLAG_STRUCT_PAGE; obj->read_domains = I915_GEM_DOMAIN_CPU; obj->write_domain = I915_GEM_DOMAIN_CPU; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index 41d0680f3bd7..b2003133deaf 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -136,6 +136,8 @@ static void put_huge_pages(struct drm_i915_gem_object *obj, huge_pages_free_pages(pages); obj->mm.dirty = false; + + __start_cpu_write(obj); } static const struct drm_i915_gem_object_ops huge_page_ops = { @@ -152,6 +154,7 @@ huge_pages_object(struct drm_i915_private *i915, { static struct lock_class_key lock_class; struct drm_i915_gem_object *obj; + unsigned int cache_level; GEM_BUG_ON(!size); GEM_BUG_ON(!IS_ALIGNED(size, BIT(__ffs(page_mask)))); @@ -173,7 +176,9 @@ huge_pages_object(struct drm_i915_private *i915, obj->write_domain = I915_GEM_DOMAIN_CPU; obj->read_domains = I915_GEM_DOMAIN_CPU; - obj->cache_level = I915_CACHE_NONE; + + cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + i915_gem_object_set_cache_coherency(obj, cache_level); obj->mm.page_mask = page_mask; diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c index ecbcbb86ae1e..8402ed925a69 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c @@ -17,13 +17,20 @@ #include "huge_gem_object.h" #include "mock_context.h" +enum client_tiling { + CLIENT_TILING_LINEAR, + CLIENT_TILING_X, + CLIENT_TILING_Y, + CLIENT_NUM_TILING_TYPES +}; + #define WIDTH 512 #define HEIGHT 32 struct blit_buffer { struct i915_vma *vma; u32 start_val; - u32 tiling; + enum client_tiling tiling; }; struct tiled_blits { @@ -53,9 +60,9 @@ static int prepare_blit(const struct tiled_blits *t, *cs++ = MI_LOAD_REGISTER_IMM(1); *cs++ = i915_mmio_reg_offset(BCS_SWCTRL); cmd = (BCS_SRC_Y | BCS_DST_Y) << 16; - if (src->tiling == I915_TILING_Y) + if (src->tiling == CLIENT_TILING_Y) cmd |= BCS_SRC_Y; - if (dst->tiling == I915_TILING_Y) + if (dst->tiling == CLIENT_TILING_Y) cmd |= BCS_DST_Y; *cs++ = cmd; @@ -172,7 +179,7 @@ static int tiled_blits_create_buffers(struct tiled_blits *t, t->buffers[i].vma = vma; t->buffers[i].tiling = - i915_prandom_u32_max_state(I915_TILING_Y + 1, prng); + i915_prandom_u32_max_state(CLIENT_TILING_Y + 1, prng); } return 0; @@ -197,17 +204,17 @@ static u64 swizzle_bit(unsigned int bit, u64 offset) static u64 tiled_offset(const struct intel_gt *gt, u64 v, unsigned int stride, - unsigned int tiling) + enum client_tiling tiling) { unsigned int swizzle; u64 x, y; - if (tiling == I915_TILING_NONE) + if (tiling == CLIENT_TILING_LINEAR) return v; y = div64_u64_rem(v, stride, &x); - if (tiling == I915_TILING_X) { + if (tiling == CLIENT_TILING_X) { v = div64_u64_rem(y, 8, &y) * stride * 8; v += y * 512; v += div64_u64_rem(x, 512, &x) << 12; @@ -244,12 +251,12 @@ static u64 tiled_offset(const struct intel_gt *gt, return v; } -static const char *repr_tiling(int tiling) +static const char *repr_tiling(enum client_tiling tiling) { switch (tiling) { - case I915_TILING_NONE: return "linear"; - case I915_TILING_X: return "X"; - case I915_TILING_Y: return "Y"; + case CLIENT_TILING_LINEAR: return "linear"; + case CLIENT_TILING_X: return "X"; + case CLIENT_TILING_Y: return "Y"; default: return "unknown"; } } diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e9a0cad5c34d..488acd39ff67 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -240,6 +240,8 @@ int __intel_context_do_pin_ww(struct intel_context *ce, if (err) goto err_post_unpin; + intel_engine_pm_might_get(ce->engine); + if (unlikely(intel_context_is_closed(ce))) { err = -ENOENT; goto err_unlock; @@ -362,8 +364,8 @@ static int __intel_context_active(struct i915_active *active) return 0; } -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, - enum i915_sw_fence_notify state) +static int __i915_sw_fence_call +sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) { return NOTIFY_DONE; } @@ -399,6 +401,10 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine) ce->guc_id.id = GUC_INVALID_LRC_ID; INIT_LIST_HEAD(&ce->guc_id.link); + INIT_LIST_HEAD(&ce->destroyed_link); + + INIT_LIST_HEAD(&ce->parallel.child_list); + /* * Initialize fence to be complete as this is expected to be complete * unless there is a pending schedule disable outstanding. @@ -413,10 +419,17 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine) void intel_context_fini(struct intel_context *ce) { + struct intel_context *child, *next; + if (ce->timeline) intel_timeline_put(ce->timeline); i915_vm_put(ce->vm); + /* Need to put the creation ref for the children */ + if (intel_context_is_parent(ce)) + for_each_child_safe(ce, child, next) + intel_context_put(child); + mutex_destroy(&ce->pin_mutex); i915_active_fini(&ce->active); i915_sw_fence_fini(&ce->guc_state.blocked); @@ -515,24 +528,53 @@ retry: struct i915_request *intel_context_find_active_request(struct intel_context *ce) { + struct intel_context *parent = intel_context_to_parent(ce); struct i915_request *rq, *active = NULL; unsigned long flags; GEM_BUG_ON(!intel_engine_uses_guc(ce->engine)); - spin_lock_irqsave(&ce->guc_state.lock, flags); - list_for_each_entry_reverse(rq, &ce->guc_state.requests, + /* + * We search the parent list to find an active request on the submitted + * context. The parent list contains the requests for all the contexts + * in the relationship so we have to do a compare of each request's + * context. + */ + spin_lock_irqsave(&parent->guc_state.lock, flags); + list_for_each_entry_reverse(rq, &parent->guc_state.requests, sched.link) { + if (rq->context != ce) + continue; if (i915_request_completed(rq)) break; active = rq; } - spin_unlock_irqrestore(&ce->guc_state.lock, flags); + spin_unlock_irqrestore(&parent->guc_state.lock, flags); return active; } +void intel_context_bind_parent_child(struct intel_context *parent, + struct intel_context *child) +{ + /* + * Callers responsibility to validate that this function is used + * correctly but we use GEM_BUG_ON here ensure that they do. + */ + GEM_BUG_ON(!intel_engine_uses_guc(parent->engine)); + GEM_BUG_ON(intel_context_is_pinned(parent)); + GEM_BUG_ON(intel_context_is_child(parent)); + GEM_BUG_ON(intel_context_is_pinned(child)); + GEM_BUG_ON(intel_context_is_child(child)); + GEM_BUG_ON(intel_context_is_parent(child)); + + parent->parallel.child_index = parent->parallel.number_children++; + list_add_tail(&child->parallel.child_link, + &parent->parallel.child_list); + child->parallel.parent = parent; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_context.c" #endif diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index c41098950746..246c37d72cd7 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -44,6 +44,54 @@ void intel_context_free(struct intel_context *ce); int intel_context_reconfigure_sseu(struct intel_context *ce, const struct intel_sseu sseu); +#define PARENT_SCRATCH_SIZE PAGE_SIZE + +static inline bool intel_context_is_child(struct intel_context *ce) +{ + return !!ce->parallel.parent; +} + +static inline bool intel_context_is_parent(struct intel_context *ce) +{ + return !!ce->parallel.number_children; +} + +static inline bool intel_context_is_pinned(struct intel_context *ce); + +static inline struct intel_context * +intel_context_to_parent(struct intel_context *ce) +{ + if (intel_context_is_child(ce)) { + /* + * The parent holds ref count to the child so it is always safe + * for the parent to access the child, but the child has a + * pointer to the parent without a ref. To ensure this is safe + * the child should only access the parent pointer while the + * parent is pinned. + */ + GEM_BUG_ON(!intel_context_is_pinned(ce->parallel.parent)); + + return ce->parallel.parent; + } else { + return ce; + } +} + +static inline bool intel_context_is_parallel(struct intel_context *ce) +{ + return intel_context_is_child(ce) || intel_context_is_parent(ce); +} + +void intel_context_bind_parent_child(struct intel_context *parent, + struct intel_context *child); + +#define for_each_child(parent, ce)\ + list_for_each_entry(ce, &(parent)->parallel.child_list,\ + parallel.child_link) +#define for_each_child_safe(parent, ce, cn)\ + list_for_each_entry_safe(ce, cn, &(parent)->parallel.child_list,\ + parallel.child_link) + /** * intel_context_lock_pinned - Stablises the 'pinned' status of the HW context * @ce - the context @@ -193,7 +241,13 @@ intel_context_timeline_lock(struct intel_context *ce) struct intel_timeline *tl = ce->timeline; int err; - err = mutex_lock_interruptible(&tl->mutex); + if (intel_context_is_parent(ce)) + err = mutex_lock_interruptible_nested(&tl->mutex, 0); + else if (intel_context_is_child(ce)) + err = mutex_lock_interruptible_nested(&tl->mutex, + ce->parallel.child_index + 1); + else + err = mutex_lock_interruptible(&tl->mutex); if (err) return ERR_PTR(err); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 12252c411159..9e0177dc5484 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -55,9 +55,13 @@ struct intel_context_ops { void (*reset)(struct intel_context *ce); void (*destroy)(struct kref *kref); - /* virtual engine/context interface */ + /* virtual/parallel engine/context interface */ struct intel_context *(*create_virtual)(struct intel_engine_cs **engine, - unsigned int count); + unsigned int count, + unsigned long flags); + struct intel_context *(*create_parallel)(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width); struct intel_engine_cs *(*get_sibling)(struct intel_engine_cs *engine, unsigned int sibling); }; @@ -113,6 +117,7 @@ struct intel_context { #define CONTEXT_NOPREEMPT 8 #define CONTEXT_LRCA_DIRTY 9 #define CONTEXT_GUC_INIT 10 +#define CONTEXT_PERMA_PIN 11 struct { u64 timeout_us; @@ -197,22 +202,80 @@ struct intel_context { struct { /** * @id: handle which is used to uniquely identify this context - * with the GuC, protected by guc->contexts_lock + * with the GuC, protected by guc->submission_state.lock */ u16 id; /** * @ref: the number of references to the guc_id, when * transitioning in and out of zero protected by - * guc->contexts_lock + * guc->submission_state.lock */ atomic_t ref; /** * @link: in guc->guc_id_list when the guc_id has no refs but is - * still valid, protected by guc->contexts_lock + * still valid, protected by guc->submission_state.lock */ struct list_head link; } guc_id; + /** + * @destroyed_link: link in guc->submission_state.destroyed_contexts, in + * list when context is pending to be destroyed (deregistered with the + * GuC), protected by guc->submission_state.lock + */ + struct list_head destroyed_link; + + /** @parallel: sub-structure for parallel submission members */ + struct { + union { + /** + * @child_list: parent's list of children + * contexts, no protection as immutable after context + * creation + */ + struct list_head child_list; + /** + * @child_link: child's link into parent's list of + * children + */ + struct list_head child_link; + }; + /** @parent: pointer to parent if child */ + struct intel_context *parent; + /** + * @last_rq: last request submitted on a parallel context, used + * to insert submit fences between requests in the parallel + * context + */ + struct i915_request *last_rq; + /** + * @fence_context: fence context composite fence when doing + * parallel submission + */ + u64 fence_context; + /** + * @seqno: seqno for composite fence when doing parallel + * submission + */ + u32 seqno; + /** @number_children: number of children if parent */ + u8 number_children; + /** @child_index: index into child_list if child */ + u8 child_index; + /** @guc: GuC specific members for parallel submission */ + struct { + /** @wqi_head: head pointer in work queue */ + u16 wqi_head; + /** @wqi_tail: tail pointer in work queue */ + u16 wqi_tail; + /** + * @parent_page: page in context state (ce->state) used + * by parent for work queue, process descriptor + */ + u8 parent_page; + } guc; + } parallel; + #ifdef CONFIG_DRM_I915_SELFTEST /** * @drop_schedule_enable: Force drop of schedule enable G2H for selftest diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 452248884ef1..08559ace0ada 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -2,6 +2,7 @@ #ifndef _INTEL_RINGBUFFER_H_ #define _INTEL_RINGBUFFER_H_ +#include <asm/cacheflush.h> #include <drm/drm_util.h> #include <linux/hashtable.h> @@ -281,9 +282,19 @@ intel_engine_has_preempt_reset(const struct intel_engine_cs *engine) return intel_engine_has_preemption(engine); } +#define FORCE_VIRTUAL BIT(0) struct intel_context * intel_engine_create_virtual(struct intel_engine_cs **siblings, - unsigned int count); + unsigned int count, unsigned long flags); + +static inline struct intel_context * +intel_engine_create_parallel(struct intel_engine_cs **engines, + unsigned int num_engines, + unsigned int width) +{ + GEM_BUG_ON(!engines[0]->cops->create_parallel); + return engines[0]->cops->create_parallel(engines, num_engines, width); +} static inline bool intel_virtual_engine_has_heartbeat(const struct intel_engine_cs *engine) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 2ae57e4656a3..ff6753ccb129 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -290,7 +290,8 @@ static void nop_irq_handler(struct intel_engine_cs *engine, u16 iir) GEM_DEBUG_WARN_ON(iir); } -static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id) +static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id, + u8 logical_instance) { const struct engine_info *info = &intel_engines[id]; struct drm_i915_private *i915 = gt->i915; @@ -335,6 +336,7 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id) engine->class = info->class; engine->instance = info->instance; + engine->logical_mask = BIT(logical_instance); __sprint_engine_name(engine); engine->props.heartbeat_interval_ms = @@ -588,6 +590,37 @@ static intel_engine_mask_t init_engine_mask(struct intel_gt *gt) return info->engine_mask; } +static void populate_logical_ids(struct intel_gt *gt, u8 *logical_ids, + u8 class, const u8 *map, u8 num_instances) +{ + int i, j; + u8 current_logical_id = 0; + + for (j = 0; j < num_instances; ++j) { + for (i = 0; i < ARRAY_SIZE(intel_engines); ++i) { + if (!HAS_ENGINE(gt, i) || + intel_engines[i].class != class) + continue; + + if (intel_engines[i].instance == map[j]) { + logical_ids[intel_engines[i].instance] = + current_logical_id++; + break; + } + } + } +} + +static void setup_logical_ids(struct intel_gt *gt, u8 *logical_ids, u8 class) +{ + int i; + u8 map[MAX_ENGINE_INSTANCE + 1]; + + for (i = 0; i < MAX_ENGINE_INSTANCE + 1; ++i) + map[i] = i; + populate_logical_ids(gt, logical_ids, class, map, ARRAY_SIZE(map)); +} + /** * intel_engines_init_mmio() - allocate and prepare the Engine Command Streamers * @gt: pointer to struct intel_gt @@ -599,7 +632,8 @@ int intel_engines_init_mmio(struct intel_gt *gt) struct drm_i915_private *i915 = gt->i915; const unsigned int engine_mask = init_engine_mask(gt); unsigned int mask = 0; - unsigned int i; + unsigned int i, class; + u8 logical_ids[MAX_ENGINE_INSTANCE + 1]; int err; drm_WARN_ON(&i915->drm, engine_mask == 0); @@ -609,15 +643,23 @@ int intel_engines_init_mmio(struct intel_gt *gt) if (i915_inject_probe_failure(i915)) return -ENODEV; - for (i = 0; i < ARRAY_SIZE(intel_engines); i++) { - if (!HAS_ENGINE(gt, i)) - continue; + for (class = 0; class < MAX_ENGINE_CLASS + 1; ++class) { + setup_logical_ids(gt, logical_ids, class); - err = intel_engine_setup(gt, i); - if (err) - goto cleanup; + for (i = 0; i < ARRAY_SIZE(intel_engines); ++i) { + u8 instance = intel_engines[i].instance; + + if (intel_engines[i].class != class || + !HAS_ENGINE(gt, i)) + continue; - mask |= BIT(i); + err = intel_engine_setup(gt, i, + logical_ids[instance]); + if (err) + goto cleanup; + + mask |= BIT(i); + } } /* @@ -1911,16 +1953,16 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) struct intel_context * intel_engine_create_virtual(struct intel_engine_cs **siblings, - unsigned int count) + unsigned int count, unsigned long flags) { if (count == 0) return ERR_PTR(-EINVAL); - if (count == 1) + if (count == 1 && !(flags & FORCE_VIRTUAL)) return intel_context_create(siblings[0]); GEM_BUG_ON(!siblings[0]->cops->create_virtual); - return siblings[0]->cops->create_virtual(siblings, count); + return siblings[0]->cops->create_virtual(siblings, count, flags); } struct i915_request * diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index dacd62773735..a1334b48dde7 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -162,6 +162,19 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) unsigned long flags; bool result = true; + /* + * This is execlist specific behaviour intended to ensure the GPU is + * idle by switching to a known 'safe' context. With GuC submission, the + * same idle guarantee is achieved by other means (disabling + * scheduling). Further, switching to a 'safe' context has no effect + * with GuC submission as the scheduler can just switch back again. + * + * FIXME: Move this backend scheduler specific behaviour into the + * scheduler backend. + */ + if (intel_engine_uses_guc(engine)) + return true; + /* GPU is pointing to the void, as good as in the kernel context. */ if (intel_gt_is_wedged(engine->gt)) return true; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h index 8520c595f5e1..d68675925b79 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h @@ -6,9 +6,11 @@ #ifndef INTEL_ENGINE_PM_H #define INTEL_ENGINE_PM_H +#include "i915_drv.h" #include "i915_request.h" #include "intel_engine_types.h" #include "intel_wakeref.h" +#include "intel_gt_pm.h" static inline bool intel_engine_pm_is_awake(const struct intel_engine_cs *engine) @@ -16,6 +18,11 @@ intel_engine_pm_is_awake(const struct intel_engine_cs *engine) return intel_wakeref_is_active(&engine->wakeref); } +static inline void __intel_engine_pm_get(struct intel_engine_cs *engine) +{ + __intel_wakeref_get(&engine->wakeref); +} + static inline void intel_engine_pm_get(struct intel_engine_cs *engine) { intel_wakeref_get(&engine->wakeref); @@ -26,6 +33,21 @@ static inline bool intel_engine_pm_get_if_awake(struct intel_engine_cs *engine) return intel_wakeref_get_if_active(&engine->wakeref); } +static inline void intel_engine_pm_might_get(struct intel_engine_cs *engine) +{ + if (!intel_engine_is_virtual(engine)) { + intel_wakeref_might_get(&engine->wakeref); + } else { + struct intel_gt *gt = engine->gt; + struct intel_engine_cs *tengine; + intel_engine_mask_t tmp, mask = engine->mask; + + for_each_engine_masked(tengine, gt, mask, tmp) + intel_wakeref_might_get(&tengine->wakeref); + } + intel_gt_pm_might_get(engine->gt); +} + static inline void intel_engine_pm_put(struct intel_engine_cs *engine) { intel_wakeref_put(&engine->wakeref); @@ -47,6 +69,21 @@ static inline void intel_engine_pm_flush(struct intel_engine_cs *engine) intel_wakeref_unlock_wait(&engine->wakeref); } +static inline void intel_engine_pm_might_put(struct intel_engine_cs *engine) +{ + if (!intel_engine_is_virtual(engine)) { + intel_wakeref_might_put(&engine->wakeref); + } else { + struct intel_gt *gt = engine->gt; + struct intel_engine_cs *tengine; + intel_engine_mask_t tmp, mask = engine->mask; + + for_each_engine_masked(tengine, gt, mask, tmp) + intel_wakeref_might_put(&tengine->wakeref); + } + intel_gt_pm_might_put(engine->gt); +} + static inline struct i915_request * intel_engine_create_kernel_request(struct intel_engine_cs *engine) { diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 9167ce52487c..e0f773585c29 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -269,6 +269,13 @@ struct intel_engine_cs { unsigned int guc_id; intel_engine_mask_t mask; + /** + * @logical_mask: logical mask of engine, reported to user space via + * query IOCTL and used to communicate with the GuC in logical space. + * The logical instance of a physical engine can change based on product + * and fusing. + */ + intel_engine_mask_t logical_mask; u8 class; u8 instance; diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 73a79c2acd3a..bedb80057046 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -201,7 +201,8 @@ static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) } static struct intel_context * -execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count); +execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags); static struct i915_request * __active_request(const struct intel_timeline * const tl, @@ -3784,7 +3785,8 @@ unlock: } static struct intel_context * -execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count) +execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags) { struct virtual_engine *ve; unsigned int n; @@ -3877,6 +3879,7 @@ execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count) ve->siblings[ve->num_siblings++] = sibling; ve->base.mask |= sibling->mask; + ve->base.logical_mask |= sibling->logical_mask; /* * All physical engines must be compatible for their emission diff --git a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.c b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.c index 1fe19ccd2794..f103664b71d4 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.c @@ -13,6 +13,59 @@ #include "pxp/intel_pxp_debugfs.h" #include "uc/intel_uc_debugfs.h" +int intel_gt_debugfs_reset_show(struct intel_gt *gt, u64 *val) +{ + int ret = intel_gt_terminally_wedged(gt); + + switch (ret) { + case -EIO: + *val = 1; + return 0; + case 0: + *val = 0; + return 0; + default: + return ret; + } +} + +int intel_gt_debugfs_reset_store(struct intel_gt *gt, u64 val) +{ + /* Flush any previous reset before applying for a new one */ + wait_event(gt->reset.queue, + !test_bit(I915_RESET_BACKOFF, >->reset.flags)); + + intel_gt_handle_error(gt, val, I915_ERROR_CAPTURE, + "Manually reset engine mask to %llx", val); + return 0; +} + +/* + * keep the interface clean where the first parameter + * is a 'struct intel_gt *' instead of 'void *' + */ +static int __intel_gt_debugfs_reset_show(void *data, u64 *val) +{ + return intel_gt_debugfs_reset_show(data, val); +} + +static int __intel_gt_debugfs_reset_store(void *data, u64 val) +{ + return intel_gt_debugfs_reset_store(data, val); +} + +DEFINE_SIMPLE_ATTRIBUTE(reset_fops, __intel_gt_debugfs_reset_show, + __intel_gt_debugfs_reset_store, "%llu\n"); + +static void gt_debugfs_register(struct intel_gt *gt, struct dentry *root) +{ + static const struct intel_gt_debugfs_file files[] = { + { "reset", &reset_fops, NULL }, + }; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), gt); +} + void intel_gt_debugfs_register(struct intel_gt *gt) { struct dentry *root; @@ -24,6 +77,8 @@ void intel_gt_debugfs_register(struct intel_gt *gt) if (IS_ERR(root)) return; + gt_debugfs_register(gt, root); + intel_gt_engines_debugfs_register(gt, root); intel_gt_pm_debugfs_register(gt, root); intel_sseu_debugfs_register(gt, root); diff --git a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h index 8b6fca09897c..e307ceb99031 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h @@ -35,4 +35,8 @@ void intel_gt_debugfs_register_files(struct dentry *root, const struct intel_gt_debugfs_file *files, unsigned long count, void *data); +/* functions that need to be accessed by the upper level non-gt interfaces */ +int intel_gt_debugfs_reset_show(struct intel_gt *gt, u64 *val); +int intel_gt_debugfs_reset_store(struct intel_gt *gt, u64 val); + #endif /* INTEL_GT_DEBUGFS_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h index d0588d8aaa44..bc898df7a48c 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h @@ -31,6 +31,11 @@ static inline bool intel_gt_pm_get_if_awake(struct intel_gt *gt) return intel_wakeref_get_if_active(>->wakeref); } +static inline void intel_gt_pm_might_get(struct intel_gt *gt) +{ + intel_wakeref_might_get(>->wakeref); +} + static inline void intel_gt_pm_put(struct intel_gt *gt) { intel_wakeref_put(>->wakeref); @@ -41,6 +46,15 @@ static inline void intel_gt_pm_put_async(struct intel_gt *gt) intel_wakeref_put_async(>->wakeref); } +static inline void intel_gt_pm_might_put(struct intel_gt *gt) +{ + intel_wakeref_might_put(>->wakeref); +} + +#define with_intel_gt_pm(gt, tmp) \ + for (tmp = 1, intel_gt_pm_get(gt); tmp; \ + intel_gt_pm_put(gt), tmp = 0) + static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt) { return intel_wakeref_wait_for_idle(>->wakeref); diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c index 5f84ad602642..404dfa7673c6 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c @@ -13,11 +13,52 @@ #include "intel_gt_pm.h" #include "intel_gt_pm_debugfs.h" #include "intel_llc.h" +#include "intel_pcode.h" #include "intel_rc6.h" #include "intel_rps.h" #include "intel_runtime_pm.h" -#include "intel_sideband.h" #include "intel_uncore.h" +#include "vlv_sideband.h" + +int intel_gt_pm_debugfs_forcewake_user_open(struct intel_gt *gt) +{ + atomic_inc(>->user_wakeref); + intel_gt_pm_get(gt); + if (GRAPHICS_VER(gt->i915) >= 6) + intel_uncore_forcewake_user_get(gt->uncore); + + return 0; +} + +int intel_gt_pm_debugfs_forcewake_user_release(struct intel_gt *gt) +{ + if (GRAPHICS_VER(gt->i915) >= 6) + intel_uncore_forcewake_user_put(gt->uncore); + intel_gt_pm_put(gt); + atomic_dec(>->user_wakeref); + + return 0; +} + +static int forcewake_user_open(struct inode *inode, struct file *file) +{ + struct intel_gt *gt = inode->i_private; + + return intel_gt_pm_debugfs_forcewake_user_open(gt); +} + +static int forcewake_user_release(struct inode *inode, struct file *file) +{ + struct intel_gt *gt = inode->i_private; + + return intel_gt_pm_debugfs_forcewake_user_release(gt); +} + +static const struct file_operations forcewake_user_fops = { + .owner = THIS_MODULE, + .open = forcewake_user_open, + .release = forcewake_user_release, +}; static int fw_domains_show(struct seq_file *m, void *data) { @@ -627,6 +668,7 @@ void intel_gt_pm_debugfs_register(struct intel_gt *gt, struct dentry *root) { "drpc", &drpc_fops, NULL }, { "frequency", &frequency_fops, NULL }, { "forcewake", &fw_domains_fops, NULL }, + { "forcewake_user", &forcewake_user_fops, NULL}, { "llc", &llc_fops, llc_eval }, { "rps_boost", &rps_boost_fops, rps_eval }, }; diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.h b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.h index 2b824289582b..a8457887ec65 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.h @@ -13,4 +13,8 @@ struct drm_printer; void intel_gt_pm_debugfs_register(struct intel_gt *gt, struct dentry *root); void intel_gt_pm_frequency_dump(struct intel_gt *gt, struct drm_printer *m); +/* functions that need to be accessed by the upper level non-gt interfaces */ +int intel_gt_pm_debugfs_forcewake_user_open(struct intel_gt *gt); +int intel_gt_pm_debugfs_forcewake_user_release(struct intel_gt *gt); + #endif /* INTEL_GT_PM_DEBUGFS_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_llc.c b/drivers/gpu/drm/i915/gt/intel_llc.c index eb1a15deed22..08d7d5ae263a 100644 --- a/drivers/gpu/drm/i915/gt/intel_llc.c +++ b/drivers/gpu/drm/i915/gt/intel_llc.c @@ -3,12 +3,13 @@ * Copyright © 2019 Intel Corporation */ +#include <asm/tsc.h> #include <linux/cpufreq.h> #include "i915_drv.h" #include "intel_gt.h" #include "intel_llc.h" -#include "intel_sideband.h" +#include "intel_pcode.h" struct ia_constants { unsigned int min_gpu_freq; diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 3ef9eaf8c50e..56156cf18c41 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -942,6 +942,11 @@ __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) context_size += PAGE_SIZE; } + if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { + ce->parallel.guc.parent_page = context_size / PAGE_SIZE; + context_size += PARENT_SCRATCH_SIZE; + } + obj = i915_gem_object_create_lmem(engine->i915, context_size, I915_BO_ALLOC_PM_VOLATILE); if (IS_ERR(obj)) diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index 799d382eea79..43093dd2d0c9 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -9,8 +9,8 @@ #include "i915_vgpu.h" #include "intel_gt.h" #include "intel_gt_pm.h" +#include "intel_pcode.h" #include "intel_rc6.h" -#include "intel_sideband.h" /** * DOC: RC6 diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index 593524195707..586dca1731ce 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -292,7 +292,7 @@ static void xcs_sanitize(struct intel_engine_cs *engine) sanitize_hwsp(engine); /* And scrub the dirty cachelines for the HWSP */ - clflush_cache_range(engine->status_page.addr, PAGE_SIZE); + drm_clflush_virt_range(engine->status_page.addr, PAGE_SIZE); intel_engine_reset_pinned_contexts(engine); } diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 172de6c9f949..5e275f8dda8c 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -11,8 +11,9 @@ #include "intel_gt_clock_utils.h" #include "intel_gt_irq.h" #include "intel_gt_pm_irq.h" +#include "intel_pcode.h" #include "intel_rps.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" #include "../../../platform/x86/intel_ips.h" #define BUSY_MAX_EI 20u /* ms */ diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index 1257f4f11e66..438bbc7b8147 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -64,7 +64,7 @@ intel_timeline_pin_map(struct intel_timeline *timeline) timeline->hwsp_map = vaddr; timeline->hwsp_seqno = memset(vaddr + ofs, 0, TIMELINE_SEQNO_BYTES); - clflush(vaddr + ofs); + drm_clflush_virt_range(vaddr + ofs, TIMELINE_SEQNO_BYTES); return 0; } @@ -225,7 +225,7 @@ void intel_timeline_reset_seqno(const struct intel_timeline *tl) memset(hwsp_seqno + 1, 0, TIMELINE_SEQNO_BYTES - sizeof(*hwsp_seqno)); WRITE_ONCE(*hwsp_seqno, tl->seqno); - clflush(hwsp_seqno); + drm_clflush_virt_range(hwsp_seqno, TIMELINE_SEQNO_BYTES); } void intel_timeline_enter(struct intel_timeline *tl) diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c index 25a8c4f62b0d..b367ecfa42de 100644 --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c @@ -3733,7 +3733,7 @@ static int nop_virtual_engine(struct intel_gt *gt, GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ve)); for (n = 0; n < nctx; n++) { - ve[n] = intel_engine_create_virtual(siblings, nsibling); + ve[n] = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve[n])) { err = PTR_ERR(ve[n]); nctx = n; @@ -3929,7 +3929,7 @@ static int mask_virtual_engine(struct intel_gt *gt, * restrict it to our desired engine within the virtual engine. */ - ve = intel_engine_create_virtual(siblings, nsibling); + ve = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve)) { err = PTR_ERR(ve); goto out_close; @@ -4060,7 +4060,7 @@ static int slicein_virtual_engine(struct intel_gt *gt, i915_request_add(rq); } - ce = intel_engine_create_virtual(siblings, nsibling); + ce = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ce)) { err = PTR_ERR(ce); goto out; @@ -4112,7 +4112,7 @@ static int sliceout_virtual_engine(struct intel_gt *gt, /* XXX We do not handle oversubscription and fairness with normal rq */ for (n = 0; n < nsibling; n++) { - ce = intel_engine_create_virtual(siblings, nsibling); + ce = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ce)) { err = PTR_ERR(ce); goto out; @@ -4214,7 +4214,7 @@ static int preserved_virtual_engine(struct intel_gt *gt, if (err) goto out_scratch; - ve = intel_engine_create_virtual(siblings, nsibling); + ve = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve)) { err = PTR_ERR(ve); goto out_scratch; @@ -4354,7 +4354,7 @@ static int reset_virtual_engine(struct intel_gt *gt, if (igt_spinner_init(&spin, gt)) return -ENOMEM; - ve = intel_engine_create_virtual(siblings, nsibling); + ve = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve)) { err = PTR_ERR(ve); goto out_spin; diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h index 8ff582222aff..ba10bd374cee 100644 --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h @@ -142,6 +142,7 @@ enum intel_guc_action { INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505, INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506, INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600, + INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601, INTEL_GUC_ACTION_RESET_CLIENT = 0x5507, INTEL_GUC_ACTION_LIMIT }; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 8f8182bf7c11..6e228343e8cb 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -756,3 +756,32 @@ void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p) } } } + +void intel_guc_write_barrier(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + if (i915_gem_object_is_lmem(guc->ct.vma->obj)) { + /* + * Ensure intel_uncore_write_fw can be used rather than + * intel_uncore_write. + */ + GEM_BUG_ON(guc->send_regs.fw_domains); + + /* + * This register is used by the i915 and GuC for MMIO based + * communication. Once we are in this code CTBs are the only + * method the i915 uses to communicate with the GuC so it is + * safe to write to this register (a value of 0 is NOP for MMIO + * communication). If we ever start mixing CTBs and MMIOs a new + * register will have to be chosen. This function is also used + * to enforce ordering of a work queue item write and an update + * to the process descriptor. When a work queue is being used, + * CTBs are also the only mechanism of communication. + */ + intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0); + } else { + /* wmb() sufficient for a barrier if in smem */ + wmb(); + } +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index 5dd174babf7a..31cf9fb48c7e 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -46,6 +46,15 @@ struct intel_guc { * submitted until the stalled request is processed. */ struct i915_request *stalled_request; + /** + * @submission_stall_reason: reason why submission is stalled + */ + enum { + STALL_NONE, + STALL_REGISTER_CONTEXT, + STALL_MOVE_LRC_TAIL, + STALL_ADD_REQUEST, + } submission_stall_reason; /* intel_guc_recv interrupt related state */ /** @irq_lock: protects GuC irq state */ @@ -71,16 +80,41 @@ struct intel_guc { } interrupts; /** - * @contexts_lock: protects guc_ids, guc_id_list, ce->guc_id.id, and - * ce->guc_id.ref when transitioning in and out of zero - */ - spinlock_t contexts_lock; - /** @guc_ids: used to allocate unique ce->guc_id.id values */ - struct ida guc_ids; - /** - * @guc_id_list: list of intel_context with valid guc_ids but no refs + * @submission_state: sub-structure for submission state protected by + * single lock */ - struct list_head guc_id_list; + struct { + /** + * @lock: protects everything in submission_state, + * ce->guc_id.id, and ce->guc_id.ref when transitioning in and + * out of zero + */ + spinlock_t lock; + /** + * @guc_ids: used to allocate new guc_ids, single-lrc + */ + struct ida guc_ids; + /** + * @guc_ids_bitmap: used to allocate new guc_ids, multi-lrc + */ + unsigned long *guc_ids_bitmap; + /** + * @guc_id_list: list of intel_context with valid guc_ids but no + * refs + */ + struct list_head guc_id_list; + /** + * @destroyed_contexts: list of contexts waiting to be destroyed + * (deregistered with the GuC) + */ + struct list_head destroyed_contexts; + /** + * @destroyed_worker: worker to deregister contexts, need as we + * need to take a GT PM reference and can't from destroy + * function as it might be in an atomic context (no sleeping) + */ + struct work_struct destroyed_worker; + } submission_state; /** * @submission_supported: tracks whether we support GuC submission on @@ -342,4 +376,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc); void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p); +void intel_guc_write_barrier(struct intel_guc *guc); + #endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c index 2c6ea64af7ec..621c893a009f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c @@ -176,7 +176,7 @@ static void guc_mapping_table_init(struct intel_gt *gt, for_each_engine(engine, gt, id) { u8 guc_class = engine_class_to_guc_class(engine->class); - system_info->mapping_table[guc_class][engine->instance] = + system_info->mapping_table[guc_class][ilog2(engine->logical_mask)] = engine->instance; } } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c index 0a3504bc0b61..a0cc34be7b56 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c @@ -383,28 +383,6 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct) return ++ct->requests.last_fence; } -static void write_barrier(struct intel_guc_ct *ct) -{ - struct intel_guc *guc = ct_to_guc(ct); - struct intel_gt *gt = guc_to_gt(guc); - - if (i915_gem_object_is_lmem(guc->ct.vma->obj)) { - GEM_BUG_ON(guc->send_regs.fw_domains); - /* - * This register is used by the i915 and GuC for MMIO based - * communication. Once we are in this code CTBs are the only - * method the i915 uses to communicate with the GuC so it is - * safe to write to this register (a value of 0 is NOP for MMIO - * communication). If we ever start mixing CTBs and MMIOs a new - * register will have to be chosen. - */ - intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0); - } else { - /* wmb() sufficient for a barrier if in smem */ - wmb(); - } -} - static int ct_write(struct intel_guc_ct *ct, const u32 *action, u32 len /* in dwords */, @@ -474,7 +452,7 @@ static int ct_write(struct intel_guc_ct *ct, * make sure H2G buffer update and LRC tail update (if this triggering a * submission) are visible before updating the descriptor tail */ - write_barrier(ct); + intel_guc_write_barrier(ct_to_guc(ct)); /* update local copies */ ctb->tail = tail; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index fa4be13c8854..722933e26347 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -52,27 +52,27 @@ #define GUC_DOORBELL_INVALID 256 -#define GUC_WQ_SIZE (PAGE_SIZE * 2) - -/* Work queue item header definitions */ +/* + * Work queue item header definitions + * + * Work queue is circular buffer used to submit complex (multi-lrc) submissions + * to the GuC. A work queue item is an entry in the circular buffer. + */ #define WQ_STATUS_ACTIVE 1 #define WQ_STATUS_SUSPENDED 2 #define WQ_STATUS_CMD_ERROR 3 #define WQ_STATUS_ENGINE_ID_NOT_USED 4 #define WQ_STATUS_SUSPENDED_FROM_RESET 5 -#define WQ_TYPE_SHIFT 0 -#define WQ_TYPE_BATCH_BUF (0x1 << WQ_TYPE_SHIFT) -#define WQ_TYPE_PSEUDO (0x2 << WQ_TYPE_SHIFT) -#define WQ_TYPE_INORDER (0x3 << WQ_TYPE_SHIFT) -#define WQ_TYPE_NOOP (0x4 << WQ_TYPE_SHIFT) -#define WQ_TARGET_SHIFT 10 -#define WQ_LEN_SHIFT 16 -#define WQ_NO_WCFLUSH_WAIT (1 << 27) -#define WQ_PRESENT_WORKLOAD (1 << 28) - -#define WQ_RING_TAIL_SHIFT 20 -#define WQ_RING_TAIL_MAX 0x7FF /* 2^11 QWords */ -#define WQ_RING_TAIL_MASK (WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT) +#define WQ_TYPE_BATCH_BUF 0x1 +#define WQ_TYPE_PSEUDO 0x2 +#define WQ_TYPE_INORDER 0x3 +#define WQ_TYPE_NOOP 0x4 +#define WQ_TYPE_MULTI_LRC 0x5 +#define WQ_TYPE_MASK GENMASK(7, 0) +#define WQ_LEN_MASK GENMASK(26, 16) + +#define WQ_GUC_ID_MASK GENMASK(15, 0) +#define WQ_RING_TAIL_MASK GENMASK(28, 18) #define GUC_STAGE_DESC_ATTR_ACTIVE BIT(0) #define GUC_STAGE_DESC_ATTR_PENDING_DB BIT(1) @@ -186,7 +186,7 @@ struct guc_process_desc { u32 wq_status; u32 engine_presence; u32 priority; - u32 reserved[30]; + u32 reserved[36]; } __packed; #define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index ba0de35f6323..d7710debcd47 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -11,6 +11,7 @@ #include "gt/intel_context.h" #include "gt/intel_engine_pm.h" #include "gt/intel_engine_heartbeat.h" +#include "gt/intel_gpu_commands.h" #include "gt/intel_gt.h" #include "gt/intel_gt_irq.h" #include "gt/intel_gt_pm.h" @@ -68,7 +69,7 @@ * fence is used to stall all requests associated with this guc_id until the * corresponding G2H returns indicating the guc_id has been deregistered. * - * guc_ids: + * submission_state.guc_ids: * Unique number associated with private GuC context data passed in during * context registration / submission / deregistration. 64k available. Simple ida * is used for allocation. @@ -89,9 +90,9 @@ * sched_engine can be submitting at a time. Currently only one sched_engine is * used for all of GuC submission but that could change in the future. * - * guc->contexts_lock - * Protects guc_id allocation for the given GuC, i.e. only one context can be - * doing guc_id allocation operations at a time for each GuC in the system. + * guc->submission_state.lock + * Global lock for GuC submission state. Protects guc_ids and destroyed contexts + * list. * * ce->guc_state.lock * Protects everything under ce->guc_state. Ensures that a context is in the @@ -103,7 +104,7 @@ * * Lock ordering rules: * sched_engine->lock -> ce->guc_state.lock - * guc->contexts_lock -> ce->guc_state.lock + * guc->submission_state.lock -> ce->guc_state.lock * * Reset races: * When a full GT reset is triggered it is assumed that some G2H responses to @@ -124,11 +125,27 @@ struct guc_virtual_engine { }; static struct intel_context * -guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count); +guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags); + +static struct intel_context * +guc_create_parallel(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width); #define GUC_REQUEST_SIZE 64 /* bytes */ /* + * We reserve 1/16 of the guc_ids for multi-lrc as these need to be contiguous + * per the GuC submission interface. A different allocation algorithm is used + * (bitmap vs. ida) between multi-lrc and single-lrc hence the reason to + * partition the guc_id space. We believe the number of multi-lrc contexts in + * use should be low and 1/16 should be sufficient. Minimum of 32 guc_ids for + * multi-lrc. + */ +#define NUMBER_MULTI_LRC_GUC_ID (GUC_MAX_LRC_DESCRIPTORS / 16) + +/* * Below is a set of functions which control the GuC scheduling state which * require a lock. */ @@ -324,6 +341,12 @@ static inline void decr_context_committed_requests(struct intel_context *ce) GEM_BUG_ON(ce->guc_state.number_committed_requests < 0); } +static struct intel_context * +request_to_scheduling_context(struct i915_request *rq) +{ + return intel_context_to_parent(rq->context); +} + static inline bool context_guc_id_invalid(struct intel_context *ce) { return ce->guc_id.id == GUC_INVALID_LRC_ID; @@ -344,6 +367,104 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) return rb_entry(rb, struct i915_priolist, node); } +/* + * When using multi-lrc submission a scratch memory area is reserved in the + * parent's context state for the process descriptor, work queue, and handshake + * between the parent + children contexts to insert safe preemption points + * between each of the BBs. Currently the scratch area is sized to a page. + * + * The layout of this scratch area is below: + * 0 guc_process_desc + * + sizeof(struct guc_process_desc) child go + * + CACHELINE_BYTES child join[0] + * ... + * + CACHELINE_BYTES child join[n - 1] + * ... unused + * PARENT_SCRATCH_SIZE / 2 work queue start + * ... work queue + * PARENT_SCRATCH_SIZE - 1 work queue end + */ +#define WQ_SIZE (PARENT_SCRATCH_SIZE / 2) +#define WQ_OFFSET (PARENT_SCRATCH_SIZE - WQ_SIZE) + +struct sync_semaphore { + u32 semaphore; + u8 unused[CACHELINE_BYTES - sizeof(u32)]; +}; + +struct parent_scratch { + struct guc_process_desc pdesc; + + struct sync_semaphore go; + struct sync_semaphore join[MAX_ENGINE_INSTANCE + 1]; + + u8 unused[WQ_OFFSET - sizeof(struct guc_process_desc) - + sizeof(struct sync_semaphore) * (MAX_ENGINE_INSTANCE + 2)]; + + u32 wq[WQ_SIZE / sizeof(u32)]; +}; + +static u32 __get_parent_scratch_offset(struct intel_context *ce) +{ + GEM_BUG_ON(!ce->parallel.guc.parent_page); + + return ce->parallel.guc.parent_page * PAGE_SIZE; +} + +static u32 __get_wq_offset(struct intel_context *ce) +{ + BUILD_BUG_ON(offsetof(struct parent_scratch, wq) != WQ_OFFSET); + + return __get_parent_scratch_offset(ce) + WQ_OFFSET; +} + +static struct parent_scratch * +__get_parent_scratch(struct intel_context *ce) +{ + BUILD_BUG_ON(sizeof(struct parent_scratch) != PARENT_SCRATCH_SIZE); + BUILD_BUG_ON(sizeof(struct sync_semaphore) != CACHELINE_BYTES); + + /* + * Need to subtract LRC_STATE_OFFSET here as the + * parallel.guc.parent_page is the offset into ce->state while + * ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET. + */ + return (struct parent_scratch *) + (ce->lrc_reg_state + + ((__get_parent_scratch_offset(ce) - + LRC_STATE_OFFSET) / sizeof(u32))); +} + +static struct guc_process_desc * +__get_process_desc(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + + return &ps->pdesc; +} + +static u32 *get_wq_pointer(struct guc_process_desc *desc, + struct intel_context *ce, + u32 wqi_size) +{ + /* + * Check for space in work queue. Caching a value of head pointer in + * intel_context structure in order reduce the number accesses to shared + * GPU memory which may be across a PCIe bus. + */ +#define AVAILABLE_SPACE \ + CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE) + if (wqi_size > AVAILABLE_SPACE) { + ce->parallel.guc.wqi_head = READ_ONCE(desc->head); + + if (wqi_size > AVAILABLE_SPACE) + return NULL; + } +#undef AVAILABLE_SPACE + + return &__get_parent_scratch(ce)->wq[ce->parallel.guc.wqi_tail / sizeof(u32)]; +} + static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index) { struct guc_lrc_desc *base = guc->lrc_desc_pool_vaddr; @@ -503,10 +624,10 @@ int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout) static int guc_lrc_desc_pin(struct intel_context *ce, bool loop); -static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) +static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq) { int err = 0; - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); u32 action[3]; int len = 0; u32 g2h_len_dw = 0; @@ -527,26 +648,17 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) GEM_BUG_ON(!atomic_read(&ce->guc_id.ref)); GEM_BUG_ON(context_guc_id_invalid(ce)); - /* - * Corner case where the GuC firmware was blown away and reloaded while - * this context was pinned. - */ - if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id))) { - err = guc_lrc_desc_pin(ce, false); - if (unlikely(err)) - return err; - } - spin_lock(&ce->guc_state.lock); /* * The request / context will be run on the hardware when scheduling - * gets enabled in the unblock. + * gets enabled in the unblock. For multi-lrc we still submit the + * context to move the LRC tails. */ - if (unlikely(context_blocked(ce))) + if (unlikely(context_blocked(ce) && !intel_context_is_parent(ce))) goto out; - enabled = context_enabled(ce); + enabled = context_enabled(ce) || context_blocked(ce); if (!enabled) { action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET; @@ -565,6 +677,18 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) trace_intel_context_sched_enable(ce); atomic_inc(&guc->outstanding_submission_g2h); set_context_enabled(ce); + + /* + * Without multi-lrc KMD does the submission step (moving the + * lrc tail) so enabling scheduling is sufficient to submit the + * context. This isn't the case in multi-lrc submission as the + * GuC needs to move the tails, hence the need for another H2G + * to submit a multi-lrc context after enabling scheduling. + */ + if (intel_context_is_parent(ce)) { + action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT; + err = intel_guc_send_nb(guc, action, len - 1, 0); + } } else if (!enabled) { clr_context_pending_enable(ce); intel_context_put(ce); @@ -577,6 +701,18 @@ out: return err; } +static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) +{ + int ret = __guc_add_request(guc, rq); + + if (unlikely(ret == -EBUSY)) { + guc->stalled_request = rq; + guc->submission_stall_reason = STALL_ADD_REQUEST; + } + + return ret; +} + static inline void guc_set_lrc_tail(struct i915_request *rq) { rq->context->lrc_reg_state[CTX_RING_TAIL] = @@ -588,6 +724,135 @@ static inline int rq_prio(const struct i915_request *rq) return rq->sched.attr.priority; } +static bool is_multi_lrc_rq(struct i915_request *rq) +{ + return intel_context_is_parallel(rq->context); +} + +static bool can_merge_rq(struct i915_request *rq, + struct i915_request *last) +{ + return request_to_scheduling_context(rq) == + request_to_scheduling_context(last); +} + +static u32 wq_space_until_wrap(struct intel_context *ce) +{ + return (WQ_SIZE - ce->parallel.guc.wqi_tail); +} + +static void write_wqi(struct guc_process_desc *desc, + struct intel_context *ce, + u32 wqi_size) +{ + BUILD_BUG_ON(!is_power_of_2(WQ_SIZE)); + + /* + * Ensure WQI are visible before updating tail + */ + intel_guc_write_barrier(ce_to_guc(ce)); + + ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) & + (WQ_SIZE - 1); + WRITE_ONCE(desc->tail, ce->parallel.guc.wqi_tail); +} + +static int guc_wq_noop_append(struct intel_context *ce) +{ + struct guc_process_desc *desc = __get_process_desc(ce); + u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce)); + u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1; + + if (!wqi) + return -EBUSY; + + GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw)); + + *wqi = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) | + FIELD_PREP(WQ_LEN_MASK, len_dw); + ce->parallel.guc.wqi_tail = 0; + + return 0; +} + +static int __guc_wq_item_append(struct i915_request *rq) +{ + struct intel_context *ce = request_to_scheduling_context(rq); + struct intel_context *child; + struct guc_process_desc *desc = __get_process_desc(ce); + unsigned int wqi_size = (ce->parallel.number_children + 4) * + sizeof(u32); + u32 *wqi; + u32 len_dw = (wqi_size / sizeof(u32)) - 1; + int ret; + + /* Ensure context is in correct state updating work queue */ + GEM_BUG_ON(!atomic_read(&ce->guc_id.ref)); + GEM_BUG_ON(context_guc_id_invalid(ce)); + GEM_BUG_ON(context_wait_for_deregister_to_register(ce)); + GEM_BUG_ON(!lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id)); + + /* Insert NOOP if this work queue item will wrap the tail pointer. */ + if (wqi_size > wq_space_until_wrap(ce)) { + ret = guc_wq_noop_append(ce); + if (ret) + return ret; + } + + wqi = get_wq_pointer(desc, ce, wqi_size); + if (!wqi) + return -EBUSY; + + GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw)); + + *wqi++ = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) | + FIELD_PREP(WQ_LEN_MASK, len_dw); + *wqi++ = ce->lrc.lrca; + *wqi++ = FIELD_PREP(WQ_GUC_ID_MASK, ce->guc_id.id) | + FIELD_PREP(WQ_RING_TAIL_MASK, ce->ring->tail / sizeof(u64)); + *wqi++ = 0; /* fence_id */ + for_each_child(ce, child) + *wqi++ = child->ring->tail / sizeof(u64); + + write_wqi(desc, ce, wqi_size); + + return 0; +} + +static int guc_wq_item_append(struct intel_guc *guc, + struct i915_request *rq) +{ + struct intel_context *ce = request_to_scheduling_context(rq); + int ret = 0; + + if (likely(!intel_context_is_banned(ce))) { + ret = __guc_wq_item_append(rq); + + if (unlikely(ret == -EBUSY)) { + guc->stalled_request = rq; + guc->submission_stall_reason = STALL_MOVE_LRC_TAIL; + } + } + + return ret; +} + +static bool multi_lrc_submit(struct i915_request *rq) +{ + struct intel_context *ce = request_to_scheduling_context(rq); + + intel_ring_set_tail(rq->ring, rq->tail); + + /* + * We expect the front end (execbuf IOCTL) to set this flag on the last + * request generated from a multi-BB submission. This indicates to the + * backend (GuC interface) that we should submit this context thus + * submitting all the requests generated in parallel. + */ + return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) || + intel_context_is_banned(ce); +} + static int guc_dequeue_one_context(struct intel_guc *guc) { struct i915_sched_engine * const sched_engine = guc->sched_engine; @@ -601,7 +866,17 @@ static int guc_dequeue_one_context(struct intel_guc *guc) if (guc->stalled_request) { submit = true; last = guc->stalled_request; - goto resubmit; + + switch (guc->submission_stall_reason) { + case STALL_REGISTER_CONTEXT: + goto register_context; + case STALL_MOVE_LRC_TAIL: + goto move_lrc_tail; + case STALL_ADD_REQUEST: + goto add_request; + default: + MISSING_CASE(guc->submission_stall_reason); + } } while ((rb = rb_first_cached(&sched_engine->queue))) { @@ -609,8 +884,8 @@ static int guc_dequeue_one_context(struct intel_guc *guc) struct i915_request *rq, *rn; priolist_for_each_request_consume(rq, rn, p) { - if (last && rq->context != last->context) - goto done; + if (last && !can_merge_rq(rq, last)) + goto register_context; list_del_init(&rq->sched.link); @@ -618,33 +893,84 @@ static int guc_dequeue_one_context(struct intel_guc *guc) trace_i915_request_in(rq, 0); last = rq; - submit = true; + + if (is_multi_lrc_rq(rq)) { + /* + * We need to coalesce all multi-lrc requests in + * a relationship into a single H2G. We are + * guaranteed that all of these requests will be + * submitted sequentially. + */ + if (multi_lrc_submit(rq)) { + submit = true; + goto register_context; + } + } else { + submit = true; + } } rb_erase_cached(&p->node, &sched_engine->queue); i915_priolist_free(p); } -done: + +register_context: if (submit) { - guc_set_lrc_tail(last); -resubmit: + struct intel_context *ce = request_to_scheduling_context(last); + + if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id) && + !intel_context_is_banned(ce))) { + ret = guc_lrc_desc_pin(ce, false); + if (unlikely(ret == -EPIPE)) { + goto deadlk; + } else if (ret == -EBUSY) { + guc->stalled_request = last; + guc->submission_stall_reason = + STALL_REGISTER_CONTEXT; + goto schedule_tasklet; + } else if (ret != 0) { + GEM_WARN_ON(ret); /* Unexpected */ + goto deadlk; + } + } + +move_lrc_tail: + if (is_multi_lrc_rq(last)) { + ret = guc_wq_item_append(guc, last); + if (ret == -EBUSY) { + goto schedule_tasklet; + } else if (ret != 0) { + GEM_WARN_ON(ret); /* Unexpected */ + goto deadlk; + } + } else { + guc_set_lrc_tail(last); + } + +add_request: ret = guc_add_request(guc, last); - if (unlikely(ret == -EPIPE)) + if (unlikely(ret == -EPIPE)) { + goto deadlk; + } else if (ret == -EBUSY) { + goto schedule_tasklet; + } else if (ret != 0) { + GEM_WARN_ON(ret); /* Unexpected */ goto deadlk; - else if (ret == -EBUSY) { - tasklet_schedule(&sched_engine->tasklet); - guc->stalled_request = last; - return false; } } guc->stalled_request = NULL; + guc->submission_stall_reason = STALL_NONE; return submit; deadlk: sched_engine->tasklet.callback = NULL; tasklet_disable_nosync(&sched_engine->tasklet); return false; + +schedule_tasklet: + tasklet_schedule(&sched_engine->tasklet); + return false; } static void guc_submission_tasklet(struct tasklet_struct *t) @@ -719,6 +1045,7 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) if (deregister) guc_signal_context_fence(ce); if (destroyed) { + intel_gt_pm_put_async(guc_to_gt(guc)); release_guc_id(guc, ce); __guc_context_destroy(ce); } @@ -797,6 +1124,8 @@ static void guc_flush_submissions(struct intel_guc *guc) spin_unlock_irqrestore(&sched_engine->lock, flags); } +static void guc_flush_destroyed_contexts(struct intel_guc *guc); + void intel_guc_submission_reset_prepare(struct intel_guc *guc) { int i; @@ -815,6 +1144,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc) spin_unlock_irq(&guc_to_gt(guc)->irq_lock); guc_flush_submissions(guc); + guc_flush_destroyed_contexts(guc); /* * Handle any outstanding G2Hs before reset. Call IRQ handler directly @@ -929,10 +1259,15 @@ __unwind_incomplete_requests(struct intel_context *ce) static void __guc_reset_context(struct intel_context *ce, bool stalled) { + bool local_stalled; struct i915_request *rq; unsigned long flags; u32 head; + int i, number_children = ce->parallel.number_children; bool skip = false; + struct intel_context *parent = ce; + + GEM_BUG_ON(intel_context_is_child(ce)); intel_context_get(ce); @@ -958,25 +1293,38 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled) if (unlikely(skip)) goto out_put; - rq = intel_context_find_active_request(ce); - if (!rq) { - head = ce->ring->tail; - stalled = false; - goto out_replay; - } + /* + * For each context in the relationship find the hanging request + * resetting each context / request as needed + */ + for (i = 0; i < number_children + 1; ++i) { + if (!intel_context_is_pinned(ce)) + goto next_context; - if (!i915_request_started(rq)) - stalled = false; + local_stalled = false; + rq = intel_context_find_active_request(ce); + if (!rq) { + head = ce->ring->tail; + goto out_replay; + } - GEM_BUG_ON(i915_active_is_idle(&ce->active)); - head = intel_ring_wrap(ce->ring, rq->head); - __i915_request_reset(rq, stalled); + if (i915_request_started(rq)) + local_stalled = true; + GEM_BUG_ON(i915_active_is_idle(&ce->active)); + head = intel_ring_wrap(ce->ring, rq->head); + + __i915_request_reset(rq, local_stalled && stalled); out_replay: - guc_reset_state(ce, head, stalled); - __unwind_incomplete_requests(ce); + guc_reset_state(ce, head, local_stalled && stalled); +next_context: + if (i != number_children) + ce = list_next_entry(ce, parallel.child_link); + } + + __unwind_incomplete_requests(parent); out_put: - intel_context_put(ce); + intel_context_put(parent); } void intel_guc_submission_reset(struct intel_guc *guc, bool stalled) @@ -997,7 +1345,8 @@ void intel_guc_submission_reset(struct intel_guc *guc, bool stalled) xa_unlock(&guc->context_lookup); - if (intel_context_is_pinned(ce)) + if (intel_context_is_pinned(ce) && + !intel_context_is_child(ce)) __guc_reset_context(ce, stalled); intel_context_put(ce); @@ -1089,7 +1438,8 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc) xa_unlock(&guc->context_lookup); - if (intel_context_is_pinned(ce)) + if (intel_context_is_pinned(ce) && + !intel_context_is_child(ce)) guc_cancel_context_requests(ce); intel_context_put(ce); @@ -1126,6 +1476,8 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc) intel_gt_unpark_heartbeats(guc_to_gt(guc)); } +static void destroyed_worker_func(struct work_struct *w); + /* * Set up the memory resources to be shared with the GuC (via the GGTT) * at firmware loading time. @@ -1148,9 +1500,17 @@ int intel_guc_submission_init(struct intel_guc *guc) xa_init_flags(&guc->context_lookup, XA_FLAGS_LOCK_IRQ); - spin_lock_init(&guc->contexts_lock); - INIT_LIST_HEAD(&guc->guc_id_list); - ida_init(&guc->guc_ids); + spin_lock_init(&guc->submission_state.lock); + INIT_LIST_HEAD(&guc->submission_state.guc_id_list); + ida_init(&guc->submission_state.guc_ids); + INIT_LIST_HEAD(&guc->submission_state.destroyed_contexts); + INIT_WORK(&guc->submission_state.destroyed_worker, + destroyed_worker_func); + + guc->submission_state.guc_ids_bitmap = + bitmap_zalloc(NUMBER_MULTI_LRC_GUC_ID, GFP_KERNEL); + if (!guc->submission_state.guc_ids_bitmap) + return -ENOMEM; return 0; } @@ -1160,8 +1520,10 @@ void intel_guc_submission_fini(struct intel_guc *guc) if (!guc->lrc_desc_pool) return; + guc_flush_destroyed_contexts(guc); guc_lrc_desc_pool_destroy(guc); i915_sched_engine_put(guc->sched_engine); + bitmap_free(guc->submission_state.guc_ids_bitmap); } static inline void queue_request(struct i915_sched_engine *sched_engine, @@ -1178,16 +1540,22 @@ static inline void queue_request(struct i915_sched_engine *sched_engine, static int guc_bypass_tasklet_submit(struct intel_guc *guc, struct i915_request *rq) { - int ret; + int ret = 0; __i915_request_submit(rq); trace_i915_request_in(rq, 0); - guc_set_lrc_tail(rq); - ret = guc_add_request(guc, rq); - if (ret == -EBUSY) - guc->stalled_request = rq; + if (is_multi_lrc_rq(rq)) { + if (multi_lrc_submit(rq)) { + ret = guc_wq_item_append(guc, rq); + if (!ret) + ret = guc_add_request(guc, rq); + } + } else { + guc_set_lrc_tail(rq); + ret = guc_add_request(guc, rq); + } if (unlikely(ret == -EPIPE)) disable_submission(guc); @@ -1195,6 +1563,16 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc, return ret; } +static bool need_tasklet(struct intel_guc *guc, struct i915_request *rq) +{ + struct i915_sched_engine *sched_engine = rq->engine->sched_engine; + struct intel_context *ce = request_to_scheduling_context(rq); + + return submission_disabled(guc) || guc->stalled_request || + !i915_sched_engine_is_empty(sched_engine) || + !lrc_desc_registered(guc, ce->guc_id.id); +} + static void guc_submit_request(struct i915_request *rq) { struct i915_sched_engine *sched_engine = rq->engine->sched_engine; @@ -1204,8 +1582,7 @@ static void guc_submit_request(struct i915_request *rq) /* Will be called from irq-context when using foreign fences. */ spin_lock_irqsave(&sched_engine->lock, flags); - if (submission_disabled(guc) || guc->stalled_request || - !i915_sched_engine_is_empty(sched_engine)) + if (need_tasklet(guc, rq)) queue_request(sched_engine, rq, rq_prio(rq)); else if (guc_bypass_tasklet_submit(guc, rq) == -EBUSY) tasklet_hi_schedule(&sched_engine->tasklet); @@ -1213,17 +1590,43 @@ static void guc_submit_request(struct i915_request *rq) spin_unlock_irqrestore(&sched_engine->lock, flags); } -static int new_guc_id(struct intel_guc *guc) +static int new_guc_id(struct intel_guc *guc, struct intel_context *ce) { - return ida_simple_get(&guc->guc_ids, 0, - GUC_MAX_LRC_DESCRIPTORS, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | __GFP_NOWARN); + int ret; + + GEM_BUG_ON(intel_context_is_child(ce)); + + if (intel_context_is_parent(ce)) + ret = bitmap_find_free_region(guc->submission_state.guc_ids_bitmap, + NUMBER_MULTI_LRC_GUC_ID, + order_base_2(ce->parallel.number_children + + 1)); + else + ret = ida_simple_get(&guc->submission_state.guc_ids, + NUMBER_MULTI_LRC_GUC_ID, + GUC_MAX_LRC_DESCRIPTORS, + GFP_KERNEL | __GFP_RETRY_MAYFAIL | + __GFP_NOWARN); + if (unlikely(ret < 0)) + return ret; + + ce->guc_id.id = ret; + return 0; } static void __release_guc_id(struct intel_guc *guc, struct intel_context *ce) { + GEM_BUG_ON(intel_context_is_child(ce)); + if (!context_guc_id_invalid(ce)) { - ida_simple_remove(&guc->guc_ids, ce->guc_id.id); + if (intel_context_is_parent(ce)) + bitmap_release_region(guc->submission_state.guc_ids_bitmap, + ce->guc_id.id, + order_base_2(ce->parallel.number_children + + 1)); + else + ida_simple_remove(&guc->submission_state.guc_ids, + ce->guc_id.id); reset_lrc_desc(guc, ce->guc_id.id); set_context_guc_id_invalid(ce); } @@ -1235,54 +1638,69 @@ static void release_guc_id(struct intel_guc *guc, struct intel_context *ce) { unsigned long flags; - spin_lock_irqsave(&guc->contexts_lock, flags); + spin_lock_irqsave(&guc->submission_state.lock, flags); __release_guc_id(guc, ce); - spin_unlock_irqrestore(&guc->contexts_lock, flags); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); } -static int steal_guc_id(struct intel_guc *guc) +static int steal_guc_id(struct intel_guc *guc, struct intel_context *ce) { - struct intel_context *ce; - int guc_id; + struct intel_context *cn; - lockdep_assert_held(&guc->contexts_lock); + lockdep_assert_held(&guc->submission_state.lock); + GEM_BUG_ON(intel_context_is_child(ce)); + GEM_BUG_ON(intel_context_is_parent(ce)); - if (!list_empty(&guc->guc_id_list)) { - ce = list_first_entry(&guc->guc_id_list, + if (!list_empty(&guc->submission_state.guc_id_list)) { + cn = list_first_entry(&guc->submission_state.guc_id_list, struct intel_context, guc_id.link); - GEM_BUG_ON(atomic_read(&ce->guc_id.ref)); - GEM_BUG_ON(context_guc_id_invalid(ce)); + GEM_BUG_ON(atomic_read(&cn->guc_id.ref)); + GEM_BUG_ON(context_guc_id_invalid(cn)); + GEM_BUG_ON(intel_context_is_child(cn)); + GEM_BUG_ON(intel_context_is_parent(cn)); - list_del_init(&ce->guc_id.link); - guc_id = ce->guc_id.id; + list_del_init(&cn->guc_id.link); + ce->guc_id = cn->guc_id; spin_lock(&ce->guc_state.lock); - clr_context_registered(ce); + clr_context_registered(cn); spin_unlock(&ce->guc_state.lock); - set_context_guc_id_invalid(ce); - return guc_id; + set_context_guc_id_invalid(cn); + + return 0; } else { return -EAGAIN; } } -static int assign_guc_id(struct intel_guc *guc, u16 *out) +static int assign_guc_id(struct intel_guc *guc, struct intel_context *ce) { int ret; - lockdep_assert_held(&guc->contexts_lock); + lockdep_assert_held(&guc->submission_state.lock); + GEM_BUG_ON(intel_context_is_child(ce)); - ret = new_guc_id(guc); + ret = new_guc_id(guc, ce); if (unlikely(ret < 0)) { - ret = steal_guc_id(guc); + if (intel_context_is_parent(ce)) + return -ENOSPC; + + ret = steal_guc_id(guc, ce); if (ret < 0) return ret; } - *out = ret; + if (intel_context_is_parent(ce)) { + struct intel_context *child; + int i = 1; + + for_each_child(ce, child) + child->guc_id.id = ce->guc_id.id + i++; + } + return 0; } @@ -1295,12 +1713,12 @@ static int pin_guc_id(struct intel_guc *guc, struct intel_context *ce) GEM_BUG_ON(atomic_read(&ce->guc_id.ref)); try_again: - spin_lock_irqsave(&guc->contexts_lock, flags); + spin_lock_irqsave(&guc->submission_state.lock, flags); might_lock(&ce->guc_state.lock); if (context_guc_id_invalid(ce)) { - ret = assign_guc_id(guc, &ce->guc_id.id); + ret = assign_guc_id(guc, ce); if (ret) goto out_unlock; ret = 1; /* Indidcates newly assigned guc_id */ @@ -1310,7 +1728,7 @@ try_again: atomic_inc(&ce->guc_id.ref); out_unlock: - spin_unlock_irqrestore(&guc->contexts_lock, flags); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); /* * -EAGAIN indicates no guc_id are available, let's retire any @@ -1342,15 +1760,42 @@ static void unpin_guc_id(struct intel_guc *guc, struct intel_context *ce) unsigned long flags; GEM_BUG_ON(atomic_read(&ce->guc_id.ref) < 0); + GEM_BUG_ON(intel_context_is_child(ce)); - if (unlikely(context_guc_id_invalid(ce))) + if (unlikely(context_guc_id_invalid(ce) || + intel_context_is_parent(ce))) return; - spin_lock_irqsave(&guc->contexts_lock, flags); + spin_lock_irqsave(&guc->submission_state.lock, flags); if (!context_guc_id_invalid(ce) && list_empty(&ce->guc_id.link) && !atomic_read(&ce->guc_id.ref)) - list_add_tail(&ce->guc_id.link, &guc->guc_id_list); - spin_unlock_irqrestore(&guc->contexts_lock, flags); + list_add_tail(&ce->guc_id.link, + &guc->submission_state.guc_id_list); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static int __guc_action_register_multi_lrc(struct intel_guc *guc, + struct intel_context *ce, + u32 guc_id, + u32 offset, + bool loop) +{ + struct intel_context *child; + u32 action[4 + MAX_ENGINE_INSTANCE]; + int len = 0; + + GEM_BUG_ON(ce->parallel.number_children > MAX_ENGINE_INSTANCE); + + action[len++] = INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC; + action[len++] = guc_id; + action[len++] = ce->parallel.number_children + 1; + action[len++] = offset; + for_each_child(ce, child) { + offset += sizeof(struct guc_lrc_desc); + action[len++] = offset; + } + + return guc_submission_send_busy_loop(guc, action, len, 0, loop); } static int __guc_action_register_context(struct intel_guc *guc, @@ -1375,9 +1820,15 @@ static int register_context(struct intel_context *ce, bool loop) ce->guc_id.id * sizeof(struct guc_lrc_desc); int ret; + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_register(ce); - ret = __guc_action_register_context(guc, ce->guc_id.id, offset, loop); + if (intel_context_is_parent(ce)) + ret = __guc_action_register_multi_lrc(guc, ce, ce->guc_id.id, + offset, loop); + else + ret = __guc_action_register_context(guc, ce->guc_id.id, offset, + loop); if (likely(!ret)) { unsigned long flags; @@ -1406,26 +1857,31 @@ static int deregister_context(struct intel_context *ce, u32 guc_id) { struct intel_guc *guc = ce_to_guc(ce); + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_deregister(ce); return __guc_action_deregister_context(guc, guc_id); } -static intel_engine_mask_t adjust_engine_mask(u8 class, intel_engine_mask_t mask) +static inline void clear_children_join_go_memory(struct intel_context *ce) { - switch (class) { - case RENDER_CLASS: - return mask >> RCS0; - case VIDEO_ENHANCEMENT_CLASS: - return mask >> VECS0; - case VIDEO_DECODE_CLASS: - return mask >> VCS0; - case COPY_ENGINE_CLASS: - return mask >> BCS0; - default: - MISSING_CASE(class); - return 0; - } + struct parent_scratch *ps = __get_parent_scratch(ce); + int i; + + ps->go.semaphore = 0; + for (i = 0; i < ce->parallel.number_children + 1; ++i) + ps->join[i].semaphore = 0; +} + +static inline u32 get_children_go_value(struct intel_context *ce) +{ + return __get_parent_scratch(ce)->go.semaphore; +} + +static inline u32 get_children_join_value(struct intel_context *ce, + u8 child_index) +{ + return __get_parent_scratch(ce)->join[child_index].semaphore; } static void guc_context_policy_init(struct intel_engine_cs *engine, @@ -1450,6 +1906,7 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) struct guc_lrc_desc *desc; bool context_registered; intel_wakeref_t wakeref; + struct intel_context *child; int ret = 0; GEM_BUG_ON(!engine->mask); @@ -1469,14 +1926,50 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) desc = __get_lrc_desc(guc, desc_idx); desc->engine_class = engine_class_to_guc_class(engine->class); - desc->engine_submit_mask = adjust_engine_mask(engine->class, - engine->mask); + desc->engine_submit_mask = engine->logical_mask; desc->hw_context_desc = ce->lrc.lrca; desc->priority = ce->guc_state.prio; desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; guc_context_policy_init(engine, desc); /* + * If context is a parent, we need to register a process descriptor + * describing a work queue and register all child contexts. + */ + if (intel_context_is_parent(ce)) { + struct guc_process_desc *pdesc; + + ce->parallel.guc.wqi_tail = 0; + ce->parallel.guc.wqi_head = 0; + + desc->process_desc = i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce); + desc->wq_addr = i915_ggtt_offset(ce->state) + + __get_wq_offset(ce); + desc->wq_size = WQ_SIZE; + + pdesc = __get_process_desc(ce); + memset(pdesc, 0, sizeof(*(pdesc))); + pdesc->stage_id = ce->guc_id.id; + pdesc->wq_base_addr = desc->wq_addr; + pdesc->wq_size_bytes = desc->wq_size; + pdesc->wq_status = WQ_STATUS_ACTIVE; + + for_each_child(ce, child) { + desc = __get_lrc_desc(guc, child->guc_id.id); + + desc->engine_class = + engine_class_to_guc_class(engine->class); + desc->hw_context_desc = child->lrc.lrca; + desc->priority = ce->guc_state.prio; + desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; + guc_context_policy_init(engine, desc); + } + + clear_children_join_go_memory(ce); + } + + /* * The context_lookup xarray is used to determine if the hardware * context is currently registered. There are two cases in which it * could be registered either the guc_id has been stolen from another @@ -1559,7 +2052,12 @@ static int guc_context_pre_pin(struct intel_context *ce, static int guc_context_pin(struct intel_context *ce, void *vaddr) { - return __guc_context_pin(ce, ce->engine, vaddr); + int ret = __guc_context_pin(ce, ce->engine, vaddr); + + if (likely(!ret && !intel_context_is_barrier(ce))) + intel_engine_pm_get(ce->engine); + + return ret; } static void guc_context_unpin(struct intel_context *ce) @@ -1568,6 +2066,9 @@ static void guc_context_unpin(struct intel_context *ce) unpin_guc_id(guc, ce); lrc_unpin(ce); + + if (likely(!intel_context_is_barrier(ce))) + intel_engine_pm_put_async(ce->engine); } static void guc_context_post_unpin(struct intel_context *ce) @@ -1602,6 +2103,7 @@ static void __guc_context_sched_disable(struct intel_guc *guc, GEM_BUG_ON(guc_id == GUC_INVALID_LRC_ID); + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_sched_disable(ce); guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), @@ -1653,6 +2155,8 @@ static struct i915_sw_fence *guc_context_block(struct intel_context *ce) u16 guc_id; bool enabled; + GEM_BUG_ON(intel_context_is_child(ce)); + spin_lock_irqsave(&ce->guc_state.lock, flags); incr_context_blocked(ce); @@ -1707,6 +2211,7 @@ static void guc_context_unblock(struct intel_context *ce) bool enable; GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_child(ce)); spin_lock_irqsave(&ce->guc_state.lock, flags); @@ -1733,11 +2238,14 @@ static void guc_context_unblock(struct intel_context *ce) static void guc_context_cancel_request(struct intel_context *ce, struct i915_request *rq) { + struct intel_context *block_context = + request_to_scheduling_context(rq); + if (i915_sw_fence_signaled(&rq->submit)) { struct i915_sw_fence *fence; intel_context_get(ce); - fence = guc_context_block(ce); + fence = guc_context_block(block_context); i915_sw_fence_wait(fence); if (!i915_request_completed(rq)) { __i915_request_skip(rq); @@ -1751,7 +2259,7 @@ static void guc_context_cancel_request(struct intel_context *ce, */ flush_work(&ce_to_guc(ce)->ct.requests.worker); - guc_context_unblock(ce); + guc_context_unblock(block_context); intel_context_put(ce); } } @@ -1777,6 +2285,8 @@ static void guc_context_ban(struct intel_context *ce, struct i915_request *rq) intel_wakeref_t wakeref; unsigned long flags; + GEM_BUG_ON(intel_context_is_child(ce)); + guc_flush_submissions(guc); spin_lock_irqsave(&ce->guc_state.lock, flags); @@ -1827,6 +2337,8 @@ static void guc_context_sched_disable(struct intel_context *ce) intel_wakeref_t wakeref; u16 guc_id; + GEM_BUG_ON(intel_context_is_child(ce)); + spin_lock_irqsave(&ce->guc_state.lock, flags); /* @@ -1857,11 +2369,30 @@ unpin: static inline void guc_lrc_desc_unpin(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); + struct intel_gt *gt = guc_to_gt(guc); + unsigned long flags; + bool disabled; + GEM_BUG_ON(!intel_gt_pm_is_awake(gt)); GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id)); GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id)); GEM_BUG_ON(context_enabled(ce)); + /* Seal race with Reset */ + spin_lock_irqsave(&ce->guc_state.lock, flags); + disabled = submission_disabled(guc); + if (likely(!disabled)) { + __intel_gt_pm_get(gt); + set_context_destroyed(ce); + clr_context_registered(ce); + } + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + if (unlikely(disabled)) { + release_guc_id(guc, ce); + __guc_context_destroy(ce); + return; + } + deregister_context(ce, ce->guc_id.id); } @@ -1889,78 +2420,86 @@ static void __guc_context_destroy(struct intel_context *ce) } } +static void guc_flush_destroyed_contexts(struct intel_guc *guc) +{ + struct intel_context *ce, *cn; + unsigned long flags; + + GEM_BUG_ON(!submission_disabled(guc) && + guc_submission_initialized(guc)); + + spin_lock_irqsave(&guc->submission_state.lock, flags); + list_for_each_entry_safe(ce, cn, + &guc->submission_state.destroyed_contexts, + destroyed_link) { + list_del_init(&ce->destroyed_link); + __release_guc_id(guc, ce); + __guc_context_destroy(ce); + } + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static void deregister_destroyed_contexts(struct intel_guc *guc) +{ + struct intel_context *ce, *cn; + unsigned long flags; + + spin_lock_irqsave(&guc->submission_state.lock, flags); + list_for_each_entry_safe(ce, cn, + &guc->submission_state.destroyed_contexts, + destroyed_link) { + list_del_init(&ce->destroyed_link); + guc_lrc_desc_unpin(ce); + } + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static void destroyed_worker_func(struct work_struct *w) +{ + struct intel_guc *guc = container_of(w, struct intel_guc, + submission_state.destroyed_worker); + struct intel_gt *gt = guc_to_gt(guc); + int tmp; + + with_intel_gt_pm(gt, tmp) + deregister_destroyed_contexts(guc); +} + static void guc_context_destroy(struct kref *kref) { struct intel_context *ce = container_of(kref, typeof(*ce), ref); - struct intel_runtime_pm *runtime_pm = ce->engine->uncore->rpm; struct intel_guc *guc = ce_to_guc(ce); - intel_wakeref_t wakeref; unsigned long flags; - bool disabled; + bool destroy; /* * If the guc_id is invalid this context has been stolen and we can free * it immediately. Also can be freed immediately if the context is not * registered with the GuC or the GuC is in the middle of a reset. */ - if (context_guc_id_invalid(ce)) { - __guc_context_destroy(ce); - return; - } else if (submission_disabled(guc) || - !lrc_desc_registered(guc, ce->guc_id.id)) { - release_guc_id(guc, ce); - __guc_context_destroy(ce); - return; - } - - /* - * We have to acquire the context spinlock and check guc_id again, if it - * is valid it hasn't been stolen and needs to be deregistered. We - * delete this context from the list of unpinned guc_id available to - * steal to seal a race with guc_lrc_desc_pin(). When the G2H CTB - * returns indicating this context has been deregistered the guc_id is - * returned to the pool of available guc_id. - */ - spin_lock_irqsave(&guc->contexts_lock, flags); - if (context_guc_id_invalid(ce)) { - spin_unlock_irqrestore(&guc->contexts_lock, flags); - __guc_context_destroy(ce); - return; - } - - if (!list_empty(&ce->guc_id.link)) - list_del_init(&ce->guc_id.link); - spin_unlock_irqrestore(&guc->contexts_lock, flags); - - /* Seal race with Reset */ - spin_lock_irqsave(&ce->guc_state.lock, flags); - disabled = submission_disabled(guc); - if (likely(!disabled)) { - set_context_destroyed(ce); - clr_context_registered(ce); + spin_lock_irqsave(&guc->submission_state.lock, flags); + destroy = submission_disabled(guc) || context_guc_id_invalid(ce) || + !lrc_desc_registered(guc, ce->guc_id.id); + if (likely(!destroy)) { + if (!list_empty(&ce->guc_id.link)) + list_del_init(&ce->guc_id.link); + list_add_tail(&ce->destroyed_link, + &guc->submission_state.destroyed_contexts); + } else { + __release_guc_id(guc, ce); } - spin_unlock_irqrestore(&ce->guc_state.lock, flags); - if (unlikely(disabled)) { - release_guc_id(guc, ce); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); + if (unlikely(destroy)) { __guc_context_destroy(ce); return; } /* - * We defer GuC context deregistration until the context is destroyed - * in order to save on CTBs. With this optimization ideally we only need - * 1 CTB to register the context during the first pin and 1 CTB to - * deregister the context when the context is destroyed. Without this - * optimization, a CTB would be needed every pin & unpin. - * - * XXX: Need to acqiure the runtime wakeref as this can be triggered - * from context_free_worker when runtime wakeref is not held. - * guc_lrc_desc_unpin requires the runtime as a GuC register is written - * in H2G CTB to deregister the context. A future patch may defer this - * H2G CTB if the runtime wakeref is zero. + * We use a worker to issue the H2G to deregister the context as we can + * take the GT PM for the first time which isn't allowed from an atomic + * context. */ - with_intel_runtime_pm(runtime_pm, wakeref) - guc_lrc_desc_unpin(ce); + queue_work(system_unbound_wq, &guc->submission_state.destroyed_worker); } static int guc_context_alloc(struct intel_context *ce) @@ -2056,9 +2595,10 @@ static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio) static void add_to_context(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq)); + GEM_BUG_ON(intel_context_is_child(ce)); GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI); spin_lock(&ce->guc_state.lock); @@ -2091,7 +2631,9 @@ static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce) static void remove_from_context(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); + + GEM_BUG_ON(intel_context_is_child(ce)); spin_lock_irq(&ce->guc_state.lock); @@ -2132,6 +2674,7 @@ static const struct intel_context_ops guc_context_ops = { .destroy = guc_context_destroy, .create_virtual = guc_create_virtual, + .create_parallel = guc_create_parallel, }; static void submit_work_cb(struct irq_work *wrk) @@ -2168,6 +2711,8 @@ static void guc_signal_context_fence(struct intel_context *ce) { unsigned long flags; + GEM_BUG_ON(intel_context_is_child(ce)); + spin_lock_irqsave(&ce->guc_state.lock, flags); clr_context_wait_for_deregister_to_register(ce); __guc_signal_context_fence(ce); @@ -2198,7 +2743,7 @@ static void guc_context_init(struct intel_context *ce) static int guc_request_alloc(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); struct intel_guc *guc = ce_to_guc(ce); unsigned long flags; int ret; @@ -2302,8 +2847,30 @@ static int guc_virtual_context_pre_pin(struct intel_context *ce, static int guc_virtual_context_pin(struct intel_context *ce, void *vaddr) { struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); + int ret = __guc_context_pin(ce, engine, vaddr); + intel_engine_mask_t tmp, mask = ce->engine->mask; - return __guc_context_pin(ce, engine, vaddr); + if (likely(!ret)) + for_each_engine_masked(engine, ce->engine->gt, mask, tmp) + intel_engine_pm_get(engine); + + return ret; +} + +static void guc_virtual_context_unpin(struct intel_context *ce) +{ + intel_engine_mask_t tmp, mask = ce->engine->mask; + struct intel_engine_cs *engine; + struct intel_guc *guc = ce_to_guc(ce); + + GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_barrier(ce)); + + unpin_guc_id(guc, ce); + lrc_unpin(ce); + + for_each_engine_masked(engine, ce->engine->gt, mask, tmp) + intel_engine_pm_put_async(engine); } static void guc_virtual_context_enter(struct intel_context *ce) @@ -2340,7 +2907,98 @@ static const struct intel_context_ops virtual_guc_context_ops = { .pre_pin = guc_virtual_context_pre_pin, .pin = guc_virtual_context_pin, - .unpin = guc_context_unpin, + .unpin = guc_virtual_context_unpin, + .post_unpin = guc_context_post_unpin, + + .ban = guc_context_ban, + + .cancel_request = guc_context_cancel_request, + + .enter = guc_virtual_context_enter, + .exit = guc_virtual_context_exit, + + .sched_disable = guc_context_sched_disable, + + .destroy = guc_context_destroy, + + .get_sibling = guc_virtual_get_sibling, +}; + +static int guc_parent_context_pin(struct intel_context *ce, void *vaddr) +{ + struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); + struct intel_guc *guc = ce_to_guc(ce); + int ret; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + ret = pin_guc_id(guc, ce); + if (unlikely(ret < 0)) + return ret; + + return __guc_context_pin(ce, engine, vaddr); +} + +static int guc_child_context_pin(struct intel_context *ce, void *vaddr) +{ + struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); + + GEM_BUG_ON(!intel_context_is_child(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + __intel_context_pin(ce->parallel.parent); + return __guc_context_pin(ce, engine, vaddr); +} + +static void guc_parent_context_unpin(struct intel_context *ce) +{ + struct intel_guc *guc = ce_to_guc(ce); + + GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_barrier(ce)); + GEM_BUG_ON(!intel_context_is_parent(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + if (ce->parallel.last_rq) + i915_request_put(ce->parallel.last_rq); + unpin_guc_id(guc, ce); + lrc_unpin(ce); +} + +static void guc_child_context_unpin(struct intel_context *ce) +{ + GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_barrier(ce)); + GEM_BUG_ON(!intel_context_is_child(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + lrc_unpin(ce); +} + +static void guc_child_context_post_unpin(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_child(ce)); + GEM_BUG_ON(!intel_context_is_pinned(ce->parallel.parent)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + lrc_post_unpin(ce); + intel_context_unpin(ce->parallel.parent); +} + +static void guc_child_context_destroy(struct kref *kref) +{ + struct intel_context *ce = container_of(kref, typeof(*ce), ref); + + __guc_context_destroy(ce); +} + +static const struct intel_context_ops virtual_parent_context_ops = { + .alloc = guc_virtual_context_alloc, + + .pre_pin = guc_context_pre_pin, + .pin = guc_parent_context_pin, + .unpin = guc_parent_context_unpin, .post_unpin = guc_context_post_unpin, .ban = guc_context_ban, @@ -2357,6 +3015,110 @@ static const struct intel_context_ops virtual_guc_context_ops = { .get_sibling = guc_virtual_get_sibling, }; +static const struct intel_context_ops virtual_child_context_ops = { + .alloc = guc_virtual_context_alloc, + + .pre_pin = guc_context_pre_pin, + .pin = guc_child_context_pin, + .unpin = guc_child_context_unpin, + .post_unpin = guc_child_context_post_unpin, + + .cancel_request = guc_context_cancel_request, + + .enter = guc_virtual_context_enter, + .exit = guc_virtual_context_exit, + + .destroy = guc_child_context_destroy, + + .get_sibling = guc_virtual_get_sibling, +}; + +/* + * The below override of the breadcrumbs is enabled when the user configures a + * context for parallel submission (multi-lrc, parent-child). + * + * The overridden breadcrumbs implements an algorithm which allows the GuC to + * safely preempt all the hw contexts configured for parallel submission + * between each BB. The contract between the i915 and GuC is if the parent + * context can be preempted, all the children can be preempted, and the GuC will + * always try to preempt the parent before the children. A handshake between the + * parent / children breadcrumbs ensures the i915 holds up its end of the deal + * creating a window to preempt between each set of BBs. + */ +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); + +static struct intel_context * +guc_create_parallel(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width) +{ + struct intel_engine_cs **siblings = NULL; + struct intel_context *parent = NULL, *ce, *err; + int i, j; + + siblings = kmalloc_array(num_siblings, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < width; ++i) { + for (j = 0; j < num_siblings; ++j) + siblings[j] = engines[i * num_siblings + j]; + + ce = intel_engine_create_virtual(siblings, num_siblings, + FORCE_VIRTUAL); + if (!ce) { + err = ERR_PTR(-ENOMEM); + goto unwind; + } + + if (i == 0) { + parent = ce; + parent->ops = &virtual_parent_context_ops; + } else { + ce->ops = &virtual_child_context_ops; + intel_context_bind_parent_child(parent, ce); + } + } + + parent->parallel.fence_context = dma_fence_context_alloc(1); + + parent->engine->emit_bb_start = + emit_bb_start_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb_dw = + 12 + 4 * parent->parallel.number_children; + for_each_child(parent, ce) { + ce->engine->emit_bb_start = + emit_bb_start_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb_dw = 16; + } + + kfree(siblings); + return parent; + +unwind: + if (parent) + intel_context_put(parent); + kfree(siblings); + return err; +} + static bool guc_irq_enable_breadcrumbs(struct intel_breadcrumbs *b) { @@ -2416,7 +3178,7 @@ static void guc_init_breadcrumbs(struct intel_engine_cs *engine) static void guc_bump_inflight_request_prio(struct i915_request *rq, int prio) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); u8 new_guc_prio = map_i915_prio_to_guc_prio(prio); /* Short circuit function */ @@ -2439,7 +3201,7 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq, static void guc_retire_inflight_request_prio(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); spin_lock(&ce->guc_state.lock); guc_prio_fini(rq, ce); @@ -2753,6 +3515,12 @@ g2h_context_lookup(struct intel_guc *guc, u32 desc_idx) return NULL; } + if (unlikely(intel_context_is_child(ce))) { + drm_err(&guc_to_gt(guc)->i915->drm, + "Context is child, desc_idx %u", desc_idx); + return NULL; + } + return ce; } @@ -2796,6 +3564,7 @@ int intel_guc_deregister_done_process_msg(struct intel_guc *guc, intel_context_put(ce); } else if (context_destroyed(ce)) { /* Context has been destroyed */ + intel_gt_pm_put_async(guc_to_gt(guc)); release_guc_id(guc, ce); __guc_context_destroy(ce); } @@ -3122,6 +3891,25 @@ static inline void guc_log_context_priority(struct drm_printer *p, drm_printf(p, "\n"); } +static inline void guc_log_context(struct drm_printer *p, + struct intel_context *ce) +{ + drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); + drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); + drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", + ce->ring->head, + ce->lrc_reg_state[CTX_RING_HEAD]); + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", + ce->ring->tail, + ce->lrc_reg_state[CTX_RING_TAIL]); + drm_printf(p, "\t\tContext Pin Count: %u\n", + atomic_read(&ce->pin_count)); + drm_printf(p, "\t\tGuC ID Ref Count: %u\n", + atomic_read(&ce->guc_id.ref)); + drm_printf(p, "\t\tSchedule State: 0x%x\n\n", + ce->guc_state.sched_state); +} + void intel_guc_submission_print_context_info(struct intel_guc *guc, struct drm_printer *p) { @@ -3131,28 +3919,310 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { - drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); - drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); - drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", - ce->ring->head, - ce->lrc_reg_state[CTX_RING_HEAD]); - drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", - ce->ring->tail, - ce->lrc_reg_state[CTX_RING_TAIL]); - drm_printf(p, "\t\tContext Pin Count: %u\n", - atomic_read(&ce->pin_count)); - drm_printf(p, "\t\tGuC ID Ref Count: %u\n", - atomic_read(&ce->guc_id.ref)); - drm_printf(p, "\t\tSchedule State: 0x%x\n\n", - ce->guc_state.sched_state); + GEM_BUG_ON(intel_context_is_child(ce)); + guc_log_context(p, ce); guc_log_context_priority(p, ce); + + if (intel_context_is_parent(ce)) { + struct guc_process_desc *desc = __get_process_desc(ce); + struct intel_context *child; + + drm_printf(p, "\t\tNumber children: %u\n", + ce->parallel.number_children); + drm_printf(p, "\t\tWQI Head: %u\n", + READ_ONCE(desc->head)); + drm_printf(p, "\t\tWQI Tail: %u\n", + READ_ONCE(desc->tail)); + drm_printf(p, "\t\tWQI Status: %u\n\n", + READ_ONCE(desc->wq_status)); + + if (ce->engine->emit_bb_start == + emit_bb_start_parent_no_preempt_mid_batch) { + u8 i; + + drm_printf(p, "\t\tChildren Go: %u\n\n", + get_children_go_value(ce)); + for (i = 0; i < ce->parallel.number_children; ++i) + drm_printf(p, "\t\tChildren Join: %u\n", + get_children_join_value(ce, i)); + } + + for_each_child(ce, child) + guc_log_context(p, child); + } } xa_unlock_irqrestore(&guc->context_lookup, flags); } +static inline u32 get_children_go_addr(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, go.semaphore); +} + +static inline u32 get_children_join_addr(struct intel_context *ce, + u8 child_index) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, join[child_index].semaphore); +} + +#define PARENT_GO_BB 1 +#define PARENT_GO_FINI_BREADCRUMB 0 +#define CHILD_GO_BB 1 +#define CHILD_GO_FINI_BREADCRUMB 0 +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + u32 *cs; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + cs = intel_ring_begin(rq, 10 + 4 * ce->parallel.number_children); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_BB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_BB, + get_children_go_addr(ce), + 0); + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + u32 *cs; + + GEM_BUG_ON(!intel_context_is_child(ce)); + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_BB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait on parent for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_BB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 * +__emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_FINI_BREADCRUMB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_FINI_BREADCRUMB, + get_children_go_addr(ce), + 0); + + return cs; +} + +/* + * If this true, a submission of multi-lrc requests had an error and the + * requests need to be skipped. The front end (execuf IOCTL) should've called + * i915_request_skip which squashes the BB but we still need to emit the fini + * breadrcrumbs seqno write. At this point we don't know how many of the + * requests in the multi-lrc submission were generated so we can't do the + * handshake between the parent and children (e.g. if 4 requests should be + * generated but 2nd hit an error only 1 would be seen by the GuC backend). + * Simply skip the handshake, but still emit the breadcrumbd seqno, if an error + * has occurred on any of the requests in submission / relationship. + */ +static inline bool skip_handshake(struct i915_request *rq) +{ + return test_bit(I915_FENCE_FLAG_SKIP_PARALLEL, &rq->fence.flags); +} + +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + if (unlikely(skip_handshake(rq))) { + /* + * NOP everything in __emit_fini_breadcrumb_parent_no_preempt_mid_batch, + * the -6 comes from the length of the emits below. + */ + memset(cs, 0, sizeof(u32) * + (ce->engine->emit_fini_breadcrumb_dw - 6)); + cs += ce->engine->emit_fini_breadcrumb_dw - 6; + } else { + cs = __emit_fini_breadcrumb_parent_no_preempt_mid_batch(rq, cs); + } + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + +static u32 * +__emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + + GEM_BUG_ON(!intel_context_is_child(ce)); + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_FINI_BREADCRUMB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait parent on for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_FINI_BREADCRUMB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + return cs; +} + +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + + GEM_BUG_ON(!intel_context_is_child(ce)); + + if (unlikely(skip_handshake(rq))) { + /* + * NOP everything in __emit_fini_breadcrumb_child_no_preempt_mid_batch, + * the -6 comes from the length of the emits below. + */ + memset(cs, 0, sizeof(u32) * + (ce->engine->emit_fini_breadcrumb_dw - 6)); + cs += ce->engine->emit_fini_breadcrumb_dw - 6; + } else { + cs = __emit_fini_breadcrumb_child_no_preempt_mid_batch(rq, cs); + } + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + static struct intel_context * -guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count) +guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags) { struct guc_virtual_engine *ve; struct intel_guc *guc; @@ -3201,6 +4271,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count) } ve->base.mask |= sibling->mask; + ve->base.logical_mask |= sibling->logical_mask; if (n != 0 && ve->base.class != sibling->class) { DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", @@ -3259,4 +4330,5 @@ bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve) #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_guc.c" +#include "selftest_guc_multi_lrc.c" #endif diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc_multi_lrc.c b/drivers/gpu/drm/i915/gt/uc/selftest_guc_multi_lrc.c new file mode 100644 index 000000000000..50953c8e8b53 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc_multi_lrc.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright �� 2019 Intel Corporation + */ + +#include "selftests/igt_spinner.h" +#include "selftests/igt_reset.h" +#include "selftests/intel_scheduler_helpers.h" +#include "gt/intel_engine_heartbeat.h" +#include "gem/selftests/mock_context.h" + +static void logical_sort(struct intel_engine_cs **engines, int num_engines) +{ + struct intel_engine_cs *sorted[MAX_ENGINE_INSTANCE + 1]; + int i, j; + + for (i = 0; i < num_engines; ++i) + for (j = 0; j < MAX_ENGINE_INSTANCE + 1; ++j) { + if (engines[j]->logical_mask & BIT(i)) { + sorted[i] = engines[j]; + break; + } + } + + memcpy(*engines, *sorted, + sizeof(struct intel_engine_cs *) * num_engines); +} + +static struct intel_context * +multi_lrc_create_parent(struct intel_gt *gt, u8 class, + unsigned long flags) +{ + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int i = 0; + + for_each_engine(engine, gt, id) { + if (engine->class != class) + continue; + + siblings[i++] = engine; + } + + if (i <= 1) + return ERR_PTR(0); + + logical_sort(siblings, i); + + return intel_engine_create_parallel(siblings, 1, i); +} + +static void multi_lrc_context_unpin(struct intel_context *ce) +{ + struct intel_context *child; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); +} + +static void multi_lrc_context_put(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + /* + * Only the parent gets the creation ref put in the uAPI, the parent + * itself is responsible for creation ref put on the children. + */ + intel_context_put(ce); +} + +static struct i915_request * +multi_lrc_nop_request(struct intel_context *ce) +{ + struct intel_context *child; + struct i915_request *rq, *child_rq; + int i = 0; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return rq; + + i915_request_get(rq); + i915_request_add(rq); + + for_each_child(ce, child) { + child_rq = intel_context_create_request(child); + if (IS_ERR(child_rq)) + goto child_error; + + if (++i == ce->parallel.number_children) + set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, + &child_rq->fence.flags); + i915_request_add(child_rq); + } + + return rq; + +child_error: + i915_request_put(rq); + + return ERR_PTR(-ENOMEM); +} + +static int __intel_guc_multi_lrc_basic(struct intel_gt *gt, unsigned int class) +{ + struct intel_context *parent; + struct i915_request *rq; + int ret; + + parent = multi_lrc_create_parent(gt, class, 0); + if (IS_ERR(parent)) { + pr_err("Failed creating contexts: %ld", PTR_ERR(parent)); + return PTR_ERR(parent); + } else if (!parent) { + pr_debug("Not enough engines in class: %d", class); + return 0; + } + + rq = multi_lrc_nop_request(parent); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + pr_err("Failed creating requests: %d", ret); + goto out; + } + + ret = intel_selftest_wait_for_rq(rq); + if (ret) + pr_err("Failed waiting on request: %d", ret); + + i915_request_put(rq); + + if (ret >= 0) { + ret = intel_gt_wait_for_idle(gt, HZ * 5); + if (ret < 0) + pr_err("GT failed to idle: %d\n", ret); + } + +out: + multi_lrc_context_unpin(parent); + multi_lrc_context_put(parent); + return ret; +} + +static int intel_guc_multi_lrc_basic(void *arg) +{ + struct intel_gt *gt = arg; + unsigned int class; + int ret; + + for (class = 0; class < MAX_ENGINE_CLASS + 1; ++class) { + ret = __intel_guc_multi_lrc_basic(gt, class); + if (ret) + return ret; + } + + return 0; +} + +int intel_guc_multi_lrc_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(intel_guc_multi_lrc_basic), + }; + struct intel_gt *gt = &i915->gt; + + if (intel_gt_is_wedged(gt)) + return 0; + + if (!intel_uc_uses_guc_submission(>->uc)) + return 0; + + return intel_gt_live_subtests(tests, gt); +} diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index fdbd46ff59e0..fe638b5da7c0 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -35,6 +35,7 @@ #include "gt/intel_gt.h" #include "gt/intel_gt_buffer_pool.h" #include "gt/intel_gt_clock_utils.h" +#include "gt/intel_gt_debugfs.h" #include "gt/intel_gt_pm.h" #include "gt/intel_gt_pm_debugfs.h" #include "gt/intel_gt_requests.h" @@ -49,7 +50,6 @@ #include "i915_scheduler.h" #include "i915_trace.h" #include "intel_pm.h" -#include "intel_sideband.h" static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node) { @@ -554,36 +554,18 @@ static int i915_wa_registers(struct seq_file *m, void *unused) return 0; } -static int -i915_wedged_get(void *data, u64 *val) +static int i915_wedged_get(void *data, u64 *val) { struct drm_i915_private *i915 = data; - int ret = intel_gt_terminally_wedged(&i915->gt); - switch (ret) { - case -EIO: - *val = 1; - return 0; - case 0: - *val = 0; - return 0; - default: - return ret; - } + return intel_gt_debugfs_reset_show(&i915->gt, val); } -static int -i915_wedged_set(void *data, u64 val) +static int i915_wedged_set(void *data, u64 val) { struct drm_i915_private *i915 = data; - /* Flush any previous reset before applying for a new one */ - wait_event(i915->gt.reset.queue, - !test_bit(I915_RESET_BACKOFF, &i915->gt.reset.flags)); - - intel_gt_handle_error(&i915->gt, val, I915_ERROR_CAPTURE, - "Manually set wedged engine mask = %llx", val); - return 0; + return intel_gt_debugfs_reset_store(&i915->gt, val); } DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, @@ -728,27 +710,15 @@ static int i915_sseu_status(struct seq_file *m, void *unused) static int i915_forcewake_open(struct inode *inode, struct file *file) { struct drm_i915_private *i915 = inode->i_private; - struct intel_gt *gt = &i915->gt; - - atomic_inc(>->user_wakeref); - intel_gt_pm_get(gt); - if (GRAPHICS_VER(i915) >= 6) - intel_uncore_forcewake_user_get(gt->uncore); - return 0; + return intel_gt_pm_debugfs_forcewake_user_open(&i915->gt); } static int i915_forcewake_release(struct inode *inode, struct file *file) { struct drm_i915_private *i915 = inode->i_private; - struct intel_gt *gt = &i915->gt; - if (GRAPHICS_VER(i915) >= 6) - intel_uncore_forcewake_user_put(&i915->uncore); - intel_gt_pm_put(gt); - atomic_dec(>->user_wakeref); - - return 0; + return intel_gt_pm_debugfs_forcewake_user_release(&i915->gt); } static const struct file_operations i915_forcewake_fops = { diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c65c3742887a..b18a250e5d2e 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -84,9 +84,9 @@ #include "intel_dram.h" #include "intel_gvt.h" #include "intel_memory_region.h" +#include "intel_pcode.h" #include "intel_pm.h" #include "intel_region_ttm.h" -#include "intel_sideband.h" #include "vlv_suspend.h" static const struct drm_driver driver; diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c index 5e2b909827f4..51b368be0fc4 100644 --- a/drivers/gpu/drm/i915/i915_query.c +++ b/drivers/gpu/drm/i915/i915_query.c @@ -124,7 +124,9 @@ query_engine_info(struct drm_i915_private *i915, for_each_uabi_engine(engine, i915) { info.engine.engine_class = engine->uabi_class; info.engine.engine_instance = engine->uabi_instance; + info.flags = I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE; info.capabilities = engine->uabi_capabilities; + info.logical_instance = ilog2(engine->logical_mask); if (copy_to_user(info_ptr, &info, sizeof(info))) return -EFAULT; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 3e1baf9356ec..da9055c3ebf0 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -1968,7 +1968,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) _ICL_PORT_PCS_LN(ln) + 4 * (dw)) #define ICL_PORT_PCS_DW1_AUX(phy) _MMIO(_ICL_PORT_PCS_DW_AUX(1, phy)) #define ICL_PORT_PCS_DW1_GRP(phy) _MMIO(_ICL_PORT_PCS_DW_GRP(1, phy)) -#define ICL_PORT_PCS_DW1_LN0(phy) _MMIO(_ICL_PORT_PCS_DW_LN(1, 0, phy)) +#define ICL_PORT_PCS_DW1_LN(ln, phy) _MMIO(_ICL_PORT_PCS_DW_LN(1, ln, phy)) #define DCC_MODE_SELECT_MASK (0x3 << 20) #define DCC_MODE_SELECT_CONTINUOSLY (0x3 << 20) #define COMMON_KEEPER_EN (1 << 26) @@ -1989,7 +1989,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define ICL_PORT_TX_DW2_AUX(phy) _MMIO(_ICL_PORT_TX_DW_AUX(2, phy)) #define ICL_PORT_TX_DW2_GRP(phy) _MMIO(_ICL_PORT_TX_DW_GRP(2, phy)) -#define ICL_PORT_TX_DW2_LN0(phy) _MMIO(_ICL_PORT_TX_DW_LN(2, 0, phy)) +#define ICL_PORT_TX_DW2_LN(ln, phy) _MMIO(_ICL_PORT_TX_DW_LN(2, ln, phy)) #define SWING_SEL_UPPER(x) (((x) >> 3) << 15) #define SWING_SEL_UPPER_MASK (1 << 15) #define SWING_SEL_LOWER(x) (((x) & 0x7) << 11) @@ -2001,7 +2001,6 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define ICL_PORT_TX_DW4_AUX(phy) _MMIO(_ICL_PORT_TX_DW_AUX(4, phy)) #define ICL_PORT_TX_DW4_GRP(phy) _MMIO(_ICL_PORT_TX_DW_GRP(4, phy)) -#define ICL_PORT_TX_DW4_LN0(phy) _MMIO(_ICL_PORT_TX_DW_LN(4, 0, phy)) #define ICL_PORT_TX_DW4_LN(ln, phy) _MMIO(_ICL_PORT_TX_DW_LN(4, ln, phy)) #define LOADGEN_SELECT (1 << 31) #define POST_CURSOR_1(x) ((x) << 12) @@ -2013,7 +2012,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define ICL_PORT_TX_DW5_AUX(phy) _MMIO(_ICL_PORT_TX_DW_AUX(5, phy)) #define ICL_PORT_TX_DW5_GRP(phy) _MMIO(_ICL_PORT_TX_DW_GRP(5, phy)) -#define ICL_PORT_TX_DW5_LN0(phy) _MMIO(_ICL_PORT_TX_DW_LN(5, 0, phy)) +#define ICL_PORT_TX_DW5_LN(ln, phy) _MMIO(_ICL_PORT_TX_DW_LN(5, ln, phy)) #define TX_TRAINING_EN (1 << 31) #define TAP2_DISABLE (1 << 30) #define TAP3_DISABLE (1 << 29) @@ -2024,14 +2023,13 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define ICL_PORT_TX_DW7_AUX(phy) _MMIO(_ICL_PORT_TX_DW_AUX(7, phy)) #define ICL_PORT_TX_DW7_GRP(phy) _MMIO(_ICL_PORT_TX_DW_GRP(7, phy)) -#define ICL_PORT_TX_DW7_LN0(phy) _MMIO(_ICL_PORT_TX_DW_LN(7, 0, phy)) #define ICL_PORT_TX_DW7_LN(ln, phy) _MMIO(_ICL_PORT_TX_DW_LN(7, ln, phy)) #define N_SCALAR(x) ((x) << 24) #define N_SCALAR_MASK (0x7F << 24) #define ICL_PORT_TX_DW8_AUX(phy) _MMIO(_ICL_PORT_TX_DW_AUX(8, phy)) #define ICL_PORT_TX_DW8_GRP(phy) _MMIO(_ICL_PORT_TX_DW_GRP(8, phy)) -#define ICL_PORT_TX_DW8_LN0(phy) _MMIO(_ICL_PORT_TX_DW_LN(8, 0, phy)) +#define ICL_PORT_TX_DW8_LN(ln, phy) _MMIO(_ICL_PORT_TX_DW_LN(8, ln, phy)) #define ICL_PORT_TX_DW8_ODCC_CLK_SEL REG_BIT(31) #define ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_MASK REG_GENMASK(30, 29) #define ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_DIV2 REG_FIELD_PREP(ICL_PORT_TX_DW8_ODCC_CLK_DIV_SEL_MASK, 0x1) @@ -8225,6 +8223,11 @@ enum { #define HSW_SPR_STRETCH_MAX_X1 REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 3) #define HSW_FBCQ_DIS (1 << 22) #define BDW_DPRS_MASK_VBLANK_SRD (1 << 0) +#define SKL_PLANE1_STRETCH_MAX_MASK REG_GENMASK(1, 0) +#define SKL_PLANE1_STRETCH_MAX_X8 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 0) +#define SKL_PLANE1_STRETCH_MAX_X4 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 1) +#define SKL_PLANE1_STRETCH_MAX_X2 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 2) +#define SKL_PLANE1_STRETCH_MAX_X1 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 3) #define CHICKEN_PIPESL_1(pipe) _MMIO_PIPE(pipe, _CHICKEN_PIPESL_1_A, _CHICKEN_PIPESL_1_B) #define _CHICKEN_TRANS_A 0x420c0 @@ -11019,7 +11022,6 @@ enum skl_power_gate { _DKL_TX_DPCNTL1) #define _DKL_TX_DPCNTL2 0x2C8 -#define DKL_TX_LOADGEN_SHARING_PMD_DISABLE REG_BIT(12) #define DKL_TX_DP20BITMODE (1 << 2) #define DKL_TX_DPCNTL2(tc_port) _MMIO(_PORT(tc_port, \ _DKL_PHY1_BASE, \ @@ -11104,12 +11106,6 @@ enum skl_power_gate { #define DC_STATE_DEBUG_MASK_CORES (1 << 0) #define DC_STATE_DEBUG_MASK_MEMORY_UP (1 << 1) -#define BXT_P_CR_MC_BIOS_REQ_0_0_0 _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x7114) -#define BXT_REQ_DATA_MASK 0x3F -#define BXT_DRAM_CHANNEL_ACTIVE_SHIFT 12 -#define BXT_DRAM_CHANNEL_ACTIVE_MASK (0xF << 12) -#define BXT_MEMORY_FREQ_MULTIPLIER_HZ 133333333 - #define BXT_D_CR_DRP0_DUNIT8 0x1000 #define BXT_D_CR_DRP0_DUNIT9 0x1200 #define BXT_D_CR_DRP0_DUNIT_START 8 @@ -11140,9 +11136,7 @@ enum skl_power_gate { #define BXT_DRAM_TYPE_LPDDR4 (0x2 << 22) #define BXT_DRAM_TYPE_DDR4 (0x4 << 22) -#define SKL_MEMORY_FREQ_MULTIPLIER_HZ 266666666 #define SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5E04) -#define SKL_REQ_DATA_MASK (0xF << 0) #define DG1_GEAR_TYPE REG_BIT(16) #define SKL_MAD_INTER_CHANNEL_0_0_0_MCHBAR_MCMAIN _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5000) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index ed64fa9defdf..2c3cd6e635b5 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1335,6 +1335,25 @@ i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) return err; } +static inline bool is_parallel_rq(struct i915_request *rq) +{ + return intel_context_is_parallel(rq->context); +} + +static inline struct intel_context *request_to_parent(struct i915_request *rq) +{ + return intel_context_to_parent(rq->context); +} + +static bool is_same_parallel_context(struct i915_request *to, + struct i915_request *from) +{ + if (is_parallel_rq(to)) + return request_to_parent(to) == request_to_parent(from); + + return false; +} + int i915_request_await_execution(struct i915_request *rq, struct dma_fence *fence) @@ -1366,11 +1385,14 @@ i915_request_await_execution(struct i915_request *rq, * want to run our callback in all cases. */ - if (dma_fence_is_i915(fence)) + if (dma_fence_is_i915(fence)) { + if (is_same_parallel_context(rq, to_request(fence))) + continue; ret = __i915_request_await_execution(rq, to_request(fence)); - else + } else { ret = i915_request_await_external(rq, fence); + } if (ret < 0) return ret; } while (--nchild); @@ -1471,10 +1493,13 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) fence)) continue; - if (dma_fence_is_i915(fence)) + if (dma_fence_is_i915(fence)) { + if (is_same_parallel_context(rq, to_request(fence))) + continue; ret = i915_request_await_request(rq, to_request(fence)); - else + } else { ret = i915_request_await_external(rq, fence); + } if (ret < 0) return ret; @@ -1550,35 +1575,51 @@ i915_request_await_object(struct i915_request *to, } static struct i915_request * -__i915_request_add_to_timeline(struct i915_request *rq) +__i915_request_ensure_parallel_ordering(struct i915_request *rq, + struct intel_timeline *timeline) { - struct intel_timeline *timeline = i915_request_timeline(rq); struct i915_request *prev; - /* - * Dependency tracking and request ordering along the timeline - * is special cased so that we can eliminate redundant ordering - * operations while building the request (we know that the timeline - * itself is ordered, and here we guarantee it). - * - * As we know we will need to emit tracking along the timeline, - * we embed the hooks into our request struct -- at the cost of - * having to have specialised no-allocation interfaces (which will - * be beneficial elsewhere). - * - * A second benefit to open-coding i915_request_await_request is - * that we can apply a slight variant of the rules specialised - * for timelines that jump between engines (such as virtual engines). - * If we consider the case of virtual engine, we must emit a dma-fence - * to prevent scheduling of the second request until the first is - * complete (to maximise our greedy late load balancing) and this - * precludes optimising to use semaphores serialisation of a single - * timeline across engines. - */ + GEM_BUG_ON(!is_parallel_rq(rq)); + + prev = request_to_parent(rq)->parallel.last_rq; + if (prev) { + if (!__i915_request_is_complete(prev)) { + i915_sw_fence_await_sw_fence(&rq->submit, + &prev->submit, + &rq->submitq); + + if (rq->engine->sched_engine->schedule) + __i915_sched_node_add_dependency(&rq->sched, + &prev->sched, + &rq->dep, + 0); + } + i915_request_put(prev); + } + + request_to_parent(rq)->parallel.last_rq = i915_request_get(rq); + + return to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); +} + +static struct i915_request * +__i915_request_ensure_ordering(struct i915_request *rq, + struct intel_timeline *timeline) +{ + struct i915_request *prev; + + GEM_BUG_ON(is_parallel_rq(rq)); + prev = to_request(__i915_active_fence_set(&timeline->last_request, &rq->fence)); + if (prev && !__i915_request_is_complete(prev)) { bool uses_guc = intel_engine_uses_guc(rq->engine); + bool pow2 = is_power_of_2(READ_ONCE(prev->engine)->mask | + rq->engine->mask); + bool same_context = prev->context == rq->context; /* * The requests are supposed to be kept in order. However, @@ -1586,13 +1627,11 @@ __i915_request_add_to_timeline(struct i915_request *rq) * is used as a barrier for external modification to this * context. */ - GEM_BUG_ON(prev->context == rq->context && + GEM_BUG_ON(same_context && i915_seqno_passed(prev->fence.seqno, rq->fence.seqno)); - if ((!uses_guc && - is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) || - (uses_guc && prev->context == rq->context)) + if ((same_context && uses_guc) || (!uses_guc && pow2)) i915_sw_fence_await_sw_fence(&rq->submit, &prev->submit, &rq->submitq); @@ -1607,6 +1646,50 @@ __i915_request_add_to_timeline(struct i915_request *rq) 0); } + return prev; +} + +static struct i915_request * +__i915_request_add_to_timeline(struct i915_request *rq) +{ + struct intel_timeline *timeline = i915_request_timeline(rq); + struct i915_request *prev; + + /* + * Dependency tracking and request ordering along the timeline + * is special cased so that we can eliminate redundant ordering + * operations while building the request (we know that the timeline + * itself is ordered, and here we guarantee it). + * + * As we know we will need to emit tracking along the timeline, + * we embed the hooks into our request struct -- at the cost of + * having to have specialised no-allocation interfaces (which will + * be beneficial elsewhere). + * + * A second benefit to open-coding i915_request_await_request is + * that we can apply a slight variant of the rules specialised + * for timelines that jump between engines (such as virtual engines). + * If we consider the case of virtual engine, we must emit a dma-fence + * to prevent scheduling of the second request until the first is + * complete (to maximise our greedy late load balancing) and this + * precludes optimising to use semaphores serialisation of a single + * timeline across engines. + * + * We do not order parallel submission requests on the timeline as each + * parallel submission context has its own timeline and the ordering + * rules for parallel requests are that they must be submitted in the + * order received from the execbuf IOCTL. So rather than using the + * timeline we store a pointer to last request submitted in the + * relationship in the gem context and insert a submission fence + * between that request and request passed into this function or + * alternatively we use completion fence if gem context has a single + * timeline and this is the first submission of an execbuf IOCTL. + */ + if (likely(!is_parallel_rq(rq))) + prev = __i915_request_ensure_ordering(rq, timeline); + else + prev = __i915_request_ensure_parallel_ordering(rq, timeline); + /* * Make sure that no request gazumped us - if it was allocated after * our i915_request_alloc() and called __i915_request_add() before diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 7bd9ed20623e..dc359242d1ae 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -139,6 +139,29 @@ enum { * the GPU. Here we track such boost requests on a per-request basis. */ I915_FENCE_FLAG_BOOST, + + /* + * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a + * parent-child relationship (parallel submission, multi-lrc) should + * trigger a submission to the GuC rather than just moving the context + * tail. + */ + I915_FENCE_FLAG_SUBMIT_PARALLEL, + + /* + * I915_FENCE_FLAG_SKIP_PARALLEL - request with a context in a + * parent-child relationship (parallel submission, multi-lrc) that + * hit an error while generating requests in the execbuf IOCTL. + * Indicates this request should be skipped as another request in + * submission / relationship encoutered an error. + */ + I915_FENCE_FLAG_SKIP_PARALLEL, + + /* + * I915_FENCE_FLAG_COMPOSITE - Indicates fence is part of a composite + * fence (dma_fence_array) and i915 generated for parallel submission. + */ + I915_FENCE_FLAG_COMPOSITE, }; /** diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c index cdf0e9c6fd73..1804f4142740 100644 --- a/drivers/gpu/drm/i915/i915_sysfs.c +++ b/drivers/gpu/drm/i915/i915_sysfs.c @@ -37,7 +37,6 @@ #include "i915_drv.h" #include "i915_sysfs.h" #include "intel_pm.h" -#include "intel_sideband.h" static inline struct drm_i915_private *kdev_minor_to_i915(struct device *kdev) { diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 4b7fc4647e46..90546fa58fc1 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1234,9 +1234,10 @@ int __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq) return i915_active_add_request(&vma->active, rq); } -int i915_vma_move_to_active(struct i915_vma *vma, - struct i915_request *rq, - unsigned int flags) +int _i915_vma_move_to_active(struct i915_vma *vma, + struct i915_request *rq, + struct dma_fence *fence, + unsigned int flags) { struct drm_i915_gem_object *obj = vma->obj; int err; @@ -1257,9 +1258,11 @@ int i915_vma_move_to_active(struct i915_vma *vma, intel_frontbuffer_put(front); } - dma_resv_add_excl_fence(vma->resv, &rq->fence); - obj->write_domain = I915_GEM_DOMAIN_RENDER; - obj->read_domains = 0; + if (fence) { + dma_resv_add_excl_fence(vma->resv, fence); + obj->write_domain = I915_GEM_DOMAIN_RENDER; + obj->read_domains = 0; + } } else { if (!(flags & __EXEC_OBJECT_NO_RESERVE)) { err = dma_resv_reserve_shared(vma->resv, 1); @@ -1267,8 +1270,10 @@ int i915_vma_move_to_active(struct i915_vma *vma, return err; } - dma_resv_add_shared_fence(vma->resv, &rq->fence); - obj->write_domain = 0; + if (fence) { + dma_resv_add_shared_fence(vma->resv, fence); + obj->write_domain = 0; + } } if (flags & EXEC_OBJECT_NEEDS_FENCE && vma->fence) diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index ed69f66c7ab0..648dbe744c96 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -57,9 +57,16 @@ static inline bool i915_vma_is_active(const struct i915_vma *vma) int __must_check __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq); -int __must_check i915_vma_move_to_active(struct i915_vma *vma, - struct i915_request *rq, - unsigned int flags); +int __must_check _i915_vma_move_to_active(struct i915_vma *vma, + struct i915_request *rq, + struct dma_fence *fence, + unsigned int flags); +static inline int __must_check +i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq, + unsigned int flags) +{ + return _i915_vma_move_to_active(vma, rq, &rq->fence, flags); +} #define __i915_vma_flags(v) ((unsigned long *)&(v)->flags.counter) diff --git a/drivers/gpu/drm/i915/intel_dram.c b/drivers/gpu/drm/i915/intel_dram.c index 30a0cab5eff4..84bb212bae4b 100644 --- a/drivers/gpu/drm/i915/intel_dram.c +++ b/drivers/gpu/drm/i915/intel_dram.c @@ -5,7 +5,7 @@ #include "i915_drv.h" #include "intel_dram.h" -#include "intel_sideband.h" +#include "intel_pcode.h" struct dram_dimm_info { u16 size; @@ -244,7 +244,6 @@ static int skl_get_dram_info(struct drm_i915_private *i915) { struct dram_info *dram_info = &i915->dram_info; - u32 mem_freq_khz, val; int ret; dram_info->type = skl_get_dram_type(i915); @@ -255,17 +254,6 @@ skl_get_dram_info(struct drm_i915_private *i915) if (ret) return ret; - val = intel_uncore_read(&i915->uncore, - SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU); - mem_freq_khz = DIV_ROUND_UP((val & SKL_REQ_DATA_MASK) * - SKL_MEMORY_FREQ_MULTIPLIER_HZ, 1000); - - if (dram_info->num_channels * mem_freq_khz == 0) { - drm_info(&i915->drm, - "Couldn't get system memory bandwidth\n"); - return -EINVAL; - } - return 0; } @@ -350,24 +338,10 @@ static void bxt_get_dimm_info(struct dram_dimm_info *dimm, u32 val) static int bxt_get_dram_info(struct drm_i915_private *i915) { struct dram_info *dram_info = &i915->dram_info; - u32 dram_channels; - u32 mem_freq_khz, val; - u8 num_active_channels, valid_ranks = 0; + u32 val; + u8 valid_ranks = 0; int i; - val = intel_uncore_read(&i915->uncore, BXT_P_CR_MC_BIOS_REQ_0_0_0); - mem_freq_khz = DIV_ROUND_UP((val & BXT_REQ_DATA_MASK) * - BXT_MEMORY_FREQ_MULTIPLIER_HZ, 1000); - - dram_channels = val & BXT_DRAM_CHANNEL_ACTIVE_MASK; - num_active_channels = hweight32(dram_channels); - - if (mem_freq_khz * num_active_channels == 0) { - drm_info(&i915->drm, - "Couldn't get system memory bandwidth\n"); - return -EINVAL; - } - /* * Now read each DUNIT8/9/10/11 to check the rank of each dimms. */ diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c new file mode 100644 index 000000000000..e8c886e4e78d --- /dev/null +++ b/drivers/gpu/drm/i915/intel_pcode.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2013-2021 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_pcode.h" + +static int gen6_check_mailbox_status(u32 mbox) +{ + switch (mbox & GEN6_PCODE_ERROR_MASK) { + case GEN6_PCODE_SUCCESS: + return 0; + case GEN6_PCODE_UNIMPLEMENTED_CMD: + return -ENODEV; + case GEN6_PCODE_ILLEGAL_CMD: + return -ENXIO; + case GEN6_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE: + case GEN7_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE: + return -EOVERFLOW; + case GEN6_PCODE_TIMEOUT: + return -ETIMEDOUT; + default: + MISSING_CASE(mbox & GEN6_PCODE_ERROR_MASK); + return 0; + } +} + +static int gen7_check_mailbox_status(u32 mbox) +{ + switch (mbox & GEN6_PCODE_ERROR_MASK) { + case GEN6_PCODE_SUCCESS: + return 0; + case GEN6_PCODE_ILLEGAL_CMD: + return -ENXIO; + case GEN7_PCODE_TIMEOUT: + return -ETIMEDOUT; + case GEN7_PCODE_ILLEGAL_DATA: + return -EINVAL; + case GEN11_PCODE_ILLEGAL_SUBCOMMAND: + return -ENXIO; + case GEN11_PCODE_LOCKED: + return -EBUSY; + case GEN11_PCODE_REJECTED: + return -EACCES; + case GEN7_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE: + return -EOVERFLOW; + default: + MISSING_CASE(mbox & GEN6_PCODE_ERROR_MASK); + return 0; + } +} + +static int __sandybridge_pcode_rw(struct drm_i915_private *i915, + u32 mbox, u32 *val, u32 *val1, + int fast_timeout_us, + int slow_timeout_ms, + bool is_read) +{ + struct intel_uncore *uncore = &i915->uncore; + + lockdep_assert_held(&i915->sb_lock); + + /* + * GEN6_PCODE_* are outside of the forcewake domain, we can use + * intel_uncore_read/write_fw variants to reduce the amount of work + * required when reading/writing. + */ + + if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) + return -EAGAIN; + + intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val); + intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0); + intel_uncore_write_fw(uncore, + GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox); + + if (__intel_wait_for_register_fw(uncore, + GEN6_PCODE_MAILBOX, + GEN6_PCODE_READY, 0, + fast_timeout_us, + slow_timeout_ms, + &mbox)) + return -ETIMEDOUT; + + if (is_read) + *val = intel_uncore_read_fw(uncore, GEN6_PCODE_DATA); + if (is_read && val1) + *val1 = intel_uncore_read_fw(uncore, GEN6_PCODE_DATA1); + + if (GRAPHICS_VER(i915) > 6) + return gen7_check_mailbox_status(mbox); + else + return gen6_check_mailbox_status(mbox); +} + +int sandybridge_pcode_read(struct drm_i915_private *i915, u32 mbox, + u32 *val, u32 *val1) +{ + int err; + + mutex_lock(&i915->sb_lock); + err = __sandybridge_pcode_rw(i915, mbox, val, val1, + 500, 20, + true); + mutex_unlock(&i915->sb_lock); + + if (err) { + drm_dbg(&i915->drm, + "warning: pcode (read from mbox %x) mailbox access failed for %ps: %d\n", + mbox, __builtin_return_address(0), err); + } + + return err; +} + +int sandybridge_pcode_write_timeout(struct drm_i915_private *i915, + u32 mbox, u32 val, + int fast_timeout_us, + int slow_timeout_ms) +{ + int err; + + mutex_lock(&i915->sb_lock); + err = __sandybridge_pcode_rw(i915, mbox, &val, NULL, + fast_timeout_us, slow_timeout_ms, + false); + mutex_unlock(&i915->sb_lock); + + if (err) { + drm_dbg(&i915->drm, + "warning: pcode (write of 0x%08x to mbox %x) mailbox access failed for %ps: %d\n", + val, mbox, __builtin_return_address(0), err); + } + + return err; +} + +static bool skl_pcode_try_request(struct drm_i915_private *i915, u32 mbox, + u32 request, u32 reply_mask, u32 reply, + u32 *status) +{ + *status = __sandybridge_pcode_rw(i915, mbox, &request, NULL, + 500, 0, + true); + + return *status || ((request & reply_mask) == reply); +} + +/** + * skl_pcode_request - send PCODE request until acknowledgment + * @i915: device private + * @mbox: PCODE mailbox ID the request is targeted for + * @request: request ID + * @reply_mask: mask used to check for request acknowledgment + * @reply: value used to check for request acknowledgment + * @timeout_base_ms: timeout for polling with preemption enabled + * + * Keep resending the @request to @mbox until PCODE acknowledges it, PCODE + * reports an error or an overall timeout of @timeout_base_ms+50 ms expires. + * The request is acknowledged once the PCODE reply dword equals @reply after + * applying @reply_mask. Polling is first attempted with preemption enabled + * for @timeout_base_ms and if this times out for another 50 ms with + * preemption disabled. + * + * Returns 0 on success, %-ETIMEDOUT in case of a timeout, <0 in case of some + * other error as reported by PCODE. + */ +int skl_pcode_request(struct drm_i915_private *i915, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms) +{ + u32 status; + int ret; + + mutex_lock(&i915->sb_lock); + +#define COND \ + skl_pcode_try_request(i915, mbox, request, reply_mask, reply, &status) + + /* + * Prime the PCODE by doing a request first. Normally it guarantees + * that a subsequent request, at most @timeout_base_ms later, succeeds. + * _wait_for() doesn't guarantee when its passed condition is evaluated + * first, so send the first request explicitly. + */ + if (COND) { + ret = 0; + goto out; + } + ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10); + if (!ret) + goto out; + + /* + * The above can time out if the number of requests was low (2 in the + * worst case) _and_ PCODE was busy for some reason even after a + * (queued) request and @timeout_base_ms delay. As a workaround retry + * the poll with preemption disabled to maximize the number of + * requests. Increase the timeout from @timeout_base_ms to 50ms to + * account for interrupts that could reduce the number of these + * requests, and for any quirks of the PCODE firmware that delays + * the request completion. + */ + drm_dbg_kms(&i915->drm, + "PCODE timeout, retrying with preemption disabled\n"); + drm_WARN_ON_ONCE(&i915->drm, timeout_base_ms > 3); + preempt_disable(); + ret = wait_for_atomic(COND, 50); + preempt_enable(); + +out: + mutex_unlock(&i915->sb_lock); + return ret ? ret : status; +#undef COND +} + +int intel_pcode_init(struct drm_i915_private *i915) +{ + int ret = 0; + + if (!IS_DGFX(i915)) + return ret; + + ret = skl_pcode_request(i915, DG1_PCODE_STATUS, + DG1_UNCORE_GET_INIT_STATUS, + DG1_UNCORE_INIT_STATUS_COMPLETE, + DG1_UNCORE_INIT_STATUS_COMPLETE, 180000); + + drm_dbg(&i915->drm, "PCODE init status %d\n", ret); + + if (ret) + drm_err(&i915->drm, "Pcode did not report uncore initialization completion!\n"); + + return ret; +} diff --git a/drivers/gpu/drm/i915/intel_pcode.h b/drivers/gpu/drm/i915/intel_pcode.h new file mode 100644 index 000000000000..50806649d4b6 --- /dev/null +++ b/drivers/gpu/drm/i915/intel_pcode.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2013-2021 Intel Corporation + */ + +#ifndef _INTEL_PCODE_H_ +#define _INTEL_PCODE_H_ + +#include <linux/types.h> + +struct drm_i915_private; + +int sandybridge_pcode_read(struct drm_i915_private *i915, u32 mbox, + u32 *val, u32 *val1); +int sandybridge_pcode_write_timeout(struct drm_i915_private *i915, u32 mbox, + u32 val, int fast_timeout_us, + int slow_timeout_ms); +#define sandybridge_pcode_write(i915, mbox, val) \ + sandybridge_pcode_write_timeout(i915, mbox, val, 500, 0) + +int skl_pcode_request(struct drm_i915_private *i915, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms); + +int intel_pcode_init(struct drm_i915_private *i915); + +#endif /* _INTEL_PCODE_H */ diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index ef5f73934dab..f90fe39cf8ca 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -47,8 +47,9 @@ #include "i915_fixed.h" #include "i915_irq.h" #include "i915_trace.h" +#include "intel_pcode.h" #include "intel_pm.h" -#include "intel_sideband.h" +#include "vlv_sideband.h" #include "../../../platform/x86/intel_ips.h" /* Stores plane specific WM parameters */ @@ -7587,11 +7588,6 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv) intel_uncore_write(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe), intel_uncore_read(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe)) | BDW_DPRS_MASK_VBLANK_SRD); - - /* Undocumented but fixes async flip + VT-d corruption */ - if (intel_vtd_active()) - intel_uncore_rmw(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe), - HSW_PRI_STRETCH_MAX_MASK, HSW_PRI_STRETCH_MAX_X1); } /* WaVSRefCountFullforceMissDisable:bdw */ @@ -7627,20 +7623,11 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv) static void hsw_init_clock_gating(struct drm_i915_private *dev_priv) { - enum pipe pipe; - /* WaFbcAsynchFlipDisableFbcQueue:hsw,bdw */ intel_uncore_write(&dev_priv->uncore, CHICKEN_PIPESL_1(PIPE_A), intel_uncore_read(&dev_priv->uncore, CHICKEN_PIPESL_1(PIPE_A)) | HSW_FBCQ_DIS); - for_each_pipe(dev_priv, pipe) { - /* Undocumented but fixes async flip + VT-d corruption */ - if (intel_vtd_active()) - intel_uncore_rmw(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe), - HSW_PRI_STRETCH_MAX_MASK, HSW_PRI_STRETCH_MAX_X1); - } - /* This is required by WaCatErrorRejectionIssue:hsw */ intel_uncore_write(&dev_priv->uncore, GEN7_SQ_CHICKEN_MBCUNIT_CONFIG, intel_uncore_read(&dev_priv->uncore, GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | diff --git a/drivers/gpu/drm/i915/intel_sbi.c b/drivers/gpu/drm/i915/intel_sbi.c new file mode 100644 index 000000000000..5ba8490a31e6 --- /dev/null +++ b/drivers/gpu/drm/i915/intel_sbi.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2013-2021 Intel Corporation + * + * LPT/WPT IOSF sideband. + */ + +#include "i915_drv.h" +#include "intel_sbi.h" + +/* SBI access */ +static int intel_sbi_rw(struct drm_i915_private *i915, u16 reg, + enum intel_sbi_destination destination, + u32 *val, bool is_read) +{ + struct intel_uncore *uncore = &i915->uncore; + u32 cmd; + + lockdep_assert_held(&i915->sb_lock); + + if (intel_wait_for_register_fw(uncore, + SBI_CTL_STAT, SBI_BUSY, 0, + 100)) { + drm_err(&i915->drm, + "timeout waiting for SBI to become ready\n"); + return -EBUSY; + } + + intel_uncore_write_fw(uncore, SBI_ADDR, (u32)reg << 16); + intel_uncore_write_fw(uncore, SBI_DATA, is_read ? 0 : *val); + + if (destination == SBI_ICLK) + cmd = SBI_CTL_DEST_ICLK | SBI_CTL_OP_CRRD; + else + cmd = SBI_CTL_DEST_MPHY | SBI_CTL_OP_IORD; + if (!is_read) + cmd |= BIT(8); + intel_uncore_write_fw(uncore, SBI_CTL_STAT, cmd | SBI_BUSY); + + if (__intel_wait_for_register_fw(uncore, + SBI_CTL_STAT, SBI_BUSY, 0, + 100, 100, &cmd)) { + drm_err(&i915->drm, + "timeout waiting for SBI to complete read\n"); + return -ETIMEDOUT; + } + + if (cmd & SBI_RESPONSE_FAIL) { + drm_err(&i915->drm, "error during SBI read of reg %x\n", reg); + return -ENXIO; + } + + if (is_read) + *val = intel_uncore_read_fw(uncore, SBI_DATA); + + return 0; +} + +u32 intel_sbi_read(struct drm_i915_private *i915, u16 reg, + enum intel_sbi_destination destination) +{ + u32 result = 0; + + intel_sbi_rw(i915, reg, destination, &result, true); + + return result; +} + +void intel_sbi_write(struct drm_i915_private *i915, u16 reg, u32 value, + enum intel_sbi_destination destination) +{ + intel_sbi_rw(i915, reg, destination, &value, false); +} diff --git a/drivers/gpu/drm/i915/intel_sbi.h b/drivers/gpu/drm/i915/intel_sbi.h new file mode 100644 index 000000000000..f5a862210454 --- /dev/null +++ b/drivers/gpu/drm/i915/intel_sbi.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2013-2021 Intel Corporation + */ + +#ifndef _INTEL_SBI_H_ +#define _INTEL_SBI_H_ + +#include <linux/types.h> + +struct drm_i915_private; + +enum intel_sbi_destination { + SBI_ICLK, + SBI_MPHY, +}; + +u32 intel_sbi_read(struct drm_i915_private *i915, u16 reg, + enum intel_sbi_destination destination); +void intel_sbi_write(struct drm_i915_private *i915, u16 reg, u32 value, + enum intel_sbi_destination destination); + +#endif /* _INTEL_SBI_H_ */ diff --git a/drivers/gpu/drm/i915/intel_sideband.c b/drivers/gpu/drm/i915/intel_sideband.c deleted file mode 100644 index e304bf44e1ff..000000000000 --- a/drivers/gpu/drm/i915/intel_sideband.c +++ /dev/null @@ -1,577 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - */ - -#include <asm/iosf_mbi.h> - -#include "i915_drv.h" -#include "intel_sideband.h" - -/* - * IOSF sideband, see VLV2_SidebandMsg_HAS.docx and - * VLV_VLV2_PUNIT_HAS_0.8.docx - */ - -/* Standard MMIO read, non-posted */ -#define SB_MRD_NP 0x00 -/* Standard MMIO write, non-posted */ -#define SB_MWR_NP 0x01 -/* Private register read, double-word addressing, non-posted */ -#define SB_CRRDDA_NP 0x06 -/* Private register write, double-word addressing, non-posted */ -#define SB_CRWRDA_NP 0x07 - -static void ping(void *info) -{ -} - -static void __vlv_punit_get(struct drm_i915_private *i915) -{ - iosf_mbi_punit_acquire(); - - /* - * Prevent the cpu from sleeping while we use this sideband, otherwise - * the punit may cause a machine hang. The issue appears to be isolated - * with changing the power state of the CPU package while changing - * the power state via the punit, and we have only observed it - * reliably on 4-core Baytail systems suggesting the issue is in the - * power delivery mechanism and likely to be be board/function - * specific. Hence we presume the workaround needs only be applied - * to the Valleyview P-unit and not all sideband communications. - */ - if (IS_VALLEYVIEW(i915)) { - cpu_latency_qos_update_request(&i915->sb_qos, 0); - on_each_cpu(ping, NULL, 1); - } -} - -static void __vlv_punit_put(struct drm_i915_private *i915) -{ - if (IS_VALLEYVIEW(i915)) - cpu_latency_qos_update_request(&i915->sb_qos, - PM_QOS_DEFAULT_VALUE); - - iosf_mbi_punit_release(); -} - -void vlv_iosf_sb_get(struct drm_i915_private *i915, unsigned long ports) -{ - if (ports & BIT(VLV_IOSF_SB_PUNIT)) - __vlv_punit_get(i915); - - mutex_lock(&i915->sb_lock); -} - -void vlv_iosf_sb_put(struct drm_i915_private *i915, unsigned long ports) -{ - mutex_unlock(&i915->sb_lock); - - if (ports & BIT(VLV_IOSF_SB_PUNIT)) - __vlv_punit_put(i915); -} - -static int vlv_sideband_rw(struct drm_i915_private *i915, - u32 devfn, u32 port, u32 opcode, - u32 addr, u32 *val) -{ - struct intel_uncore *uncore = &i915->uncore; - const bool is_read = (opcode == SB_MRD_NP || opcode == SB_CRRDDA_NP); - int err; - - lockdep_assert_held(&i915->sb_lock); - if (port == IOSF_PORT_PUNIT) - iosf_mbi_assert_punit_acquired(); - - /* Flush the previous comms, just in case it failed last time. */ - if (intel_wait_for_register(uncore, - VLV_IOSF_DOORBELL_REQ, IOSF_SB_BUSY, 0, - 5)) { - drm_dbg(&i915->drm, "IOSF sideband idle wait (%s) timed out\n", - is_read ? "read" : "write"); - return -EAGAIN; - } - - preempt_disable(); - - intel_uncore_write_fw(uncore, VLV_IOSF_ADDR, addr); - intel_uncore_write_fw(uncore, VLV_IOSF_DATA, is_read ? 0 : *val); - intel_uncore_write_fw(uncore, VLV_IOSF_DOORBELL_REQ, - (devfn << IOSF_DEVFN_SHIFT) | - (opcode << IOSF_OPCODE_SHIFT) | - (port << IOSF_PORT_SHIFT) | - (0xf << IOSF_BYTE_ENABLES_SHIFT) | - (0 << IOSF_BAR_SHIFT) | - IOSF_SB_BUSY); - - if (__intel_wait_for_register_fw(uncore, - VLV_IOSF_DOORBELL_REQ, IOSF_SB_BUSY, 0, - 10000, 0, NULL) == 0) { - if (is_read) - *val = intel_uncore_read_fw(uncore, VLV_IOSF_DATA); - err = 0; - } else { - drm_dbg(&i915->drm, "IOSF sideband finish wait (%s) timed out\n", - is_read ? "read" : "write"); - err = -ETIMEDOUT; - } - - preempt_enable(); - - return err; -} - -u32 vlv_punit_read(struct drm_i915_private *i915, u32 addr) -{ - u32 val = 0; - - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_PUNIT, - SB_CRRDDA_NP, addr, &val); - - return val; -} - -int vlv_punit_write(struct drm_i915_private *i915, u32 addr, u32 val) -{ - return vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_PUNIT, - SB_CRWRDA_NP, addr, &val); -} - -u32 vlv_bunit_read(struct drm_i915_private *i915, u32 reg) -{ - u32 val = 0; - - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_BUNIT, - SB_CRRDDA_NP, reg, &val); - - return val; -} - -void vlv_bunit_write(struct drm_i915_private *i915, u32 reg, u32 val) -{ - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_BUNIT, - SB_CRWRDA_NP, reg, &val); -} - -u32 vlv_nc_read(struct drm_i915_private *i915, u8 addr) -{ - u32 val = 0; - - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_NC, - SB_CRRDDA_NP, addr, &val); - - return val; -} - -u32 vlv_iosf_sb_read(struct drm_i915_private *i915, u8 port, u32 reg) -{ - u32 val = 0; - - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), port, - SB_CRRDDA_NP, reg, &val); - - return val; -} - -void vlv_iosf_sb_write(struct drm_i915_private *i915, - u8 port, u32 reg, u32 val) -{ - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), port, - SB_CRWRDA_NP, reg, &val); -} - -u32 vlv_cck_read(struct drm_i915_private *i915, u32 reg) -{ - u32 val = 0; - - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCK, - SB_CRRDDA_NP, reg, &val); - - return val; -} - -void vlv_cck_write(struct drm_i915_private *i915, u32 reg, u32 val) -{ - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCK, - SB_CRWRDA_NP, reg, &val); -} - -u32 vlv_ccu_read(struct drm_i915_private *i915, u32 reg) -{ - u32 val = 0; - - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCU, - SB_CRRDDA_NP, reg, &val); - - return val; -} - -void vlv_ccu_write(struct drm_i915_private *i915, u32 reg, u32 val) -{ - vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCU, - SB_CRWRDA_NP, reg, &val); -} - -static u32 vlv_dpio_phy_iosf_port(struct drm_i915_private *i915, enum dpio_phy phy) -{ - /* - * IOSF_PORT_DPIO: VLV x2 PHY (DP/HDMI B and C), CHV x1 PHY (DP/HDMI D) - * IOSF_PORT_DPIO_2: CHV x2 PHY (DP/HDMI B and C) - */ - if (IS_CHERRYVIEW(i915)) - return phy == DPIO_PHY0 ? IOSF_PORT_DPIO_2 : IOSF_PORT_DPIO; - else - return IOSF_PORT_DPIO; -} - -u32 vlv_dpio_read(struct drm_i915_private *i915, enum pipe pipe, int reg) -{ - u32 port = vlv_dpio_phy_iosf_port(i915, DPIO_PHY(pipe)); - u32 val = 0; - - vlv_sideband_rw(i915, DPIO_DEVFN, port, SB_MRD_NP, reg, &val); - - /* - * FIXME: There might be some registers where all 1's is a valid value, - * so ideally we should check the register offset instead... - */ - drm_WARN(&i915->drm, val == 0xffffffff, - "DPIO read pipe %c reg 0x%x == 0x%x\n", - pipe_name(pipe), reg, val); - - return val; -} - -void vlv_dpio_write(struct drm_i915_private *i915, - enum pipe pipe, int reg, u32 val) -{ - u32 port = vlv_dpio_phy_iosf_port(i915, DPIO_PHY(pipe)); - - vlv_sideband_rw(i915, DPIO_DEVFN, port, SB_MWR_NP, reg, &val); -} - -u32 vlv_flisdsi_read(struct drm_i915_private *i915, u32 reg) -{ - u32 val = 0; - - vlv_sideband_rw(i915, DPIO_DEVFN, IOSF_PORT_FLISDSI, SB_CRRDDA_NP, - reg, &val); - return val; -} - -void vlv_flisdsi_write(struct drm_i915_private *i915, u32 reg, u32 val) -{ - vlv_sideband_rw(i915, DPIO_DEVFN, IOSF_PORT_FLISDSI, SB_CRWRDA_NP, - reg, &val); -} - -/* SBI access */ -static int intel_sbi_rw(struct drm_i915_private *i915, u16 reg, - enum intel_sbi_destination destination, - u32 *val, bool is_read) -{ - struct intel_uncore *uncore = &i915->uncore; - u32 cmd; - - lockdep_assert_held(&i915->sb_lock); - - if (intel_wait_for_register_fw(uncore, - SBI_CTL_STAT, SBI_BUSY, 0, - 100)) { - drm_err(&i915->drm, - "timeout waiting for SBI to become ready\n"); - return -EBUSY; - } - - intel_uncore_write_fw(uncore, SBI_ADDR, (u32)reg << 16); - intel_uncore_write_fw(uncore, SBI_DATA, is_read ? 0 : *val); - - if (destination == SBI_ICLK) - cmd = SBI_CTL_DEST_ICLK | SBI_CTL_OP_CRRD; - else - cmd = SBI_CTL_DEST_MPHY | SBI_CTL_OP_IORD; - if (!is_read) - cmd |= BIT(8); - intel_uncore_write_fw(uncore, SBI_CTL_STAT, cmd | SBI_BUSY); - - if (__intel_wait_for_register_fw(uncore, - SBI_CTL_STAT, SBI_BUSY, 0, - 100, 100, &cmd)) { - drm_err(&i915->drm, - "timeout waiting for SBI to complete read\n"); - return -ETIMEDOUT; - } - - if (cmd & SBI_RESPONSE_FAIL) { - drm_err(&i915->drm, "error during SBI read of reg %x\n", reg); - return -ENXIO; - } - - if (is_read) - *val = intel_uncore_read_fw(uncore, SBI_DATA); - - return 0; -} - -u32 intel_sbi_read(struct drm_i915_private *i915, u16 reg, - enum intel_sbi_destination destination) -{ - u32 result = 0; - - intel_sbi_rw(i915, reg, destination, &result, true); - - return result; -} - -void intel_sbi_write(struct drm_i915_private *i915, u16 reg, u32 value, - enum intel_sbi_destination destination) -{ - intel_sbi_rw(i915, reg, destination, &value, false); -} - -static int gen6_check_mailbox_status(u32 mbox) -{ - switch (mbox & GEN6_PCODE_ERROR_MASK) { - case GEN6_PCODE_SUCCESS: - return 0; - case GEN6_PCODE_UNIMPLEMENTED_CMD: - return -ENODEV; - case GEN6_PCODE_ILLEGAL_CMD: - return -ENXIO; - case GEN6_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE: - case GEN7_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE: - return -EOVERFLOW; - case GEN6_PCODE_TIMEOUT: - return -ETIMEDOUT; - default: - MISSING_CASE(mbox & GEN6_PCODE_ERROR_MASK); - return 0; - } -} - -static int gen7_check_mailbox_status(u32 mbox) -{ - switch (mbox & GEN6_PCODE_ERROR_MASK) { - case GEN6_PCODE_SUCCESS: - return 0; - case GEN6_PCODE_ILLEGAL_CMD: - return -ENXIO; - case GEN7_PCODE_TIMEOUT: - return -ETIMEDOUT; - case GEN7_PCODE_ILLEGAL_DATA: - return -EINVAL; - case GEN11_PCODE_ILLEGAL_SUBCOMMAND: - return -ENXIO; - case GEN11_PCODE_LOCKED: - return -EBUSY; - case GEN11_PCODE_REJECTED: - return -EACCES; - case GEN7_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE: - return -EOVERFLOW; - default: - MISSING_CASE(mbox & GEN6_PCODE_ERROR_MASK); - return 0; - } -} - -static int __sandybridge_pcode_rw(struct drm_i915_private *i915, - u32 mbox, u32 *val, u32 *val1, - int fast_timeout_us, - int slow_timeout_ms, - bool is_read) -{ - struct intel_uncore *uncore = &i915->uncore; - - lockdep_assert_held(&i915->sb_lock); - - /* - * GEN6_PCODE_* are outside of the forcewake domain, we can use - * intel_uncore_read/write_fw variants to reduce the amount of work - * required when reading/writing. - */ - - if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) - return -EAGAIN; - - intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val); - intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0); - intel_uncore_write_fw(uncore, - GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox); - - if (__intel_wait_for_register_fw(uncore, - GEN6_PCODE_MAILBOX, - GEN6_PCODE_READY, 0, - fast_timeout_us, - slow_timeout_ms, - &mbox)) - return -ETIMEDOUT; - - if (is_read) - *val = intel_uncore_read_fw(uncore, GEN6_PCODE_DATA); - if (is_read && val1) - *val1 = intel_uncore_read_fw(uncore, GEN6_PCODE_DATA1); - - if (GRAPHICS_VER(i915) > 6) - return gen7_check_mailbox_status(mbox); - else - return gen6_check_mailbox_status(mbox); -} - -int sandybridge_pcode_read(struct drm_i915_private *i915, u32 mbox, - u32 *val, u32 *val1) -{ - int err; - - mutex_lock(&i915->sb_lock); - err = __sandybridge_pcode_rw(i915, mbox, val, val1, - 500, 20, - true); - mutex_unlock(&i915->sb_lock); - - if (err) { - drm_dbg(&i915->drm, - "warning: pcode (read from mbox %x) mailbox access failed for %ps: %d\n", - mbox, __builtin_return_address(0), err); - } - - return err; -} - -int sandybridge_pcode_write_timeout(struct drm_i915_private *i915, - u32 mbox, u32 val, - int fast_timeout_us, - int slow_timeout_ms) -{ - int err; - - mutex_lock(&i915->sb_lock); - err = __sandybridge_pcode_rw(i915, mbox, &val, NULL, - fast_timeout_us, slow_timeout_ms, - false); - mutex_unlock(&i915->sb_lock); - - if (err) { - drm_dbg(&i915->drm, - "warning: pcode (write of 0x%08x to mbox %x) mailbox access failed for %ps: %d\n", - val, mbox, __builtin_return_address(0), err); - } - - return err; -} - -static bool skl_pcode_try_request(struct drm_i915_private *i915, u32 mbox, - u32 request, u32 reply_mask, u32 reply, - u32 *status) -{ - *status = __sandybridge_pcode_rw(i915, mbox, &request, NULL, - 500, 0, - true); - - return *status || ((request & reply_mask) == reply); -} - -/** - * skl_pcode_request - send PCODE request until acknowledgment - * @i915: device private - * @mbox: PCODE mailbox ID the request is targeted for - * @request: request ID - * @reply_mask: mask used to check for request acknowledgment - * @reply: value used to check for request acknowledgment - * @timeout_base_ms: timeout for polling with preemption enabled - * - * Keep resending the @request to @mbox until PCODE acknowledges it, PCODE - * reports an error or an overall timeout of @timeout_base_ms+50 ms expires. - * The request is acknowledged once the PCODE reply dword equals @reply after - * applying @reply_mask. Polling is first attempted with preemption enabled - * for @timeout_base_ms and if this times out for another 50 ms with - * preemption disabled. - * - * Returns 0 on success, %-ETIMEDOUT in case of a timeout, <0 in case of some - * other error as reported by PCODE. - */ -int skl_pcode_request(struct drm_i915_private *i915, u32 mbox, u32 request, - u32 reply_mask, u32 reply, int timeout_base_ms) -{ - u32 status; - int ret; - - mutex_lock(&i915->sb_lock); - -#define COND \ - skl_pcode_try_request(i915, mbox, request, reply_mask, reply, &status) - - /* - * Prime the PCODE by doing a request first. Normally it guarantees - * that a subsequent request, at most @timeout_base_ms later, succeeds. - * _wait_for() doesn't guarantee when its passed condition is evaluated - * first, so send the first request explicitly. - */ - if (COND) { - ret = 0; - goto out; - } - ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10); - if (!ret) - goto out; - - /* - * The above can time out if the number of requests was low (2 in the - * worst case) _and_ PCODE was busy for some reason even after a - * (queued) request and @timeout_base_ms delay. As a workaround retry - * the poll with preemption disabled to maximize the number of - * requests. Increase the timeout from @timeout_base_ms to 50ms to - * account for interrupts that could reduce the number of these - * requests, and for any quirks of the PCODE firmware that delays - * the request completion. - */ - drm_dbg_kms(&i915->drm, - "PCODE timeout, retrying with preemption disabled\n"); - drm_WARN_ON_ONCE(&i915->drm, timeout_base_ms > 3); - preempt_disable(); - ret = wait_for_atomic(COND, 50); - preempt_enable(); - -out: - mutex_unlock(&i915->sb_lock); - return ret ? ret : status; -#undef COND -} - -int intel_pcode_init(struct drm_i915_private *i915) -{ - int ret = 0; - - if (!IS_DGFX(i915)) - return ret; - - ret = skl_pcode_request(i915, DG1_PCODE_STATUS, - DG1_UNCORE_GET_INIT_STATUS, - DG1_UNCORE_INIT_STATUS_COMPLETE, - DG1_UNCORE_INIT_STATUS_COMPLETE, 180000); - - drm_dbg(&i915->drm, "PCODE init status %d\n", ret); - - if (ret) - drm_err(&i915->drm, "Pcode did not report uncore initialization completion!\n"); - - return ret; -} diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h index 545c8f277c46..4f4c2e15e736 100644 --- a/drivers/gpu/drm/i915/intel_wakeref.h +++ b/drivers/gpu/drm/i915/intel_wakeref.h @@ -123,6 +123,12 @@ enum { __INTEL_WAKEREF_PUT_LAST_BIT__ }; +static inline void +intel_wakeref_might_get(struct intel_wakeref *wf) +{ + might_lock(&wf->mutex); +} + /** * intel_wakeref_put_flags: Release the wakeref * @wf: the wakeref @@ -170,6 +176,12 @@ intel_wakeref_put_delay(struct intel_wakeref *wf, unsigned long delay) FIELD_PREP(INTEL_WAKEREF_PUT_DELAY, delay)); } +static inline void +intel_wakeref_might_put(struct intel_wakeref *wf) +{ + might_lock(&wf->mutex); +} + /** * intel_wakeref_lock: Lock the wakeref (mutex) * @wf: the wakeref diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h index 3cf6758931f9..bdd290f2bf3c 100644 --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h @@ -48,5 +48,6 @@ selftest(ring_submission, intel_ring_submission_live_selftests) selftest(perf, i915_perf_live_selftests) selftest(slpc, intel_slpc_live_selftests) selftest(guc, intel_guc_live_selftests) +selftest(guc_multi_lrc, intel_guc_multi_lrc_live_selftests) /* Here be dragons: keep last to run last! */ selftest(late_gt_pm, intel_gt_pm_late_selftests) diff --git a/drivers/gpu/drm/i915/selftests/mock_region.c b/drivers/gpu/drm/i915/selftests/mock_region.c index efa86dffe3c6..75793008c4ef 100644 --- a/drivers/gpu/drm/i915/selftests/mock_region.c +++ b/drivers/gpu/drm/i915/selftests/mock_region.c @@ -6,8 +6,6 @@ #include <drm/ttm/ttm_placement.h> #include <linux/scatterlist.h> -#include <drm/ttm/ttm_placement.h> - #include "gem/i915_gem_region.h" #include "intel_memory_region.h" #include "intel_region_ttm.h" diff --git a/drivers/gpu/drm/i915/vlv_sideband.c b/drivers/gpu/drm/i915/vlv_sideband.c new file mode 100644 index 000000000000..35380738a951 --- /dev/null +++ b/drivers/gpu/drm/i915/vlv_sideband.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2013-2021 Intel Corporation + */ + +#include <asm/iosf_mbi.h> + +#include "i915_drv.h" +#include "vlv_sideband.h" + +/* + * IOSF sideband, see VLV2_SidebandMsg_HAS.docx and + * VLV_VLV2_PUNIT_HAS_0.8.docx + */ + +/* Standard MMIO read, non-posted */ +#define SB_MRD_NP 0x00 +/* Standard MMIO write, non-posted */ +#define SB_MWR_NP 0x01 +/* Private register read, double-word addressing, non-posted */ +#define SB_CRRDDA_NP 0x06 +/* Private register write, double-word addressing, non-posted */ +#define SB_CRWRDA_NP 0x07 + +static void ping(void *info) +{ +} + +static void __vlv_punit_get(struct drm_i915_private *i915) +{ + iosf_mbi_punit_acquire(); + + /* + * Prevent the cpu from sleeping while we use this sideband, otherwise + * the punit may cause a machine hang. The issue appears to be isolated + * with changing the power state of the CPU package while changing + * the power state via the punit, and we have only observed it + * reliably on 4-core Baytail systems suggesting the issue is in the + * power delivery mechanism and likely to be board/function + * specific. Hence we presume the workaround needs only be applied + * to the Valleyview P-unit and not all sideband communications. + */ + if (IS_VALLEYVIEW(i915)) { + cpu_latency_qos_update_request(&i915->sb_qos, 0); + on_each_cpu(ping, NULL, 1); + } +} + +static void __vlv_punit_put(struct drm_i915_private *i915) +{ + if (IS_VALLEYVIEW(i915)) + cpu_latency_qos_update_request(&i915->sb_qos, + PM_QOS_DEFAULT_VALUE); + + iosf_mbi_punit_release(); +} + +void vlv_iosf_sb_get(struct drm_i915_private *i915, unsigned long ports) +{ + if (ports & BIT(VLV_IOSF_SB_PUNIT)) + __vlv_punit_get(i915); + + mutex_lock(&i915->sb_lock); +} + +void vlv_iosf_sb_put(struct drm_i915_private *i915, unsigned long ports) +{ + mutex_unlock(&i915->sb_lock); + + if (ports & BIT(VLV_IOSF_SB_PUNIT)) + __vlv_punit_put(i915); +} + +static int vlv_sideband_rw(struct drm_i915_private *i915, + u32 devfn, u32 port, u32 opcode, + u32 addr, u32 *val) +{ + struct intel_uncore *uncore = &i915->uncore; + const bool is_read = (opcode == SB_MRD_NP || opcode == SB_CRRDDA_NP); + int err; + + lockdep_assert_held(&i915->sb_lock); + if (port == IOSF_PORT_PUNIT) + iosf_mbi_assert_punit_acquired(); + + /* Flush the previous comms, just in case it failed last time. */ + if (intel_wait_for_register(uncore, + VLV_IOSF_DOORBELL_REQ, IOSF_SB_BUSY, 0, + 5)) { + drm_dbg(&i915->drm, "IOSF sideband idle wait (%s) timed out\n", + is_read ? "read" : "write"); + return -EAGAIN; + } + + preempt_disable(); + + intel_uncore_write_fw(uncore, VLV_IOSF_ADDR, addr); + intel_uncore_write_fw(uncore, VLV_IOSF_DATA, is_read ? 0 : *val); + intel_uncore_write_fw(uncore, VLV_IOSF_DOORBELL_REQ, + (devfn << IOSF_DEVFN_SHIFT) | + (opcode << IOSF_OPCODE_SHIFT) | + (port << IOSF_PORT_SHIFT) | + (0xf << IOSF_BYTE_ENABLES_SHIFT) | + (0 << IOSF_BAR_SHIFT) | + IOSF_SB_BUSY); + + if (__intel_wait_for_register_fw(uncore, + VLV_IOSF_DOORBELL_REQ, IOSF_SB_BUSY, 0, + 10000, 0, NULL) == 0) { + if (is_read) + *val = intel_uncore_read_fw(uncore, VLV_IOSF_DATA); + err = 0; + } else { + drm_dbg(&i915->drm, "IOSF sideband finish wait (%s) timed out\n", + is_read ? "read" : "write"); + err = -ETIMEDOUT; + } + + preempt_enable(); + + return err; +} + +u32 vlv_punit_read(struct drm_i915_private *i915, u32 addr) +{ + u32 val = 0; + + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_PUNIT, + SB_CRRDDA_NP, addr, &val); + + return val; +} + +int vlv_punit_write(struct drm_i915_private *i915, u32 addr, u32 val) +{ + return vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_PUNIT, + SB_CRWRDA_NP, addr, &val); +} + +u32 vlv_bunit_read(struct drm_i915_private *i915, u32 reg) +{ + u32 val = 0; + + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_BUNIT, + SB_CRRDDA_NP, reg, &val); + + return val; +} + +void vlv_bunit_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_BUNIT, + SB_CRWRDA_NP, reg, &val); +} + +u32 vlv_nc_read(struct drm_i915_private *i915, u8 addr) +{ + u32 val = 0; + + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_NC, + SB_CRRDDA_NP, addr, &val); + + return val; +} + +u32 vlv_iosf_sb_read(struct drm_i915_private *i915, u8 port, u32 reg) +{ + u32 val = 0; + + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), port, + SB_CRRDDA_NP, reg, &val); + + return val; +} + +void vlv_iosf_sb_write(struct drm_i915_private *i915, + u8 port, u32 reg, u32 val) +{ + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), port, + SB_CRWRDA_NP, reg, &val); +} + +u32 vlv_cck_read(struct drm_i915_private *i915, u32 reg) +{ + u32 val = 0; + + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCK, + SB_CRRDDA_NP, reg, &val); + + return val; +} + +void vlv_cck_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCK, + SB_CRWRDA_NP, reg, &val); +} + +u32 vlv_ccu_read(struct drm_i915_private *i915, u32 reg) +{ + u32 val = 0; + + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCU, + SB_CRRDDA_NP, reg, &val); + + return val; +} + +void vlv_ccu_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ + vlv_sideband_rw(i915, PCI_DEVFN(0, 0), IOSF_PORT_CCU, + SB_CRWRDA_NP, reg, &val); +} + +static u32 vlv_dpio_phy_iosf_port(struct drm_i915_private *i915, enum dpio_phy phy) +{ + /* + * IOSF_PORT_DPIO: VLV x2 PHY (DP/HDMI B and C), CHV x1 PHY (DP/HDMI D) + * IOSF_PORT_DPIO_2: CHV x2 PHY (DP/HDMI B and C) + */ + if (IS_CHERRYVIEW(i915)) + return phy == DPIO_PHY0 ? IOSF_PORT_DPIO_2 : IOSF_PORT_DPIO; + else + return IOSF_PORT_DPIO; +} + +u32 vlv_dpio_read(struct drm_i915_private *i915, enum pipe pipe, int reg) +{ + u32 port = vlv_dpio_phy_iosf_port(i915, DPIO_PHY(pipe)); + u32 val = 0; + + vlv_sideband_rw(i915, DPIO_DEVFN, port, SB_MRD_NP, reg, &val); + + /* + * FIXME: There might be some registers where all 1's is a valid value, + * so ideally we should check the register offset instead... + */ + drm_WARN(&i915->drm, val == 0xffffffff, + "DPIO read pipe %c reg 0x%x == 0x%x\n", + pipe_name(pipe), reg, val); + + return val; +} + +void vlv_dpio_write(struct drm_i915_private *i915, + enum pipe pipe, int reg, u32 val) +{ + u32 port = vlv_dpio_phy_iosf_port(i915, DPIO_PHY(pipe)); + + vlv_sideband_rw(i915, DPIO_DEVFN, port, SB_MWR_NP, reg, &val); +} + +u32 vlv_flisdsi_read(struct drm_i915_private *i915, u32 reg) +{ + u32 val = 0; + + vlv_sideband_rw(i915, DPIO_DEVFN, IOSF_PORT_FLISDSI, SB_CRRDDA_NP, + reg, &val); + return val; +} + +void vlv_flisdsi_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ + vlv_sideband_rw(i915, DPIO_DEVFN, IOSF_PORT_FLISDSI, SB_CRWRDA_NP, + reg, &val); +} diff --git a/drivers/gpu/drm/i915/intel_sideband.h b/drivers/gpu/drm/i915/vlv_sideband.h index d1d14bcb8f56..d7732f612e7f 100644 --- a/drivers/gpu/drm/i915/intel_sideband.h +++ b/drivers/gpu/drm/i915/vlv_sideband.h @@ -1,18 +1,16 @@ /* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2013-2021 Intel Corporation + */ -#ifndef _INTEL_SIDEBAND_H_ -#define _INTEL_SIDEBAND_H_ +#ifndef _VLV_SIDEBAND_H_ +#define _VLV_SIDEBAND_H_ #include <linux/bitops.h> #include <linux/types.h> -struct drm_i915_private; enum pipe; - -enum intel_sbi_destination { - SBI_ICLK, - SBI_MPHY, -}; +struct drm_i915_private; enum { VLV_IOSF_SB_BUNIT, @@ -122,22 +120,4 @@ static inline void vlv_punit_put(struct drm_i915_private *i915) vlv_iosf_sb_put(i915, BIT(VLV_IOSF_SB_PUNIT)); } -u32 intel_sbi_read(struct drm_i915_private *i915, u16 reg, - enum intel_sbi_destination destination); -void intel_sbi_write(struct drm_i915_private *i915, u16 reg, u32 value, - enum intel_sbi_destination destination); - -int sandybridge_pcode_read(struct drm_i915_private *i915, u32 mbox, - u32 *val, u32 *val1); -int sandybridge_pcode_write_timeout(struct drm_i915_private *i915, u32 mbox, - u32 val, int fast_timeout_us, - int slow_timeout_ms); -#define sandybridge_pcode_write(i915, mbox, val) \ - sandybridge_pcode_write_timeout(i915, mbox, val, 500, 0) - -int skl_pcode_request(struct drm_i915_private *i915, u32 mbox, u32 request, - u32 reply_mask, u32 reply, int timeout_base_ms); - -int intel_pcode_init(struct drm_i915_private *i915); - -#endif /* _INTEL_SIDEBAND_H */ +#endif /* _VLV_SIDEBAND_H_ */ diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index aa2a7eccfb94..914ebd9290e5 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1522,6 +1522,12 @@ struct drm_i915_gem_caching { #define I915_TILING_NONE 0 #define I915_TILING_X 1 #define I915_TILING_Y 2 +/* + * Do not add new tiling types here. The I915_TILING_* values are for + * de-tiling fence registers that no longer exist on modern platforms. Although + * the hardware may support new types of tiling in general (e.g., Tile4), we + * do not need to add them to the uapi that is specific to now-defunct ioctls. + */ #define I915_TILING_LAST I915_TILING_Y #define I915_BIT_6_SWIZZLE_NONE 0 @@ -1824,6 +1830,7 @@ struct drm_i915_gem_context_param { * Extensions: * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE) * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) + * i915_context_engines_parallel_submit (I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT) */ #define I915_CONTEXT_PARAM_ENGINES 0xa @@ -2099,6 +2106,135 @@ struct i915_context_engines_bond { } __attribute__((packed)) name__ /** + * struct i915_context_engines_parallel_submit - Configure engine for + * parallel submission. + * + * Setup a slot in the context engine map to allow multiple BBs to be submitted + * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU + * in parallel. Multiple hardware contexts are created internally in the i915 to + * run these BBs. Once a slot is configured for N BBs only N BBs can be + * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user + * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how + * many BBs there are based on the slot's configuration. The N BBs are the last + * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set. + * + * The default placement behavior is to create implicit bonds between each + * context if each context maps to more than 1 physical engine (e.g. context is + * a virtual engine). Also we only allow contexts of same engine class and these + * contexts must be in logically contiguous order. Examples of the placement + * behavior are described below. Lastly, the default is to not allow BBs to be + * preempted mid-batch. Rather insert coordinated preemption points on all + * hardware contexts between each set of BBs. Flags could be added in the future + * to change both of these default behaviors. + * + * Returns -EINVAL if hardware context placement configuration is invalid or if + * the placement configuration isn't supported on the platform / submission + * interface. + * Returns -ENODEV if extension isn't supported on the platform / submission + * interface. + * + * .. code-block:: none + * + * Examples syntax: + * CS[X] = generic engine of same class, logical instance X + * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE + * + * Example 1 pseudo code: + * set_engines(INVALID) + * set_parallel(engine_index=0, width=2, num_siblings=1, + * engines=CS[0],CS[1]) + * + * Results in the following valid placement: + * CS[0], CS[1] + * + * Example 2 pseudo code: + * set_engines(INVALID) + * set_parallel(engine_index=0, width=2, num_siblings=2, + * engines=CS[0],CS[2],CS[1],CS[3]) + * + * Results in the following valid placements: + * CS[0], CS[1] + * CS[2], CS[3] + * + * This can be thought of as two virtual engines, each containing two + * engines thereby making a 2D array. However, there are bonds tying the + * entries together and placing restrictions on how they can be scheduled. + * Specifically, the scheduler can choose only vertical columns from the 2D + * array. That is, CS[0] is bonded to CS[1] and CS[2] to CS[3]. So if the + * scheduler wants to submit to CS[0], it must also choose CS[1] and vice + * versa. Same for CS[2] requires also using CS[3]. + * VE[0] = CS[0], CS[2] + * VE[1] = CS[1], CS[3] + * + * Example 3 pseudo code: + * set_engines(INVALID) + * set_parallel(engine_index=0, width=2, num_siblings=2, + * engines=CS[0],CS[1],CS[1],CS[3]) + * + * Results in the following valid and invalid placements: + * CS[0], CS[1] + * CS[1], CS[3] - Not logically contiguous, return -EINVAL + */ +struct i915_context_engines_parallel_submit { + /** + * @base: base user extension. + */ + struct i915_user_extension base; + + /** + * @engine_index: slot for parallel engine + */ + __u16 engine_index; + + /** + * @width: number of contexts per parallel engine or in other words the + * number of batches in each submission + */ + __u16 width; + + /** + * @num_siblings: number of siblings per context or in other words the + * number of possible placements for each submission + */ + __u16 num_siblings; + + /** + * @mbz16: reserved for future use; must be zero + */ + __u16 mbz16; + + /** + * @flags: all undefined flags must be zero, currently not defined flags + */ + __u64 flags; + + /** + * @mbz64: reserved for future use; must be zero + */ + __u64 mbz64[3]; + + /** + * @engines: 2-d array of engine instances to configure parallel engine + * + * length = width (i) * num_siblings (j) + * index = j + i * num_siblings + */ + struct i915_engine_class_instance engines[0]; + +} __packed; + +#define I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(name__, N__) struct { \ + struct i915_user_extension base; \ + __u16 engine_index; \ + __u16 width; \ + __u16 num_siblings; \ + __u16 mbz16; \ + __u64 flags; \ + __u64 mbz64[3]; \ + struct i915_engine_class_instance engines[N__]; \ +} __attribute__((packed)) name__ + +/** * DOC: Context Engine Map uAPI * * Context engine map is a new way of addressing engines when submitting batch- @@ -2157,6 +2293,7 @@ struct i915_context_param_engines { __u64 extensions; /* linked chain of extension blocks, 0 terminates */ #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */ +#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ struct i915_engine_class_instance engines[0]; } __attribute__((packed)); @@ -2775,14 +2912,20 @@ struct drm_i915_engine_info { /** @flags: Engine flags. */ __u64 flags; +#define I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE (1 << 0) /** @capabilities: Capabilities of this engine. */ __u64 capabilities; #define I915_VIDEO_CLASS_CAPABILITY_HEVC (1 << 0) #define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC (1 << 1) + /** @logical_instance: Logical instance of engine */ + __u16 logical_instance; + /** @rsvd1: Reserved fields. */ - __u64 rsvd1[4]; + __u16 rsvd1[3]; + /** @rsvd2: Reserved fields. */ + __u64 rsvd2[3]; }; /** |