diff options
Diffstat (limited to 'drivers/vfio/pci/mlx5/cmd.c')
| -rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 413 | 
1 files changed, 340 insertions, 73 deletions
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..64e68d13cb98 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);  int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)  { +	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;  	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};  	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; +	int err;  	lockdep_assert_held(&mvdev->state_mutex);  	if (mvdev->mdev_detach)  		return -ENOTCONN; +	/* +	 * In case PRE_COPY is used, saving_migf is exposed while the device is +	 * running. Make sure to run only once there is no active save command. +	 * Running both in parallel, might end-up with a failure in the save +	 * command once it will try to turn on 'tracking' on a suspended device. +	 */ +	if (migf) { +		err = wait_for_completion_interruptible(&migf->save_comp); +		if (err) +			return err; +	} +  	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);  	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);  	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); -	return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); +	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); +	if (migf) +		complete(&migf->save_comp); + +	return err;  }  int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,23 +63,54 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)  }  int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, -					  size_t *state_size) +					  size_t *state_size, u8 query_flags)  {  	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};  	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; +	bool inc = query_flags & MLX5VF_QUERY_INC;  	int ret;  	lockdep_assert_held(&mvdev->state_mutex);  	if (mvdev->mdev_detach)  		return -ENOTCONN; +	/* +	 * In case PRE_COPY is used, saving_migf is exposed while device is +	 * running. Make sure to run only once there is no active save command. +	 * Running both in parallel, might end-up with a failure in the +	 * incremental query command on un-tracked vhca. +	 */ +	if (inc) { +		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); +		if (ret) +			return ret; +		if (mvdev->saving_migf->state == +		    MLX5_MIGF_STATE_PRE_COPY_ERROR) { +			/* +			 * In case we had a PRE_COPY error, only query full +			 * image for final image +			 */ +			if (!(query_flags & MLX5VF_QUERY_FINAL)) { +				*state_size = 0; +				complete(&mvdev->saving_migf->save_comp); +				return 0; +			} +			query_flags &= ~MLX5VF_QUERY_INC; +		} +	} +  	MLX5_SET(query_vhca_migration_state_in, in, opcode,  		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);  	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);  	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); +	MLX5_SET(query_vhca_migration_state_in, in, incremental, +		 query_flags & MLX5VF_QUERY_INC);  	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,  				  out); +	if (inc) +		complete(&mvdev->saving_migf->save_comp); +  	if (ret)  		return ret; @@ -173,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,  	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))  		mvdev->core_device.vdev.log_ops = log_ops; +	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && +	    MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) +		mvdev->core_device.vdev.migration_flags |= +			VFIO_MIGRATION_PRE_COPY; +  end:  	mlx5_vf_put_core_dev(mvdev->mdev);  } @@ -210,11 +264,11 @@ err_exec:  }  static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, -			struct mlx5_vf_migration_file *migf, +			struct mlx5_vhca_data_buffer *buf,  			struct mlx5_vhca_recv_buf *recv_buf,  			u32 *mkey)  { -	size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : +	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :  				recv_buf->npages;  	int err = 0, inlen;  	__be64 *mtt; @@ -232,10 +286,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,  		 DIV_ROUND_UP(npages, 2));  	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); -	if (migf) { +	if (buf) {  		struct sg_dma_page_iter dma_iter; -		for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) +		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)  			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));  	} else {  		int i; @@ -255,35 +309,195 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,  	MLX5_SET(mkc, mkc, qpn, 0xffffff);  	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);  	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); -	MLX5_SET64(mkc, mkc, len, -		   migf ? migf->total_length : (npages * PAGE_SIZE)); +	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);  	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);  	kvfree(in);  	return err;  } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ +	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; +	struct mlx5_core_dev *mdev = mvdev->mdev; +	int ret; + +	lockdep_assert_held(&mvdev->state_mutex); +	if (mvdev->mdev_detach) +		return -ENOTCONN; + +	if (buf->dmaed || !buf->allocated_length) +		return -EINVAL; + +	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); +	if (ret) +		return ret; + +	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); +	if (ret) +		goto err; + +	buf->dmaed = true; + +	return 0; +err: +	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); +	return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ +	struct mlx5_vf_migration_file *migf = buf->migf; +	struct sg_page_iter sg_iter; + +	lockdep_assert_held(&migf->mvdev->state_mutex); +	WARN_ON(migf->mvdev->mdev_detach); + +	if (buf->dmaed) { +		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); +		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, +				  buf->dma_dir, 0); +	} + +	/* Undo alloc_pages_bulk_array() */ +	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) +		__free_page(sg_page_iter_page(&sg_iter)); +	sg_free_append_table(&buf->table); +	kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, +			 size_t length, +			 enum dma_data_direction dma_dir) +{ +	struct mlx5_vhca_data_buffer *buf; +	int ret; + +	buf = kzalloc(sizeof(*buf), GFP_KERNEL); +	if (!buf) +		return ERR_PTR(-ENOMEM); + +	buf->dma_dir = dma_dir; +	buf->migf = migf; +	if (length) { +		ret = mlx5vf_add_migration_pages(buf, +				DIV_ROUND_UP_ULL(length, PAGE_SIZE)); +		if (ret) +			goto end; + +		if (dma_dir != DMA_NONE) { +			ret = mlx5vf_dma_data_buffer(buf); +			if (ret) +				goto end; +		} +	} + +	return buf; +end: +	mlx5vf_free_data_buffer(buf); +	return ERR_PTR(ret); +} + +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ +	spin_lock_irq(&buf->migf->list_lock); +	list_add_tail(&buf->buf_elm, &buf->migf->avail_list); +	spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, +		       size_t length, enum dma_data_direction dma_dir) +{ +	struct mlx5_vhca_data_buffer *buf, *temp_buf; +	struct list_head free_list; + +	lockdep_assert_held(&migf->mvdev->state_mutex); +	if (migf->mvdev->mdev_detach) +		return ERR_PTR(-ENOTCONN); + +	INIT_LIST_HEAD(&free_list); + +	spin_lock_irq(&migf->list_lock); +	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { +		if (buf->dma_dir == dma_dir) { +			list_del_init(&buf->buf_elm); +			if (buf->allocated_length >= length) { +				spin_unlock_irq(&migf->list_lock); +				goto found; +			} +			/* +			 * Prevent holding redundant buffers. Put in a free +			 * list and call at the end not under the spin lock +			 * (&migf->list_lock) to mlx5vf_free_data_buffer which +			 * might sleep. +			 */ +			list_add(&buf->buf_elm, &free_list); +		} +	} +	spin_unlock_irq(&migf->list_lock); +	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: +	while ((temp_buf = list_first_entry_or_null(&free_list, +				struct mlx5_vhca_data_buffer, buf_elm))) { +		list_del(&temp_buf->buf_elm); +		mlx5vf_free_data_buffer(temp_buf); +	} + +	return buf; +} +  void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)  {  	struct mlx5vf_async_data *async_data = container_of(_work,  		struct mlx5vf_async_data, work);  	struct mlx5_vf_migration_file *migf = container_of(async_data,  		struct mlx5_vf_migration_file, async_data); -	struct mlx5_core_dev *mdev = migf->mvdev->mdev;  	mutex_lock(&migf->lock);  	if (async_data->status) { -		migf->is_err = true; +		mlx5vf_put_data_buffer(async_data->buf); +		if (async_data->header_buf) +			mlx5vf_put_data_buffer(async_data->header_buf); +		if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) +			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; +		else +			migf->state = MLX5_MIGF_STATE_ERROR;  		wake_up_interruptible(&migf->poll_wait);  	}  	mutex_unlock(&migf->lock); - -	mlx5_core_destroy_mkey(mdev, async_data->mkey); -	dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -	mlx5_core_dealloc_pd(mdev, async_data->pdn);  	kvfree(async_data->out); +	complete(&migf->save_comp);  	fput(migf->filp);  } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, +			  size_t image_size) +{ +	struct mlx5_vf_migration_file *migf = header_buf->migf; +	struct mlx5_vf_migration_header header = {}; +	unsigned long flags; +	struct page *page; +	u8 *to_buff; + +	header.image_size = cpu_to_le64(image_size); +	page = mlx5vf_get_migration_page(header_buf, 0); +	if (!page) +		return -EINVAL; +	to_buff = kmap_local_page(page); +	memcpy(to_buff, &header, sizeof(header)); +	kunmap_local(to_buff); +	header_buf->length = sizeof(header); +	header_buf->header_image_size = image_size; +	header_buf->start_pos = header_buf->migf->max_pos; +	migf->max_pos += header_buf->length; +	spin_lock_irqsave(&migf->list_lock, flags); +	list_add_tail(&header_buf->buf_elm, &migf->buf_list); +	spin_unlock_irqrestore(&migf->list_lock, flags); +	return 0; +} +  static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)  {  	struct mlx5vf_async_data *async_data = container_of(context, @@ -292,67 +506,96 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)  			struct mlx5_vf_migration_file, async_data);  	if (!status) { -		WRITE_ONCE(migf->total_length, -			   MLX5_GET(save_vhca_state_out, async_data->out, -				    actual_image_size)); +		size_t image_size; +		unsigned long flags; + +		image_size = MLX5_GET(save_vhca_state_out, async_data->out, +				      actual_image_size); +		if (async_data->header_buf) { +			status = add_buf_header(async_data->header_buf, image_size); +			if (status) +				goto err; +		} +		async_data->buf->length = image_size; +		async_data->buf->start_pos = migf->max_pos; +		migf->max_pos += async_data->buf->length; +		spin_lock_irqsave(&migf->list_lock, flags); +		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); +		spin_unlock_irqrestore(&migf->list_lock, flags); +		migf->state = async_data->last_chunk ? +			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;  		wake_up_interruptible(&migf->poll_wait);  	} +err:  	/*  	 * The error and the cleanup flows can't run from an  	 * interrupt context  	 */ +	if (status == -EREMOTEIO) +		status = MLX5_GET(save_vhca_state_out, async_data->out, status);  	async_data->status = status;  	queue_work(migf->mvdev->cb_wq, &async_data->work);  }  int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, -			       struct mlx5_vf_migration_file *migf) +			       struct mlx5_vf_migration_file *migf, +			       struct mlx5_vhca_data_buffer *buf, bool inc, +			       bool track)  {  	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);  	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; +	struct mlx5_vhca_data_buffer *header_buf = NULL;  	struct mlx5vf_async_data *async_data; -	struct mlx5_core_dev *mdev; -	u32 pdn, mkey;  	int err;  	lockdep_assert_held(&mvdev->state_mutex);  	if (mvdev->mdev_detach)  		return -ENOTCONN; -	mdev = mvdev->mdev; -	err = mlx5_core_alloc_pd(mdev, &pdn); +	err = wait_for_completion_interruptible(&migf->save_comp);  	if (err)  		return err; -	err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, -			      0); -	if (err) -		goto err_dma_map; - -	err = _create_mkey(mdev, pdn, migf, NULL, &mkey); -	if (err) -		goto err_create_mkey; +	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) +		/* +		 * In case we had a PRE_COPY error, SAVE is triggered only for +		 * the final image, read device full image. +		 */ +		inc = false;  	MLX5_SET(save_vhca_state_in, in, opcode,  		 MLX5_CMD_OP_SAVE_VHCA_STATE);  	MLX5_SET(save_vhca_state_in, in, op_mod, 0);  	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); -	MLX5_SET(save_vhca_state_in, in, mkey, mkey); -	MLX5_SET(save_vhca_state_in, in, size, migf->total_length); +	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); +	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); +	MLX5_SET(save_vhca_state_in, in, incremental, inc); +	MLX5_SET(save_vhca_state_in, in, set_track, track);  	async_data = &migf->async_data; +	async_data->buf = buf; +	async_data->last_chunk = !track;  	async_data->out = kvzalloc(out_size, GFP_KERNEL);  	if (!async_data->out) {  		err = -ENOMEM;  		goto err_out;  	} -	/* no data exists till the callback comes back */ -	migf->total_length = 0; +	if (MLX5VF_PRE_COPY_SUPP(mvdev)) { +		header_buf = mlx5vf_get_data_buffer(migf, +			sizeof(struct mlx5_vf_migration_header), DMA_NONE); +		if (IS_ERR(header_buf)) { +			err = PTR_ERR(header_buf); +			goto err_free; +		} +	} + +	if (async_data->last_chunk) +		migf->state = MLX5_MIGF_STATE_SAVE_LAST; + +	async_data->header_buf = header_buf;  	get_file(migf->filp); -	async_data->mkey = mkey; -	async_data->pdn = pdn;  	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),  			       async_data->out,  			       out_size, mlx5vf_save_callback, @@ -363,68 +606,92 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,  	return 0;  err_exec: +	if (header_buf) +		mlx5vf_put_data_buffer(header_buf);  	fput(migf->filp); +err_free:  	kvfree(async_data->out);  err_out: -	mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: -	dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: -	mlx5_core_dealloc_pd(mdev, pdn); +	complete(&migf->save_comp);  	return err;  }  int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, -			       struct mlx5_vf_migration_file *migf) +			       struct mlx5_vf_migration_file *migf, +			       struct mlx5_vhca_data_buffer *buf)  { -	struct mlx5_core_dev *mdev; -	u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; -	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; -	u32 pdn, mkey; +	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; +	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};  	int err;  	lockdep_assert_held(&mvdev->state_mutex);  	if (mvdev->mdev_detach)  		return -ENOTCONN; -	mutex_lock(&migf->lock); -	if (!migf->total_length) { -		err = -EINVAL; -		goto end; +	if (!buf->dmaed) { +		err = mlx5vf_dma_data_buffer(buf); +		if (err) +			return err;  	} -	mdev = mvdev->mdev; -	err = mlx5_core_alloc_pd(mdev, &pdn); -	if (err) -		goto end; - -	err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -	if (err) -		goto err_reg; - -	err = _create_mkey(mdev, pdn, migf, NULL, &mkey); -	if (err) -		goto err_mkey; -  	MLX5_SET(load_vhca_state_in, in, opcode,  		 MLX5_CMD_OP_LOAD_VHCA_STATE);  	MLX5_SET(load_vhca_state_in, in, op_mod, 0);  	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); -	MLX5_SET(load_vhca_state_in, in, mkey, mkey); -	MLX5_SET(load_vhca_state_in, in, size, migf->total_length); +	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); +	MLX5_SET(load_vhca_state_in, in, size, buf->length); +	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); +} -	err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ +	int err; -	mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: -	dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: -	mlx5_core_dealloc_pd(mdev, pdn); -end: -	mutex_unlock(&migf->lock); +	lockdep_assert_held(&migf->mvdev->state_mutex); +	if (migf->mvdev->mdev_detach) +		return -ENOTCONN; + +	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);  	return err;  } +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ +	lockdep_assert_held(&migf->mvdev->state_mutex); +	if (migf->mvdev->mdev_detach) +		return; + +	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ +	struct mlx5_vhca_data_buffer *entry; + +	lockdep_assert_held(&migf->mvdev->state_mutex); +	WARN_ON(migf->mvdev->mdev_detach); + +	if (migf->buf) { +		mlx5vf_free_data_buffer(migf->buf); +		migf->buf = NULL; +	} + +	if (migf->buf_header) { +		mlx5vf_free_data_buffer(migf->buf_header); +		migf->buf_header = NULL; +	} + +	list_splice(&migf->avail_list, &migf->buf_list); + +	while ((entry = list_first_entry_or_null(&migf->buf_list, +				struct mlx5_vhca_data_buffer, buf_elm))) { +		list_del(&entry->buf_elm); +		mlx5vf_free_data_buffer(entry); +	} + +	mlx5vf_cmd_dealloc_pd(migf); +} +  static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,  			   u32 req_nodes)  {  |