From c668878381b5702f867ec7f43ee3b74259c6ea03 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:31 +0200 Subject: [PATCH] vfio/mlx5: Refactor to use queue based data chunks Refactor to use queue based data chunks on the migration file. The SAVE command adds a chunk to the tail of the queue while the read() API finds the required chunk and returns its data. In case the queue is empty but the state of the migration file is MLX5_MIGF_STATE_COMPLETE, read() may not be blocked but will return 0 to indicate end of file. This is a step towards maintaining multiple images and their meta data (i.e. headers) on the migration file as part of next patches from the series. Note: At that point, we still use a single chunk on the migration file but becomes ready to support multiple. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 24 +++++-- drivers/vfio/pci/mlx5/cmd.h | 5 ++ drivers/vfio/pci/mlx5/main.c | 145 +++++++++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 38 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fcba123..0e36b4c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,6 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { + migf->buf = async_data->buf; migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -368,9 +369,15 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->buf->length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + unsigned long flags; + + async_data->buf->length = + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -407,6 +414,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; + async_data->buf = buf; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -479,14 +487,22 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { - lockdep_assert_held(&migf->mvdev->state_mutex); + struct mlx5_vhca_data_buffer *entry; + lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (migf->buf) { mlx5vf_free_data_buffer(migf->buf); migf->buf = NULL; } + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 14403e6..6e59468 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -14,6 +14,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_COMPLETE, }; struct mlx5_vhca_data_buffer { @@ -24,6 +25,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -34,6 +36,7 @@ struct mlx5_vhca_data_buffer { struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; int status; void *out; }; @@ -45,6 +48,8 @@ struct mlx5_vf_migration_file { u32 pdn; struct mlx5_vhca_data_buffer *buf; + spinlock_t list_lock; + struct list_head buf_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95646c..ca16425 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -124,11 +124,90 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -137,53 +216,47 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || - migf->state == MLX5_MIGF_STATE_ERROR)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > vhca_buf->length) { - done = -EINVAL; - goto out_unlock; - } if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -202,7 +275,8 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->buf->length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -253,6 +327,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; @@ -266,7 +342,6 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; - migf->buf = buf; return migf; out_save: mlx5vf_free_data_buffer(buf); @@ -386,6 +461,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); return migf; out_pd: mlx5vf_cmd_dealloc_pd(migf); -- 2.7.4