vfio/mlx5: Introduce vfio precopy ioctl implementation
authorYishai Hadas <yishaih@nvidia.com>
Tue, 6 Dec 2022 08:34:34 +0000 (10:34 +0200)
committerAlex Williamson <alex.williamson@redhat.com>
Tue, 6 Dec 2022 19:36:44 +0000 (12:36 -0700)
vfio precopy ioctl returns an estimation of data available for
transferring from the device.

Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current
state of the device, and if needed, append the dirty data to the
transfer FD data. This is done by saving a middle state.

As mlx5 runs the SAVE command asynchronously, make sure to query for
incremental data only once there is no active save command.
Running both in parallel, might end-up with a failure in the incremental
query command on un-tracked vhca.

Also, a middle state will be saved only after the previous state has
finished its SAVE command and has been fully transferred, this prevents
endless use resources.

Co-developed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20221206083438.37807-11-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
drivers/vfio/pci/mlx5/cmd.c
drivers/vfio/pci/mlx5/main.c

index 160fa38..12e74ec 100644 (file)
@@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 {
        u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
        u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+       bool inc = query_flags & MLX5VF_QUERY_INC;
        int ret;
 
        lockdep_assert_held(&mvdev->state_mutex);
        if (mvdev->mdev_detach)
                return -ENOTCONN;
 
+       /*
+        * In case PRE_COPY is used, saving_migf is exposed while device is
+        * running. Make sure to run only once there is no active save command.
+        * Running both in parallel, might end-up with a failure in the
+        * incremental query command on un-tracked vhca.
+        */
+       if (inc) {
+               ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+               if (ret)
+                       return ret;
+       }
+
        MLX5_SET(query_vhca_migration_state_in, in, opcode,
                 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
        MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
@@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 
        ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
                                  out);
+       if (inc)
+               complete(&mvdev->saving_migf->save_comp);
+
        if (ret)
                return ret;
 
index 9a36e36..2c8ac76 100644 (file)
@@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
        wake_up_interruptible(&migf->poll_wait);
 }
 
+static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+                                unsigned long arg)
+{
+       struct mlx5_vf_migration_file *migf = filp->private_data;
+       struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+       struct mlx5_vhca_data_buffer *buf;
+       struct vfio_precopy_info info = {};
+       loff_t *pos = &filp->f_pos;
+       unsigned long minsz;
+       size_t inc_length = 0;
+       bool end_of_data;
+       int ret;
+
+       if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+               return -ENOTTY;
+
+       minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+       if (copy_from_user(&info, (void __user *)arg, minsz))
+               return -EFAULT;
+
+       if (info.argsz < minsz)
+               return -EINVAL;
+
+       mutex_lock(&mvdev->state_mutex);
+       if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+           mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+               ret = -EINVAL;
+               goto err_state_unlock;
+       }
+
+       /*
+        * We can't issue a SAVE command when the device is suspended, so as
+        * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+        * bytes that can't be read.
+        */
+       if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+               /*
+                * Once the query returns it's guaranteed that there is no
+                * active SAVE command.
+                * As so, the other code below is safe with the proper locks.
+                */
+               ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+                                                           MLX5VF_QUERY_INC);
+               if (ret)
+                       goto err_state_unlock;
+       }
+
+       mutex_lock(&migf->lock);
+       if (migf->state == MLX5_MIGF_STATE_ERROR) {
+               ret = -ENODEV;
+               goto err_migf_unlock;
+       }
+
+       buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+       if (buf) {
+               if (buf->start_pos == 0) {
+                       info.initial_bytes = buf->header_image_size - *pos;
+               } else if (buf->start_pos ==
+                               sizeof(struct mlx5_vf_migration_header)) {
+                       /* First data buffer following the header */
+                       info.initial_bytes = buf->start_pos +
+                                               buf->length - *pos;
+               } else {
+                       info.dirty_bytes = buf->start_pos + buf->length - *pos;
+               }
+       } else {
+               if (!end_of_data) {
+                       ret = -EINVAL;
+                       goto err_migf_unlock;
+               }
+
+               info.dirty_bytes = inc_length;
+       }
+
+       if (!end_of_data || !inc_length) {
+               mutex_unlock(&migf->lock);
+               goto done;
+       }
+
+       mutex_unlock(&migf->lock);
+       /*
+        * We finished transferring the current state and the device has a
+        * dirty state, save a new state to be ready for.
+        */
+       buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+       if (IS_ERR(buf)) {
+               ret = PTR_ERR(buf);
+               mlx5vf_mark_err(migf);
+               goto err_state_unlock;
+       }
+
+       ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+       if (ret) {
+               mlx5vf_mark_err(migf);
+               mlx5vf_put_data_buffer(buf);
+               goto err_state_unlock;
+       }
+
+done:
+       mlx5vf_state_mutex_unlock(mvdev);
+       return copy_to_user((void __user *)arg, &info, minsz);
+err_migf_unlock:
+       mutex_unlock(&migf->lock);
+err_state_unlock:
+       mlx5vf_state_mutex_unlock(mvdev);
+       return ret;
+}
+
 static const struct file_operations mlx5vf_save_fops = {
        .owner = THIS_MODULE,
        .read = mlx5vf_save_read,
        .poll = mlx5vf_save_poll,
+       .unlocked_ioctl = mlx5vf_precopy_ioctl,
+       .compat_ioctl = compat_ptr_ioctl,
        .release = mlx5vf_release_file,
        .llseek = no_llseek,
 };