habanalabs: add device memory scrub ability through debugfs
authorDafna Hirschfeld <dhirschfeld@habana.ai>
Mon, 11 Apr 2022 14:11:23 +0000 (17:11 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 22 May 2022 19:01:20 +0000 (21:01 +0200)
Add the ability to scrub the device memory with a given value.
Add file 'dram_mem_scrub_val' to set the value
and a file 'dram_mem_scrub' to scrub the dram.

This is very important to help during automated tests, when you want
the CI system to randomize the memory before training certain
DL topologies.

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Documentation/ABI/testing/debugfs-driver-habanalabs
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/habanalabs_drv.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c

index 84bf3da2bb275c6316e8bf58128da6499b314e80..0f8d20fe343f32edf60549c89d2424e727f1282f 100644 (file)
@@ -170,6 +170,20 @@ KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Sets the state of the third S/W led on the device
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/memory_scrub
+Date:           May 2022
+KernelVersion:  5.19
+Contact:        dhirschfeld@habana.ai
+Description:    Allows the root user to scrub the dram memory. The scrubbing
+                value can be set using the debugfs file memory_scrub_val.
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/memory_scrub_val
+Date:           May 2022
+KernelVersion:  5.19
+Contact:        dhirschfeld@habana.ai
+Description:    The value to which the dram will be set to when the user
+                scrubs the dram using 'memory_scrub' debugfs file
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/mmu
 Date:           Jan 2019
 KernelVersion:  5.1
index a9c4f2d4139da2250b142e2f79804c66b31075e3..c6744bfc6da425f3a96e3567edb6e80c8eebf5bb 100644 (file)
@@ -538,6 +538,39 @@ static int engines_show(struct seq_file *s, void *data)
        return 0;
 }
 
+static ssize_t hl_memory_scrub(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *ppos)
+{
+       struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+       struct hl_device *hdev = entry->hdev;
+       u64 val = entry->memory_scrub_val;
+       int rc;
+
+       if (!hl_device_operational(hdev, NULL)) {
+               dev_warn_ratelimited(hdev->dev, "Can't scrub memory, device is not operational\n");
+               return -EIO;
+       }
+
+       mutex_lock(&hdev->fpriv_list_lock);
+       if (hdev->is_compute_ctx_active) {
+               mutex_unlock(&hdev->fpriv_list_lock);
+               dev_err(hdev->dev, "can't scrub dram, context exist\n");
+               return -EBUSY;
+       }
+       hdev->is_in_dram_scrub = true;
+       mutex_unlock(&hdev->fpriv_list_lock);
+
+       rc = hdev->asic_funcs->scrub_device_dram(hdev, val);
+
+       mutex_lock(&hdev->fpriv_list_lock);
+       hdev->is_in_dram_scrub = false;
+       mutex_unlock(&hdev->fpriv_list_lock);
+
+       if (rc)
+               return rc;
+       return count;
+}
+
 static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
 {
        struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -1316,6 +1349,11 @@ static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf,
        return count;
 }
 
+static const struct file_operations hl_mem_scrub_fops = {
+       .owner = THIS_MODULE,
+       .write = hl_memory_scrub,
+};
+
 static const struct file_operations hl_data32b_fops = {
        .owner = THIS_MODULE,
        .read = hl_data_read32,
@@ -1475,6 +1513,17 @@ void hl_debugfs_add_device(struct hl_device *hdev)
        dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
                                                hl_debug_root);
 
+       debugfs_create_x64("memory_scrub_val",
+                               0644,
+                               dev_entry->root,
+                               &dev_entry->memory_scrub_val);
+
+       debugfs_create_file("memory_scrub",
+                               0200,
+                               dev_entry->root,
+                               dev_entry,
+                               &hl_mem_scrub_fops);
+
        debugfs_create_x64("addr",
                                0644,
                                dev_entry->root,
index 496d61ee07c5138572b16cd72400b81815a62fe4..59150caa98a2db33bf0ebb6b927d7b3a92f743a5 100644 (file)
@@ -1246,6 +1246,7 @@ struct fw_load_mgr {
  *                           its implementation is not trivial when the driver
  *                           is loaded in simulation mode (not upstreamed).
  * @scrub_device_mem: Scrub device memory given an address and size
+ * @scrub_device_dram: Scrub the dram memory of the device.
  * @get_int_queue_base: get the internal queue base address.
  * @test_queues: run simple test on all queues for sanity check.
  * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
@@ -1357,6 +1358,7 @@ struct hl_asic_funcs {
        void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
                                        void *cpu_addr, dma_addr_t dma_handle);
        int (*scrub_device_mem)(struct hl_device *hdev, u64 addr, u64 size);
+       int (*scrub_device_dram)(struct hl_device *hdev, u64 val);
        void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
                                dma_addr_t *dma_handle, u16 *queue_len);
        int (*test_queues)(struct hl_device *hdev);
@@ -2011,6 +2013,7 @@ struct hl_debugfs_entry {
  * @addr: next address to read/write from/to in read/write32.
  * @mmu_addr: next virtual address to translate to physical address in mmu_show.
  * @userptr_lookup: the target user ptr to look up for on demand.
+ * @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram
  * @mmu_asid: ASID to use while translating in mmu_show.
  * @state_dump_head: index of the latest state dump
  * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
@@ -2041,6 +2044,7 @@ struct hl_dbg_device_entry {
        u64                             addr;
        u64                             mmu_addr;
        u64                             userptr_lookup;
+       u64                             memory_scrub_val;
        u32                             mmu_asid;
        u32                             state_dump_head;
        u8                              i2c_bus;
@@ -2704,6 +2708,7 @@ struct hl_reset_info {
  * @id_control: minor of the control device
  * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
  *                    addresses.
+ * @is_in_dram_scrub: true if dram scrub operation is on going.
  * @disabled: is device disabled.
  * @late_init_done: is late init stage was done during initialization.
  * @hwmon_initialized: is H/W monitor sensors was initialized.
@@ -2834,6 +2839,7 @@ struct hl_device {
        u16                             id;
        u16                             id_control;
        u16                             cpu_pci_msb_addr;
+       u8                              is_in_dram_scrub;
        u8                              disabled;
        u8                              late_init_done;
        u8                              hwmon_initialized;
index 70203433e6cd5416c75290ad9a18e472fa353e58..1210de39d661167cc71fa61112135ff1b8b3e2f0 100644 (file)
@@ -158,6 +158,14 @@ int hl_device_open(struct inode *inode, struct file *filp)
                goto out_err;
        }
 
+       if (hdev->is_in_dram_scrub) {
+               dev_dbg_ratelimited(hdev->dev,
+                       "Can't open %s during dram scrub\n",
+                       dev_name(hdev->dev));
+               rc = -EAGAIN;
+               goto out_err;
+       }
+
        if (hdev->compute_ctx_in_release) {
                dev_dbg_ratelimited(hdev->dev,
                        "Can't open %s because another user is still releasing it\n",
index 8e9bdbac512ed983548873d91f4bdd3e36b73ae3..08cd60300b4f023b74ecdd60f3c6edd6a1e889da 100644 (file)
@@ -4740,12 +4740,11 @@ static void gaudi_dma_free_coherent(struct hl_device *hdev, size_t size,
        dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle);
 }
 
-static int gaudi_hbm_scrubbing(struct hl_device *hdev)
+static int gaudi_scrub_device_dram(struct hl_device *hdev, u64 val)
 {
        struct asic_fixed_properties *prop = &hdev->asic_prop;
        u64  cur_addr = DRAM_BASE_ADDR_USER;
-       u32 val;
-       u32 chunk_size;
+       u32 chunk_size, busy;
        int rc, dma_id;
 
        while (cur_addr < prop->dram_end_address) {
@@ -4759,8 +4758,10 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
                                "Doing HBM scrubbing for 0x%09llx - 0x%09llx\n",
                                cur_addr, cur_addr + chunk_size);
 
-                       WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0xdeadbeaf);
-                       WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0xdeadbeaf);
+                       WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset,
+                                       lower_32_bits(val));
+                       WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset,
+                                       upper_32_bits(val));
                        WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset,
                                                lower_32_bits(cur_addr));
                        WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset,
@@ -4783,8 +4784,8 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
                        rc = hl_poll_timeout(
                                hdev,
                                mmDMA0_CORE_STS0 + dma_offset,
-                               val,
-                               ((val & DMA0_CORE_STS0_BUSY_MASK) == 0),
+                               busy,
+                               ((busy & DMA0_CORE_STS0_BUSY_MASK) == 0),
                                1000,
                                HBM_SCRUBBING_TIMEOUT_US);
 
@@ -4838,7 +4839,7 @@ static int gaudi_scrub_device_mem(struct hl_device *hdev, u64 addr, u64 size)
                }
 
                /* Scrub HBM using all DMA channels in parallel */
-               rc = gaudi_hbm_scrubbing(hdev);
+               rc = gaudi_scrub_device_dram(hdev, 0xdeadbeaf);
                if (rc)
                        dev_err(hdev->dev,
                                "Failed to clear HBM in mem scrub all\n");
@@ -9208,6 +9209,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .asic_dma_alloc_coherent = gaudi_dma_alloc_coherent,
        .asic_dma_free_coherent = gaudi_dma_free_coherent,
        .scrub_device_mem = gaudi_scrub_device_mem,
+       .scrub_device_dram = gaudi_scrub_device_dram,
        .get_int_queue_base = gaudi_get_int_queue_base,
        .test_queues = gaudi_test_queues,
        .asic_dma_pool_zalloc = gaudi_dma_pool_zalloc,
index f8fb6dc042694cf77f257aeaeac1a9b1b3c1832f..f2d4362f6a46ee68be52115e1c5093f6f003de6c 100644 (file)
@@ -5434,6 +5434,11 @@ static int goya_mmu_prefetch_cache_range(struct hl_device *hdev, u32 flags, u32
        return 0;
 }
 
+static int goya_scrub_device_dram(struct hl_device *hdev, u64 val)
+{
+       return -EOPNOTSUPP;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
        .early_init = goya_early_init,
        .early_fini = goya_early_fini,
@@ -5452,6 +5457,7 @@ static const struct hl_asic_funcs goya_funcs = {
        .asic_dma_alloc_coherent = goya_dma_alloc_coherent,
        .asic_dma_free_coherent = goya_dma_free_coherent,
        .scrub_device_mem = goya_scrub_device_mem,
+       .scrub_device_dram = goya_scrub_device_dram,
        .get_int_queue_base = goya_get_int_queue_base,
        .test_queues = goya_test_queues,
        .asic_dma_pool_zalloc = goya_dma_pool_zalloc,