drm/amdgpu/vcn: Add sriov VCN v4_0 unified queue support
authorJane Jian <Jane.Jian@amd.com>
Mon, 22 Aug 2022 07:20:24 +0000 (15:20 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 1 Sep 2022 19:12:07 +0000 (15:12 -0400)
Enable unified queue support for sriov, abandon all previous
multi-queue settings

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jane Jian <Jane.Jian@amd.com>
Reviewed-by: Ruijing Dong <ruijing.dong@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c

index 06b2d18..9fa2a5c 100644 (file)
@@ -1908,10 +1908,9 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev)
                case IP_VERSION(4, 0, 0):
                case IP_VERSION(4, 0, 2):
                case IP_VERSION(4, 0, 4):
-                       if (!amdgpu_sriov_vf(adev)) {
-                               amdgpu_device_ip_block_add(adev, &vcn_v4_0_ip_block);
+                       amdgpu_device_ip_block_add(adev, &vcn_v4_0_ip_block);
+                       if (!amdgpu_sriov_vf(adev))
                                amdgpu_device_ip_block_add(adev, &jpeg_v4_0_ip_block);
-                       }
                        break;
                default:
                        dev_err(adev->dev,
index 60c6081..80b7a6c 100644 (file)
 #define AMDGPU_VCN_SW_RING_FLAG                (1 << 9)
 #define AMDGPU_VCN_FW_LOGGING_FLAG     (1 << 10)
 #define AMDGPU_VCN_SMU_VERSION_INFO_FLAG (1 << 11)
+#define AMDGPU_VCN_VF_RB_SETUP_FLAG (1 << 12)
 
 #define AMDGPU_VCN_IB_FLAG_DECODE_BUFFER       0x00000001
 #define AMDGPU_VCN_CMD_FLAG_MSG_BUFFER         0x00000001
@@ -317,12 +318,24 @@ struct amdgpu_fw_shared {
        struct amdgpu_fw_shared_smu_interface_info smu_interface_info;
 };
 
+struct amdgpu_fw_shared_rb_setup {
+       uint32_t is_rb_enabled_flags;
+       uint32_t rb_addr_lo;
+       uint32_t rb_addr_hi;
+       uint32_t  rb_size;
+       uint32_t  rb4_addr_lo;
+       uint32_t  rb4_addr_hi;
+       uint32_t  rb4_size;
+       uint32_t  reserved[6];
+};
+
 struct amdgpu_vcn4_fw_shared {
        uint32_t present_flag_0;
        uint8_t pad[12];
        struct amdgpu_fw_shared_unified_queue_struct sq;
        uint8_t pad1[8];
        struct amdgpu_fw_shared_fw_logging fw_log;
+       struct amdgpu_fw_shared_rb_setup rb_setup;
 };
 
 struct amdgpu_vcn_fwlog {
index fb2d74f..09c89fa 100644 (file)
@@ -30,6 +30,7 @@
 #include "soc15d.h"
 #include "soc15_hw_ip.h"
 #include "vcn_v2_0.h"
+#include "mmsch_v4_0.h"
 
 #include "vcn/vcn_4_0_0_offset.h"
 #include "vcn/vcn_4_0_0_sh_mask.h"
@@ -45,6 +46,8 @@
 #define VCN_VID_SOC_ADDRESS_2_0                                                        0x1fb00
 #define VCN1_VID_SOC_ADDRESS_3_0                                               0x48300
 
+#define VCN_HARVEST_MMSCH                                                              0
+
 #define RDECODE_MSG_CREATE                                                     0x00000000
 #define RDECODE_MESSAGE_CREATE                                                 0x00000001
 
@@ -53,12 +56,14 @@ static int amdgpu_ih_clientid_vcns[] = {
        SOC15_IH_CLIENTID_VCN1
 };
 
+static int vcn_v4_0_start_sriov(struct amdgpu_device *adev);
 static void vcn_v4_0_set_unified_ring_funcs(struct amdgpu_device *adev);
 static void vcn_v4_0_set_irq_funcs(struct amdgpu_device *adev);
 static int vcn_v4_0_set_powergating_state(void *handle,
         enum amd_powergating_state state);
 static int vcn_v4_0_pause_dpg_mode(struct amdgpu_device *adev,
         int inst_idx, struct dpg_pause_state *new_state);
+static void vcn_v4_0_unified_ring_set_wptr(struct amdgpu_ring *ring);
 
 /**
  * vcn_v4_0_early_init - set function pointers
@@ -71,6 +76,9 @@ static int vcn_v4_0_early_init(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       if (amdgpu_sriov_vf(adev))
+               adev->vcn.harvest_config = VCN_HARVEST_MMSCH;
+
        /* re-use enc ring as unified ring */
        adev->vcn.num_enc_rings = 1;
 
@@ -92,6 +100,7 @@ static int vcn_v4_0_sw_init(void *handle)
        struct amdgpu_ring *ring;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
        int i, r;
+       int vcn_doorbell_index = 0;
 
        r = amdgpu_vcn_sw_init(adev);
        if (r)
@@ -103,6 +112,12 @@ static int vcn_v4_0_sw_init(void *handle)
        if (r)
                return r;
 
+       if (amdgpu_sriov_vf(adev)) {
+               vcn_doorbell_index = adev->doorbell_index.vcn.vcn_ring0_1 - MMSCH_DOORBELL_OFFSET;
+               /* get DWORD offset */
+               vcn_doorbell_index = vcn_doorbell_index << 1;
+       }
+
        for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
                volatile struct amdgpu_vcn4_fw_shared *fw_shared;
 
@@ -119,7 +134,10 @@ static int vcn_v4_0_sw_init(void *handle)
 
                ring = &adev->vcn.inst[i].ring_enc[0];
                ring->use_doorbell = true;
-               ring->doorbell_index = (adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 2 + 8 * i;
+               if (amdgpu_sriov_vf(adev))
+                       ring->doorbell_index = vcn_doorbell_index + i * (adev->vcn.num_enc_rings + 1) + 1;
+               else
+                       ring->doorbell_index = (adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 2 + 8 * i;
 
                sprintf(ring->name, "vcn_unified_%d", i);
 
@@ -132,10 +150,19 @@ static int vcn_v4_0_sw_init(void *handle)
                fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
                fw_shared->sq.is_enabled = 1;
 
+               if (amdgpu_sriov_vf(adev))
+                       fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_SETUP_FLAG);
+
                if (amdgpu_vcnfw_log)
                        amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
        }
 
+       if (amdgpu_sriov_vf(adev)) {
+               r = amdgpu_virt_alloc_mm_table(adev);
+               if (r)
+                       return r;
+       }
+
        if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
                adev->vcn.pause_dpg_mode = vcn_v4_0_pause_dpg_mode;
 
@@ -169,6 +196,9 @@ static int vcn_v4_0_sw_fini(void *handle)
                drm_dev_exit(idx);
        }
 
+       if (amdgpu_sriov_vf(adev))
+               amdgpu_virt_free_mm_table(adev);
+
        r = amdgpu_vcn_suspend(adev);
        if (r)
                return r;
@@ -191,18 +221,42 @@ static int vcn_v4_0_hw_init(void *handle)
        struct amdgpu_ring *ring;
        int i, r;
 
-       for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
-               if (adev->vcn.harvest_config & (1 << i))
-                       continue;
+       if (amdgpu_sriov_vf(adev)) {
+               r = vcn_v4_0_start_sriov(adev);
+               if (r)
+                       goto done;
 
-               ring = &adev->vcn.inst[i].ring_enc[0];
+               for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+                       if (adev->vcn.harvest_config & (1 << i))
+                               continue;
 
-               adev->nbio.funcs->vcn_doorbell_range(adev, ring->use_doorbell,
-                               ((adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 8 * i), i);
+                       ring = &adev->vcn.inst[i].ring_enc[0];
+                       if (amdgpu_vcn_is_disabled_vcn(adev, VCN_ENCODE_RING, i)) {
+                               ring->sched.ready = false;
+                               ring->no_scheduler = true;
+                               dev_info(adev->dev, "ring %s is disabled by hypervisor\n", ring->name);
+                       } else {
+                               ring->wptr = 0;
+                               ring->wptr_old = 0;
+                               vcn_v4_0_unified_ring_set_wptr(ring);
+                               ring->sched.ready = true;
+                       }
+               }
+       } else {
+               for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+                       if (adev->vcn.harvest_config & (1 << i))
+                               continue;
 
-               r = amdgpu_ring_test_helper(ring);
-               if (r)
-                       goto done;
+                       ring = &adev->vcn.inst[i].ring_enc[0];
+
+                       adev->nbio.funcs->vcn_doorbell_range(adev, ring->use_doorbell,
+                                       ((adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 8 * i), i);
+
+                       r = amdgpu_ring_test_helper(ring);
+                       if (r)
+                               goto done;
+
+               }
        }
 
 done:
@@ -230,12 +284,14 @@ static int vcn_v4_0_hw_fini(void *handle)
        for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
                if (adev->vcn.harvest_config & (1 << i))
                        continue;
-
-               if ((adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) ||
+               if (!amdgpu_sriov_vf(adev)) {
+                       if ((adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) ||
                         (adev->vcn.cur_state != AMD_PG_STATE_GATE &&
                                 RREG32_SOC15(VCN, i, regUVD_STATUS))) {
                         vcn_v4_0_set_powergating_state(adev, AMD_PG_STATE_GATE);
+                       }
                }
+
        }
 
        return 0;
@@ -1107,6 +1163,214 @@ static int vcn_v4_0_start(struct amdgpu_device *adev)
        return 0;
 }
 
+static int vcn_v4_0_start_sriov(struct amdgpu_device *adev)
+{
+       int i;
+       struct amdgpu_ring *ring_enc;
+       uint64_t cache_addr;
+       uint64_t rb_enc_addr;
+       uint64_t ctx_addr;
+       uint32_t param, resp, expected;
+       uint32_t offset, cache_size;
+       uint32_t tmp, timeout;
+
+       struct amdgpu_mm_table *table = &adev->virt.mm_table;
+       uint32_t *table_loc;
+       uint32_t table_size;
+       uint32_t size, size_dw;
+       uint32_t init_status;
+       uint32_t enabled_vcn;
+
+       struct mmsch_v4_0_cmd_direct_write
+               direct_wt = { {0} };
+       struct mmsch_v4_0_cmd_direct_read_modify_write
+               direct_rd_mod_wt = { {0} };
+       struct mmsch_v4_0_cmd_end end = { {0} };
+       struct mmsch_v4_0_init_header header;
+
+       volatile struct amdgpu_vcn4_fw_shared *fw_shared;
+       volatile struct amdgpu_fw_shared_rb_setup *rb_setup;
+
+       direct_wt.cmd_header.command_type =
+               MMSCH_COMMAND__DIRECT_REG_WRITE;
+       direct_rd_mod_wt.cmd_header.command_type =
+               MMSCH_COMMAND__DIRECT_REG_READ_MODIFY_WRITE;
+       end.cmd_header.command_type =
+               MMSCH_COMMAND__END;
+
+       header.version = MMSCH_VERSION;
+       header.total_size = sizeof(struct mmsch_v4_0_init_header) >> 2;
+       for (i = 0; i < AMDGPU_MAX_VCN_INSTANCES; i++) {
+               header.inst[i].init_status = 0;
+               header.inst[i].table_offset = 0;
+               header.inst[i].table_size = 0;
+       }
+
+       table_loc = (uint32_t *)table->cpu_addr;
+       table_loc += header.total_size;
+       for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+               if (adev->vcn.harvest_config & (1 << i))
+                       continue;
+
+               table_size = 0;
+
+               MMSCH_V4_0_INSERT_DIRECT_RD_MOD_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_STATUS),
+                       ~UVD_STATUS__UVD_BUSY, UVD_STATUS__UVD_BUSY);
+
+               cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4);
+
+               if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
+                       MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                               regUVD_LMI_VCPU_CACHE_64BIT_BAR_LOW),
+                               adev->firmware.ucode[AMDGPU_UCODE_ID_VCN + i].tmr_mc_addr_lo);
+                       MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                               regUVD_LMI_VCPU_CACHE_64BIT_BAR_HIGH),
+                               adev->firmware.ucode[AMDGPU_UCODE_ID_VCN + i].tmr_mc_addr_hi);
+                       offset = 0;
+                       MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                               regUVD_VCPU_CACHE_OFFSET0),
+                               0);
+               } else {
+                       MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                               regUVD_LMI_VCPU_CACHE_64BIT_BAR_LOW),
+                               lower_32_bits(adev->vcn.inst[i].gpu_addr));
+                       MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                               regUVD_LMI_VCPU_CACHE_64BIT_BAR_HIGH),
+                               upper_32_bits(adev->vcn.inst[i].gpu_addr));
+                       offset = cache_size;
+                       MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                               regUVD_VCPU_CACHE_OFFSET0),
+                               AMDGPU_UVD_FIRMWARE_OFFSET >> 3);
+               }
+
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_VCPU_CACHE_SIZE0),
+                       cache_size);
+
+               cache_addr = adev->vcn.inst[i].gpu_addr + offset;
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_LMI_VCPU_CACHE1_64BIT_BAR_LOW),
+                       lower_32_bits(cache_addr));
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_LMI_VCPU_CACHE1_64BIT_BAR_HIGH),
+                       upper_32_bits(cache_addr));
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_VCPU_CACHE_OFFSET1),
+                       0);
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_VCPU_CACHE_SIZE1),
+                       AMDGPU_VCN_STACK_SIZE);
+
+               cache_addr = adev->vcn.inst[i].gpu_addr + offset +
+                       AMDGPU_VCN_STACK_SIZE;
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_LMI_VCPU_CACHE2_64BIT_BAR_LOW),
+                       lower_32_bits(cache_addr));
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_LMI_VCPU_CACHE2_64BIT_BAR_HIGH),
+                       upper_32_bits(cache_addr));
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_VCPU_CACHE_OFFSET2),
+                       0);
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_VCPU_CACHE_SIZE2),
+                       AMDGPU_VCN_CONTEXT_SIZE);
+
+               fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
+               rb_setup = &fw_shared->rb_setup;
+
+               ring_enc = &adev->vcn.inst[i].ring_enc[0];
+               ring_enc->wptr = 0;
+               rb_enc_addr = ring_enc->gpu_addr;
+
+               rb_setup->is_rb_enabled_flags |= RB_ENABLED;
+               rb_setup->rb_addr_lo = lower_32_bits(rb_enc_addr);
+               rb_setup->rb_addr_hi = upper_32_bits(rb_enc_addr);
+               rb_setup->rb_size = ring_enc->ring_size / 4;
+               fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_SETUP_FLAG);
+
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_LMI_VCPU_NC0_64BIT_BAR_LOW),
+                       lower_32_bits(adev->vcn.inst[i].fw_shared.gpu_addr));
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_LMI_VCPU_NC0_64BIT_BAR_HIGH),
+                       upper_32_bits(adev->vcn.inst[i].fw_shared.gpu_addr));
+               MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i,
+                       regUVD_VCPU_NONCACHE_SIZE0),
+                       AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn4_fw_shared)));
+
+               /* add end packet */
+               MMSCH_V4_0_INSERT_END();
+
+               /* refine header */
+               header.inst[i].init_status = 0;
+               header.inst[i].table_offset = header.total_size;
+               header.inst[i].table_size = table_size;
+               header.total_size += table_size;
+       }
+
+       /* Update init table header in memory */
+       size = sizeof(struct mmsch_v4_0_init_header);
+       table_loc = (uint32_t *)table->cpu_addr;
+       memcpy((void *)table_loc, &header, size);
+
+       /* message MMSCH (in VCN[0]) to initialize this client
+        * 1, write to mmsch_vf_ctx_addr_lo/hi register with GPU mc addr
+        * of memory descriptor location
+        */
+       ctx_addr = table->gpu_addr;
+       WREG32_SOC15(VCN, 0, regMMSCH_VF_CTX_ADDR_LO, lower_32_bits(ctx_addr));
+       WREG32_SOC15(VCN, 0, regMMSCH_VF_CTX_ADDR_HI, upper_32_bits(ctx_addr));
+
+       /* 2, update vmid of descriptor */
+       tmp = RREG32_SOC15(VCN, 0, regMMSCH_VF_VMID);
+       tmp &= ~MMSCH_VF_VMID__VF_CTX_VMID_MASK;
+       /* use domain0 for MM scheduler */
+       tmp |= (0 << MMSCH_VF_VMID__VF_CTX_VMID__SHIFT);
+       WREG32_SOC15(VCN, 0, regMMSCH_VF_VMID, tmp);
+
+       /* 3, notify mmsch about the size of this descriptor */
+       size = header.total_size;
+       WREG32_SOC15(VCN, 0, regMMSCH_VF_CTX_SIZE, size);
+
+       /* 4, set resp to zero */
+       WREG32_SOC15(VCN, 0, regMMSCH_VF_MAILBOX_RESP, 0);
+
+       /* 5, kick off the initialization and wait until
+        * MMSCH_VF_MAILBOX_RESP becomes non-zero
+        */
+       param = 0x00000001;
+       WREG32_SOC15(VCN, 0, regMMSCH_VF_MAILBOX_HOST, param);
+       tmp = 0;
+       timeout = 1000;
+       resp = 0;
+       expected = MMSCH_VF_MAILBOX_RESP__OK;
+       while (resp != expected) {
+               resp = RREG32_SOC15(VCN, 0, regMMSCH_VF_MAILBOX_RESP);
+               if (resp != 0)
+                       break;
+
+               udelay(10);
+               tmp = tmp + 10;
+               if (tmp >= timeout) {
+                       DRM_ERROR("failed to init MMSCH. TIME-OUT after %d usec"\
+                               " waiting for regMMSCH_VF_MAILBOX_RESP "\
+                               "(expected=0x%08x, readback=0x%08x)\n",
+                               tmp, expected, resp);
+                       return -EBUSY;
+               }
+       }
+       enabled_vcn = amdgpu_vcn_is_disabled_vcn(adev, VCN_DECODE_RING, 0) ? 1 : 0;
+       init_status = ((struct mmsch_v4_0_init_header *)(table_loc))->inst[enabled_vcn].init_status;
+       if (resp != expected && resp != MMSCH_VF_MAILBOX_RESP__INCOMPLETE
+       && init_status != MMSCH_VF_ENGINE_STATUS__PASS)
+               DRM_ERROR("MMSCH init status is incorrect! readback=0x%08x, header init "\
+                       "status for VCN%x: 0x%x\n", resp, enabled_vcn, init_status);
+
+       return 0;
+}
+
 /**
  * vcn_v4_0_stop_dpg_mode - VCN stop with dpg mode
  *
@@ -1596,6 +1860,15 @@ static int vcn_v4_0_set_powergating_state(void *handle, enum amd_powergating_sta
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
        int ret;
 
+       /* for SRIOV, guest should not control VCN Power-gating
+        * MMSCH FW should control Power-gating and clock-gating
+        * guest should avoid touching CGC and PG
+        */
+       if (amdgpu_sriov_vf(adev)) {
+               adev->vcn.cur_state = AMD_PG_STATE_UNGATE;
+               return 0;
+       }
+
        if(state == adev->vcn.cur_state)
                return 0;