Currently the amount of maximum queues is statically configured.
Using a static value is causing redundunt cycles when traversing
all queues and consumes more memory than actually needed.
In this patch we configure each asic with the exact number of
queues needed.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
cs_counters_aggregate(hdev, cs->ctx);
+ kfree(cs->jobs_in_queue_cnt);
kfree(cs);
}
other = ctx->cs_pending[cs_cmpl->cs_seq &
(hdev->asic_prop.max_pending_cs - 1)];
if ((other) && (!dma_fence_is_signaled(other))) {
- spin_unlock(&ctx->cs_lock);
dev_dbg(hdev->dev,
"Rejecting CS because of too many in-flights CS\n");
rc = -EAGAIN;
goto free_fence;
}
+ cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
+ sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
+ if (!cs->jobs_in_queue_cnt) {
+ rc = -ENOMEM;
+ goto free_fence;
+ }
+
dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
ctx->asid, ctx->cs_sequence);
return 0;
free_fence:
+ spin_unlock(&ctx->cs_lock);
kfree(cs_cmpl);
free_cs:
kfree(cs);
struct asic_fixed_properties *asic = &hdev->asic_prop;
struct hw_queue_properties *hw_queue_prop;
+ /* This must be checked here to prevent out-of-bounds access to
+ * hw_queues_props array
+ */
+ if (chunk->queue_index >= asic->max_queues) {
+ dev_err(hdev->dev, "Queue index %d is invalid\n",
+ chunk->queue_index);
+ return -EINVAL;
+ }
+
hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
- if ((chunk->queue_index >= HL_MAX_QUEUES) ||
- (hw_queue_prop->type == QUEUE_TYPE_NA)) {
+ if (hw_queue_prop->type == QUEUE_TYPE_NA) {
dev_err(hdev->dev, "Queue index %d is invalid\n",
chunk->queue_index);
return -EINVAL;
hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
q_type = hw_queue_prop->type;
- if ((q_idx >= HL_MAX_QUEUES) ||
+ if ((q_idx >= hdev->asic_prop.max_queues) ||
(!hw_queue_prop->supports_sync_stream)) {
dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx);
rc = -EINVAL;
struct asic_fixed_properties *prop = &hdev->asic_prop;
int i;
- if (GAUDI_QUEUE_ID_SIZE >= HL_MAX_QUEUES) {
- dev_err(hdev->dev,
- "Number of H/W queues must be smaller than %d\n",
- HL_MAX_QUEUES);
- return -EFAULT;
- }
+ prop->max_queues = GAUDI_QUEUE_ID_SIZE;
+ prop->hw_queues_props = kcalloc(prop->max_queues,
+ sizeof(struct hw_queue_properties),
+ GFP_KERNEL);
- for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) {
+ if (!prop->hw_queues_props)
+ return -ENOMEM;
+
+ for (i = 0 ; i < prop->max_queues ; i++) {
if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
}
}
- for (; i < HL_MAX_QUEUES; i++)
- prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
-
prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
prop->sync_stream_first_sob = 0;
prop->sync_stream_first_mon = 0;
(unsigned long long) pci_resource_len(pdev,
SRAM_BAR_ID),
SRAM_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
if (pci_resource_len(pdev, CFG_BAR_ID) != CFG_BAR_SIZE) {
(unsigned long long) pci_resource_len(pdev,
CFG_BAR_ID),
CFG_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);
rc = hl_pci_init(hdev);
if (rc)
- return rc;
+ goto free_queue_props;
return 0;
+
+free_queue_props:
+ kfree(hdev->asic_prop.hw_queues_props);
+ return rc;
}
static int gaudi_early_fini(struct hl_device *hdev)
{
+ kfree(hdev->asic_prop.hw_queues_props);
hl_pci_fini(hdev);
return 0;
{
int i, rc, ret_val = 0;
- for (i = 0 ; i < HL_MAX_QUEUES ; i++) {
+ for (i = 0 ; i < hdev->asic_prop.max_queues ; i++) {
if (hdev->asic_prop.hw_queues_props[i].type == QUEUE_TYPE_EXT) {
rc = gaudi_test_queue(hdev, i);
if (rc)
static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev);
static void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
-void goya_get_fixed_properties(struct hl_device *hdev)
+int goya_get_fixed_properties(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
int i;
+ prop->max_queues = GOYA_QUEUE_ID_SIZE;
+ prop->hw_queues_props = kcalloc(prop->max_queues,
+ sizeof(struct hw_queue_properties),
+ GFP_KERNEL);
+
+ if (!prop->hw_queues_props)
+ return -ENOMEM;
+
for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 0;
}
- for (; i < HL_MAX_QUEUES; i++)
- prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
-
prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
prop->dram_base_address = DRAM_PHYS_BASE;
CARD_NAME_MAX_LEN);
prop->max_pending_cs = GOYA_MAX_PENDING_CS;
+
+ return 0;
}
/*
u32 val;
int rc;
- goya_get_fixed_properties(hdev);
+ rc = goya_get_fixed_properties(hdev);
+ if (rc) {
+ dev_err(hdev->dev, "Failed to get fixed properties\n");
+ return rc;
+ }
/* Check BAR sizes */
if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) {
(unsigned long long) pci_resource_len(pdev,
SRAM_CFG_BAR_ID),
CFG_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) {
(unsigned long long) pci_resource_len(pdev,
MSIX_BAR_ID),
MSIX_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
rc = hl_pci_init(hdev);
if (rc)
- return rc;
+ goto free_queue_props;
if (!hdev->pldm) {
val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
}
return 0;
+
+free_queue_props:
+ kfree(hdev->asic_prop.hw_queues_props);
+ return rc;
}
/*
*/
static int goya_early_fini(struct hl_device *hdev)
{
+ kfree(hdev->asic_prop.hw_queues_props);
hl_pci_fini(hdev);
return 0;
*/
#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + 1)
-#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES)
-#error "Number of H/W queues must be smaller than HL_MAX_QUEUES"
-#endif
-
#if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES)
#error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES"
#endif
u8 device_cpu_mmu_mappings_done;
};
-void goya_get_fixed_properties(struct hl_device *hdev);
+int goya_get_fixed_properties(struct hl_device *hdev);
int goya_mmu_init(struct hl_device *hdev);
void goya_init_dma_qmans(struct hl_device *hdev);
void goya_init_mme_qmans(struct hl_device *hdev);
#define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */
-#define HL_MAX_QUEUES 128
-
#define HL_IDLE_BUSY_TS_ARR_SIZE 4096
/* Memory */
* @high_pll: high PLL frequency used by the device.
* @cb_pool_cb_cnt: number of CBs in the CB pool.
* @cb_pool_cb_size: size of each CB in the CB pool.
- * @tpc_enabled_mask: which TPCs are enabled.
+ * @max_pending_cs: maximum of concurrent pending command submissions
+ * @max_queues: maximum amount of queues in the system
* @sync_stream_first_sob: first sync object available for sync stream use
* @sync_stream_first_mon: first monitor available for sync stream use
* @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
*/
struct asic_fixed_properties {
- struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES];
+ struct hw_queue_properties *hw_queues_props;
struct armcp_info armcp_info;
char uboot_ver[VERSION_MAX_LEN];
char preboot_ver[VERSION_MAX_LEN];
u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size;
u32 max_pending_cs;
+ u32 max_queues;
u16 sync_stream_first_sob;
u16 sync_stream_first_mon;
u8 tpc_enabled_mask;
* @aborted: true if CS was aborted due to some device error.
*/
struct hl_cs {
- u16 jobs_in_queue_cnt[HL_MAX_QUEUES];
+ u16 *jobs_in_queue_cnt;
struct hl_ctx *ctx;
struct list_head job_list;
spinlock_t job_lock;
goto out;
q = &hdev->kernel_queues[0];
- for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+ for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) {
if (q->queue_type == QUEUE_TYPE_INT) {
q->ci += cs->jobs_in_queue_cnt[i];
q->ci &= ((q->int_queue_len << 1) - 1);
struct hl_device *hdev = ctx->hdev;
struct hl_cs_job *job, *tmp;
struct hl_hw_queue *q;
+ u32 max_queues;
int rc = 0, i, cq_cnt;
hdev->asic_funcs->hw_queues_lock(hdev);
goto out;
}
+ max_queues = hdev->asic_prop.max_queues;
+
q = &hdev->kernel_queues[0];
- for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+ for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) {
if (cs->jobs_in_queue_cnt[i]) {
switch (q->queue_type) {
case QUEUE_TYPE_EXT:
unroll_cq_resv:
q = &hdev->kernel_queues[0];
- for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
+ for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) {
if ((q->queue_type == QUEUE_TYPE_EXT ||
q->queue_type == QUEUE_TYPE_HW) &&
cs->jobs_in_queue_cnt[i]) {
struct hl_hw_queue *q;
int i, rc, q_ready_cnt;
- hdev->kernel_queues = kcalloc(HL_MAX_QUEUES,
+ hdev->kernel_queues = kcalloc(asic->max_queues,
sizeof(*hdev->kernel_queues), GFP_KERNEL);
if (!hdev->kernel_queues) {
/* Initialize the H/W queues */
for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues;
- i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
+ i < asic->max_queues ; i++, q_ready_cnt++, q++) {
q->queue_type = asic->hw_queues_props[i].type;
q->supports_sync_stream =
void hl_hw_queues_destroy(struct hl_device *hdev)
{
struct hl_hw_queue *q;
+ u32 max_queues = hdev->asic_prop.max_queues;
int i;
- for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
+ for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++)
queue_fini(hdev, q);
kfree(hdev->kernel_queues);
void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
{
struct hl_hw_queue *q;
+ u32 max_queues = hdev->asic_prop.max_queues;
int i;
- for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) {
+ for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) {
if ((!q->valid) ||
((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
continue;