From c16d45f42b64e91895f4bc1cf19febeb5e0c52b6 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 2 Jun 2020 12:28:27 +0300 Subject: [PATCH] habanalabs: Use pending CS amount per ASIC Training schemes requires much more concurrent command submissions than inference does. In addition, training command submissions can be completed in a non serialized manner. Hence, we add support in which each ASIC will be able to configure the amount of concurrent pending command submissions, rather than use a predefined amount. This change will enhance performance by allowing the user to add more concurrent work without waiting for the previous work to be completed. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 6 ++++-- drivers/misc/habanalabs/context.c | 14 +++++++++++--- drivers/misc/habanalabs/gaudi/gaudi.c | 2 ++ drivers/misc/habanalabs/gaudi/gaudiP.h | 6 ++++++ drivers/misc/habanalabs/goya/goya.c | 2 ++ drivers/misc/habanalabs/goya/goyaP.h | 6 ++++++ drivers/misc/habanalabs/habanalabs.h | 9 +++++---- drivers/misc/habanalabs/hw_queue.c | 2 +- 8 files changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index b0f62cb..e99c1d1 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -418,7 +418,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, spin_lock(&ctx->cs_lock); cs_cmpl->cs_seq = ctx->cs_sequence; - other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)]; + other = ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)]; if ((other) && (!dma_fence_is_signaled(other))) { spin_unlock(&ctx->cs_lock); dev_dbg(hdev->dev, @@ -432,7 +433,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->sequence = cs_cmpl->cs_seq; - ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] = + ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)] = &cs_cmpl->base_fence; ctx->cs_sequence++; diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index ec92b35..1b96fef 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx) * to this function unless the ref count is 0 */ - for (i = 0 ; i < HL_MAX_PENDING_CS ; i++) + for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++) dma_fence_put(ctx->cs_pending[i]); + kfree(ctx->cs_pending); + if (ctx->asid != HL_KERNEL_ASID_ID) { /* The engines are stopped as there is no executing CS, but the * Coresight might be still working by accessing addresses @@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) spin_lock_init(&ctx->cs_lock); atomic_set(&ctx->thread_ctx_switch_token, 1); ctx->thread_ctx_switch_wait_token = 0; + ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, + sizeof(struct dma_fence *), + GFP_KERNEL); + if (!ctx->cs_pending) + return -ENOMEM; if (is_kernel_ctx) { ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */ @@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx) struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) { + struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; struct dma_fence *fence; spin_lock(&ctx->cs_lock); @@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) return ERR_PTR(-EINVAL); } - if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) { + if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) { spin_unlock(&ctx->cs_lock); return NULL; } fence = dma_fence_get( - ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]); + ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]); spin_unlock(&ctx->cs_lock); return fence; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 3ee104b..9d9cbcd 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME, CARD_NAME_MAX_LEN); + prop->max_pending_cs = GAUDI_MAX_PENDING_CS; + return 0; } diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index 41a8d9b..63baef1 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -57,6 +57,12 @@ #define GAUDI_DEFAULT_CARD_NAME "HL2000" +#define GAUDI_MAX_PENDING_CS 1024 + +#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS) +#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1" +#endif + #define PCI_DMA_NUMBER_OF_CHNLS 3 #define HBM_DMA_NUMBER_OF_CHNLS 5 #define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \ diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index a4a20e2..6dccaec 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev) strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME, CARD_NAME_MAX_LEN); + + prop->max_pending_cs = GOYA_MAX_PENDING_CS; } /* diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index d36f8d9..9d8a176 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -57,6 +57,12 @@ #define GOYA_DEFAULT_CARD_NAME "HL1000" +#define GOYA_MAX_PENDING_CS 64 + +#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS) +#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1" +#endif + /* DRAM Memory Map */ #define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 1ecdcf8..64d9b2d 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -42,9 +42,6 @@ #define HL_MAX_QUEUES 128 -/* MUST BE POWER OF 2 and larger than 1 */ -#define HL_MAX_PENDING_CS 64 - #define HL_IDLE_BUSY_TS_ARR_SIZE 4096 /* Memory */ @@ -61,6 +58,9 @@ #define HL_MAX_SOB_VAL (1 << 15) +#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0)) +#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1)) + /** * struct pgt_info - MMU hop page info. * @node: hash linked-list node for the pgts shadow hash of pgts. @@ -285,6 +285,7 @@ struct asic_fixed_properties { u32 high_pll; u32 cb_pool_cb_cnt; u32 cb_pool_cb_size; + u32 max_pending_cs; u8 tpc_enabled_mask; u8 completion_queues_count; }; @@ -782,7 +783,7 @@ struct hl_ctx { struct hl_fpriv *hpriv; struct hl_device *hdev; struct kref refcount; - struct dma_fence *cs_pending[HL_MAX_PENDING_CS]; + struct dma_fence **cs_pending; struct hl_va_range *host_va_range; struct hl_va_range *host_huge_va_range; struct hl_va_range *dram_va_range; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index f4434b3..29b96d2 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) * write address offset in the SM block (QMAN LBW message). * The write address offset is calculated as "COMP_OFFSET << 2". */ - offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1); + offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1); ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) | ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK); -- 2.7.4