gpu: host1x: Add support for syncpoint waits in CDMA pushbuffer
authorMikko Perttunen <mperttunen@nvidia.com>
Thu, 10 Jun 2021 11:04:45 +0000 (14:04 +0300)
committerThierry Reding <treding@nvidia.com>
Tue, 10 Aug 2021 12:41:19 +0000 (14:41 +0200)
Add support for inserting syncpoint waits in the CDMA pushbuffer.
These waits need to be done in HOST1X class, while gather submitted
by the application execute in engine class.

Support is added by converting the gather list of job into a command
list that can include both gathers and waits. When the job is
submitted, these commands are pushed as the appropriate opcodes
on the CDMA pushbuffer.

Also supported are waits relative to the start of the job,
which are useful for jobs doing multiple things with an engine
that doesn't natively support pipelining.

While at it, use 32-bit waits on chips that support them.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
drivers/gpu/host1x/hw/channel_hw.c
drivers/gpu/host1x/hw/debug_hw.c
drivers/gpu/host1x/hw/hw_host1x02_uclass.h
drivers/gpu/host1x/hw/hw_host1x04_uclass.h
drivers/gpu/host1x/hw/hw_host1x05_uclass.h
drivers/gpu/host1x/hw/hw_host1x06_uclass.h
drivers/gpu/host1x/hw/hw_host1x07_uclass.h
drivers/gpu/host1x/job.c
drivers/gpu/host1x/job.h
include/linux/host1x.h

index bf21512e5078f8a85939dba1f7b24bb8cf505b19..1999780a7203a649b20ee33e3cbc7d8cb2b78f76 100644 (file)
@@ -47,39 +47,84 @@ static void trace_write_gather(struct host1x_cdma *cdma, struct host1x_bo *bo,
        }
 }
 
-static void submit_gathers(struct host1x_job *job)
+static void submit_wait(struct host1x_cdma *cdma, u32 id, u32 threshold,
+                       u32 next_class)
+{
+#if HOST1X_HW >= 2
+       host1x_cdma_push_wide(cdma,
+               host1x_opcode_setclass(
+                       HOST1X_CLASS_HOST1X,
+                       HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32,
+                       /* WAIT_SYNCPT_32 is at SYNCPT_PAYLOAD_32+2 */
+                       BIT(0) | BIT(2)
+               ),
+               threshold,
+               id,
+               host1x_opcode_setclass(next_class, 0, 0)
+       );
+#else
+       /* TODO add waitchk or use waitbases or other mitigation */
+       host1x_cdma_push(cdma,
+               host1x_opcode_setclass(
+                       HOST1X_CLASS_HOST1X,
+                       host1x_uclass_wait_syncpt_r(),
+                       BIT(0)
+               ),
+               host1x_class_host_wait_syncpt(id, threshold)
+       );
+       host1x_cdma_push(cdma,
+               host1x_opcode_setclass(next_class, 0, 0),
+               HOST1X_OPCODE_NOP
+       );
+#endif
+}
+
+static void submit_gathers(struct host1x_job *job, u32 job_syncpt_base)
 {
        struct host1x_cdma *cdma = &job->channel->cdma;
 #if HOST1X_HW < 6
        struct device *dev = job->channel->dev;
 #endif
        unsigned int i;
+       u32 threshold;
 
-       for (i = 0; i < job->num_gathers; i++) {
-               struct host1x_job_gather *g = &job->gathers[i];
-               dma_addr_t addr = g->base + g->offset;
-               u32 op2, op3;
+       for (i = 0; i < job->num_cmds; i++) {
+               struct host1x_job_cmd *cmd = &job->cmds[i];
 
-               op2 = lower_32_bits(addr);
-               op3 = upper_32_bits(addr);
+               if (cmd->is_wait) {
+                       if (cmd->wait.relative)
+                               threshold = job_syncpt_base + cmd->wait.threshold;
+                       else
+                               threshold = cmd->wait.threshold;
 
-               trace_write_gather(cdma, g->bo, g->offset, g->words);
+                       submit_wait(cdma, cmd->wait.id, threshold, cmd->wait.next_class);
+               } else {
+                       struct host1x_job_gather *g = &cmd->gather;
+
+                       dma_addr_t addr = g->base + g->offset;
+                       u32 op2, op3;
+
+                       op2 = lower_32_bits(addr);
+                       op3 = upper_32_bits(addr);
 
-               if (op3 != 0) {
+                       trace_write_gather(cdma, g->bo, g->offset, g->words);
+
+                       if (op3 != 0) {
 #if HOST1X_HW >= 6
-                       u32 op1 = host1x_opcode_gather_wide(g->words);
-                       u32 op4 = HOST1X_OPCODE_NOP;
+                               u32 op1 = host1x_opcode_gather_wide(g->words);
+                               u32 op4 = HOST1X_OPCODE_NOP;
 
-                       host1x_cdma_push_wide(cdma, op1, op2, op3, op4);
+                               host1x_cdma_push_wide(cdma, op1, op2, op3, op4);
 #else
-                       dev_err(dev, "invalid gather for push buffer %pad\n",
-                               &addr);
-                       continue;
+                               dev_err(dev, "invalid gather for push buffer %pad\n",
+                                       &addr);
+                               continue;
 #endif
-               } else {
-                       u32 op1 = host1x_opcode_gather(g->words);
+                       } else {
+                               u32 op1 = host1x_opcode_gather(g->words);
 
-                       host1x_cdma_push(cdma, op1, op2);
+                               host1x_cdma_push(cdma, op1, op2);
+                       }
                }
        }
 }
@@ -126,7 +171,7 @@ static int channel_submit(struct host1x_job *job)
        struct host1x *host = dev_get_drvdata(ch->dev->parent);
 
        trace_host1x_channel_submit(dev_name(ch->dev),
-                                   job->num_gathers, job->num_relocs,
+                                   job->num_cmds, job->num_relocs,
                                    job->syncpt->id, job->syncpt_incrs);
 
        /* before error checks, return current max */
@@ -181,7 +226,7 @@ static int channel_submit(struct host1x_job *job)
                                 host1x_opcode_setclass(job->class, 0, 0),
                                 HOST1X_OPCODE_NOP);
 
-       submit_gathers(job);
+       submit_gathers(job, syncval - user_syncpt_incrs);
 
        /* end CDMA submit & stash pinned hMems into sync queue */
        host1x_cdma_end(&ch->cdma, job);
index ceb48229d14b37f6a64acecc00f24855f6b55a0e..35952fd5597ea750805ef4661d9a64d734f631ba 100644 (file)
@@ -208,10 +208,15 @@ static void show_channel_gathers(struct output *o, struct host1x_cdma *cdma)
                                    job->first_get, job->timeout,
                                    job->num_slots, job->num_unpins);
 
-               for (i = 0; i < job->num_gathers; i++) {
-                       struct host1x_job_gather *g = &job->gathers[i];
+               for (i = 0; i < job->num_cmds; i++) {
+                       struct host1x_job_gather *g;
                        u32 *mapped;
 
+                       if (job->cmds[i].is_wait)
+                               continue;
+
+                       g = &job->cmds[i].gather;
+
                        if (job->gather_copy_mapped)
                                mapped = (u32 *)job->gather_copy_mapped;
                        else
index 4fc51f70496ba97f75a76d85864f6ec664a1e55d..0a2ab8f1da6f6b8b8a523f236df7fa3cfb626e9d 100644 (file)
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
        host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+       return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+       host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+       return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+       host1x_uclass_wait_syncpt_32_r()
 
 #endif
index 9e84a4adca9fea01d3a1af0f20eed16f2f5f4756..60c692b92955d0847f940f8792c0a1485480f2c0 100644 (file)
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
        host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+       return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+       host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+       return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+       host1x_uclass_wait_syncpt_32_r()
 
 #endif
index aee5a4e32877c612eafebd431ece9b463060e598..2fcc9a2ad3ef10126cd01d4c5801547eb466349c 100644 (file)
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
        host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+       return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+       host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+       return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+       host1x_uclass_wait_syncpt_32_r()
 
 #endif
index c4bacdb7155fe635d917c1d31e7c43e722847707..5f831438d19bb39209055cded9a10ceb5ada825b 100644 (file)
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
        host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+       return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+       host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+       return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+       host1x_uclass_wait_syncpt_32_r()
 
 #endif
index c74070f3f203108a1f871da7e57b30011c476d13..8cd2ef087d5d03dee9f1a47ae9252e3719bb1abb 100644 (file)
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
        host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+       return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+       host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+       return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+       host1x_uclass_wait_syncpt_32_r()
 
 #endif
index 09097e19c0d09d44a250f06f0fa2c33b7d258b8d..32619b73a2fc259b995918906b3414576f98c4c0 100644 (file)
@@ -38,7 +38,7 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
        total = sizeof(struct host1x_job) +
                (u64)num_relocs * sizeof(struct host1x_reloc) +
                (u64)num_unpins * sizeof(struct host1x_job_unpin_data) +
-               (u64)num_cmdbufs * sizeof(struct host1x_job_gather) +
+               (u64)num_cmdbufs * sizeof(struct host1x_job_cmd) +
                (u64)num_unpins * sizeof(dma_addr_t) +
                (u64)num_unpins * sizeof(u32 *);
        if (total > ULONG_MAX)
@@ -57,8 +57,8 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
        mem += num_relocs * sizeof(struct host1x_reloc);
        job->unpins = num_unpins ? mem : NULL;
        mem += num_unpins * sizeof(struct host1x_job_unpin_data);
-       job->gathers = num_cmdbufs ? mem : NULL;
-       mem += num_cmdbufs * sizeof(struct host1x_job_gather);
+       job->cmds = num_cmdbufs ? mem : NULL;
+       mem += num_cmdbufs * sizeof(struct host1x_job_cmd);
        job->addr_phys = num_unpins ? mem : NULL;
 
        job->reloc_addr_phys = job->addr_phys;
@@ -101,22 +101,38 @@ EXPORT_SYMBOL(host1x_job_put);
 void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
                           unsigned int words, unsigned int offset)
 {
-       struct host1x_job_gather *gather = &job->gathers[job->num_gathers];
+       struct host1x_job_gather *gather = &job->cmds[job->num_cmds].gather;
 
        gather->words = words;
        gather->bo = bo;
        gather->offset = offset;
 
-       job->num_gathers++;
+       job->num_cmds++;
 }
 EXPORT_SYMBOL(host1x_job_add_gather);
 
+void host1x_job_add_wait(struct host1x_job *job, u32 id, u32 thresh,
+                        bool relative, u32 next_class)
+{
+       struct host1x_job_cmd *cmd = &job->cmds[job->num_cmds];
+
+       cmd->is_wait = true;
+       cmd->wait.id = id;
+       cmd->wait.threshold = thresh;
+       cmd->wait.next_class = next_class;
+       cmd->wait.relative = relative;
+
+       job->num_cmds++;
+}
+EXPORT_SYMBOL(host1x_job_add_wait);
+
 static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 {
        struct host1x_client *client = job->client;
        struct device *dev = client->dev;
        struct host1x_job_gather *g;
        struct iommu_domain *domain;
+       struct sg_table *sgt;
        unsigned int i;
        int err;
 
@@ -126,7 +142,6 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
        for (i = 0; i < job->num_relocs; i++) {
                struct host1x_reloc *reloc = &job->relocs[i];
                dma_addr_t phys_addr, *phys;
-               struct sg_table *sgt;
 
                reloc->target.bo = host1x_bo_get(reloc->target.bo);
                if (!reloc->target.bo) {
@@ -202,17 +217,20 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
        if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))
                return 0;
 
-       for (i = 0; i < job->num_gathers; i++) {
+       for (i = 0; i < job->num_cmds; i++) {
                size_t gather_size = 0;
                struct scatterlist *sg;
-               struct sg_table *sgt;
                dma_addr_t phys_addr;
                unsigned long shift;
                struct iova *alloc;
                dma_addr_t *phys;
                unsigned int j;
 
-               g = &job->gathers[i];
+               if (job->cmds[i].is_wait)
+                       continue;
+
+               g = &job->cmds[i].gather;
+
                g->bo = host1x_bo_get(g->bo);
                if (!g->bo) {
                        err = -EINVAL;
@@ -545,8 +563,13 @@ static inline int copy_gathers(struct device *host, struct host1x_job *job,
        fw.num_relocs = job->num_relocs;
        fw.class = job->class;
 
-       for (i = 0; i < job->num_gathers; i++) {
-               struct host1x_job_gather *g = &job->gathers[i];
+       for (i = 0; i < job->num_cmds; i++) {
+               struct host1x_job_gather *g;
+
+               if (job->cmds[i].is_wait)
+                       continue;
+
+               g = &job->cmds[i].gather;
 
                size += g->words * sizeof(u32);
        }
@@ -568,10 +591,14 @@ static inline int copy_gathers(struct device *host, struct host1x_job *job,
 
        job->gather_copy_size = size;
 
-       for (i = 0; i < job->num_gathers; i++) {
-               struct host1x_job_gather *g = &job->gathers[i];
+       for (i = 0; i < job->num_cmds; i++) {
+               struct host1x_job_gather *g;
                void *gather;
 
+               if (job->cmds[i].is_wait)
+                       continue;
+               g = &job->cmds[i].gather;
+
                /* Copy the gather */
                gather = host1x_bo_mmap(g->bo);
                memcpy(job->gather_copy_mapped + offset, gather + g->offset,
@@ -614,8 +641,12 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev)
        }
 
        /* patch gathers */
-       for (i = 0; i < job->num_gathers; i++) {
-               struct host1x_job_gather *g = &job->gathers[i];
+       for (i = 0; i < job->num_cmds; i++) {
+               struct host1x_job_gather *g;
+
+               if (job->cmds[i].is_wait)
+                       continue;
+               g = &job->cmds[i].gather;
 
                /* process each gather mem only once */
                if (g->handled)
@@ -625,10 +656,11 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev)
                if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))
                        g->base = job->gather_addr_phys[i];
 
-               for (j = i + 1; j < job->num_gathers; j++) {
-                       if (job->gathers[j].bo == g->bo) {
-                               job->gathers[j].handled = true;
-                               job->gathers[j].base = g->base;
+               for (j = i + 1; j < job->num_cmds; j++) {
+                       if (!job->cmds[j].is_wait &&
+                           job->cmds[j].gather.bo == g->bo) {
+                               job->cmds[j].gather.handled = true;
+                               job->cmds[j].gather.base = g->base;
                        }
                }
 
index 94bc2e4ae241d202fd61e74b90425ca26bb17a35..b4428c5495c936583dd927a6b37589417de8df5f 100644 (file)
@@ -18,6 +18,22 @@ struct host1x_job_gather {
        bool handled;
 };
 
+struct host1x_job_wait {
+       u32 id;
+       u32 threshold;
+       u32 next_class;
+       bool relative;
+};
+
+struct host1x_job_cmd {
+       bool is_wait;
+
+       union {
+               struct host1x_job_gather gather;
+               struct host1x_job_wait wait;
+       };
+};
+
 struct host1x_job_unpin_data {
        struct host1x_bo *bo;
        struct sg_table *sgt;
index 57271ab1fee84ca6ec283bc89c4db05124227c51..2127762fc63da6ad572dec297eacf3f250137b3c 100644 (file)
@@ -218,8 +218,8 @@ struct host1x_job {
        struct host1x_client *client;
 
        /* Gathers and their memory */
-       struct host1x_job_gather *gathers;
-       unsigned int num_gathers;
+       struct host1x_job_cmd *cmds;
+       unsigned int num_cmds;
 
        /* Array of handles to be pinned & unpinned */
        struct host1x_reloc *relocs;
@@ -278,6 +278,8 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
                                    u32 num_cmdbufs, u32 num_relocs);
 void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
                           unsigned int words, unsigned int offset);
+void host1x_job_add_wait(struct host1x_job *job, u32 id, u32 thresh,
+                        bool relative, u32 next_class);
 struct host1x_job *host1x_job_get(struct host1x_job *job);
 void host1x_job_put(struct host1x_job *job);
 int host1x_job_pin(struct host1x_job *job, struct device *dev);