Merge tag 'v5.15.57' into rpi-5.15.y
[platform/kernel/linux-rpi.git] / drivers / staging / media / rpivid / rpivid_h265.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Raspberry Pi HEVC driver
4  *
5  * Copyright (C) 2020 Raspberry Pi (Trading) Ltd
6  *
7  * Based on the Cedrus VPU driver, that is:
8  *
9  * Copyright (C) 2016 Florent Revest <florent.revest@free-electrons.com>
10  * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
11  * Copyright (C) 2018 Bootlin
12  */
13
14 #include <linux/delay.h>
15 #include <linux/types.h>
16
17 #include <media/videobuf2-dma-contig.h>
18
19 #include "rpivid.h"
20 #include "rpivid_hw.h"
21 #include "rpivid_video.h"
22
23 #define DEBUG_TRACE_P1_CMD 0
24 #define DEBUG_TRACE_EXECUTION 0
25
26 #define USE_REQUEST_PIN 1
27
28 #if DEBUG_TRACE_EXECUTION
29 #define xtrace_in(dev_, de_)\
30         v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: in\n",   __func__,\
31                   (de_) == NULL ? -1 : (de_)->decode_order)
32 #define xtrace_ok(dev_, de_)\
33         v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: ok\n",   __func__,\
34                   (de_) == NULL ? -1 : (de_)->decode_order)
35 #define xtrace_fin(dev_, de_)\
36         v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: finish\n", __func__,\
37                   (de_) == NULL ? -1 : (de_)->decode_order)
38 #define xtrace_fail(dev_, de_)\
39         v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: FAIL\n", __func__,\
40                   (de_) == NULL ? -1 : (de_)->decode_order)
41 #else
42 #define xtrace_in(dev_, de_)
43 #define xtrace_ok(dev_, de_)
44 #define xtrace_fin(dev_, de_)
45 #define xtrace_fail(dev_, de_)
46 #endif
47
48 enum hevc_slice_type {
49         HEVC_SLICE_B = 0,
50         HEVC_SLICE_P = 1,
51         HEVC_SLICE_I = 2,
52 };
53
54 enum hevc_layer { L0 = 0, L1 = 1 };
55
56 static int gptr_alloc(struct rpivid_dev *const dev, struct rpivid_gptr *gptr,
57                       size_t size, unsigned long attrs)
58 {
59         gptr->size = size;
60         gptr->attrs = attrs;
61         gptr->addr = 0;
62         gptr->ptr = dma_alloc_attrs(dev->dev, gptr->size, &gptr->addr,
63                                     GFP_KERNEL, gptr->attrs);
64         return !gptr->ptr ? -ENOMEM : 0;
65 }
66
67 static void gptr_free(struct rpivid_dev *const dev,
68                       struct rpivid_gptr *const gptr)
69 {
70         if (gptr->ptr)
71                 dma_free_attrs(dev->dev, gptr->size, gptr->ptr, gptr->addr,
72                                gptr->attrs);
73         gptr->size = 0;
74         gptr->ptr = NULL;
75         gptr->addr = 0;
76         gptr->attrs = 0;
77 }
78
79 /* Realloc but do not copy
80  *
81  * Frees then allocs.
82  * If the alloc fails then it attempts to re-allocote the old size
83  * On error then check gptr->ptr to determine if anything is currently
84  * allocated.
85  */
86 static int gptr_realloc_new(struct rpivid_dev * const dev,
87                             struct rpivid_gptr * const gptr, size_t size)
88 {
89         const size_t old_size = gptr->size;
90
91         if (size == gptr->size)
92                 return 0;
93
94         if (gptr->ptr)
95                 dma_free_attrs(dev->dev, gptr->size, gptr->ptr,
96                                gptr->addr, gptr->attrs);
97
98         gptr->addr = 0;
99         gptr->size = size;
100         gptr->ptr = dma_alloc_attrs(dev->dev, gptr->size,
101                                     &gptr->addr, GFP_KERNEL, gptr->attrs);
102
103         if (!gptr->ptr) {
104                 gptr->addr = 0;
105                 gptr->size = old_size;
106                 gptr->ptr = dma_alloc_attrs(dev->dev, gptr->size,
107                                             &gptr->addr, GFP_KERNEL, gptr->attrs);
108                 if (!gptr->ptr) {
109                         gptr->size = 0;
110                         gptr->addr = 0;
111                         gptr->attrs = 0;
112                 }
113                 return -ENOMEM;
114         }
115
116         return 0;
117 }
118
119 static size_t next_size(const size_t x)
120 {
121         return rpivid_round_up_size(x + 1);
122 }
123
124 #define NUM_SCALING_FACTORS 4064 /* Not a typo = 0xbe0 + 0x400 */
125
126 #define AXI_BASE64 0
127
128 #define PROB_BACKUP ((20 << 12) + (20 << 6) + (0 << 0))
129 #define PROB_RELOAD ((20 << 12) + (20 << 0) + (0 << 6))
130
131 #define HEVC_MAX_REFS V4L2_HEVC_DPB_ENTRIES_NUM_MAX
132
133 //////////////////////////////////////////////////////////////////////////////
134
135 struct rpi_cmd {
136         u32 addr;
137         u32 data;
138 } __packed;
139
140 struct rpivid_q_aux {
141         unsigned int refcount;
142         unsigned int q_index;
143         struct rpivid_q_aux *next;
144         struct rpivid_gptr col;
145 };
146
147 //////////////////////////////////////////////////////////////////////////////
148
149 enum rpivid_decode_state {
150         RPIVID_DECODE_SLICE_START,
151         RPIVID_DECODE_SLICE_CONTINUE,
152         RPIVID_DECODE_ERROR_CONTINUE,
153         RPIVID_DECODE_ERROR_DONE,
154         RPIVID_DECODE_PHASE1,
155         RPIVID_DECODE_END,
156 };
157
158 struct rpivid_dec_env {
159         struct rpivid_ctx *ctx;
160         struct rpivid_dec_env *next;
161
162         enum rpivid_decode_state state;
163         unsigned int decode_order;
164         int p1_status;          /* P1 status - what to realloc */
165
166         struct rpi_cmd *cmd_fifo;
167         unsigned int cmd_len, cmd_max;
168         unsigned int num_slice_msgs;
169         unsigned int pic_width_in_ctbs_y;
170         unsigned int pic_height_in_ctbs_y;
171         unsigned int dpbno_col;
172         u32 reg_slicestart;
173         int collocated_from_l0_flag;
174         /*
175          * Last CTB/Tile X,Y processed by (wpp_)entry_point
176          * Could be in _state as P0 only but needs updating where _state
177          * is const
178          */
179         unsigned int entry_ctb_x;
180         unsigned int entry_ctb_y;
181         unsigned int entry_tile_x;
182         unsigned int entry_tile_y;
183         unsigned int entry_qp;
184         u32 entry_slice;
185
186         u32 rpi_config2;
187         u32 rpi_framesize;
188         u32 rpi_currpoc;
189
190         struct vb2_v4l2_buffer *frame_buf; // Detached dest buffer
191         struct vb2_v4l2_buffer *src_buf;   // Detached src buffer
192         unsigned int frame_c_offset;
193         unsigned int frame_stride;
194         dma_addr_t frame_addr;
195         dma_addr_t ref_addrs[16];
196         struct rpivid_q_aux *frame_aux;
197         struct rpivid_q_aux *col_aux;
198
199         dma_addr_t cmd_addr;
200         size_t cmd_size;
201
202         dma_addr_t pu_base_vc;
203         dma_addr_t coeff_base_vc;
204         u32 pu_stride;
205         u32 coeff_stride;
206
207         struct rpivid_gptr *bit_copy_gptr;
208         size_t bit_copy_len;
209
210 #define SLICE_MSGS_MAX (2 * HEVC_MAX_REFS * 8 + 3)
211         u16 slice_msgs[SLICE_MSGS_MAX];
212         u8 scaling_factors[NUM_SCALING_FACTORS];
213
214 #if USE_REQUEST_PIN
215         struct media_request *req_pin;
216 #else
217         struct media_request_object *req_obj;
218 #endif
219         struct rpivid_hw_irq_ent irq_ent;
220 };
221
222 #define member_size(type, member) sizeof(((type *)0)->member)
223
224 struct rpivid_dec_state {
225         struct v4l2_ctrl_hevc_sps sps;
226         struct v4l2_ctrl_hevc_pps pps;
227
228         // Helper vars & tables derived from sps/pps
229         unsigned int log2_ctb_size;     /* log2 width of a CTB */
230         unsigned int ctb_width;         /* Width in CTBs */
231         unsigned int ctb_height;        /* Height in CTBs */
232         unsigned int ctb_size;          /* Pic area in CTBs */
233         unsigned int tile_width;        /* Width in tiles */
234         unsigned int tile_height;       /* Height in tiles */
235
236         int *col_bd;
237         int *row_bd;
238         int *ctb_addr_rs_to_ts;
239         int *ctb_addr_ts_to_rs;
240
241         // Aux starage for DPB
242         // Hold refs
243         struct rpivid_q_aux *ref_aux[HEVC_MAX_REFS];
244         struct rpivid_q_aux *frame_aux;
245
246         // Slice vars
247         unsigned int slice_idx;
248         bool slice_temporal_mvp;  /* Slice flag but constant for frame */
249         bool use_aux;
250         bool mk_aux;
251
252         // Temp vars per run - don't actually need to persist
253         u8 *src_buf;
254         dma_addr_t src_addr;
255         const struct v4l2_ctrl_hevc_slice_params *sh;
256         const struct v4l2_ctrl_hevc_decode_params *dec;
257         unsigned int nb_refs[2];
258         unsigned int slice_qp;
259         unsigned int max_num_merge_cand; // 0 if I-slice
260         bool dependent_slice_segment_flag;
261
262         unsigned int start_ts;          /* slice_segment_addr -> ts */
263         unsigned int start_ctb_x;       /* CTB X,Y of start_ts */
264         unsigned int start_ctb_y;
265         unsigned int prev_ctb_x;        /* CTB X,Y of start_ts - 1 */
266         unsigned int prev_ctb_y;
267 };
268
269 #if !USE_REQUEST_PIN
270 static void dst_req_obj_release(struct media_request_object *object)
271 {
272         kfree(object);
273 }
274
275 static const struct media_request_object_ops dst_req_obj_ops = {
276         .release = dst_req_obj_release,
277 };
278 #endif
279
280 static inline int clip_int(const int x, const int lo, const int hi)
281 {
282         return x < lo ? lo : x > hi ? hi : x;
283 }
284
285 //////////////////////////////////////////////////////////////////////////////
286 // Phase 1 command and bit FIFOs
287
288 #if DEBUG_TRACE_P1_CMD
289 static int p1_z;
290 #endif
291
292 static int cmds_check_space(struct rpivid_dec_env *const de, unsigned int n)
293 {
294         struct rpi_cmd *a;
295         unsigned int newmax;
296
297         if (n > 0x100000) {
298                 v4l2_err(&de->ctx->dev->v4l2_dev,
299                          "%s: n %u implausible\n", __func__, n);
300                 return -ENOMEM;
301         }
302
303         if (de->cmd_len + n <= de->cmd_max)
304                 return 0;
305
306         newmax = roundup_pow_of_two(de->cmd_len + n);
307
308         a = krealloc(de->cmd_fifo, newmax * sizeof(struct rpi_cmd),
309                      GFP_KERNEL);
310         if (!a) {
311                 v4l2_err(&de->ctx->dev->v4l2_dev,
312                          "Failed cmd buffer realloc from %u to %u\n",
313                          de->cmd_max, newmax);
314                 return -ENOMEM;
315         }
316         v4l2_info(&de->ctx->dev->v4l2_dev,
317                   "cmd buffer realloc from %u to %u\n", de->cmd_max, newmax);
318
319         de->cmd_fifo = a;
320         de->cmd_max = newmax;
321         return 0;
322 }
323
324 // ???? u16 addr - put in u32
325 static void p1_apb_write(struct rpivid_dec_env *const de, const u16 addr,
326                          const u32 data)
327 {
328         if (de->cmd_len >= de->cmd_max) {
329                 v4l2_err(&de->ctx->dev->v4l2_dev,
330                          "%s: Overflow @ %d\n", __func__, de->cmd_len);
331                 return;
332         }
333
334         de->cmd_fifo[de->cmd_len].addr = addr;
335         de->cmd_fifo[de->cmd_len].data = data;
336
337 #if DEBUG_TRACE_P1_CMD
338         if (++p1_z < 256) {
339                 v4l2_info(&de->ctx->dev->v4l2_dev, "[%02x] %x %x\n",
340                           de->cmd_len, addr, data);
341         }
342 #endif
343         de->cmd_len++;
344 }
345
346 static int ctb_to_tile(unsigned int ctb, unsigned int *bd, int num)
347 {
348         int i;
349
350         for (i = 1; ctb >= bd[i]; i++)
351                 ; // bd[] has num+1 elements; bd[0]=0;
352         return i - 1;
353 }
354
355 static unsigned int ctb_to_tile_x(const struct rpivid_dec_state *const s,
356                                   const unsigned int ctb_x)
357 {
358         return ctb_to_tile(ctb_x, s->col_bd, s->tile_width);
359 }
360
361 static unsigned int ctb_to_tile_y(const struct rpivid_dec_state *const s,
362                                   const unsigned int ctb_y)
363 {
364         return ctb_to_tile(ctb_y, s->row_bd, s->tile_height);
365 }
366
367 static void aux_q_free(struct rpivid_ctx *const ctx,
368                        struct rpivid_q_aux *const aq)
369 {
370         struct rpivid_dev *const dev = ctx->dev;
371
372         gptr_free(dev, &aq->col);
373         kfree(aq);
374 }
375
376 static struct rpivid_q_aux *aux_q_alloc(struct rpivid_ctx *const ctx,
377                                         const unsigned int q_index)
378 {
379         struct rpivid_dev *const dev = ctx->dev;
380         struct rpivid_q_aux *const aq = kzalloc(sizeof(*aq), GFP_KERNEL);
381
382         if (!aq)
383                 return NULL;
384
385         if (gptr_alloc(dev, &aq->col, ctx->colmv_picsize,
386                        DMA_ATTR_FORCE_CONTIGUOUS | DMA_ATTR_NO_KERNEL_MAPPING))
387                 goto fail;
388
389         /*
390          * Spinlock not required as called in P0 only and
391          * aux checks done by _new
392          */
393         aq->refcount = 1;
394         aq->q_index = q_index;
395         ctx->aux_ents[q_index] = aq;
396         return aq;
397
398 fail:
399         kfree(aq);
400         return NULL;
401 }
402
403 static struct rpivid_q_aux *aux_q_new(struct rpivid_ctx *const ctx,
404                                       const unsigned int q_index)
405 {
406         struct rpivid_q_aux *aq;
407         unsigned long lockflags;
408
409         spin_lock_irqsave(&ctx->aux_lock, lockflags);
410         /*
411          * If we already have this allocated to a slot then use that
412          * and assume that it will all work itself out in the pipeline
413          */
414         if ((aq = ctx->aux_ents[q_index]) != NULL) {
415                 ++aq->refcount;
416         } else if ((aq = ctx->aux_free) != NULL) {
417                 ctx->aux_free = aq->next;
418                 aq->next = NULL;
419                 aq->refcount = 1;
420                 aq->q_index = q_index;
421                 ctx->aux_ents[q_index] = aq;
422         }
423         spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
424
425         if (!aq)
426                 aq = aux_q_alloc(ctx, q_index);
427
428         return aq;
429 }
430
431 static struct rpivid_q_aux *aux_q_ref_idx(struct rpivid_ctx *const ctx,
432                                           const int q_index)
433 {
434         unsigned long lockflags;
435         struct rpivid_q_aux *aq;
436
437         spin_lock_irqsave(&ctx->aux_lock, lockflags);
438         if ((aq = ctx->aux_ents[q_index]) != NULL)
439                 ++aq->refcount;
440         spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
441
442         return aq;
443 }
444
445 static struct rpivid_q_aux *aux_q_ref(struct rpivid_ctx *const ctx,
446                                       struct rpivid_q_aux *const aq)
447 {
448         if (aq) {
449                 unsigned long lockflags;
450
451                 spin_lock_irqsave(&ctx->aux_lock, lockflags);
452
453                 ++aq->refcount;
454
455                 spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
456         }
457         return aq;
458 }
459
460 static void aux_q_release(struct rpivid_ctx *const ctx,
461                           struct rpivid_q_aux **const paq)
462 {
463         struct rpivid_q_aux *const aq = *paq;
464         unsigned long lockflags;
465
466         if (!aq)
467                 return;
468
469         *paq = NULL;
470
471         spin_lock_irqsave(&ctx->aux_lock, lockflags);
472         if (--aq->refcount == 0) {
473                 aq->next = ctx->aux_free;
474                 ctx->aux_free = aq;
475                 ctx->aux_ents[aq->q_index] = NULL;
476                 aq->q_index = ~0U;
477         }
478         spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
479 }
480
481 static void aux_q_init(struct rpivid_ctx *const ctx)
482 {
483         spin_lock_init(&ctx->aux_lock);
484         ctx->aux_free = NULL;
485 }
486
487 static void aux_q_uninit(struct rpivid_ctx *const ctx)
488 {
489         struct rpivid_q_aux *aq;
490
491         ctx->colmv_picsize = 0;
492         ctx->colmv_stride = 0;
493         while ((aq = ctx->aux_free) != NULL) {
494                 ctx->aux_free = aq->next;
495                 aux_q_free(ctx, aq);
496         }
497 }
498
499 //////////////////////////////////////////////////////////////////////////////
500
501 /*
502  * Initialisation process for context variables (CABAC init)
503  * see H.265 9.3.2.2
504  *
505  * N.B. If comparing with FFmpeg note that this h/w uses slightly different
506  * offsets to FFmpegs array
507  */
508
509 /* Actual number of values */
510 #define RPI_PROB_VALS 154U
511 /* Rounded up as we copy words */
512 #define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
513
514 /* Initialiser values - see tables H.265 9-4 through 9-42 */
515 static const u8 prob_init[3][156] = {
516         {
517                 153, 200, 139, 141, 157, 154, 154, 154, 154, 154, 184, 154, 154,
518                 154, 184, 63,  154, 154, 154, 154, 154, 154, 154, 154, 154, 154,
519                 154, 154, 154, 153, 138, 138, 111, 141, 94,  138, 182, 154, 154,
520                 154, 140, 92,  137, 138, 140, 152, 138, 139, 153, 74,  149, 92,
521                 139, 107, 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, 110,
522                 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
523                 79,  108, 123, 63,  110, 110, 124, 125, 140, 153, 125, 127, 140,
524                 109, 111, 143, 127, 111, 79,  108, 123, 63,  91,  171, 134, 141,
525                 138, 153, 136, 167, 152, 152, 139, 139, 111, 111, 125, 110, 110,
526                 94,  124, 108, 124, 107, 125, 141, 179, 153, 125, 107, 125, 141,
527                 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, 139, 182, 182,
528                 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, 0,   0,
529         },
530         {
531                 153, 185, 107, 139, 126, 197, 185, 201, 154, 149, 154, 139, 154,
532                 154, 154, 152, 110, 122, 95,  79,  63,  31,  31,  153, 153, 168,
533                 140, 198, 79,  124, 138, 94,  153, 111, 149, 107, 167, 154, 154,
534                 154, 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136,
535                 153, 121, 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, 125,
536                 110, 94,  110, 95,  79,  125, 111, 110, 78,  110, 111, 111, 95,
537                 94,  108, 123, 108, 125, 110, 94,  110, 95,  79,  125, 111, 110,
538                 78,  110, 111, 111, 95,  94,  108, 123, 108, 121, 140, 61,  154,
539                 107, 167, 91,  122, 107, 167, 139, 139, 155, 154, 139, 153, 139,
540                 123, 123, 63,  153, 166, 183, 140, 136, 153, 154, 166, 183, 140,
541                 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 123, 123,
542                 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, 0,   0,
543         },
544         {
545                 153, 160, 107, 139, 126, 197, 185, 201, 154, 134, 154, 139, 154,
546                 154, 183, 152, 154, 137, 95,  79,  63,  31,  31,  153, 153, 168,
547                 169, 198, 79,  224, 167, 122, 153, 111, 149, 92,  167, 154, 154,
548                 154, 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136,
549                 153, 121, 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, 125,
550                 110, 124, 110, 95,  94,  125, 111, 111, 79,  125, 126, 111, 111,
551                 79,  108, 123, 93,  125, 110, 124, 110, 95,  94,  125, 111, 111,
552                 79,  125, 126, 111, 111, 79,  108, 123, 93,  121, 140, 61,  154,
553                 107, 167, 91,  107, 107, 167, 139, 139, 170, 154, 139, 153, 139,
554                 123, 123, 63,  124, 166, 183, 140, 136, 153, 154, 166, 183, 140,
555                 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 138, 138,
556                 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, 0,   0,
557         },
558 };
559
560 #define CMDS_WRITE_PROB ((RPI_PROB_ARRAY_SIZE / 4) + 1)
561 static void write_prob(struct rpivid_dec_env *const de,
562                        const struct rpivid_dec_state *const s)
563 {
564         u8 dst[RPI_PROB_ARRAY_SIZE];
565
566         const unsigned int init_type =
567                 ((s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT) != 0 &&
568                  s->sh->slice_type != HEVC_SLICE_I) ?
569                         s->sh->slice_type + 1 :
570                         2 - s->sh->slice_type;
571         const u8 *p = prob_init[init_type];
572         const int q = clip_int(s->slice_qp, 0, 51);
573         unsigned int i;
574
575         for (i = 0; i < RPI_PROB_VALS; i++) {
576                 int init_value = p[i];
577                 int m = (init_value >> 4) * 5 - 45;
578                 int n = ((init_value & 15) << 3) - 16;
579                 int pre = 2 * (((m * q) >> 4) + n) - 127;
580
581                 pre ^= pre >> 31;
582                 if (pre > 124)
583                         pre = 124 + (pre & 1);
584                 dst[i] = pre;
585         }
586         for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i)
587                 dst[i] = 0;
588
589         for (i = 0; i < RPI_PROB_ARRAY_SIZE; i += 4)
590                 p1_apb_write(de, 0x1000 + i,
591                              dst[i] + (dst[i + 1] << 8) + (dst[i + 2] << 16) +
592                                      (dst[i + 3] << 24));
593
594         /*
595          * Having written the prob array back it up
596          * This is not always needed but is a small overhead that simplifies
597          * (and speeds up) some multi-tile & WPP scenarios
598          * There are no scenarios where having written a prob we ever want
599          * a previous (non-initial) state back
600          */
601         p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
602 }
603
604 #define CMDS_WRITE_SCALING_FACTORS NUM_SCALING_FACTORS
605 static void write_scaling_factors(struct rpivid_dec_env *const de)
606 {
607         int i;
608         const u8 *p = (u8 *)de->scaling_factors;
609
610         for (i = 0; i < NUM_SCALING_FACTORS; i += 4, p += 4)
611                 p1_apb_write(de, 0x2000 + i,
612                              p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24));
613 }
614
615 static inline __u32 dma_to_axi_addr(dma_addr_t a)
616 {
617         return (__u32)(a >> 6);
618 }
619
620 #define CMDS_WRITE_BITSTREAM 4
621 static int write_bitstream(struct rpivid_dec_env *const de,
622                            const struct rpivid_dec_state *const s)
623 {
624         // Note that FFmpeg V4L2 does not remove emulation prevention bytes,
625         // so this is matched in the configuration here.
626         // Whether that is the correct behaviour or not is not clear in the
627         // spec.
628         const int rpi_use_emu = 1;
629         unsigned int offset = s->sh->data_bit_offset / 8 + 1;
630         const unsigned int len = (s->sh->bit_size + 7) / 8 - offset;
631         dma_addr_t addr;
632
633         if (s->src_addr != 0) {
634                 addr = s->src_addr + offset;
635         } else {
636                 if (len + de->bit_copy_len > de->bit_copy_gptr->size) {
637                         v4l2_warn(&de->ctx->dev->v4l2_dev,
638                                   "Bit copy buffer overflow: size=%zu, offset=%zu, len=%u\n",
639                                   de->bit_copy_gptr->size,
640                                   de->bit_copy_len, len);
641                         return -ENOMEM;
642                 }
643                 memcpy(de->bit_copy_gptr->ptr + de->bit_copy_len,
644                        s->src_buf + offset, len);
645                 addr = de->bit_copy_gptr->addr + de->bit_copy_len;
646                 de->bit_copy_len += (len + 63) & ~63;
647         }
648         offset = addr & 63;
649
650         p1_apb_write(de, RPI_BFBASE, dma_to_axi_addr(addr));
651         p1_apb_write(de, RPI_BFNUM, len);
652         p1_apb_write(de, RPI_BFCONTROL, offset + (1 << 7)); // Stop
653         p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu << 6));
654         return 0;
655 }
656
657 //////////////////////////////////////////////////////////////////////////////
658
659 /*
660  * The slice constant part of the slice register - width and height need to
661  * be ORed in later as they are per-tile / WPP-row
662  */
663 static u32 slice_reg_const(const struct rpivid_dec_state *const s)
664 {
665         u32 x = (s->max_num_merge_cand << 0) |
666                 (s->nb_refs[L0] << 4) |
667                 (s->nb_refs[L1] << 8) |
668                 (s->sh->slice_type << 12);
669
670         if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA)
671                 x |= BIT(14);
672         if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA)
673                 x |= BIT(15);
674         if (s->sh->slice_type == HEVC_SLICE_B &&
675             (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO))
676                 x |= BIT(16);
677
678         return x;
679 }
680
681 //////////////////////////////////////////////////////////////////////////////
682
683 #define CMDS_NEW_SLICE_SEGMENT (4 + CMDS_WRITE_SCALING_FACTORS)
684 static void new_slice_segment(struct rpivid_dec_env *const de,
685                               const struct rpivid_dec_state *const s)
686 {
687         const struct v4l2_ctrl_hevc_sps *const sps = &s->sps;
688         const struct v4l2_ctrl_hevc_pps *const pps = &s->pps;
689
690         p1_apb_write(de,
691                      RPI_SPS0,
692                      ((sps->log2_min_luma_coding_block_size_minus3 + 3) << 0) |
693                      (s->log2_ctb_size << 4) |
694                      ((sps->log2_min_luma_transform_block_size_minus2 + 2)
695                                                         << 8) |
696                      ((sps->log2_min_luma_transform_block_size_minus2 + 2 +
697                        sps->log2_diff_max_min_luma_transform_block_size)
698                                                 << 12) |
699                      ((sps->bit_depth_luma_minus8 + 8) << 16) |
700                      ((sps->bit_depth_chroma_minus8 + 8) << 20) |
701                      (sps->max_transform_hierarchy_depth_intra << 24) |
702                      (sps->max_transform_hierarchy_depth_inter << 28));
703
704         p1_apb_write(de,
705                      RPI_SPS1,
706                      ((sps->pcm_sample_bit_depth_luma_minus1 + 1) << 0) |
707                      ((sps->pcm_sample_bit_depth_chroma_minus1 + 1) << 4) |
708                      ((sps->log2_min_pcm_luma_coding_block_size_minus3 + 3)
709                                                 << 8) |
710                      ((sps->log2_min_pcm_luma_coding_block_size_minus3 + 3 +
711                        sps->log2_diff_max_min_pcm_luma_coding_block_size)
712                                                 << 12) |
713                      (((sps->flags & V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE) ?
714                                 0 : sps->chroma_format_idc) << 16) |
715                      ((!!(sps->flags & V4L2_HEVC_SPS_FLAG_AMP_ENABLED)) << 18) |
716                      ((!!(sps->flags & V4L2_HEVC_SPS_FLAG_PCM_ENABLED)) << 19) |
717                      ((!!(sps->flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED))
718                                                 << 20) |
719                      ((!!(sps->flags &
720                            V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED))
721                                                 << 21));
722
723         p1_apb_write(de,
724                      RPI_PPS,
725                      ((s->log2_ctb_size - pps->diff_cu_qp_delta_depth) << 0) |
726                      ((!!(pps->flags & V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED))
727                                                  << 4) |
728                      ((!!(pps->flags &
729                                 V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED))
730                                                  << 5) |
731                      ((!!(pps->flags & V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED))
732                                                  << 6) |
733                      ((!!(pps->flags &
734                                 V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED))
735                                                 << 7) |
736                      (((pps->pps_cb_qp_offset + s->sh->slice_cb_qp_offset) & 255)
737                                                 << 8) |
738                      (((pps->pps_cr_qp_offset + s->sh->slice_cr_qp_offset) & 255)
739                                                 << 16) |
740                      ((!!(pps->flags &
741                                 V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED))
742                                                 << 24));
743
744         if (!s->start_ts &&
745             (sps->flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED) != 0)
746                 write_scaling_factors(de);
747
748         if (!s->dependent_slice_segment_flag) {
749                 int ctb_col = s->sh->slice_segment_addr %
750                                                         de->pic_width_in_ctbs_y;
751                 int ctb_row = s->sh->slice_segment_addr /
752                                                         de->pic_width_in_ctbs_y;
753
754                 de->reg_slicestart = (ctb_col << 0) + (ctb_row << 16);
755         }
756
757         p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
758 }
759
760 //////////////////////////////////////////////////////////////////////////////
761 // Slice messages
762
763 static void msg_slice(struct rpivid_dec_env *const de, const u16 msg)
764 {
765         de->slice_msgs[de->num_slice_msgs++] = msg;
766 }
767
768 #define CMDS_PROGRAM_SLICECMDS (1 + SLICE_MSGS_MAX)
769 static void program_slicecmds(struct rpivid_dec_env *const de,
770                               const int sliceid)
771 {
772         int i;
773
774         p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs + (sliceid << 8));
775
776         for (i = 0; i < de->num_slice_msgs; i++)
777                 p1_apb_write(de, 0x4000 + 4 * i, de->slice_msgs[i] & 0xffff);
778 }
779
780 // NoBackwardPredictionFlag 8.3.5
781 // Simply checks POCs
782 static int has_backward(const struct v4l2_hevc_dpb_entry *const dpb,
783                         const __u8 *const idx, const unsigned int n,
784                         const unsigned int cur_poc)
785 {
786         unsigned int i;
787
788         for (i = 0; i < n; ++i) {
789                 // Compare mod 2^16
790                 // We only get u16 pocs & 8.3.1 says
791                 // "The bitstream shall not contain data that result in values
792                 //  of DiffPicOrderCnt( picA, picB ) used in the decoding
793                 //  process that are not in the range of âˆ’2^15 to 2^15 âˆ’ 1,
794                 //  inclusive."
795                 if (((cur_poc - dpb[idx[i]].pic_order_cnt[0]) & 0x8000) != 0)
796                         return 0;
797         }
798         return 1;
799 }
800
801 static void pre_slice_decode(struct rpivid_dec_env *const de,
802                              const struct rpivid_dec_state *const s)
803 {
804         const struct v4l2_ctrl_hevc_slice_params *const sh = s->sh;
805         const struct v4l2_ctrl_hevc_decode_params *const dec = s->dec;
806         int weighted_pred_flag, idx;
807         u16 cmd_slice;
808         unsigned int collocated_from_l0_flag;
809
810         de->num_slice_msgs = 0;
811
812         cmd_slice = 0;
813         if (sh->slice_type == HEVC_SLICE_I)
814                 cmd_slice = 1;
815         if (sh->slice_type == HEVC_SLICE_P)
816                 cmd_slice = 2;
817         if (sh->slice_type == HEVC_SLICE_B)
818                 cmd_slice = 3;
819
820         cmd_slice |= (s->nb_refs[L0] << 2) | (s->nb_refs[L1] << 6) |
821                      (s->max_num_merge_cand << 11);
822
823         collocated_from_l0_flag =
824                 !s->slice_temporal_mvp ||
825                 sh->slice_type != HEVC_SLICE_B ||
826                 (sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0);
827         cmd_slice |= collocated_from_l0_flag << 14;
828
829         if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
830                 // Flag to say all reference pictures are from the past
831                 const int no_backward_pred_flag =
832                         has_backward(dec->dpb, sh->ref_idx_l0, s->nb_refs[L0],
833                                      sh->slice_pic_order_cnt) &&
834                         has_backward(dec->dpb, sh->ref_idx_l1, s->nb_refs[L1],
835                                      sh->slice_pic_order_cnt);
836                 cmd_slice |= no_backward_pred_flag << 10;
837                 msg_slice(de, cmd_slice);
838
839                 if (s->slice_temporal_mvp) {
840                         const __u8 *const rpl = collocated_from_l0_flag ?
841                                                 sh->ref_idx_l0 : sh->ref_idx_l1;
842                         de->dpbno_col = rpl[sh->collocated_ref_idx];
843                         //v4l2_info(&de->ctx->dev->v4l2_dev,
844                         //          "L0=%d col_ref_idx=%d,
845                         //          dpb_no=%d\n", collocated_from_l0_flag,
846                         //          sh->collocated_ref_idx, de->dpbno_col);
847                 }
848
849                 // Write reference picture descriptions
850                 weighted_pred_flag =
851                         sh->slice_type == HEVC_SLICE_P ?
852                                 !!(s->pps.flags & V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED) :
853                                 !!(s->pps.flags & V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED);
854
855                 for (idx = 0; idx < s->nb_refs[L0]; ++idx) {
856                         unsigned int dpb_no = sh->ref_idx_l0[idx];
857                         //v4l2_info(&de->ctx->dev->v4l2_dev,
858                         //        "L0[%d]=dpb[%d]\n", idx, dpb_no);
859
860                         msg_slice(de,
861                                   dpb_no |
862                                   (dec->dpb[dpb_no].rps ==
863                                         V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR ?
864                                                  (1 << 4) : 0) |
865                                   (weighted_pred_flag ? (3 << 5) : 0));
866                         msg_slice(de, dec->dpb[dpb_no].pic_order_cnt[0]);
867
868                         if (weighted_pred_flag) {
869                                 const struct v4l2_hevc_pred_weight_table
870                                         *const w = &sh->pred_weight_table;
871                                 const int luma_weight_denom =
872                                         (1 << w->luma_log2_weight_denom);
873                                 const unsigned int chroma_log2_weight_denom =
874                                         (w->luma_log2_weight_denom +
875                                          w->delta_chroma_log2_weight_denom);
876                                 const int chroma_weight_denom =
877                                         (1 << chroma_log2_weight_denom);
878
879                                 msg_slice(de,
880                                           w->luma_log2_weight_denom |
881                                           (((w->delta_luma_weight_l0[idx] +
882                                              luma_weight_denom) & 0x1ff)
883                                                  << 3));
884                                 msg_slice(de, w->luma_offset_l0[idx] & 0xff);
885                                 msg_slice(de,
886                                           chroma_log2_weight_denom |
887                                           (((w->delta_chroma_weight_l0[idx][0] +
888                                              chroma_weight_denom) & 0x1ff)
889                                                    << 3));
890                                 msg_slice(de,
891                                           w->chroma_offset_l0[idx][0] & 0xff);
892                                 msg_slice(de,
893                                           chroma_log2_weight_denom |
894                                           (((w->delta_chroma_weight_l0[idx][1] +
895                                              chroma_weight_denom) & 0x1ff)
896                                                    << 3));
897                                 msg_slice(de,
898                                           w->chroma_offset_l0[idx][1] & 0xff);
899                         }
900                 }
901
902                 for (idx = 0; idx < s->nb_refs[L1]; ++idx) {
903                         unsigned int dpb_no = sh->ref_idx_l1[idx];
904                         //v4l2_info(&de->ctx->dev->v4l2_dev,
905                         //          "L1[%d]=dpb[%d]\n", idx, dpb_no);
906                         msg_slice(de,
907                                   dpb_no |
908                                   (dec->dpb[dpb_no].rps ==
909                                          V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR ?
910                                                  (1 << 4) : 0) |
911                                         (weighted_pred_flag ? (3 << 5) : 0));
912                         msg_slice(de, dec->dpb[dpb_no].pic_order_cnt[0]);
913                         if (weighted_pred_flag) {
914                                 const struct v4l2_hevc_pred_weight_table
915                                         *const w = &sh->pred_weight_table;
916                                 const int luma_weight_denom =
917                                         (1 << w->luma_log2_weight_denom);
918                                 const unsigned int chroma_log2_weight_denom =
919                                         (w->luma_log2_weight_denom +
920                                          w->delta_chroma_log2_weight_denom);
921                                 const int chroma_weight_denom =
922                                         (1 << chroma_log2_weight_denom);
923
924                                 msg_slice(de,
925                                           w->luma_log2_weight_denom |
926                                           (((w->delta_luma_weight_l1[idx] +
927                                              luma_weight_denom) & 0x1ff) << 3));
928                                 msg_slice(de, w->luma_offset_l1[idx] & 0xff);
929                                 msg_slice(de,
930                                           chroma_log2_weight_denom |
931                                           (((w->delta_chroma_weight_l1[idx][0] +
932                                              chroma_weight_denom) & 0x1ff)
933                                                         << 3));
934                                 msg_slice(de,
935                                           w->chroma_offset_l1[idx][0] & 0xff);
936                                 msg_slice(de,
937                                           chroma_log2_weight_denom |
938                                           (((w->delta_chroma_weight_l1[idx][1] +
939                                              chroma_weight_denom) & 0x1ff)
940                                                    << 3));
941                                 msg_slice(de,
942                                           w->chroma_offset_l1[idx][1] & 0xff);
943                         }
944                 }
945         } else {
946                 msg_slice(de, cmd_slice);
947         }
948
949         msg_slice(de,
950                   (sh->slice_beta_offset_div2 & 15) |
951                   ((sh->slice_tc_offset_div2 & 15) << 4) |
952                   ((sh->flags &
953                     V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED) ?
954                                                 1 << 8 : 0) |
955                   ((sh->flags &
956                           V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED) ?
957                                                 1 << 9 : 0) |
958                   ((s->pps.flags &
959                           V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED) ?
960                                                 1 << 10 : 0));
961
962         msg_slice(de, ((sh->slice_cr_qp_offset & 31) << 5) +
963                        (sh->slice_cb_qp_offset & 31)); // CMD_QPOFF
964 }
965
966 #define CMDS_WRITE_SLICE 1
967 static void write_slice(struct rpivid_dec_env *const de,
968                         const struct rpivid_dec_state *const s,
969                         const u32 slice_const,
970                         const unsigned int ctb_col,
971                         const unsigned int ctb_row)
972 {
973         const unsigned int cs = (1 << s->log2_ctb_size);
974         const unsigned int w_last = s->sps.pic_width_in_luma_samples & (cs - 1);
975         const unsigned int h_last = s->sps.pic_height_in_luma_samples & (cs - 1);
976
977         p1_apb_write(de, RPI_SLICE,
978                      slice_const |
979                      ((ctb_col + 1 < s->ctb_width || !w_last ?
980                                 cs : w_last) << 17) |
981                      ((ctb_row + 1 < s->ctb_height || !h_last ?
982                                 cs : h_last) << 24));
983 }
984
985 #define PAUSE_MODE_WPP  1
986 #define PAUSE_MODE_TILE 0xffff
987
988 /*
989  * N.B. This can be called to fill in data from the previous slice so must not
990  * use any state data that may change from slice to slice (e.g. qp)
991  */
992 #define CMDS_NEW_ENTRY_POINT (6 + CMDS_WRITE_SLICE)
993 static void new_entry_point(struct rpivid_dec_env *const de,
994                             const struct rpivid_dec_state *const s,
995                             const bool do_bte,
996                             const bool reset_qp_y,
997                             const u32 pause_mode,
998                             const unsigned int tile_x,
999                             const unsigned int tile_y,
1000                             const unsigned int ctb_col,
1001                             const unsigned int ctb_row,
1002                             const unsigned int slice_qp,
1003                             const u32 slice_const)
1004 {
1005         const unsigned int endx = s->col_bd[tile_x + 1] - 1;
1006         const unsigned int endy = (pause_mode == PAUSE_MODE_WPP) ?
1007                 ctb_row : s->row_bd[tile_y + 1] - 1;
1008
1009         p1_apb_write(de, RPI_TILESTART,
1010                      s->col_bd[tile_x] | (s->row_bd[tile_y] << 16));
1011         p1_apb_write(de, RPI_TILEEND, endx | (endy << 16));
1012
1013         if (do_bte)
1014                 p1_apb_write(de, RPI_BEGINTILEEND, endx | (endy << 16));
1015
1016         write_slice(de, s, slice_const, endx, endy);
1017
1018         if (reset_qp_y) {
1019                 unsigned int sps_qp_bd_offset =
1020                         6 * s->sps.bit_depth_luma_minus8;
1021
1022                 p1_apb_write(de, RPI_QP, sps_qp_bd_offset + slice_qp);
1023         }
1024
1025         p1_apb_write(de, RPI_MODE,
1026                      pause_mode |
1027                         ((endx == s->ctb_width - 1) << 17) |
1028                         ((endy == s->ctb_height - 1) << 18));
1029
1030         p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) | (ctb_row << 16));
1031
1032         de->entry_tile_x = tile_x;
1033         de->entry_tile_y = tile_y;
1034         de->entry_ctb_x = ctb_col;
1035         de->entry_ctb_y = ctb_row;
1036         de->entry_qp = slice_qp;
1037         de->entry_slice = slice_const;
1038 }
1039
1040 //////////////////////////////////////////////////////////////////////////////
1041 // Wavefront mode
1042
1043 #define CMDS_WPP_PAUSE 4
1044 static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row)
1045 {
1046         p1_apb_write(de, RPI_STATUS, (ctb_row << 18) | 0x25);
1047         p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
1048         p1_apb_write(de, RPI_MODE,
1049                      ctb_row == de->pic_height_in_ctbs_y - 1 ?
1050                                                         0x70000 : 0x30000);
1051         p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2);
1052 }
1053
1054 #define CMDS_WPP_ENTRY_FILL_1 (CMDS_WPP_PAUSE + 2 + CMDS_NEW_ENTRY_POINT)
1055 static int wpp_entry_fill(struct rpivid_dec_env *const de,
1056                           const struct rpivid_dec_state *const s,
1057                           const unsigned int last_y)
1058 {
1059         int rv;
1060         const unsigned int last_x = s->ctb_width - 1;
1061
1062         rv = cmds_check_space(de, CMDS_WPP_ENTRY_FILL_1 *
1063                                   (last_y - de->entry_ctb_y));
1064         if (rv)
1065                 return rv;
1066
1067         while (de->entry_ctb_y < last_y) {
1068                 /* wpp_entry_x/y set by wpp_entry_point */
1069                 if (s->ctb_width > 2)
1070                         wpp_pause(de, de->entry_ctb_y);
1071                 p1_apb_write(de, RPI_STATUS,
1072                              (de->entry_ctb_y << 18) | (last_x << 5) | 2);
1073
1074                 /* if width == 1 then the saved state is the init one */
1075                 if (s->ctb_width == 2)
1076                         p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
1077                 else
1078                         p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
1079
1080                 new_entry_point(de, s, false, true, PAUSE_MODE_WPP,
1081                                 0, 0, 0, de->entry_ctb_y + 1,
1082                                 de->entry_qp, de->entry_slice);
1083         }
1084         return 0;
1085 }
1086
1087 static int wpp_end_previous_slice(struct rpivid_dec_env *const de,
1088                                   const struct rpivid_dec_state *const s)
1089 {
1090         int rv;
1091
1092         rv = wpp_entry_fill(de, s, s->prev_ctb_y);
1093         if (rv)
1094                 return rv;
1095
1096         rv = cmds_check_space(de, CMDS_WPP_PAUSE + 2);
1097         if (rv)
1098                 return rv;
1099
1100         if (de->entry_ctb_x < 2 &&
1101             (de->entry_ctb_y < s->start_ctb_y || s->start_ctb_x > 2) &&
1102             s->ctb_width > 2)
1103                 wpp_pause(de, s->prev_ctb_y);
1104         p1_apb_write(de, RPI_STATUS,
1105                      1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
1106         if (s->start_ctb_x == 2 ||
1107             (s->ctb_width == 2 && de->entry_ctb_y < s->start_ctb_y))
1108                 p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
1109         return 0;
1110 }
1111
1112 /* Only main profile supported so WPP => !Tiles which makes some of the
1113  * next chunk code simpler
1114  */
1115 static int wpp_decode_slice(struct rpivid_dec_env *const de,
1116                             const struct rpivid_dec_state *const s,
1117                             bool last_slice)
1118 {
1119         bool reset_qp_y = true;
1120         const bool indep = !s->dependent_slice_segment_flag;
1121         int rv;
1122
1123         if (s->start_ts) {
1124                 rv = wpp_end_previous_slice(de, s);
1125                 if (rv)
1126                         return rv;
1127         }
1128         pre_slice_decode(de, s);
1129
1130         rv = cmds_check_space(de,
1131                               CMDS_WRITE_BITSTREAM +
1132                                 CMDS_WRITE_PROB +
1133                                 CMDS_PROGRAM_SLICECMDS +
1134                                 CMDS_NEW_SLICE_SEGMENT +
1135                                 CMDS_NEW_ENTRY_POINT);
1136         if (rv)
1137                 return rv;
1138
1139         rv = write_bitstream(de, s);
1140         if (rv)
1141                 return rv;
1142
1143         if (!s->start_ts || indep || s->ctb_width == 1)
1144                 write_prob(de, s);
1145         else if (!s->start_ctb_x)
1146                 p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
1147         else
1148                 reset_qp_y = false;
1149
1150         program_slicecmds(de, s->slice_idx);
1151         new_slice_segment(de, s);
1152         new_entry_point(de, s, indep, reset_qp_y, PAUSE_MODE_WPP,
1153                         0, 0, s->start_ctb_x, s->start_ctb_y,
1154                         s->slice_qp, slice_reg_const(s));
1155
1156         if (last_slice) {
1157                 rv = wpp_entry_fill(de, s, s->ctb_height - 1);
1158                 if (rv)
1159                         return rv;
1160
1161                 rv = cmds_check_space(de, CMDS_WPP_PAUSE + 1);
1162                 if (rv)
1163                         return rv;
1164
1165                 if (de->entry_ctb_x < 2 && s->ctb_width > 2)
1166                         wpp_pause(de, s->ctb_height - 1);
1167
1168                 p1_apb_write(de, RPI_STATUS,
1169                              1 | ((s->ctb_width - 1) << 5) |
1170                                 ((s->ctb_height - 1) << 18));
1171         }
1172         return 0;
1173 }
1174
1175 //////////////////////////////////////////////////////////////////////////////
1176 // Tiles mode
1177
1178 // Guarantees 1 cmd entry free on exit
1179 static int tile_entry_fill(struct rpivid_dec_env *const de,
1180                            const struct rpivid_dec_state *const s,
1181                            const unsigned int last_tile_x,
1182                            const unsigned int last_tile_y)
1183 {
1184         while (de->entry_tile_y < last_tile_y ||
1185                (de->entry_tile_y == last_tile_y &&
1186                 de->entry_tile_x < last_tile_x)) {
1187                 int rv;
1188                 unsigned int t_x = de->entry_tile_x;
1189                 unsigned int t_y = de->entry_tile_y;
1190                 const unsigned int last_x = s->col_bd[t_x + 1] - 1;
1191                 const unsigned int last_y = s->row_bd[t_y + 1] - 1;
1192
1193                 // One more than needed here
1194                 rv = cmds_check_space(de, CMDS_NEW_ENTRY_POINT + 3);
1195                 if (rv)
1196                         return rv;
1197
1198                 p1_apb_write(de, RPI_STATUS,
1199                              2 | (last_x << 5) | (last_y << 18));
1200                 p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
1201
1202                 // Inc tile
1203                 if (++t_x >= s->tile_width) {
1204                         t_x = 0;
1205                         ++t_y;
1206                 }
1207
1208                 new_entry_point(de, s, false, true, PAUSE_MODE_TILE,
1209                                 t_x, t_y, s->col_bd[t_x], s->row_bd[t_y],
1210                                 de->entry_qp, de->entry_slice);
1211         }
1212         return 0;
1213 }
1214
1215 /*
1216  * Write STATUS register with expected end CTU address of previous slice
1217  */
1218 static int end_previous_slice(struct rpivid_dec_env *const de,
1219                               const struct rpivid_dec_state *const s)
1220 {
1221         int rv;
1222
1223         rv = tile_entry_fill(de, s,
1224                              ctb_to_tile_x(s, s->prev_ctb_x),
1225                              ctb_to_tile_y(s, s->prev_ctb_y));
1226         if (rv)
1227                 return rv;
1228
1229         p1_apb_write(de, RPI_STATUS,
1230                      1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
1231         return 0;
1232 }
1233
1234 static int decode_slice(struct rpivid_dec_env *const de,
1235                         const struct rpivid_dec_state *const s,
1236                         bool last_slice)
1237 {
1238         bool reset_qp_y;
1239         unsigned int tile_x = ctb_to_tile_x(s, s->start_ctb_x);
1240         unsigned int tile_y = ctb_to_tile_y(s, s->start_ctb_y);
1241         int rv;
1242
1243         if (s->start_ts) {
1244                 rv = end_previous_slice(de, s);
1245                 if (rv)
1246                         return rv;
1247         }
1248
1249         rv = cmds_check_space(de,
1250                               CMDS_WRITE_BITSTREAM +
1251                                 CMDS_WRITE_PROB +
1252                                 CMDS_PROGRAM_SLICECMDS +
1253                                 CMDS_NEW_SLICE_SEGMENT +
1254                                 CMDS_NEW_ENTRY_POINT);
1255         if (rv)
1256                 return rv;
1257
1258         pre_slice_decode(de, s);
1259         rv = write_bitstream(de, s);
1260         if (rv)
1261                 return rv;
1262
1263         reset_qp_y = !s->start_ts ||
1264                 !s->dependent_slice_segment_flag ||
1265                 tile_x != ctb_to_tile_x(s, s->prev_ctb_x) ||
1266                 tile_y != ctb_to_tile_y(s, s->prev_ctb_y);
1267         if (reset_qp_y)
1268                 write_prob(de, s);
1269
1270         program_slicecmds(de, s->slice_idx);
1271         new_slice_segment(de, s);
1272         new_entry_point(de, s, !s->dependent_slice_segment_flag, reset_qp_y,
1273                         PAUSE_MODE_TILE,
1274                         tile_x, tile_y, s->start_ctb_x, s->start_ctb_y,
1275                         s->slice_qp, slice_reg_const(s));
1276
1277         /*
1278          * If this is the last slice then fill in the other tile entries
1279          * now, otherwise this will be done at the start of the next slice
1280          * when it will be known where this slice finishes
1281          */
1282         if (last_slice) {
1283                 rv = tile_entry_fill(de, s,
1284                                      s->tile_width - 1,
1285                                      s->tile_height - 1);
1286                 if (rv)
1287                         return rv;
1288                 p1_apb_write(de, RPI_STATUS,
1289                              1 | ((s->ctb_width - 1) << 5) |
1290                                 ((s->ctb_height - 1) << 18));
1291         }
1292         return 0;
1293 }
1294
1295 //////////////////////////////////////////////////////////////////////////////
1296 // Scaling factors
1297
1298 static void expand_scaling_list(const unsigned int size_id,
1299                                 u8 *const dst0,
1300                                 const u8 *const src0, uint8_t dc)
1301 {
1302         u8 *d;
1303         unsigned int x, y;
1304
1305         switch (size_id) {
1306         case 0:
1307                 memcpy(dst0, src0, 16);
1308                 break;
1309         case 1:
1310                 memcpy(dst0, src0, 64);
1311                 break;
1312         case 2:
1313                 d = dst0;
1314
1315                 for (y = 0; y != 16; y++) {
1316                         const u8 *s = src0 + (y >> 1) * 8;
1317
1318                         for (x = 0; x != 8; ++x) {
1319                                 *d++ = *s;
1320                                 *d++ = *s++;
1321                         }
1322                 }
1323                 dst0[0] = dc;
1324                 break;
1325         default:
1326                 d = dst0;
1327
1328                 for (y = 0; y != 32; y++) {
1329                         const u8 *s = src0 + (y >> 2) * 8;
1330
1331                         for (x = 0; x != 8; ++x) {
1332                                 *d++ = *s;
1333                                 *d++ = *s;
1334                                 *d++ = *s;
1335                                 *d++ = *s++;
1336                         }
1337                 }
1338                 dst0[0] = dc;
1339                 break;
1340         }
1341 }
1342
1343 static void populate_scaling_factors(const struct rpivid_run *const run,
1344                                      struct rpivid_dec_env *const de,
1345                                      const struct rpivid_dec_state *const s)
1346 {
1347         const struct v4l2_ctrl_hevc_scaling_matrix *const sl =
1348                 run->h265.scaling_matrix;
1349         // Array of constants for scaling factors
1350         static const u32 scaling_factor_offsets[4][6] = {
1351                 // MID0    MID1    MID2    MID3    MID4    MID5
1352                 // SID0 (4x4)
1353                 { 0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050 },
1354                 // SID1 (8x8)
1355                 { 0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0 },
1356                 // SID2 (16x16)
1357                 { 0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0 },
1358                 // SID3 (32x32)
1359                 { 0x07E0, 0x0BE0, 0x0000, 0x0000, 0x0000, 0x0000 }
1360         };
1361
1362         unsigned int mid;
1363
1364         for (mid = 0; mid < 6; mid++)
1365                 expand_scaling_list(0, de->scaling_factors +
1366                                             scaling_factor_offsets[0][mid],
1367                                     sl->scaling_list_4x4[mid], 0);
1368         for (mid = 0; mid < 6; mid++)
1369                 expand_scaling_list(1, de->scaling_factors +
1370                                             scaling_factor_offsets[1][mid],
1371                                     sl->scaling_list_8x8[mid], 0);
1372         for (mid = 0; mid < 6; mid++)
1373                 expand_scaling_list(2, de->scaling_factors +
1374                                             scaling_factor_offsets[2][mid],
1375                                     sl->scaling_list_16x16[mid],
1376                                     sl->scaling_list_dc_coef_16x16[mid]);
1377         for (mid = 0; mid < 2; mid++)
1378                 expand_scaling_list(3, de->scaling_factors +
1379                                             scaling_factor_offsets[3][mid],
1380                                     sl->scaling_list_32x32[mid],
1381                                     sl->scaling_list_dc_coef_32x32[mid]);
1382 }
1383
1384 static void free_ps_info(struct rpivid_dec_state *const s)
1385 {
1386         kfree(s->ctb_addr_rs_to_ts);
1387         s->ctb_addr_rs_to_ts = NULL;
1388         kfree(s->ctb_addr_ts_to_rs);
1389         s->ctb_addr_ts_to_rs = NULL;
1390
1391         kfree(s->col_bd);
1392         s->col_bd = NULL;
1393         kfree(s->row_bd);
1394         s->row_bd = NULL;
1395 }
1396
1397 static unsigned int tile_width(const struct rpivid_dec_state *const s,
1398                                const unsigned int t_x)
1399 {
1400         return s->col_bd[t_x + 1] - s->col_bd[t_x];
1401 }
1402
1403 static unsigned int tile_height(const struct rpivid_dec_state *const s,
1404                                 const unsigned int t_y)
1405 {
1406         return s->row_bd[t_y + 1] - s->row_bd[t_y];
1407 }
1408
1409 static void fill_rs_to_ts(struct rpivid_dec_state *const s)
1410 {
1411         unsigned int ts = 0;
1412         unsigned int t_y;
1413         unsigned int tr_rs = 0;
1414
1415         for (t_y = 0; t_y != s->tile_height; ++t_y) {
1416                 const unsigned int t_h = tile_height(s, t_y);
1417                 unsigned int t_x;
1418                 unsigned int tc_rs = tr_rs;
1419
1420                 for (t_x = 0; t_x != s->tile_width; ++t_x) {
1421                         const unsigned int t_w = tile_width(s, t_x);
1422                         unsigned int y;
1423                         unsigned int rs = tc_rs;
1424
1425                         for (y = 0; y != t_h; ++y) {
1426                                 unsigned int x;
1427
1428                                 for (x = 0; x != t_w; ++x) {
1429                                         s->ctb_addr_rs_to_ts[rs + x] = ts;
1430                                         s->ctb_addr_ts_to_rs[ts] = rs + x;
1431                                         ++ts;
1432                                 }
1433                                 rs += s->ctb_width;
1434                         }
1435                         tc_rs += t_w;
1436                 }
1437                 tr_rs += t_h * s->ctb_width;
1438         }
1439 }
1440
1441 static int updated_ps(struct rpivid_dec_state *const s)
1442 {
1443         unsigned int i;
1444
1445         free_ps_info(s);
1446
1447         // Inferred parameters
1448         s->log2_ctb_size = s->sps.log2_min_luma_coding_block_size_minus3 + 3 +
1449                            s->sps.log2_diff_max_min_luma_coding_block_size;
1450
1451         s->ctb_width = (s->sps.pic_width_in_luma_samples +
1452                         (1 << s->log2_ctb_size) - 1) >>
1453                        s->log2_ctb_size;
1454         s->ctb_height = (s->sps.pic_height_in_luma_samples +
1455                          (1 << s->log2_ctb_size) - 1) >>
1456                         s->log2_ctb_size;
1457         s->ctb_size = s->ctb_width * s->ctb_height;
1458
1459         // Inferred parameters
1460
1461         s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size,
1462                                              sizeof(*s->ctb_addr_rs_to_ts),
1463                                              GFP_KERNEL);
1464         if (!s->ctb_addr_rs_to_ts)
1465                 goto fail;
1466         s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size,
1467                                              sizeof(*s->ctb_addr_ts_to_rs),
1468                                              GFP_KERNEL);
1469         if (!s->ctb_addr_ts_to_rs)
1470                 goto fail;
1471
1472         if (!(s->pps.flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) {
1473                 s->tile_width = 1;
1474                 s->tile_height = 1;
1475         } else {
1476                 s->tile_width = s->pps.num_tile_columns_minus1 + 1;
1477                 s->tile_height = s->pps.num_tile_rows_minus1 + 1;
1478         }
1479
1480         s->col_bd = kmalloc((s->tile_width + 1) * sizeof(*s->col_bd),
1481                             GFP_KERNEL);
1482         if (!s->col_bd)
1483                 goto fail;
1484         s->row_bd = kmalloc((s->tile_height + 1) * sizeof(*s->row_bd),
1485                             GFP_KERNEL);
1486         if (!s->row_bd)
1487                 goto fail;
1488
1489         s->col_bd[0] = 0;
1490         for (i = 1; i < s->tile_width; i++)
1491                 s->col_bd[i] = s->col_bd[i - 1] +
1492                         s->pps.column_width_minus1[i - 1] + 1;
1493         s->col_bd[s->tile_width] = s->ctb_width;
1494
1495         s->row_bd[0] = 0;
1496         for (i = 1; i < s->tile_height; i++)
1497                 s->row_bd[i] = s->row_bd[i - 1] +
1498                         s->pps.row_height_minus1[i - 1] + 1;
1499         s->row_bd[s->tile_height] = s->ctb_height;
1500
1501         fill_rs_to_ts(s);
1502         return 0;
1503
1504 fail:
1505         free_ps_info(s);
1506         /* Set invalid to force reload */
1507         s->sps.pic_width_in_luma_samples = 0;
1508         return -ENOMEM;
1509 }
1510
1511 static int write_cmd_buffer(struct rpivid_dev *const dev,
1512                             struct rpivid_dec_env *const de,
1513                             const struct rpivid_dec_state *const s)
1514 {
1515         const size_t cmd_size = ALIGN(de->cmd_len * sizeof(de->cmd_fifo[0]),
1516                                       dev->cache_align);
1517
1518         de->cmd_addr = dma_map_single(dev->dev, de->cmd_fifo,
1519                                       cmd_size, DMA_TO_DEVICE);
1520         if (dma_mapping_error(dev->dev, de->cmd_addr)) {
1521                 v4l2_err(&dev->v4l2_dev,
1522                          "Map cmd buffer (%zu): FAILED\n", cmd_size);
1523                 return -ENOMEM;
1524         }
1525         de->cmd_size = cmd_size;
1526         return 0;
1527 }
1528
1529 static void setup_colmv(struct rpivid_ctx *const ctx, struct rpivid_run *run,
1530                         struct rpivid_dec_state *const s)
1531 {
1532         ctx->colmv_stride = ALIGN(s->sps.pic_width_in_luma_samples, 64);
1533         ctx->colmv_picsize = ctx->colmv_stride *
1534                 (ALIGN(s->sps.pic_height_in_luma_samples, 64) >> 4);
1535 }
1536
1537 // Can be called from irq context
1538 static struct rpivid_dec_env *dec_env_new(struct rpivid_ctx *const ctx)
1539 {
1540         struct rpivid_dec_env *de;
1541         unsigned long lock_flags;
1542
1543         spin_lock_irqsave(&ctx->dec_lock, lock_flags);
1544
1545         de = ctx->dec_free;
1546         if (de) {
1547                 ctx->dec_free = de->next;
1548                 de->next = NULL;
1549                 de->state = RPIVID_DECODE_SLICE_START;
1550         }
1551
1552         spin_unlock_irqrestore(&ctx->dec_lock, lock_flags);
1553         return de;
1554 }
1555
1556 // Can be called from irq context
1557 static void dec_env_delete(struct rpivid_dec_env *const de)
1558 {
1559         struct rpivid_ctx * const ctx = de->ctx;
1560         unsigned long lock_flags;
1561
1562         if (de->cmd_size) {
1563                 dma_unmap_single(ctx->dev->dev, de->cmd_addr, de->cmd_size,
1564                                  DMA_TO_DEVICE);
1565                 de->cmd_size = 0;
1566         }
1567
1568         aux_q_release(ctx, &de->frame_aux);
1569         aux_q_release(ctx, &de->col_aux);
1570
1571         spin_lock_irqsave(&ctx->dec_lock, lock_flags);
1572
1573         de->state = RPIVID_DECODE_END;
1574         de->next = ctx->dec_free;
1575         ctx->dec_free = de;
1576
1577         spin_unlock_irqrestore(&ctx->dec_lock, lock_flags);
1578 }
1579
1580 static void dec_env_uninit(struct rpivid_ctx *const ctx)
1581 {
1582         unsigned int i;
1583
1584         if (ctx->dec_pool) {
1585                 for (i = 0; i != RPIVID_DEC_ENV_COUNT; ++i) {
1586                         struct rpivid_dec_env *const de = ctx->dec_pool + i;
1587
1588                         kfree(de->cmd_fifo);
1589                 }
1590
1591                 kfree(ctx->dec_pool);
1592         }
1593
1594         ctx->dec_pool = NULL;
1595         ctx->dec_free = NULL;
1596 }
1597
1598 static int dec_env_init(struct rpivid_ctx *const ctx)
1599 {
1600         unsigned int i;
1601
1602         ctx->dec_pool = kzalloc(sizeof(*ctx->dec_pool) * RPIVID_DEC_ENV_COUNT,
1603                                 GFP_KERNEL);
1604         if (!ctx->dec_pool)
1605                 return -1;
1606
1607         spin_lock_init(&ctx->dec_lock);
1608
1609         // Build free chain
1610         ctx->dec_free = ctx->dec_pool;
1611         for (i = 0; i != RPIVID_DEC_ENV_COUNT - 1; ++i)
1612                 ctx->dec_pool[i].next = ctx->dec_pool + i + 1;
1613
1614         // Fill in other bits
1615         for (i = 0; i != RPIVID_DEC_ENV_COUNT; ++i) {
1616                 struct rpivid_dec_env *const de = ctx->dec_pool + i;
1617
1618                 de->ctx = ctx;
1619                 de->decode_order = i;
1620 //              de->cmd_max = 1024;
1621                 de->cmd_max = 8096;
1622                 de->cmd_fifo = kmalloc_array(de->cmd_max,
1623                                              sizeof(struct rpi_cmd),
1624                                              GFP_KERNEL);
1625                 if (!de->cmd_fifo)
1626                         goto fail;
1627         }
1628
1629         return 0;
1630
1631 fail:
1632         dec_env_uninit(ctx);
1633         return -1;
1634 }
1635
1636 // Assume that we get exactly the same DPB for every slice
1637 // it makes no real sense otherwise
1638 #if V4L2_HEVC_DPB_ENTRIES_NUM_MAX > 16
1639 #error HEVC_DPB_ENTRIES > h/w slots
1640 #endif
1641
1642 static u32 mk_config2(const struct rpivid_dec_state *const s)
1643 {
1644         const struct v4l2_ctrl_hevc_sps *const sps = &s->sps;
1645         const struct v4l2_ctrl_hevc_pps *const pps = &s->pps;
1646         u32 c;
1647         // BitDepthY
1648         c = (sps->bit_depth_luma_minus8 + 8) << 0;
1649          // BitDepthC
1650         c |= (sps->bit_depth_chroma_minus8 + 8) << 4;
1651          // BitDepthY
1652         if (sps->bit_depth_luma_minus8)
1653                 c |= BIT(8);
1654         // BitDepthC
1655         if (sps->bit_depth_chroma_minus8)
1656                 c |= BIT(9);
1657         c |= s->log2_ctb_size << 10;
1658         if (pps->flags & V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED)
1659                 c |= BIT(13);
1660         if (sps->flags & V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED)
1661                 c |= BIT(14);
1662         if (s->mk_aux)
1663                 c |= BIT(15); /* Write motion vectors to external memory */
1664         c |= (pps->log2_parallel_merge_level_minus2 + 2) << 16;
1665         if (s->slice_temporal_mvp)
1666                 c |= BIT(19);
1667         if (sps->flags & V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED)
1668                 c |= BIT(20);
1669         c |= (pps->pps_cb_qp_offset & 31) << 21;
1670         c |= (pps->pps_cr_qp_offset & 31) << 26;
1671         return c;
1672 }
1673
1674 static inline bool is_ref_unit_type(const unsigned int nal_unit_type)
1675 {
1676         /* From Table 7-1
1677          * True for 1, 3, 5, 7, 9, 11, 13, 15
1678          */
1679         return (nal_unit_type & ~0xe) != 0;
1680 }
1681
1682 static void rpivid_h265_setup(struct rpivid_ctx *ctx, struct rpivid_run *run)
1683 {
1684         struct rpivid_dev *const dev = ctx->dev;
1685         const struct v4l2_ctrl_hevc_decode_params *const dec =
1686                                                 run->h265.dec;
1687         /* sh0 used where slice header contents should be constant over all
1688          * slices, or first slice of frame
1689          */
1690         const struct v4l2_ctrl_hevc_slice_params *const sh0 =
1691                                         run->h265.slice_params;
1692         struct rpivid_q_aux *dpb_q_aux[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
1693         struct rpivid_dec_state *const s = ctx->state;
1694         struct vb2_queue *vq;
1695         struct rpivid_dec_env *de = ctx->dec0;
1696         unsigned int prev_rs;
1697         unsigned int i;
1698         int rv;
1699         bool slice_temporal_mvp;
1700         bool frame_end;
1701
1702         xtrace_in(dev, de);
1703         s->sh = NULL;  // Avoid use until in the slice loop
1704
1705         frame_end =
1706                 ((run->src->flags & V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF) == 0);
1707
1708         slice_temporal_mvp = (sh0->flags &
1709                    V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED);
1710
1711         if (de && de->state != RPIVID_DECODE_END) {
1712                 switch (de->state) {
1713                 case RPIVID_DECODE_SLICE_CONTINUE:
1714                         // Expected state
1715                         break;
1716                 default:
1717                         v4l2_err(&dev->v4l2_dev, "%s: Unexpected state: %d\n",
1718                                  __func__, de->state);
1719                 /* FALLTHRU */
1720                 case RPIVID_DECODE_ERROR_CONTINUE:
1721                         // Uncleared error - fail now
1722                         goto fail;
1723                 }
1724
1725                 if (s->slice_temporal_mvp != slice_temporal_mvp) {
1726                         v4l2_warn(&dev->v4l2_dev,
1727                                   "Slice Temporal MVP non-constant\n");
1728                         goto fail;
1729                 }
1730         } else {
1731                 /* Frame start */
1732                 unsigned int ctb_size_y;
1733                 bool sps_changed = false;
1734
1735                 if (memcmp(&s->sps, run->h265.sps, sizeof(s->sps)) != 0) {
1736                         /* SPS changed */
1737                         v4l2_info(&dev->v4l2_dev, "SPS changed\n");
1738                         memcpy(&s->sps, run->h265.sps, sizeof(s->sps));
1739                         sps_changed = true;
1740                 }
1741                 if (sps_changed ||
1742                     memcmp(&s->pps, run->h265.pps, sizeof(s->pps)) != 0) {
1743                         /* SPS changed */
1744                         v4l2_info(&dev->v4l2_dev, "PPS changed\n");
1745                         memcpy(&s->pps, run->h265.pps, sizeof(s->pps));
1746
1747                         /* Recalc stuff as required */
1748                         rv = updated_ps(s);
1749                         if (rv)
1750                                 goto fail;
1751                 }
1752
1753                 de = dec_env_new(ctx);
1754                 if (!de) {
1755                         v4l2_err(&dev->v4l2_dev,
1756                                  "Failed to find free decode env\n");
1757                         goto fail;
1758                 }
1759                 ctx->dec0 = de;
1760
1761                 ctb_size_y =
1762                         1U << (s->sps.log2_min_luma_coding_block_size_minus3 +
1763                                3 +
1764                                s->sps.log2_diff_max_min_luma_coding_block_size);
1765
1766                 de->pic_width_in_ctbs_y =
1767                         (s->sps.pic_width_in_luma_samples + ctb_size_y - 1) /
1768                                 ctb_size_y; // 7-15
1769                 de->pic_height_in_ctbs_y =
1770                         (s->sps.pic_height_in_luma_samples + ctb_size_y - 1) /
1771                                 ctb_size_y; // 7-17
1772                 de->cmd_len = 0;
1773                 de->dpbno_col = ~0U;
1774
1775                 de->bit_copy_gptr = ctx->bitbufs + ctx->p1idx;
1776                 de->bit_copy_len = 0;
1777
1778                 de->frame_c_offset = ctx->dst_fmt.height * 128;
1779                 de->frame_stride = ctx->dst_fmt.plane_fmt[0].bytesperline * 128;
1780                 de->frame_addr =
1781                         vb2_dma_contig_plane_dma_addr(&run->dst->vb2_buf, 0);
1782                 de->frame_aux = NULL;
1783
1784                 if (s->sps.bit_depth_luma_minus8 !=
1785                     s->sps.bit_depth_chroma_minus8) {
1786                         v4l2_warn(&dev->v4l2_dev,
1787                                   "Chroma depth (%d) != Luma depth (%d)\n",
1788                                   s->sps.bit_depth_chroma_minus8 + 8,
1789                                   s->sps.bit_depth_luma_minus8 + 8);
1790                         goto fail;
1791                 }
1792                 if (s->sps.bit_depth_luma_minus8 == 0) {
1793                         if (ctx->dst_fmt.pixelformat !=
1794                                                 V4L2_PIX_FMT_NV12_COL128) {
1795                                 v4l2_err(&dev->v4l2_dev,
1796                                          "Pixel format %#x != NV12_COL128 for 8-bit output",
1797                                          ctx->dst_fmt.pixelformat);
1798                                 goto fail;
1799                         }
1800                 } else if (s->sps.bit_depth_luma_minus8 == 2) {
1801                         if (ctx->dst_fmt.pixelformat !=
1802                                                 V4L2_PIX_FMT_NV12_10_COL128) {
1803                                 v4l2_err(&dev->v4l2_dev,
1804                                          "Pixel format %#x != NV12_10_COL128 for 10-bit output",
1805                                          ctx->dst_fmt.pixelformat);
1806                                 goto fail;
1807                         }
1808                 } else {
1809                         v4l2_warn(&dev->v4l2_dev,
1810                                   "Luma depth (%d) unsupported\n",
1811                                   s->sps.bit_depth_luma_minus8 + 8);
1812                         goto fail;
1813                 }
1814                 if (run->dst->vb2_buf.num_planes != 1) {
1815                         v4l2_warn(&dev->v4l2_dev, "Capture planes (%d) != 1\n",
1816                                   run->dst->vb2_buf.num_planes);
1817                         goto fail;
1818                 }
1819                 if (run->dst->planes[0].length <
1820                     ctx->dst_fmt.plane_fmt[0].sizeimage) {
1821                         v4l2_warn(&dev->v4l2_dev,
1822                                   "Capture plane[0] length (%d) < sizeimage (%d)\n",
1823                                   run->dst->planes[0].length,
1824                                   ctx->dst_fmt.plane_fmt[0].sizeimage);
1825                         goto fail;
1826                 }
1827
1828                 // Fill in ref planes with our address s.t. if we mess
1829                 // up refs somehow then we still have a valid address
1830                 // entry
1831                 for (i = 0; i != 16; ++i)
1832                         de->ref_addrs[i] = de->frame_addr;
1833
1834                 /*
1835                  * Stash initial temporal_mvp flag
1836                  * This must be the same for all pic slices (7.4.7.1)
1837                  */
1838                 s->slice_temporal_mvp = slice_temporal_mvp;
1839
1840                 /*
1841                  * Need Aux ents for all (ref) DPB ents if temporal MV could
1842                  * be enabled for any pic
1843                  */
1844                 s->use_aux = ((s->sps.flags &
1845                                V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) != 0);
1846                 s->mk_aux = s->use_aux &&
1847                             (s->sps.sps_max_sub_layers_minus1 >= sh0->nuh_temporal_id_plus1 ||
1848                              is_ref_unit_type(sh0->nal_unit_type));
1849
1850                 // Phase 2 reg pre-calc
1851                 de->rpi_config2 = mk_config2(s);
1852                 de->rpi_framesize = (s->sps.pic_height_in_luma_samples << 16) |
1853                                     s->sps.pic_width_in_luma_samples;
1854                 de->rpi_currpoc = sh0->slice_pic_order_cnt;
1855
1856                 if (s->sps.flags &
1857                     V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) {
1858                         setup_colmv(ctx, run, s);
1859                 }
1860
1861                 s->slice_idx = 0;
1862
1863                 if (sh0->slice_segment_addr != 0) {
1864                         v4l2_warn(&dev->v4l2_dev,
1865                                   "New frame but segment_addr=%d\n",
1866                                   sh0->slice_segment_addr);
1867                         goto fail;
1868                 }
1869
1870                 /* Allocate a bitbuf if we need one - don't need one if single
1871                  * slice as we can use the src buf directly
1872                  */
1873                 if (!frame_end && !de->bit_copy_gptr->ptr) {
1874                         size_t bits_alloc;
1875                         bits_alloc = rpivid_bit_buf_size(s->sps.pic_width_in_luma_samples,
1876                                                          s->sps.pic_height_in_luma_samples,
1877                                                          s->sps.bit_depth_luma_minus8);
1878
1879                         if (gptr_alloc(dev, de->bit_copy_gptr,
1880                                        bits_alloc,
1881                                        DMA_ATTR_FORCE_CONTIGUOUS) != 0) {
1882                                 v4l2_err(&dev->v4l2_dev,
1883                                          "Unable to alloc buf (%zu) for bit copy\n",
1884                                          bits_alloc);
1885                                 goto fail;
1886                         }
1887                         v4l2_info(&dev->v4l2_dev,
1888                                   "Alloc buf (%zu) for bit copy OK\n",
1889                                   bits_alloc);
1890                 }
1891         }
1892
1893         // Either map src buffer or use directly
1894         s->src_addr = 0;
1895         s->src_buf = NULL;
1896
1897         if (frame_end)
1898                 s->src_addr = vb2_dma_contig_plane_dma_addr(&run->src->vb2_buf,
1899                                                             0);
1900         if (!s->src_addr)
1901                 s->src_buf = vb2_plane_vaddr(&run->src->vb2_buf, 0);
1902         if (!s->src_addr && !s->src_buf) {
1903                 v4l2_err(&dev->v4l2_dev, "Failed to map src buffer\n");
1904                 goto fail;
1905         }
1906
1907         // Pre calc a few things
1908         s->dec = dec;
1909         for (i = 0; i != run->h265.slice_ents; ++i) {
1910                 const struct v4l2_ctrl_hevc_slice_params *const sh = sh0 + i;
1911                 const bool last_slice = frame_end && i + 1 == run->h265.slice_ents;
1912
1913                 s->sh = sh;
1914
1915                 if (run->src->planes[0].bytesused < (sh->bit_size + 7) / 8) {
1916                         v4l2_warn(&dev->v4l2_dev,
1917                                   "Bit size %d > bytesused %d\n",
1918                                   sh->bit_size, run->src->planes[0].bytesused);
1919                         goto fail;
1920                 }
1921                 if (sh->data_bit_offset >= sh->bit_size ||
1922                     sh->bit_size - sh->data_bit_offset < 8) {
1923                         v4l2_warn(&dev->v4l2_dev,
1924                                   "Bit size %d < Bit offset %d + 8\n",
1925                                   sh->bit_size, sh->data_bit_offset);
1926                         goto fail;
1927                 }
1928
1929                 s->slice_qp = 26 + s->pps.init_qp_minus26 + sh->slice_qp_delta;
1930                 s->max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ?
1931                                                 0 :
1932                                                 (5 - sh->five_minus_max_num_merge_cand);
1933                 s->dependent_slice_segment_flag =
1934                         ((sh->flags &
1935                           V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT) != 0);
1936
1937                 s->nb_refs[0] = (sh->slice_type == HEVC_SLICE_I) ?
1938                                         0 :
1939                                         sh->num_ref_idx_l0_active_minus1 + 1;
1940                 s->nb_refs[1] = (sh->slice_type != HEVC_SLICE_B) ?
1941                                         0 :
1942                                         sh->num_ref_idx_l1_active_minus1 + 1;
1943
1944                 if (s->sps.flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED)
1945                         populate_scaling_factors(run, de, s);
1946
1947                 /* Calc all the random coord info to avoid repeated conversion in/out */
1948                 s->start_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr];
1949                 s->start_ctb_x = sh->slice_segment_addr % de->pic_width_in_ctbs_y;
1950                 s->start_ctb_y = sh->slice_segment_addr / de->pic_width_in_ctbs_y;
1951                 /* Last CTB of previous slice */
1952                 prev_rs = !s->start_ts ? 0 : s->ctb_addr_ts_to_rs[s->start_ts - 1];
1953                 s->prev_ctb_x = prev_rs % de->pic_width_in_ctbs_y;
1954                 s->prev_ctb_y = prev_rs / de->pic_width_in_ctbs_y;
1955
1956                 if ((s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED))
1957                         rv = wpp_decode_slice(de, s, last_slice);
1958                 else
1959                         rv = decode_slice(de, s, last_slice);
1960                 if (rv)
1961                         goto fail;
1962
1963                 ++s->slice_idx;
1964         }
1965
1966         if (!frame_end) {
1967                 xtrace_ok(dev, de);
1968                 return;
1969         }
1970
1971         // Frame end
1972         memset(dpb_q_aux, 0,
1973                sizeof(*dpb_q_aux) * V4L2_HEVC_DPB_ENTRIES_NUM_MAX);
1974
1975         // Locate ref frames
1976         // At least in the current implementation this is constant across all
1977         // slices. If this changes we will need idx mapping code.
1978         // Uses sh so here rather than trigger
1979
1980         vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx,
1981                              V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
1982
1983         if (!vq) {
1984                 v4l2_err(&dev->v4l2_dev, "VQ gone!\n");
1985                 goto fail;
1986         }
1987
1988         //        v4l2_info(&dev->v4l2_dev, "rpivid_h265_end of frame\n");
1989         if (write_cmd_buffer(dev, de, s))
1990                 goto fail;
1991
1992         for (i = 0; i < dec->num_active_dpb_entries; ++i) {
1993                 int buffer_index =
1994                         vb2_find_timestamp(vq, dec->dpb[i].timestamp, 0);
1995                 struct vb2_buffer *buf = buffer_index < 0 ?
1996                                         NULL :
1997                                         vb2_get_buffer(vq, buffer_index);
1998
1999                 if (!buf) {
2000                         v4l2_warn(&dev->v4l2_dev,
2001                                   "Missing DPB ent %d, timestamp=%lld, index=%d\n",
2002                                   i, (long long)dec->dpb[i].timestamp,
2003                                   buffer_index);
2004                         continue;
2005                 }
2006
2007                 if (s->use_aux) {
2008                         dpb_q_aux[i] = aux_q_ref_idx(ctx, buffer_index);
2009                         if (!dpb_q_aux[i])
2010                                 v4l2_warn(&dev->v4l2_dev,
2011                                           "Missing DPB AUX ent %d, timestamp=%lld, index=%d\n",
2012                                           i, (long long)dec->dpb[i].timestamp,
2013                                           buffer_index);
2014                 }
2015
2016                 de->ref_addrs[i] =
2017                         vb2_dma_contig_plane_dma_addr(buf, 0);
2018         }
2019
2020         // Move DPB from temp
2021         for (i = 0; i != V4L2_HEVC_DPB_ENTRIES_NUM_MAX; ++i) {
2022                 aux_q_release(ctx, &s->ref_aux[i]);
2023                 s->ref_aux[i] = dpb_q_aux[i];
2024         }
2025         // Unref the old frame aux too - it is either in the DPB or not
2026         // now
2027         aux_q_release(ctx, &s->frame_aux);
2028
2029         if (s->mk_aux) {
2030                 s->frame_aux = aux_q_new(ctx, run->dst->vb2_buf.index);
2031
2032                 if (!s->frame_aux) {
2033                         v4l2_err(&dev->v4l2_dev,
2034                                  "Failed to obtain aux storage for frame\n");
2035                         goto fail;
2036                 }
2037
2038                 de->frame_aux = aux_q_ref(ctx, s->frame_aux);
2039         }
2040
2041         if (de->dpbno_col != ~0U) {
2042                 if (de->dpbno_col >= dec->num_active_dpb_entries) {
2043                         v4l2_err(&dev->v4l2_dev,
2044                                  "Col ref index %d >= %d\n",
2045                                  de->dpbno_col,
2046                                  dec->num_active_dpb_entries);
2047                 } else {
2048                         // Standard requires that the col pic is
2049                         // constant for the duration of the pic
2050                         // (text of collocated_ref_idx in H265-2 2018
2051                         // 7.4.7.1)
2052
2053                         // Spot the collocated ref in passing
2054                         de->col_aux = aux_q_ref(ctx,
2055                                                 dpb_q_aux[de->dpbno_col]);
2056
2057                         if (!de->col_aux) {
2058                                 v4l2_warn(&dev->v4l2_dev,
2059                                           "Missing DPB ent for col\n");
2060                                 // Probably need to abort if this fails
2061                                 // as P2 may explode on bad data
2062                                 goto fail;
2063                         }
2064                 }
2065         }
2066
2067         de->state = RPIVID_DECODE_PHASE1;
2068         xtrace_ok(dev, de);
2069         return;
2070
2071 fail:
2072         if (de)
2073                 // Actual error reporting happens in Trigger
2074                 de->state = frame_end ? RPIVID_DECODE_ERROR_DONE :
2075                                         RPIVID_DECODE_ERROR_CONTINUE;
2076         xtrace_fail(dev, de);
2077 }
2078
2079 //////////////////////////////////////////////////////////////////////////////
2080 // Handle PU and COEFF stream overflow
2081
2082 // Returns:
2083 // -1  Phase 1 decode error
2084 //  0  OK
2085 // >0  Out of space (bitmask)
2086
2087 #define STATUS_COEFF_EXHAUSTED  8
2088 #define STATUS_PU_EXHAUSTED     16
2089
2090 static int check_status(const struct rpivid_dev *const dev)
2091 {
2092         const u32 cfstatus = apb_read(dev, RPI_CFSTATUS);
2093         const u32 cfnum = apb_read(dev, RPI_CFNUM);
2094         u32 status = apb_read(dev, RPI_STATUS);
2095
2096         // Handle PU and COEFF stream overflow
2097
2098         // this is the definition of successful completion of phase 1
2099         // it assures that status register is zero and all blocks in each tile
2100         // have completed
2101         if (cfstatus == cfnum)
2102                 return 0;       //No error
2103
2104         status &= (STATUS_PU_EXHAUSTED | STATUS_COEFF_EXHAUSTED);
2105         if (status)
2106                 return status;
2107
2108         return -1;
2109 }
2110
2111 static void phase2_cb(struct rpivid_dev *const dev, void *v)
2112 {
2113         struct rpivid_dec_env *const de = v;
2114
2115         xtrace_in(dev, de);
2116
2117         /* Done with buffers - allow new P1 */
2118         rpivid_hw_irq_active1_enable_claim(dev, 1);
2119
2120         v4l2_m2m_buf_done(de->frame_buf, VB2_BUF_STATE_DONE);
2121         de->frame_buf = NULL;
2122
2123 #if USE_REQUEST_PIN
2124         media_request_unpin(de->req_pin);
2125         de->req_pin = NULL;
2126 #else
2127         media_request_object_complete(de->req_obj);
2128         de->req_obj = NULL;
2129 #endif
2130
2131         xtrace_ok(dev, de);
2132         dec_env_delete(de);
2133 }
2134
2135 static void phase2_claimed(struct rpivid_dev *const dev, void *v)
2136 {
2137         struct rpivid_dec_env *const de = v;
2138         unsigned int i;
2139
2140         xtrace_in(dev, de);
2141
2142         apb_write_vc_addr(dev, RPI_PURBASE, de->pu_base_vc);
2143         apb_write_vc_len(dev, RPI_PURSTRIDE, de->pu_stride);
2144         apb_write_vc_addr(dev, RPI_COEFFRBASE, de->coeff_base_vc);
2145         apb_write_vc_len(dev, RPI_COEFFRSTRIDE, de->coeff_stride);
2146
2147         apb_write_vc_addr(dev, RPI_OUTYBASE, de->frame_addr);
2148         apb_write_vc_addr(dev, RPI_OUTCBASE,
2149                           de->frame_addr + de->frame_c_offset);
2150         apb_write_vc_len(dev, RPI_OUTYSTRIDE, de->frame_stride);
2151         apb_write_vc_len(dev, RPI_OUTCSTRIDE, de->frame_stride);
2152
2153         //    v4l2_info(&dev->v4l2_dev, "Frame: Y=%llx, C=%llx, Stride=%x\n",
2154         //              de->frame_addr, de->frame_addr + de->frame_c_offset,
2155         //              de->frame_stride);
2156
2157         for (i = 0; i < 16; i++) {
2158                 // Strides are in fact unused but fill in anyway
2159                 apb_write_vc_addr(dev, 0x9000 + 16 * i, de->ref_addrs[i]);
2160                 apb_write_vc_len(dev, 0x9004 + 16 * i, de->frame_stride);
2161                 apb_write_vc_addr(dev, 0x9008 + 16 * i,
2162                                   de->ref_addrs[i] + de->frame_c_offset);
2163                 apb_write_vc_len(dev, 0x900C + 16 * i, de->frame_stride);
2164         }
2165
2166         apb_write(dev, RPI_CONFIG2, de->rpi_config2);
2167         apb_write(dev, RPI_FRAMESIZE, de->rpi_framesize);
2168         apb_write(dev, RPI_CURRPOC, de->rpi_currpoc);
2169         //    v4l2_info(&dev->v4l2_dev, "Config2=%#x, FrameSize=%#x, POC=%#x\n",
2170         //    de->rpi_config2, de->rpi_framesize, de->rpi_currpoc);
2171
2172         // collocated reads/writes
2173         apb_write_vc_len(dev, RPI_COLSTRIDE,
2174                          de->ctx->colmv_stride); // Read vals
2175         apb_write_vc_len(dev, RPI_MVSTRIDE,
2176                          de->ctx->colmv_stride); // Write vals
2177         apb_write_vc_addr(dev, RPI_MVBASE,
2178                           !de->frame_aux ? 0 : de->frame_aux->col.addr);
2179         apb_write_vc_addr(dev, RPI_COLBASE,
2180                           !de->col_aux ? 0 : de->col_aux->col.addr);
2181
2182         //v4l2_info(&dev->v4l2_dev,
2183         //         "Mv=%llx, Col=%llx, Stride=%x, Buf=%llx->%llx\n",
2184         //         de->rpi_mvbase, de->rpi_colbase, de->ctx->colmv_stride,
2185         //         de->ctx->colmvbuf.addr, de->ctx->colmvbuf.addr +
2186         //         de->ctx->colmvbuf.size);
2187
2188         rpivid_hw_irq_active2_irq(dev, &de->irq_ent, phase2_cb, de);
2189
2190         apb_write_final(dev, RPI_NUMROWS, de->pic_height_in_ctbs_y);
2191
2192         xtrace_ok(dev, de);
2193 }
2194
2195 static void phase1_claimed(struct rpivid_dev *const dev, void *v);
2196
2197 // release any and all objects associated with de
2198 // and reenable phase 1 if required
2199 static void phase1_err_fin(struct rpivid_dev *const dev,
2200                            struct rpivid_ctx *const ctx,
2201                            struct rpivid_dec_env *const de)
2202 {
2203         /* Return all detached buffers */
2204         if (de->src_buf)
2205                 v4l2_m2m_buf_done(de->src_buf, VB2_BUF_STATE_ERROR);
2206         de->src_buf = NULL;
2207         if (de->frame_buf)
2208                 v4l2_m2m_buf_done(de->frame_buf, VB2_BUF_STATE_ERROR);
2209         de->frame_buf = NULL;
2210 #if USE_REQUEST_PIN
2211         if (de->req_pin)
2212                 media_request_unpin(de->req_pin);
2213         de->req_pin = NULL;
2214 #else
2215         if (de->req_obj)
2216                 media_request_object_complete(de->req_obj);
2217         de->req_obj = NULL;
2218 #endif
2219
2220         dec_env_delete(de);
2221
2222         /* Reenable phase 0 if we were blocking */
2223         if (atomic_add_return(-1, &ctx->p1out) >= RPIVID_P1BUF_COUNT - 1)
2224                 v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
2225
2226         /* Done with P1-P2 buffers - allow new P1 */
2227         rpivid_hw_irq_active1_enable_claim(dev, 1);
2228 }
2229
2230 static void phase1_thread(struct rpivid_dev *const dev, void *v)
2231 {
2232         struct rpivid_dec_env *const de = v;
2233         struct rpivid_ctx *const ctx = de->ctx;
2234
2235         struct rpivid_gptr *const pu_gptr = ctx->pu_bufs + ctx->p2idx;
2236         struct rpivid_gptr *const coeff_gptr = ctx->coeff_bufs + ctx->p2idx;
2237
2238         xtrace_in(dev, de);
2239
2240         if (de->p1_status & STATUS_PU_EXHAUSTED) {
2241                 if (gptr_realloc_new(dev, pu_gptr, next_size(pu_gptr->size))) {
2242                         v4l2_err(&dev->v4l2_dev,
2243                                  "%s: PU realloc (%zx) failed\n",
2244                                  __func__, pu_gptr->size);
2245                         goto fail;
2246                 }
2247                 v4l2_info(&dev->v4l2_dev, "%s: PU realloc (%zx) OK\n",
2248                           __func__, pu_gptr->size);
2249         }
2250
2251         if (de->p1_status & STATUS_COEFF_EXHAUSTED) {
2252                 if (gptr_realloc_new(dev, coeff_gptr,
2253                                      next_size(coeff_gptr->size))) {
2254                         v4l2_err(&dev->v4l2_dev,
2255                                  "%s: Coeff realloc (%zx) failed\n",
2256                                  __func__, coeff_gptr->size);
2257                         goto fail;
2258                 }
2259                 v4l2_info(&dev->v4l2_dev, "%s: Coeff realloc (%zx) OK\n",
2260                           __func__, coeff_gptr->size);
2261         }
2262
2263         phase1_claimed(dev, de);
2264         xtrace_ok(dev, de);
2265         return;
2266
2267 fail:
2268         if (!pu_gptr->addr || !coeff_gptr->addr) {
2269                 v4l2_err(&dev->v4l2_dev,
2270                          "%s: Fatal: failed to reclaim old alloc\n",
2271                          __func__);
2272                 ctx->fatal_err = 1;
2273         }
2274         xtrace_fail(dev, de);
2275         phase1_err_fin(dev, ctx, de);
2276 }
2277
2278 /* Always called in irq context (this is good) */
2279 static void phase1_cb(struct rpivid_dev *const dev, void *v)
2280 {
2281         struct rpivid_dec_env *const de = v;
2282         struct rpivid_ctx *const ctx = de->ctx;
2283
2284         xtrace_in(dev, de);
2285
2286         de->p1_status = check_status(dev);
2287
2288         if (de->p1_status != 0) {
2289                 v4l2_info(&dev->v4l2_dev, "%s: Post wait: %#x\n",
2290                           __func__, de->p1_status);
2291
2292                 if (de->p1_status < 0)
2293                         goto fail;
2294
2295                 /* Need to realloc - push onto a thread rather than IRQ */
2296                 rpivid_hw_irq_active1_thread(dev, &de->irq_ent,
2297                                              phase1_thread, de);
2298                 return;
2299         }
2300
2301         v4l2_m2m_buf_done(de->src_buf, VB2_BUF_STATE_DONE);
2302         de->src_buf = NULL;
2303
2304         /* All phase1 error paths done - it is safe to inc p2idx */
2305         ctx->p2idx =
2306                 (ctx->p2idx + 1 >= RPIVID_P2BUF_COUNT) ? 0 : ctx->p2idx + 1;
2307
2308         /* Renable the next setup if we were blocking */
2309         if (atomic_add_return(-1, &ctx->p1out) >= RPIVID_P1BUF_COUNT - 1) {
2310                 xtrace_fin(dev, de);
2311                 v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
2312         }
2313
2314         rpivid_hw_irq_active2_claim(dev, &de->irq_ent, phase2_claimed, de);
2315
2316         xtrace_ok(dev, de);
2317         return;
2318
2319 fail:
2320         xtrace_fail(dev, de);
2321         phase1_err_fin(dev, ctx, de);
2322 }
2323
2324 static void phase1_claimed(struct rpivid_dev *const dev, void *v)
2325 {
2326         struct rpivid_dec_env *const de = v;
2327         struct rpivid_ctx *const ctx = de->ctx;
2328
2329         const struct rpivid_gptr * const pu_gptr = ctx->pu_bufs + ctx->p2idx;
2330         const struct rpivid_gptr * const coeff_gptr = ctx->coeff_bufs +
2331                                                       ctx->p2idx;
2332
2333         xtrace_in(dev, de);
2334
2335         if (ctx->fatal_err)
2336                 goto fail;
2337
2338         de->pu_base_vc = pu_gptr->addr;
2339         de->pu_stride =
2340                 ALIGN_DOWN(pu_gptr->size / de->pic_height_in_ctbs_y, 64);
2341
2342         de->coeff_base_vc = coeff_gptr->addr;
2343         de->coeff_stride =
2344                 ALIGN_DOWN(coeff_gptr->size / de->pic_height_in_ctbs_y, 64);
2345
2346         /* phase1_claimed blocked until cb_phase1 completed so p2idx inc
2347          * in cb_phase1 after error detection
2348          */
2349
2350         apb_write_vc_addr(dev, RPI_PUWBASE, de->pu_base_vc);
2351         apb_write_vc_len(dev, RPI_PUWSTRIDE, de->pu_stride);
2352         apb_write_vc_addr(dev, RPI_COEFFWBASE, de->coeff_base_vc);
2353         apb_write_vc_len(dev, RPI_COEFFWSTRIDE, de->coeff_stride);
2354
2355         // Trigger command FIFO
2356         apb_write(dev, RPI_CFNUM, de->cmd_len);
2357
2358         // Claim irq
2359         rpivid_hw_irq_active1_irq(dev, &de->irq_ent, phase1_cb, de);
2360
2361         // And start the h/w
2362         apb_write_vc_addr_final(dev, RPI_CFBASE, de->cmd_addr);
2363
2364         xtrace_ok(dev, de);
2365         return;
2366
2367 fail:
2368         xtrace_fail(dev, de);
2369         phase1_err_fin(dev, ctx, de);
2370 }
2371
2372 static void dec_state_delete(struct rpivid_ctx *const ctx)
2373 {
2374         unsigned int i;
2375         struct rpivid_dec_state *const s = ctx->state;
2376
2377         if (!s)
2378                 return;
2379         ctx->state = NULL;
2380
2381         free_ps_info(s);
2382
2383         for (i = 0; i != HEVC_MAX_REFS; ++i)
2384                 aux_q_release(ctx, &s->ref_aux[i]);
2385         aux_q_release(ctx, &s->frame_aux);
2386
2387         kfree(s);
2388 }
2389
2390 struct irq_sync {
2391         atomic_t done;
2392         wait_queue_head_t wq;
2393         struct rpivid_hw_irq_ent irq_ent;
2394 };
2395
2396 static void phase2_sync_claimed(struct rpivid_dev *const dev, void *v)
2397 {
2398         struct irq_sync *const sync = v;
2399
2400         atomic_set(&sync->done, 1);
2401         wake_up(&sync->wq);
2402 }
2403
2404 static void phase1_sync_claimed(struct rpivid_dev *const dev, void *v)
2405 {
2406         struct irq_sync *const sync = v;
2407
2408         rpivid_hw_irq_active1_enable_claim(dev, 1);
2409         rpivid_hw_irq_active2_claim(dev, &sync->irq_ent, phase2_sync_claimed, sync);
2410 }
2411
2412 /* Sync with IRQ operations
2413  *
2414  * Claims phase1 and phase2 in turn and waits for the phase2 claim so any
2415  * pending IRQ ops will have completed by the time this returns
2416  *
2417  * phase1 has counted enables so must reenable once claimed
2418  * phase2 has unlimited enables
2419  */
2420 static void irq_sync(struct rpivid_dev *const dev)
2421 {
2422         struct irq_sync sync;
2423
2424         atomic_set(&sync.done, 0);
2425         init_waitqueue_head(&sync.wq);
2426
2427         rpivid_hw_irq_active1_claim(dev, &sync.irq_ent, phase1_sync_claimed, &sync);
2428         wait_event(sync.wq, atomic_read(&sync.done));
2429 }
2430
2431 static void h265_ctx_uninit(struct rpivid_dev *const dev, struct rpivid_ctx *ctx)
2432 {
2433         unsigned int i;
2434
2435         dec_env_uninit(ctx);
2436         dec_state_delete(ctx);
2437
2438         // dec_env & state must be killed before this to release the buffer to
2439         // the free pool
2440         aux_q_uninit(ctx);
2441
2442         for (i = 0; i != ARRAY_SIZE(ctx->bitbufs); ++i)
2443                 gptr_free(dev, ctx->bitbufs + i);
2444         for (i = 0; i != ARRAY_SIZE(ctx->pu_bufs); ++i)
2445                 gptr_free(dev, ctx->pu_bufs + i);
2446         for (i = 0; i != ARRAY_SIZE(ctx->coeff_bufs); ++i)
2447                 gptr_free(dev, ctx->coeff_bufs + i);
2448 }
2449
2450 static void rpivid_h265_stop(struct rpivid_ctx *ctx)
2451 {
2452         struct rpivid_dev *const dev = ctx->dev;
2453
2454         v4l2_info(&dev->v4l2_dev, "%s\n", __func__);
2455
2456         irq_sync(dev);
2457         h265_ctx_uninit(dev, ctx);
2458 }
2459
2460 static int rpivid_h265_start(struct rpivid_ctx *ctx)
2461 {
2462         struct rpivid_dev *const dev = ctx->dev;
2463         unsigned int i;
2464
2465         unsigned int w = ctx->dst_fmt.width;
2466         unsigned int h = ctx->dst_fmt.height;
2467         unsigned int wxh;
2468         size_t pu_alloc;
2469         size_t coeff_alloc;
2470
2471 #if DEBUG_TRACE_P1_CMD
2472         p1_z = 0;
2473 #endif
2474
2475         // Generate a sanitised WxH for memory alloc
2476         // Assume HD if unset
2477         if (w == 0)
2478                 w = 1920;
2479         if (w > 4096)
2480                 w = 4096;
2481         if (h == 0)
2482                 h = 1088;
2483         if (h > 4096)
2484                 h = 4096;
2485         wxh = w * h;
2486
2487         v4l2_info(&dev->v4l2_dev, "%s: (%dx%d)\n", __func__,
2488                   ctx->dst_fmt.width, ctx->dst_fmt.height);
2489
2490         ctx->fatal_err = 0;
2491         ctx->dec0 = NULL;
2492         ctx->state = kzalloc(sizeof(*ctx->state), GFP_KERNEL);
2493         if (!ctx->state) {
2494                 v4l2_err(&dev->v4l2_dev, "Failed to allocate decode state\n");
2495                 goto fail;
2496         }
2497
2498         if (dec_env_init(ctx) != 0) {
2499                 v4l2_err(&dev->v4l2_dev, "Failed to allocate decode envs\n");
2500                 goto fail;
2501         }
2502
2503         // Finger in the air PU & Coeff alloc
2504         // Will be realloced if too small
2505         coeff_alloc = rpivid_round_up_size(wxh);
2506         pu_alloc = rpivid_round_up_size(wxh / 4);
2507         for (i = 0; i != ARRAY_SIZE(ctx->pu_bufs); ++i) {
2508                 // Don't actually need a kernel mapping here
2509                 if (gptr_alloc(dev, ctx->pu_bufs + i, pu_alloc,
2510                                DMA_ATTR_NO_KERNEL_MAPPING))
2511                         goto fail;
2512                 if (gptr_alloc(dev, ctx->coeff_bufs + i, coeff_alloc,
2513                                DMA_ATTR_NO_KERNEL_MAPPING))
2514                         goto fail;
2515         }
2516         aux_q_init(ctx);
2517
2518         return 0;
2519
2520 fail:
2521         h265_ctx_uninit(dev, ctx);
2522         return -ENOMEM;
2523 }
2524
2525 static void rpivid_h265_trigger(struct rpivid_ctx *ctx)
2526 {
2527         struct rpivid_dev *const dev = ctx->dev;
2528         struct rpivid_dec_env *const de = ctx->dec0;
2529
2530         xtrace_in(dev, de);
2531
2532         switch (!de ? RPIVID_DECODE_ERROR_CONTINUE : de->state) {
2533         case RPIVID_DECODE_SLICE_START:
2534                 de->state = RPIVID_DECODE_SLICE_CONTINUE;
2535                 fallthrough;
2536         case RPIVID_DECODE_SLICE_CONTINUE:
2537                 v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
2538                                                  VB2_BUF_STATE_DONE);
2539                 xtrace_ok(dev, de);
2540                 break;
2541
2542         default:
2543                 v4l2_err(&dev->v4l2_dev, "%s: Unexpected state: %d\n", __func__,
2544                          de->state);
2545                 fallthrough;
2546         case RPIVID_DECODE_ERROR_DONE:
2547                 ctx->dec0 = NULL;
2548                 dec_env_delete(de);
2549                 fallthrough;
2550         case RPIVID_DECODE_ERROR_CONTINUE:
2551                 xtrace_fin(dev, de);
2552                 v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
2553                                                  VB2_BUF_STATE_ERROR);
2554                 break;
2555
2556         case RPIVID_DECODE_PHASE1:
2557                 ctx->dec0 = NULL;
2558
2559 #if !USE_REQUEST_PIN
2560                 /* Alloc a new request object - needs to be alloced dynamically
2561                  * as the media request will release it some random time after
2562                  * it is completed
2563                  */
2564                 de->req_obj = kmalloc(sizeof(*de->req_obj), GFP_KERNEL);
2565                 if (!de->req_obj) {
2566                         xtrace_fail(dev, de);
2567                         dec_env_delete(de);
2568                         v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev,
2569                                                          ctx->fh.m2m_ctx,
2570                                                          VB2_BUF_STATE_ERROR);
2571                         break;
2572                 }
2573                 media_request_object_init(de->req_obj);
2574 #warning probably needs to _get the req obj too
2575 #endif
2576                 ctx->p1idx = (ctx->p1idx + 1 >= RPIVID_P1BUF_COUNT) ?
2577                                                         0 : ctx->p1idx + 1;
2578
2579                 /* We know we have src & dst so no need to test */
2580                 de->src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx);
2581                 de->frame_buf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx);
2582
2583 #if USE_REQUEST_PIN
2584                 de->req_pin = de->src_buf->vb2_buf.req_obj.req;
2585                 media_request_pin(de->req_pin);
2586 #else
2587                 media_request_object_bind(de->src_buf->vb2_buf.req_obj.req,
2588                                           &dst_req_obj_ops, de, false,
2589                                           de->req_obj);
2590 #endif
2591
2592                 /* We could get rid of the src buffer here if we've already
2593                  * copied it, but we don't copy the last buffer unless it
2594                  * didn't return a contig dma addr and that shouldn't happen
2595                  */
2596
2597                 /* Enable the next setup if our Q isn't too big */
2598                 if (atomic_add_return(1, &ctx->p1out) < RPIVID_P1BUF_COUNT) {
2599                         xtrace_fin(dev, de);
2600                         v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
2601                 }
2602
2603                 rpivid_hw_irq_active1_claim(dev, &de->irq_ent, phase1_claimed,
2604                                             de);
2605                 xtrace_ok(dev, de);
2606                 break;
2607         }
2608 }
2609
2610 const struct rpivid_dec_ops rpivid_dec_ops_h265 = {
2611         .setup = rpivid_h265_setup,
2612         .start = rpivid_h265_start,
2613         .stop = rpivid_h265_stop,
2614         .trigger = rpivid_h265_trigger,
2615 };
2616
2617 static int try_ctrl_sps(struct v4l2_ctrl *ctrl)
2618 {
2619         const struct v4l2_ctrl_hevc_sps *const sps = ctrl->p_new.p_hevc_sps;
2620         struct rpivid_ctx *const ctx = ctrl->priv;
2621         struct rpivid_dev *const dev = ctx->dev;
2622
2623         if (sps->chroma_format_idc != 1) {
2624                 v4l2_warn(&dev->v4l2_dev,
2625                           "Chroma format (%d) unsupported\n",
2626                           sps->chroma_format_idc);
2627                 return -EINVAL;
2628         }
2629
2630         if (sps->bit_depth_luma_minus8 != 0 &&
2631             sps->bit_depth_luma_minus8 != 2) {
2632                 v4l2_warn(&dev->v4l2_dev,
2633                           "Luma depth (%d) unsupported\n",
2634                           sps->bit_depth_luma_minus8 + 8);
2635                 return -EINVAL;
2636         }
2637
2638         if (sps->bit_depth_luma_minus8 != sps->bit_depth_chroma_minus8) {
2639                 v4l2_warn(&dev->v4l2_dev,
2640                           "Chroma depth (%d) != Luma depth (%d)\n",
2641                           sps->bit_depth_chroma_minus8 + 8,
2642                           sps->bit_depth_luma_minus8 + 8);
2643                 return -EINVAL;
2644         }
2645
2646         if (!sps->pic_width_in_luma_samples ||
2647             !sps->pic_height_in_luma_samples ||
2648             sps->pic_width_in_luma_samples > 4096 ||
2649             sps->pic_height_in_luma_samples > 4096) {
2650                 v4l2_warn(&dev->v4l2_dev,
2651                           "Bad sps width (%u) x height (%u)\n",
2652                           sps->pic_width_in_luma_samples,
2653                           sps->pic_height_in_luma_samples);
2654                 return -EINVAL;
2655         }
2656
2657         if (!ctx->dst_fmt_set)
2658                 return 0;
2659
2660         if ((sps->bit_depth_luma_minus8 == 0 &&
2661              ctx->dst_fmt.pixelformat != V4L2_PIX_FMT_NV12_COL128) ||
2662             (sps->bit_depth_luma_minus8 == 2 &&
2663              ctx->dst_fmt.pixelformat != V4L2_PIX_FMT_NV12_10_COL128)) {
2664                 v4l2_warn(&dev->v4l2_dev,
2665                           "SPS luma depth %d does not match capture format\n",
2666                           sps->bit_depth_luma_minus8 + 8);
2667                 return -EINVAL;
2668         }
2669
2670         if (sps->pic_width_in_luma_samples > ctx->dst_fmt.width ||
2671             sps->pic_height_in_luma_samples > ctx->dst_fmt.height) {
2672                 v4l2_warn(&dev->v4l2_dev,
2673                           "SPS size (%dx%d) > capture size (%d,%d)\n",
2674                           sps->pic_width_in_luma_samples,
2675                           sps->pic_height_in_luma_samples,
2676                           ctx->dst_fmt.width,
2677                           ctx->dst_fmt.height);
2678                 return -EINVAL;
2679         }
2680
2681         return 0;
2682 }
2683
2684 const struct v4l2_ctrl_ops rpivid_hevc_sps_ctrl_ops = {
2685         .try_ctrl = try_ctrl_sps,
2686 };
2687
2688 static int try_ctrl_pps(struct v4l2_ctrl *ctrl)
2689 {
2690         const struct v4l2_ctrl_hevc_pps *const pps = ctrl->p_new.p_hevc_pps;
2691         struct rpivid_ctx *const ctx = ctrl->priv;
2692         struct rpivid_dev *const dev = ctx->dev;
2693
2694         if ((pps->flags &
2695              V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED) &&
2696             (pps->flags &
2697              V4L2_HEVC_PPS_FLAG_TILES_ENABLED) &&
2698             (pps->num_tile_columns_minus1 || pps->num_tile_rows_minus1)) {
2699                 v4l2_warn(&dev->v4l2_dev,
2700                           "WPP + Tiles not supported\n");
2701                 return -EINVAL;
2702         }
2703
2704         return 0;
2705 }
2706
2707 const struct v4l2_ctrl_ops rpivid_hevc_pps_ctrl_ops = {
2708         .try_ctrl = try_ctrl_pps,
2709 };
2710