1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Raspberry Pi HEVC driver
5 * Copyright (C) 2020 Raspberry Pi (Trading) Ltd
7 * Based on the Cedrus VPU driver, that is:
9 * Copyright (C) 2016 Florent Revest <florent.revest@free-electrons.com>
10 * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
11 * Copyright (C) 2018 Bootlin
14 #include <linux/delay.h>
15 #include <linux/types.h>
17 #include <media/videobuf2-dma-contig.h>
20 #include "rpivid_hw.h"
21 #include "rpivid_video.h"
23 #define DEBUG_TRACE_P1_CMD 0
24 #define DEBUG_TRACE_EXECUTION 0
26 #define USE_REQUEST_PIN 1
28 #if DEBUG_TRACE_EXECUTION
29 #define xtrace_in(dev_, de_)\
30 v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: in\n", __func__,\
31 (de_) == NULL ? -1 : (de_)->decode_order)
32 #define xtrace_ok(dev_, de_)\
33 v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: ok\n", __func__,\
34 (de_) == NULL ? -1 : (de_)->decode_order)
35 #define xtrace_fin(dev_, de_)\
36 v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: finish\n", __func__,\
37 (de_) == NULL ? -1 : (de_)->decode_order)
38 #define xtrace_fail(dev_, de_)\
39 v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: FAIL\n", __func__,\
40 (de_) == NULL ? -1 : (de_)->decode_order)
42 #define xtrace_in(dev_, de_)
43 #define xtrace_ok(dev_, de_)
44 #define xtrace_fin(dev_, de_)
45 #define xtrace_fail(dev_, de_)
48 enum hevc_slice_type {
54 enum hevc_layer { L0 = 0, L1 = 1 };
56 static int gptr_alloc(struct rpivid_dev *const dev, struct rpivid_gptr *gptr,
57 size_t size, unsigned long attrs)
62 gptr->ptr = dma_alloc_attrs(dev->dev, gptr->size, &gptr->addr,
63 GFP_KERNEL, gptr->attrs);
64 return !gptr->ptr ? -ENOMEM : 0;
67 static void gptr_free(struct rpivid_dev *const dev,
68 struct rpivid_gptr *const gptr)
71 dma_free_attrs(dev->dev, gptr->size, gptr->ptr, gptr->addr,
79 /* Realloc but do not copy
82 * If the alloc fails then it attempts to re-allocote the old size
83 * On error then check gptr->ptr to determine if anything is currently
86 static int gptr_realloc_new(struct rpivid_dev * const dev,
87 struct rpivid_gptr * const gptr, size_t size)
89 const size_t old_size = gptr->size;
91 if (size == gptr->size)
95 dma_free_attrs(dev->dev, gptr->size, gptr->ptr,
96 gptr->addr, gptr->attrs);
100 gptr->ptr = dma_alloc_attrs(dev->dev, gptr->size,
101 &gptr->addr, GFP_KERNEL, gptr->attrs);
105 gptr->size = old_size;
106 gptr->ptr = dma_alloc_attrs(dev->dev, gptr->size,
107 &gptr->addr, GFP_KERNEL, gptr->attrs);
119 static size_t next_size(const size_t x)
121 return rpivid_round_up_size(x + 1);
124 #define NUM_SCALING_FACTORS 4064 /* Not a typo = 0xbe0 + 0x400 */
128 #define PROB_BACKUP ((20 << 12) + (20 << 6) + (0 << 0))
129 #define PROB_RELOAD ((20 << 12) + (20 << 0) + (0 << 6))
131 #define HEVC_MAX_REFS V4L2_HEVC_DPB_ENTRIES_NUM_MAX
133 //////////////////////////////////////////////////////////////////////////////
140 struct rpivid_q_aux {
141 unsigned int refcount;
142 unsigned int q_index;
143 struct rpivid_q_aux *next;
144 struct rpivid_gptr col;
147 //////////////////////////////////////////////////////////////////////////////
149 enum rpivid_decode_state {
150 RPIVID_DECODE_SLICE_START,
151 RPIVID_DECODE_SLICE_CONTINUE,
152 RPIVID_DECODE_ERROR_CONTINUE,
153 RPIVID_DECODE_ERROR_DONE,
154 RPIVID_DECODE_PHASE1,
158 struct rpivid_dec_env {
159 struct rpivid_ctx *ctx;
160 struct rpivid_dec_env *next;
162 enum rpivid_decode_state state;
163 unsigned int decode_order;
164 int p1_status; /* P1 status - what to realloc */
166 struct rpi_cmd *cmd_fifo;
167 unsigned int cmd_len, cmd_max;
168 unsigned int num_slice_msgs;
169 unsigned int pic_width_in_ctbs_y;
170 unsigned int pic_height_in_ctbs_y;
171 unsigned int dpbno_col;
173 int collocated_from_l0_flag;
175 * Last CTB/Tile X,Y processed by (wpp_)entry_point
176 * Could be in _state as P0 only but needs updating where _state
179 unsigned int entry_ctb_x;
180 unsigned int entry_ctb_y;
181 unsigned int entry_tile_x;
182 unsigned int entry_tile_y;
183 unsigned int entry_qp;
190 struct vb2_v4l2_buffer *frame_buf; // Detached dest buffer
191 struct vb2_v4l2_buffer *src_buf; // Detached src buffer
192 unsigned int frame_c_offset;
193 unsigned int frame_stride;
194 dma_addr_t frame_addr;
195 dma_addr_t ref_addrs[16];
196 struct rpivid_q_aux *frame_aux;
197 struct rpivid_q_aux *col_aux;
202 dma_addr_t pu_base_vc;
203 dma_addr_t coeff_base_vc;
207 struct rpivid_gptr *bit_copy_gptr;
210 #define SLICE_MSGS_MAX (2 * HEVC_MAX_REFS * 8 + 3)
211 u16 slice_msgs[SLICE_MSGS_MAX];
212 u8 scaling_factors[NUM_SCALING_FACTORS];
215 struct media_request *req_pin;
217 struct media_request_object *req_obj;
219 struct rpivid_hw_irq_ent irq_ent;
222 #define member_size(type, member) sizeof(((type *)0)->member)
224 struct rpivid_dec_state {
225 struct v4l2_ctrl_hevc_sps sps;
226 struct v4l2_ctrl_hevc_pps pps;
228 // Helper vars & tables derived from sps/pps
229 unsigned int log2_ctb_size; /* log2 width of a CTB */
230 unsigned int ctb_width; /* Width in CTBs */
231 unsigned int ctb_height; /* Height in CTBs */
232 unsigned int ctb_size; /* Pic area in CTBs */
233 unsigned int tile_width; /* Width in tiles */
234 unsigned int tile_height; /* Height in tiles */
238 int *ctb_addr_rs_to_ts;
239 int *ctb_addr_ts_to_rs;
241 // Aux starage for DPB
243 struct rpivid_q_aux *ref_aux[HEVC_MAX_REFS];
244 struct rpivid_q_aux *frame_aux;
247 unsigned int slice_idx;
248 bool slice_temporal_mvp; /* Slice flag but constant for frame */
252 // Temp vars per run - don't actually need to persist
255 const struct v4l2_ctrl_hevc_slice_params *sh;
256 const struct v4l2_ctrl_hevc_decode_params *dec;
257 unsigned int nb_refs[2];
258 unsigned int slice_qp;
259 unsigned int max_num_merge_cand; // 0 if I-slice
260 bool dependent_slice_segment_flag;
262 unsigned int start_ts; /* slice_segment_addr -> ts */
263 unsigned int start_ctb_x; /* CTB X,Y of start_ts */
264 unsigned int start_ctb_y;
265 unsigned int prev_ctb_x; /* CTB X,Y of start_ts - 1 */
266 unsigned int prev_ctb_y;
270 static void dst_req_obj_release(struct media_request_object *object)
275 static const struct media_request_object_ops dst_req_obj_ops = {
276 .release = dst_req_obj_release,
280 static inline int clip_int(const int x, const int lo, const int hi)
282 return x < lo ? lo : x > hi ? hi : x;
285 //////////////////////////////////////////////////////////////////////////////
286 // Phase 1 command and bit FIFOs
288 #if DEBUG_TRACE_P1_CMD
292 static int cmds_check_space(struct rpivid_dec_env *const de, unsigned int n)
298 v4l2_err(&de->ctx->dev->v4l2_dev,
299 "%s: n %u implausible\n", __func__, n);
303 if (de->cmd_len + n <= de->cmd_max)
306 newmax = roundup_pow_of_two(de->cmd_len + n);
308 a = krealloc(de->cmd_fifo, newmax * sizeof(struct rpi_cmd),
311 v4l2_err(&de->ctx->dev->v4l2_dev,
312 "Failed cmd buffer realloc from %u to %u\n",
313 de->cmd_max, newmax);
316 v4l2_info(&de->ctx->dev->v4l2_dev,
317 "cmd buffer realloc from %u to %u\n", de->cmd_max, newmax);
320 de->cmd_max = newmax;
324 // ???? u16 addr - put in u32
325 static void p1_apb_write(struct rpivid_dec_env *const de, const u16 addr,
328 if (de->cmd_len >= de->cmd_max) {
329 v4l2_err(&de->ctx->dev->v4l2_dev,
330 "%s: Overflow @ %d\n", __func__, de->cmd_len);
334 de->cmd_fifo[de->cmd_len].addr = addr;
335 de->cmd_fifo[de->cmd_len].data = data;
337 #if DEBUG_TRACE_P1_CMD
339 v4l2_info(&de->ctx->dev->v4l2_dev, "[%02x] %x %x\n",
340 de->cmd_len, addr, data);
346 static int ctb_to_tile(unsigned int ctb, unsigned int *bd, int num)
350 for (i = 1; ctb >= bd[i]; i++)
351 ; // bd[] has num+1 elements; bd[0]=0;
355 static unsigned int ctb_to_tile_x(const struct rpivid_dec_state *const s,
356 const unsigned int ctb_x)
358 return ctb_to_tile(ctb_x, s->col_bd, s->tile_width);
361 static unsigned int ctb_to_tile_y(const struct rpivid_dec_state *const s,
362 const unsigned int ctb_y)
364 return ctb_to_tile(ctb_y, s->row_bd, s->tile_height);
367 static void aux_q_free(struct rpivid_ctx *const ctx,
368 struct rpivid_q_aux *const aq)
370 struct rpivid_dev *const dev = ctx->dev;
372 gptr_free(dev, &aq->col);
376 static struct rpivid_q_aux *aux_q_alloc(struct rpivid_ctx *const ctx,
377 const unsigned int q_index)
379 struct rpivid_dev *const dev = ctx->dev;
380 struct rpivid_q_aux *const aq = kzalloc(sizeof(*aq), GFP_KERNEL);
385 if (gptr_alloc(dev, &aq->col, ctx->colmv_picsize,
386 DMA_ATTR_FORCE_CONTIGUOUS | DMA_ATTR_NO_KERNEL_MAPPING))
390 * Spinlock not required as called in P0 only and
391 * aux checks done by _new
394 aq->q_index = q_index;
395 ctx->aux_ents[q_index] = aq;
403 static struct rpivid_q_aux *aux_q_new(struct rpivid_ctx *const ctx,
404 const unsigned int q_index)
406 struct rpivid_q_aux *aq;
407 unsigned long lockflags;
409 spin_lock_irqsave(&ctx->aux_lock, lockflags);
411 * If we already have this allocated to a slot then use that
412 * and assume that it will all work itself out in the pipeline
414 if ((aq = ctx->aux_ents[q_index]) != NULL) {
416 } else if ((aq = ctx->aux_free) != NULL) {
417 ctx->aux_free = aq->next;
420 aq->q_index = q_index;
421 ctx->aux_ents[q_index] = aq;
423 spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
426 aq = aux_q_alloc(ctx, q_index);
431 static struct rpivid_q_aux *aux_q_ref_idx(struct rpivid_ctx *const ctx,
434 unsigned long lockflags;
435 struct rpivid_q_aux *aq;
437 spin_lock_irqsave(&ctx->aux_lock, lockflags);
438 if ((aq = ctx->aux_ents[q_index]) != NULL)
440 spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
445 static struct rpivid_q_aux *aux_q_ref(struct rpivid_ctx *const ctx,
446 struct rpivid_q_aux *const aq)
449 unsigned long lockflags;
451 spin_lock_irqsave(&ctx->aux_lock, lockflags);
455 spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
460 static void aux_q_release(struct rpivid_ctx *const ctx,
461 struct rpivid_q_aux **const paq)
463 struct rpivid_q_aux *const aq = *paq;
464 unsigned long lockflags;
471 spin_lock_irqsave(&ctx->aux_lock, lockflags);
472 if (--aq->refcount == 0) {
473 aq->next = ctx->aux_free;
475 ctx->aux_ents[aq->q_index] = NULL;
478 spin_unlock_irqrestore(&ctx->aux_lock, lockflags);
481 static void aux_q_init(struct rpivid_ctx *const ctx)
483 spin_lock_init(&ctx->aux_lock);
484 ctx->aux_free = NULL;
487 static void aux_q_uninit(struct rpivid_ctx *const ctx)
489 struct rpivid_q_aux *aq;
491 ctx->colmv_picsize = 0;
492 ctx->colmv_stride = 0;
493 while ((aq = ctx->aux_free) != NULL) {
494 ctx->aux_free = aq->next;
499 //////////////////////////////////////////////////////////////////////////////
502 * Initialisation process for context variables (CABAC init)
505 * N.B. If comparing with FFmpeg note that this h/w uses slightly different
506 * offsets to FFmpegs array
509 /* Actual number of values */
510 #define RPI_PROB_VALS 154U
511 /* Rounded up as we copy words */
512 #define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
514 /* Initialiser values - see tables H.265 9-4 through 9-42 */
515 static const u8 prob_init[3][156] = {
517 153, 200, 139, 141, 157, 154, 154, 154, 154, 154, 184, 154, 154,
518 154, 184, 63, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154,
519 154, 154, 154, 153, 138, 138, 111, 141, 94, 138, 182, 154, 154,
520 154, 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92,
521 139, 107, 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, 110,
522 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
523 79, 108, 123, 63, 110, 110, 124, 125, 140, 153, 125, 127, 140,
524 109, 111, 143, 127, 111, 79, 108, 123, 63, 91, 171, 134, 141,
525 138, 153, 136, 167, 152, 152, 139, 139, 111, 111, 125, 110, 110,
526 94, 124, 108, 124, 107, 125, 141, 179, 153, 125, 107, 125, 141,
527 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, 139, 182, 182,
528 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, 0, 0,
531 153, 185, 107, 139, 126, 197, 185, 201, 154, 149, 154, 139, 154,
532 154, 154, 152, 110, 122, 95, 79, 63, 31, 31, 153, 153, 168,
533 140, 198, 79, 124, 138, 94, 153, 111, 149, 107, 167, 154, 154,
534 154, 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136,
535 153, 121, 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, 125,
536 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
537 94, 108, 123, 108, 125, 110, 94, 110, 95, 79, 125, 111, 110,
538 78, 110, 111, 111, 95, 94, 108, 123, 108, 121, 140, 61, 154,
539 107, 167, 91, 122, 107, 167, 139, 139, 155, 154, 139, 153, 139,
540 123, 123, 63, 153, 166, 183, 140, 136, 153, 154, 166, 183, 140,
541 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 123, 123,
542 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, 0, 0,
545 153, 160, 107, 139, 126, 197, 185, 201, 154, 134, 154, 139, 154,
546 154, 183, 152, 154, 137, 95, 79, 63, 31, 31, 153, 153, 168,
547 169, 198, 79, 224, 167, 122, 153, 111, 149, 92, 167, 154, 154,
548 154, 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136,
549 153, 121, 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, 125,
550 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
551 79, 108, 123, 93, 125, 110, 124, 110, 95, 94, 125, 111, 111,
552 79, 125, 126, 111, 111, 79, 108, 123, 93, 121, 140, 61, 154,
553 107, 167, 91, 107, 107, 167, 139, 139, 170, 154, 139, 153, 139,
554 123, 123, 63, 124, 166, 183, 140, 136, 153, 154, 166, 183, 140,
555 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 138, 138,
556 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, 0, 0,
560 #define CMDS_WRITE_PROB ((RPI_PROB_ARRAY_SIZE / 4) + 1)
561 static void write_prob(struct rpivid_dec_env *const de,
562 const struct rpivid_dec_state *const s)
564 u8 dst[RPI_PROB_ARRAY_SIZE];
566 const unsigned int init_type =
567 ((s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT) != 0 &&
568 s->sh->slice_type != HEVC_SLICE_I) ?
569 s->sh->slice_type + 1 :
570 2 - s->sh->slice_type;
571 const u8 *p = prob_init[init_type];
572 const int q = clip_int(s->slice_qp, 0, 51);
575 for (i = 0; i < RPI_PROB_VALS; i++) {
576 int init_value = p[i];
577 int m = (init_value >> 4) * 5 - 45;
578 int n = ((init_value & 15) << 3) - 16;
579 int pre = 2 * (((m * q) >> 4) + n) - 127;
583 pre = 124 + (pre & 1);
586 for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i)
589 for (i = 0; i < RPI_PROB_ARRAY_SIZE; i += 4)
590 p1_apb_write(de, 0x1000 + i,
591 dst[i] + (dst[i + 1] << 8) + (dst[i + 2] << 16) +
595 * Having written the prob array back it up
596 * This is not always needed but is a small overhead that simplifies
597 * (and speeds up) some multi-tile & WPP scenarios
598 * There are no scenarios where having written a prob we ever want
599 * a previous (non-initial) state back
601 p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
604 #define CMDS_WRITE_SCALING_FACTORS NUM_SCALING_FACTORS
605 static void write_scaling_factors(struct rpivid_dec_env *const de)
608 const u8 *p = (u8 *)de->scaling_factors;
610 for (i = 0; i < NUM_SCALING_FACTORS; i += 4, p += 4)
611 p1_apb_write(de, 0x2000 + i,
612 p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24));
615 static inline __u32 dma_to_axi_addr(dma_addr_t a)
617 return (__u32)(a >> 6);
620 #define CMDS_WRITE_BITSTREAM 4
621 static int write_bitstream(struct rpivid_dec_env *const de,
622 const struct rpivid_dec_state *const s)
624 // Note that FFmpeg V4L2 does not remove emulation prevention bytes,
625 // so this is matched in the configuration here.
626 // Whether that is the correct behaviour or not is not clear in the
628 const int rpi_use_emu = 1;
629 unsigned int offset = s->sh->data_bit_offset / 8 + 1;
630 const unsigned int len = (s->sh->bit_size + 7) / 8 - offset;
633 if (s->src_addr != 0) {
634 addr = s->src_addr + offset;
636 if (len + de->bit_copy_len > de->bit_copy_gptr->size) {
637 v4l2_warn(&de->ctx->dev->v4l2_dev,
638 "Bit copy buffer overflow: size=%zu, offset=%zu, len=%u\n",
639 de->bit_copy_gptr->size,
640 de->bit_copy_len, len);
643 memcpy(de->bit_copy_gptr->ptr + de->bit_copy_len,
644 s->src_buf + offset, len);
645 addr = de->bit_copy_gptr->addr + de->bit_copy_len;
646 de->bit_copy_len += (len + 63) & ~63;
650 p1_apb_write(de, RPI_BFBASE, dma_to_axi_addr(addr));
651 p1_apb_write(de, RPI_BFNUM, len);
652 p1_apb_write(de, RPI_BFCONTROL, offset + (1 << 7)); // Stop
653 p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu << 6));
657 //////////////////////////////////////////////////////////////////////////////
660 * The slice constant part of the slice register - width and height need to
661 * be ORed in later as they are per-tile / WPP-row
663 static u32 slice_reg_const(const struct rpivid_dec_state *const s)
665 u32 x = (s->max_num_merge_cand << 0) |
666 (s->nb_refs[L0] << 4) |
667 (s->nb_refs[L1] << 8) |
668 (s->sh->slice_type << 12);
670 if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA)
672 if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA)
674 if (s->sh->slice_type == HEVC_SLICE_B &&
675 (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO))
681 //////////////////////////////////////////////////////////////////////////////
683 #define CMDS_NEW_SLICE_SEGMENT (4 + CMDS_WRITE_SCALING_FACTORS)
684 static void new_slice_segment(struct rpivid_dec_env *const de,
685 const struct rpivid_dec_state *const s)
687 const struct v4l2_ctrl_hevc_sps *const sps = &s->sps;
688 const struct v4l2_ctrl_hevc_pps *const pps = &s->pps;
692 ((sps->log2_min_luma_coding_block_size_minus3 + 3) << 0) |
693 (s->log2_ctb_size << 4) |
694 ((sps->log2_min_luma_transform_block_size_minus2 + 2)
696 ((sps->log2_min_luma_transform_block_size_minus2 + 2 +
697 sps->log2_diff_max_min_luma_transform_block_size)
699 ((sps->bit_depth_luma_minus8 + 8) << 16) |
700 ((sps->bit_depth_chroma_minus8 + 8) << 20) |
701 (sps->max_transform_hierarchy_depth_intra << 24) |
702 (sps->max_transform_hierarchy_depth_inter << 28));
706 ((sps->pcm_sample_bit_depth_luma_minus1 + 1) << 0) |
707 ((sps->pcm_sample_bit_depth_chroma_minus1 + 1) << 4) |
708 ((sps->log2_min_pcm_luma_coding_block_size_minus3 + 3)
710 ((sps->log2_min_pcm_luma_coding_block_size_minus3 + 3 +
711 sps->log2_diff_max_min_pcm_luma_coding_block_size)
713 (((sps->flags & V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE) ?
714 0 : sps->chroma_format_idc) << 16) |
715 ((!!(sps->flags & V4L2_HEVC_SPS_FLAG_AMP_ENABLED)) << 18) |
716 ((!!(sps->flags & V4L2_HEVC_SPS_FLAG_PCM_ENABLED)) << 19) |
717 ((!!(sps->flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED))
720 V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED))
725 ((s->log2_ctb_size - pps->diff_cu_qp_delta_depth) << 0) |
726 ((!!(pps->flags & V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED))
729 V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED))
731 ((!!(pps->flags & V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED))
734 V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED))
736 (((pps->pps_cb_qp_offset + s->sh->slice_cb_qp_offset) & 255)
738 (((pps->pps_cr_qp_offset + s->sh->slice_cr_qp_offset) & 255)
741 V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED))
745 (sps->flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED) != 0)
746 write_scaling_factors(de);
748 if (!s->dependent_slice_segment_flag) {
749 int ctb_col = s->sh->slice_segment_addr %
750 de->pic_width_in_ctbs_y;
751 int ctb_row = s->sh->slice_segment_addr /
752 de->pic_width_in_ctbs_y;
754 de->reg_slicestart = (ctb_col << 0) + (ctb_row << 16);
757 p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
760 //////////////////////////////////////////////////////////////////////////////
763 static void msg_slice(struct rpivid_dec_env *const de, const u16 msg)
765 de->slice_msgs[de->num_slice_msgs++] = msg;
768 #define CMDS_PROGRAM_SLICECMDS (1 + SLICE_MSGS_MAX)
769 static void program_slicecmds(struct rpivid_dec_env *const de,
774 p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs + (sliceid << 8));
776 for (i = 0; i < de->num_slice_msgs; i++)
777 p1_apb_write(de, 0x4000 + 4 * i, de->slice_msgs[i] & 0xffff);
780 // NoBackwardPredictionFlag 8.3.5
781 // Simply checks POCs
782 static int has_backward(const struct v4l2_hevc_dpb_entry *const dpb,
783 const __u8 *const idx, const unsigned int n,
784 const unsigned int cur_poc)
788 for (i = 0; i < n; ++i) {
790 // We only get u16 pocs & 8.3.1 says
791 // "The bitstream shall not contain data that result in values
792 // of DiffPicOrderCnt( picA, picB ) used in the decoding
793 // process that are not in the range of −2^15 to 2^15 − 1,
795 if (((cur_poc - dpb[idx[i]].pic_order_cnt[0]) & 0x8000) != 0)
801 static void pre_slice_decode(struct rpivid_dec_env *const de,
802 const struct rpivid_dec_state *const s)
804 const struct v4l2_ctrl_hevc_slice_params *const sh = s->sh;
805 const struct v4l2_ctrl_hevc_decode_params *const dec = s->dec;
806 int weighted_pred_flag, idx;
808 unsigned int collocated_from_l0_flag;
810 de->num_slice_msgs = 0;
813 if (sh->slice_type == HEVC_SLICE_I)
815 if (sh->slice_type == HEVC_SLICE_P)
817 if (sh->slice_type == HEVC_SLICE_B)
820 cmd_slice |= (s->nb_refs[L0] << 2) | (s->nb_refs[L1] << 6) |
821 (s->max_num_merge_cand << 11);
823 collocated_from_l0_flag =
824 !s->slice_temporal_mvp ||
825 sh->slice_type != HEVC_SLICE_B ||
826 (sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0);
827 cmd_slice |= collocated_from_l0_flag << 14;
829 if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
830 // Flag to say all reference pictures are from the past
831 const int no_backward_pred_flag =
832 has_backward(dec->dpb, sh->ref_idx_l0, s->nb_refs[L0],
833 sh->slice_pic_order_cnt) &&
834 has_backward(dec->dpb, sh->ref_idx_l1, s->nb_refs[L1],
835 sh->slice_pic_order_cnt);
836 cmd_slice |= no_backward_pred_flag << 10;
837 msg_slice(de, cmd_slice);
839 if (s->slice_temporal_mvp) {
840 const __u8 *const rpl = collocated_from_l0_flag ?
841 sh->ref_idx_l0 : sh->ref_idx_l1;
842 de->dpbno_col = rpl[sh->collocated_ref_idx];
843 //v4l2_info(&de->ctx->dev->v4l2_dev,
844 // "L0=%d col_ref_idx=%d,
845 // dpb_no=%d\n", collocated_from_l0_flag,
846 // sh->collocated_ref_idx, de->dpbno_col);
849 // Write reference picture descriptions
851 sh->slice_type == HEVC_SLICE_P ?
852 !!(s->pps.flags & V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED) :
853 !!(s->pps.flags & V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED);
855 for (idx = 0; idx < s->nb_refs[L0]; ++idx) {
856 unsigned int dpb_no = sh->ref_idx_l0[idx];
857 //v4l2_info(&de->ctx->dev->v4l2_dev,
858 // "L0[%d]=dpb[%d]\n", idx, dpb_no);
862 (dec->dpb[dpb_no].rps ==
863 V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR ?
865 (weighted_pred_flag ? (3 << 5) : 0));
866 msg_slice(de, dec->dpb[dpb_no].pic_order_cnt[0]);
868 if (weighted_pred_flag) {
869 const struct v4l2_hevc_pred_weight_table
870 *const w = &sh->pred_weight_table;
871 const int luma_weight_denom =
872 (1 << w->luma_log2_weight_denom);
873 const unsigned int chroma_log2_weight_denom =
874 (w->luma_log2_weight_denom +
875 w->delta_chroma_log2_weight_denom);
876 const int chroma_weight_denom =
877 (1 << chroma_log2_weight_denom);
880 w->luma_log2_weight_denom |
881 (((w->delta_luma_weight_l0[idx] +
882 luma_weight_denom) & 0x1ff)
884 msg_slice(de, w->luma_offset_l0[idx] & 0xff);
886 chroma_log2_weight_denom |
887 (((w->delta_chroma_weight_l0[idx][0] +
888 chroma_weight_denom) & 0x1ff)
891 w->chroma_offset_l0[idx][0] & 0xff);
893 chroma_log2_weight_denom |
894 (((w->delta_chroma_weight_l0[idx][1] +
895 chroma_weight_denom) & 0x1ff)
898 w->chroma_offset_l0[idx][1] & 0xff);
902 for (idx = 0; idx < s->nb_refs[L1]; ++idx) {
903 unsigned int dpb_no = sh->ref_idx_l1[idx];
904 //v4l2_info(&de->ctx->dev->v4l2_dev,
905 // "L1[%d]=dpb[%d]\n", idx, dpb_no);
908 (dec->dpb[dpb_no].rps ==
909 V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR ?
911 (weighted_pred_flag ? (3 << 5) : 0));
912 msg_slice(de, dec->dpb[dpb_no].pic_order_cnt[0]);
913 if (weighted_pred_flag) {
914 const struct v4l2_hevc_pred_weight_table
915 *const w = &sh->pred_weight_table;
916 const int luma_weight_denom =
917 (1 << w->luma_log2_weight_denom);
918 const unsigned int chroma_log2_weight_denom =
919 (w->luma_log2_weight_denom +
920 w->delta_chroma_log2_weight_denom);
921 const int chroma_weight_denom =
922 (1 << chroma_log2_weight_denom);
925 w->luma_log2_weight_denom |
926 (((w->delta_luma_weight_l1[idx] +
927 luma_weight_denom) & 0x1ff) << 3));
928 msg_slice(de, w->luma_offset_l1[idx] & 0xff);
930 chroma_log2_weight_denom |
931 (((w->delta_chroma_weight_l1[idx][0] +
932 chroma_weight_denom) & 0x1ff)
935 w->chroma_offset_l1[idx][0] & 0xff);
937 chroma_log2_weight_denom |
938 (((w->delta_chroma_weight_l1[idx][1] +
939 chroma_weight_denom) & 0x1ff)
942 w->chroma_offset_l1[idx][1] & 0xff);
946 msg_slice(de, cmd_slice);
950 (sh->slice_beta_offset_div2 & 15) |
951 ((sh->slice_tc_offset_div2 & 15) << 4) |
953 V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED) ?
956 V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED) ?
959 V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED) ?
962 msg_slice(de, ((sh->slice_cr_qp_offset & 31) << 5) +
963 (sh->slice_cb_qp_offset & 31)); // CMD_QPOFF
966 #define CMDS_WRITE_SLICE 1
967 static void write_slice(struct rpivid_dec_env *const de,
968 const struct rpivid_dec_state *const s,
969 const u32 slice_const,
970 const unsigned int ctb_col,
971 const unsigned int ctb_row)
973 const unsigned int cs = (1 << s->log2_ctb_size);
974 const unsigned int w_last = s->sps.pic_width_in_luma_samples & (cs - 1);
975 const unsigned int h_last = s->sps.pic_height_in_luma_samples & (cs - 1);
977 p1_apb_write(de, RPI_SLICE,
979 ((ctb_col + 1 < s->ctb_width || !w_last ?
980 cs : w_last) << 17) |
981 ((ctb_row + 1 < s->ctb_height || !h_last ?
982 cs : h_last) << 24));
985 #define PAUSE_MODE_WPP 1
986 #define PAUSE_MODE_TILE 0xffff
989 * N.B. This can be called to fill in data from the previous slice so must not
990 * use any state data that may change from slice to slice (e.g. qp)
992 #define CMDS_NEW_ENTRY_POINT (6 + CMDS_WRITE_SLICE)
993 static void new_entry_point(struct rpivid_dec_env *const de,
994 const struct rpivid_dec_state *const s,
996 const bool reset_qp_y,
997 const u32 pause_mode,
998 const unsigned int tile_x,
999 const unsigned int tile_y,
1000 const unsigned int ctb_col,
1001 const unsigned int ctb_row,
1002 const unsigned int slice_qp,
1003 const u32 slice_const)
1005 const unsigned int endx = s->col_bd[tile_x + 1] - 1;
1006 const unsigned int endy = (pause_mode == PAUSE_MODE_WPP) ?
1007 ctb_row : s->row_bd[tile_y + 1] - 1;
1009 p1_apb_write(de, RPI_TILESTART,
1010 s->col_bd[tile_x] | (s->row_bd[tile_y] << 16));
1011 p1_apb_write(de, RPI_TILEEND, endx | (endy << 16));
1014 p1_apb_write(de, RPI_BEGINTILEEND, endx | (endy << 16));
1016 write_slice(de, s, slice_const, endx, endy);
1019 unsigned int sps_qp_bd_offset =
1020 6 * s->sps.bit_depth_luma_minus8;
1022 p1_apb_write(de, RPI_QP, sps_qp_bd_offset + slice_qp);
1025 p1_apb_write(de, RPI_MODE,
1027 ((endx == s->ctb_width - 1) << 17) |
1028 ((endy == s->ctb_height - 1) << 18));
1030 p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) | (ctb_row << 16));
1032 de->entry_tile_x = tile_x;
1033 de->entry_tile_y = tile_y;
1034 de->entry_ctb_x = ctb_col;
1035 de->entry_ctb_y = ctb_row;
1036 de->entry_qp = slice_qp;
1037 de->entry_slice = slice_const;
1040 //////////////////////////////////////////////////////////////////////////////
1043 #define CMDS_WPP_PAUSE 4
1044 static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row)
1046 p1_apb_write(de, RPI_STATUS, (ctb_row << 18) | 0x25);
1047 p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
1048 p1_apb_write(de, RPI_MODE,
1049 ctb_row == de->pic_height_in_ctbs_y - 1 ?
1051 p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2);
1054 #define CMDS_WPP_ENTRY_FILL_1 (CMDS_WPP_PAUSE + 2 + CMDS_NEW_ENTRY_POINT)
1055 static int wpp_entry_fill(struct rpivid_dec_env *const de,
1056 const struct rpivid_dec_state *const s,
1057 const unsigned int last_y)
1060 const unsigned int last_x = s->ctb_width - 1;
1062 rv = cmds_check_space(de, CMDS_WPP_ENTRY_FILL_1 *
1063 (last_y - de->entry_ctb_y));
1067 while (de->entry_ctb_y < last_y) {
1068 /* wpp_entry_x/y set by wpp_entry_point */
1069 if (s->ctb_width > 2)
1070 wpp_pause(de, de->entry_ctb_y);
1071 p1_apb_write(de, RPI_STATUS,
1072 (de->entry_ctb_y << 18) | (last_x << 5) | 2);
1074 /* if width == 1 then the saved state is the init one */
1075 if (s->ctb_width == 2)
1076 p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
1078 p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
1080 new_entry_point(de, s, false, true, PAUSE_MODE_WPP,
1081 0, 0, 0, de->entry_ctb_y + 1,
1082 de->entry_qp, de->entry_slice);
1087 static int wpp_end_previous_slice(struct rpivid_dec_env *const de,
1088 const struct rpivid_dec_state *const s)
1092 rv = wpp_entry_fill(de, s, s->prev_ctb_y);
1096 rv = cmds_check_space(de, CMDS_WPP_PAUSE + 2);
1100 if (de->entry_ctb_x < 2 &&
1101 (de->entry_ctb_y < s->start_ctb_y || s->start_ctb_x > 2) &&
1103 wpp_pause(de, s->prev_ctb_y);
1104 p1_apb_write(de, RPI_STATUS,
1105 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
1106 if (s->start_ctb_x == 2 ||
1107 (s->ctb_width == 2 && de->entry_ctb_y < s->start_ctb_y))
1108 p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
1112 /* Only main profile supported so WPP => !Tiles which makes some of the
1113 * next chunk code simpler
1115 static int wpp_decode_slice(struct rpivid_dec_env *const de,
1116 const struct rpivid_dec_state *const s,
1119 bool reset_qp_y = true;
1120 const bool indep = !s->dependent_slice_segment_flag;
1124 rv = wpp_end_previous_slice(de, s);
1128 pre_slice_decode(de, s);
1130 rv = cmds_check_space(de,
1131 CMDS_WRITE_BITSTREAM +
1133 CMDS_PROGRAM_SLICECMDS +
1134 CMDS_NEW_SLICE_SEGMENT +
1135 CMDS_NEW_ENTRY_POINT);
1139 rv = write_bitstream(de, s);
1143 if (!s->start_ts || indep || s->ctb_width == 1)
1145 else if (!s->start_ctb_x)
1146 p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
1150 program_slicecmds(de, s->slice_idx);
1151 new_slice_segment(de, s);
1152 new_entry_point(de, s, indep, reset_qp_y, PAUSE_MODE_WPP,
1153 0, 0, s->start_ctb_x, s->start_ctb_y,
1154 s->slice_qp, slice_reg_const(s));
1157 rv = wpp_entry_fill(de, s, s->ctb_height - 1);
1161 rv = cmds_check_space(de, CMDS_WPP_PAUSE + 1);
1165 if (de->entry_ctb_x < 2 && s->ctb_width > 2)
1166 wpp_pause(de, s->ctb_height - 1);
1168 p1_apb_write(de, RPI_STATUS,
1169 1 | ((s->ctb_width - 1) << 5) |
1170 ((s->ctb_height - 1) << 18));
1175 //////////////////////////////////////////////////////////////////////////////
1178 // Guarantees 1 cmd entry free on exit
1179 static int tile_entry_fill(struct rpivid_dec_env *const de,
1180 const struct rpivid_dec_state *const s,
1181 const unsigned int last_tile_x,
1182 const unsigned int last_tile_y)
1184 while (de->entry_tile_y < last_tile_y ||
1185 (de->entry_tile_y == last_tile_y &&
1186 de->entry_tile_x < last_tile_x)) {
1188 unsigned int t_x = de->entry_tile_x;
1189 unsigned int t_y = de->entry_tile_y;
1190 const unsigned int last_x = s->col_bd[t_x + 1] - 1;
1191 const unsigned int last_y = s->row_bd[t_y + 1] - 1;
1193 // One more than needed here
1194 rv = cmds_check_space(de, CMDS_NEW_ENTRY_POINT + 3);
1198 p1_apb_write(de, RPI_STATUS,
1199 2 | (last_x << 5) | (last_y << 18));
1200 p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
1203 if (++t_x >= s->tile_width) {
1208 new_entry_point(de, s, false, true, PAUSE_MODE_TILE,
1209 t_x, t_y, s->col_bd[t_x], s->row_bd[t_y],
1210 de->entry_qp, de->entry_slice);
1216 * Write STATUS register with expected end CTU address of previous slice
1218 static int end_previous_slice(struct rpivid_dec_env *const de,
1219 const struct rpivid_dec_state *const s)
1223 rv = tile_entry_fill(de, s,
1224 ctb_to_tile_x(s, s->prev_ctb_x),
1225 ctb_to_tile_y(s, s->prev_ctb_y));
1229 p1_apb_write(de, RPI_STATUS,
1230 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
1234 static int decode_slice(struct rpivid_dec_env *const de,
1235 const struct rpivid_dec_state *const s,
1239 unsigned int tile_x = ctb_to_tile_x(s, s->start_ctb_x);
1240 unsigned int tile_y = ctb_to_tile_y(s, s->start_ctb_y);
1244 rv = end_previous_slice(de, s);
1249 rv = cmds_check_space(de,
1250 CMDS_WRITE_BITSTREAM +
1252 CMDS_PROGRAM_SLICECMDS +
1253 CMDS_NEW_SLICE_SEGMENT +
1254 CMDS_NEW_ENTRY_POINT);
1258 pre_slice_decode(de, s);
1259 rv = write_bitstream(de, s);
1263 reset_qp_y = !s->start_ts ||
1264 !s->dependent_slice_segment_flag ||
1265 tile_x != ctb_to_tile_x(s, s->prev_ctb_x) ||
1266 tile_y != ctb_to_tile_y(s, s->prev_ctb_y);
1270 program_slicecmds(de, s->slice_idx);
1271 new_slice_segment(de, s);
1272 new_entry_point(de, s, !s->dependent_slice_segment_flag, reset_qp_y,
1274 tile_x, tile_y, s->start_ctb_x, s->start_ctb_y,
1275 s->slice_qp, slice_reg_const(s));
1278 * If this is the last slice then fill in the other tile entries
1279 * now, otherwise this will be done at the start of the next slice
1280 * when it will be known where this slice finishes
1283 rv = tile_entry_fill(de, s,
1285 s->tile_height - 1);
1288 p1_apb_write(de, RPI_STATUS,
1289 1 | ((s->ctb_width - 1) << 5) |
1290 ((s->ctb_height - 1) << 18));
1295 //////////////////////////////////////////////////////////////////////////////
1298 static void expand_scaling_list(const unsigned int size_id,
1300 const u8 *const src0, uint8_t dc)
1307 memcpy(dst0, src0, 16);
1310 memcpy(dst0, src0, 64);
1315 for (y = 0; y != 16; y++) {
1316 const u8 *s = src0 + (y >> 1) * 8;
1318 for (x = 0; x != 8; ++x) {
1328 for (y = 0; y != 32; y++) {
1329 const u8 *s = src0 + (y >> 2) * 8;
1331 for (x = 0; x != 8; ++x) {
1343 static void populate_scaling_factors(const struct rpivid_run *const run,
1344 struct rpivid_dec_env *const de,
1345 const struct rpivid_dec_state *const s)
1347 const struct v4l2_ctrl_hevc_scaling_matrix *const sl =
1348 run->h265.scaling_matrix;
1349 // Array of constants for scaling factors
1350 static const u32 scaling_factor_offsets[4][6] = {
1351 // MID0 MID1 MID2 MID3 MID4 MID5
1353 { 0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050 },
1355 { 0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0 },
1357 { 0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0 },
1359 { 0x07E0, 0x0BE0, 0x0000, 0x0000, 0x0000, 0x0000 }
1364 for (mid = 0; mid < 6; mid++)
1365 expand_scaling_list(0, de->scaling_factors +
1366 scaling_factor_offsets[0][mid],
1367 sl->scaling_list_4x4[mid], 0);
1368 for (mid = 0; mid < 6; mid++)
1369 expand_scaling_list(1, de->scaling_factors +
1370 scaling_factor_offsets[1][mid],
1371 sl->scaling_list_8x8[mid], 0);
1372 for (mid = 0; mid < 6; mid++)
1373 expand_scaling_list(2, de->scaling_factors +
1374 scaling_factor_offsets[2][mid],
1375 sl->scaling_list_16x16[mid],
1376 sl->scaling_list_dc_coef_16x16[mid]);
1377 for (mid = 0; mid < 2; mid++)
1378 expand_scaling_list(3, de->scaling_factors +
1379 scaling_factor_offsets[3][mid],
1380 sl->scaling_list_32x32[mid],
1381 sl->scaling_list_dc_coef_32x32[mid]);
1384 static void free_ps_info(struct rpivid_dec_state *const s)
1386 kfree(s->ctb_addr_rs_to_ts);
1387 s->ctb_addr_rs_to_ts = NULL;
1388 kfree(s->ctb_addr_ts_to_rs);
1389 s->ctb_addr_ts_to_rs = NULL;
1397 static unsigned int tile_width(const struct rpivid_dec_state *const s,
1398 const unsigned int t_x)
1400 return s->col_bd[t_x + 1] - s->col_bd[t_x];
1403 static unsigned int tile_height(const struct rpivid_dec_state *const s,
1404 const unsigned int t_y)
1406 return s->row_bd[t_y + 1] - s->row_bd[t_y];
1409 static void fill_rs_to_ts(struct rpivid_dec_state *const s)
1411 unsigned int ts = 0;
1413 unsigned int tr_rs = 0;
1415 for (t_y = 0; t_y != s->tile_height; ++t_y) {
1416 const unsigned int t_h = tile_height(s, t_y);
1418 unsigned int tc_rs = tr_rs;
1420 for (t_x = 0; t_x != s->tile_width; ++t_x) {
1421 const unsigned int t_w = tile_width(s, t_x);
1423 unsigned int rs = tc_rs;
1425 for (y = 0; y != t_h; ++y) {
1428 for (x = 0; x != t_w; ++x) {
1429 s->ctb_addr_rs_to_ts[rs + x] = ts;
1430 s->ctb_addr_ts_to_rs[ts] = rs + x;
1437 tr_rs += t_h * s->ctb_width;
1441 static int updated_ps(struct rpivid_dec_state *const s)
1447 // Inferred parameters
1448 s->log2_ctb_size = s->sps.log2_min_luma_coding_block_size_minus3 + 3 +
1449 s->sps.log2_diff_max_min_luma_coding_block_size;
1451 s->ctb_width = (s->sps.pic_width_in_luma_samples +
1452 (1 << s->log2_ctb_size) - 1) >>
1454 s->ctb_height = (s->sps.pic_height_in_luma_samples +
1455 (1 << s->log2_ctb_size) - 1) >>
1457 s->ctb_size = s->ctb_width * s->ctb_height;
1459 // Inferred parameters
1461 s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size,
1462 sizeof(*s->ctb_addr_rs_to_ts),
1464 if (!s->ctb_addr_rs_to_ts)
1466 s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size,
1467 sizeof(*s->ctb_addr_ts_to_rs),
1469 if (!s->ctb_addr_ts_to_rs)
1472 if (!(s->pps.flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) {
1476 s->tile_width = s->pps.num_tile_columns_minus1 + 1;
1477 s->tile_height = s->pps.num_tile_rows_minus1 + 1;
1480 s->col_bd = kmalloc((s->tile_width + 1) * sizeof(*s->col_bd),
1484 s->row_bd = kmalloc((s->tile_height + 1) * sizeof(*s->row_bd),
1490 for (i = 1; i < s->tile_width; i++)
1491 s->col_bd[i] = s->col_bd[i - 1] +
1492 s->pps.column_width_minus1[i - 1] + 1;
1493 s->col_bd[s->tile_width] = s->ctb_width;
1496 for (i = 1; i < s->tile_height; i++)
1497 s->row_bd[i] = s->row_bd[i - 1] +
1498 s->pps.row_height_minus1[i - 1] + 1;
1499 s->row_bd[s->tile_height] = s->ctb_height;
1506 /* Set invalid to force reload */
1507 s->sps.pic_width_in_luma_samples = 0;
1511 static int write_cmd_buffer(struct rpivid_dev *const dev,
1512 struct rpivid_dec_env *const de,
1513 const struct rpivid_dec_state *const s)
1515 const size_t cmd_size = ALIGN(de->cmd_len * sizeof(de->cmd_fifo[0]),
1518 de->cmd_addr = dma_map_single(dev->dev, de->cmd_fifo,
1519 cmd_size, DMA_TO_DEVICE);
1520 if (dma_mapping_error(dev->dev, de->cmd_addr)) {
1521 v4l2_err(&dev->v4l2_dev,
1522 "Map cmd buffer (%zu): FAILED\n", cmd_size);
1525 de->cmd_size = cmd_size;
1529 static void setup_colmv(struct rpivid_ctx *const ctx, struct rpivid_run *run,
1530 struct rpivid_dec_state *const s)
1532 ctx->colmv_stride = ALIGN(s->sps.pic_width_in_luma_samples, 64);
1533 ctx->colmv_picsize = ctx->colmv_stride *
1534 (ALIGN(s->sps.pic_height_in_luma_samples, 64) >> 4);
1537 // Can be called from irq context
1538 static struct rpivid_dec_env *dec_env_new(struct rpivid_ctx *const ctx)
1540 struct rpivid_dec_env *de;
1541 unsigned long lock_flags;
1543 spin_lock_irqsave(&ctx->dec_lock, lock_flags);
1547 ctx->dec_free = de->next;
1549 de->state = RPIVID_DECODE_SLICE_START;
1552 spin_unlock_irqrestore(&ctx->dec_lock, lock_flags);
1556 // Can be called from irq context
1557 static void dec_env_delete(struct rpivid_dec_env *const de)
1559 struct rpivid_ctx * const ctx = de->ctx;
1560 unsigned long lock_flags;
1563 dma_unmap_single(ctx->dev->dev, de->cmd_addr, de->cmd_size,
1568 aux_q_release(ctx, &de->frame_aux);
1569 aux_q_release(ctx, &de->col_aux);
1571 spin_lock_irqsave(&ctx->dec_lock, lock_flags);
1573 de->state = RPIVID_DECODE_END;
1574 de->next = ctx->dec_free;
1577 spin_unlock_irqrestore(&ctx->dec_lock, lock_flags);
1580 static void dec_env_uninit(struct rpivid_ctx *const ctx)
1584 if (ctx->dec_pool) {
1585 for (i = 0; i != RPIVID_DEC_ENV_COUNT; ++i) {
1586 struct rpivid_dec_env *const de = ctx->dec_pool + i;
1588 kfree(de->cmd_fifo);
1591 kfree(ctx->dec_pool);
1594 ctx->dec_pool = NULL;
1595 ctx->dec_free = NULL;
1598 static int dec_env_init(struct rpivid_ctx *const ctx)
1602 ctx->dec_pool = kzalloc(sizeof(*ctx->dec_pool) * RPIVID_DEC_ENV_COUNT,
1607 spin_lock_init(&ctx->dec_lock);
1610 ctx->dec_free = ctx->dec_pool;
1611 for (i = 0; i != RPIVID_DEC_ENV_COUNT - 1; ++i)
1612 ctx->dec_pool[i].next = ctx->dec_pool + i + 1;
1614 // Fill in other bits
1615 for (i = 0; i != RPIVID_DEC_ENV_COUNT; ++i) {
1616 struct rpivid_dec_env *const de = ctx->dec_pool + i;
1619 de->decode_order = i;
1620 // de->cmd_max = 1024;
1622 de->cmd_fifo = kmalloc_array(de->cmd_max,
1623 sizeof(struct rpi_cmd),
1632 dec_env_uninit(ctx);
1636 // Assume that we get exactly the same DPB for every slice
1637 // it makes no real sense otherwise
1638 #if V4L2_HEVC_DPB_ENTRIES_NUM_MAX > 16
1639 #error HEVC_DPB_ENTRIES > h/w slots
1642 static u32 mk_config2(const struct rpivid_dec_state *const s)
1644 const struct v4l2_ctrl_hevc_sps *const sps = &s->sps;
1645 const struct v4l2_ctrl_hevc_pps *const pps = &s->pps;
1648 c = (sps->bit_depth_luma_minus8 + 8) << 0;
1650 c |= (sps->bit_depth_chroma_minus8 + 8) << 4;
1652 if (sps->bit_depth_luma_minus8)
1655 if (sps->bit_depth_chroma_minus8)
1657 c |= s->log2_ctb_size << 10;
1658 if (pps->flags & V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED)
1660 if (sps->flags & V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED)
1663 c |= BIT(15); /* Write motion vectors to external memory */
1664 c |= (pps->log2_parallel_merge_level_minus2 + 2) << 16;
1665 if (s->slice_temporal_mvp)
1667 if (sps->flags & V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED)
1669 c |= (pps->pps_cb_qp_offset & 31) << 21;
1670 c |= (pps->pps_cr_qp_offset & 31) << 26;
1674 static inline bool is_ref_unit_type(const unsigned int nal_unit_type)
1677 * True for 1, 3, 5, 7, 9, 11, 13, 15
1679 return (nal_unit_type & ~0xe) != 0;
1682 static void rpivid_h265_setup(struct rpivid_ctx *ctx, struct rpivid_run *run)
1684 struct rpivid_dev *const dev = ctx->dev;
1685 const struct v4l2_ctrl_hevc_decode_params *const dec =
1687 /* sh0 used where slice header contents should be constant over all
1688 * slices, or first slice of frame
1690 const struct v4l2_ctrl_hevc_slice_params *const sh0 =
1691 run->h265.slice_params;
1692 struct rpivid_q_aux *dpb_q_aux[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
1693 struct rpivid_dec_state *const s = ctx->state;
1694 struct vb2_queue *vq;
1695 struct rpivid_dec_env *de = ctx->dec0;
1696 unsigned int prev_rs;
1699 bool slice_temporal_mvp;
1703 s->sh = NULL; // Avoid use until in the slice loop
1706 ((run->src->flags & V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF) == 0);
1708 slice_temporal_mvp = (sh0->flags &
1709 V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED);
1711 if (de && de->state != RPIVID_DECODE_END) {
1712 switch (de->state) {
1713 case RPIVID_DECODE_SLICE_CONTINUE:
1717 v4l2_err(&dev->v4l2_dev, "%s: Unexpected state: %d\n",
1718 __func__, de->state);
1720 case RPIVID_DECODE_ERROR_CONTINUE:
1721 // Uncleared error - fail now
1725 if (s->slice_temporal_mvp != slice_temporal_mvp) {
1726 v4l2_warn(&dev->v4l2_dev,
1727 "Slice Temporal MVP non-constant\n");
1732 unsigned int ctb_size_y;
1733 bool sps_changed = false;
1735 if (memcmp(&s->sps, run->h265.sps, sizeof(s->sps)) != 0) {
1737 v4l2_info(&dev->v4l2_dev, "SPS changed\n");
1738 memcpy(&s->sps, run->h265.sps, sizeof(s->sps));
1742 memcmp(&s->pps, run->h265.pps, sizeof(s->pps)) != 0) {
1744 v4l2_info(&dev->v4l2_dev, "PPS changed\n");
1745 memcpy(&s->pps, run->h265.pps, sizeof(s->pps));
1747 /* Recalc stuff as required */
1753 de = dec_env_new(ctx);
1755 v4l2_err(&dev->v4l2_dev,
1756 "Failed to find free decode env\n");
1762 1U << (s->sps.log2_min_luma_coding_block_size_minus3 +
1764 s->sps.log2_diff_max_min_luma_coding_block_size);
1766 de->pic_width_in_ctbs_y =
1767 (s->sps.pic_width_in_luma_samples + ctb_size_y - 1) /
1769 de->pic_height_in_ctbs_y =
1770 (s->sps.pic_height_in_luma_samples + ctb_size_y - 1) /
1773 de->dpbno_col = ~0U;
1775 de->bit_copy_gptr = ctx->bitbufs + ctx->p1idx;
1776 de->bit_copy_len = 0;
1778 de->frame_c_offset = ctx->dst_fmt.height * 128;
1779 de->frame_stride = ctx->dst_fmt.plane_fmt[0].bytesperline * 128;
1781 vb2_dma_contig_plane_dma_addr(&run->dst->vb2_buf, 0);
1782 de->frame_aux = NULL;
1784 if (s->sps.bit_depth_luma_minus8 !=
1785 s->sps.bit_depth_chroma_minus8) {
1786 v4l2_warn(&dev->v4l2_dev,
1787 "Chroma depth (%d) != Luma depth (%d)\n",
1788 s->sps.bit_depth_chroma_minus8 + 8,
1789 s->sps.bit_depth_luma_minus8 + 8);
1792 if (s->sps.bit_depth_luma_minus8 == 0) {
1793 if (ctx->dst_fmt.pixelformat !=
1794 V4L2_PIX_FMT_NV12_COL128) {
1795 v4l2_err(&dev->v4l2_dev,
1796 "Pixel format %#x != NV12_COL128 for 8-bit output",
1797 ctx->dst_fmt.pixelformat);
1800 } else if (s->sps.bit_depth_luma_minus8 == 2) {
1801 if (ctx->dst_fmt.pixelformat !=
1802 V4L2_PIX_FMT_NV12_10_COL128) {
1803 v4l2_err(&dev->v4l2_dev,
1804 "Pixel format %#x != NV12_10_COL128 for 10-bit output",
1805 ctx->dst_fmt.pixelformat);
1809 v4l2_warn(&dev->v4l2_dev,
1810 "Luma depth (%d) unsupported\n",
1811 s->sps.bit_depth_luma_minus8 + 8);
1814 if (run->dst->vb2_buf.num_planes != 1) {
1815 v4l2_warn(&dev->v4l2_dev, "Capture planes (%d) != 1\n",
1816 run->dst->vb2_buf.num_planes);
1819 if (run->dst->planes[0].length <
1820 ctx->dst_fmt.plane_fmt[0].sizeimage) {
1821 v4l2_warn(&dev->v4l2_dev,
1822 "Capture plane[0] length (%d) < sizeimage (%d)\n",
1823 run->dst->planes[0].length,
1824 ctx->dst_fmt.plane_fmt[0].sizeimage);
1828 // Fill in ref planes with our address s.t. if we mess
1829 // up refs somehow then we still have a valid address
1831 for (i = 0; i != 16; ++i)
1832 de->ref_addrs[i] = de->frame_addr;
1835 * Stash initial temporal_mvp flag
1836 * This must be the same for all pic slices (7.4.7.1)
1838 s->slice_temporal_mvp = slice_temporal_mvp;
1841 * Need Aux ents for all (ref) DPB ents if temporal MV could
1842 * be enabled for any pic
1844 s->use_aux = ((s->sps.flags &
1845 V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) != 0);
1846 s->mk_aux = s->use_aux &&
1847 (s->sps.sps_max_sub_layers_minus1 >= sh0->nuh_temporal_id_plus1 ||
1848 is_ref_unit_type(sh0->nal_unit_type));
1850 // Phase 2 reg pre-calc
1851 de->rpi_config2 = mk_config2(s);
1852 de->rpi_framesize = (s->sps.pic_height_in_luma_samples << 16) |
1853 s->sps.pic_width_in_luma_samples;
1854 de->rpi_currpoc = sh0->slice_pic_order_cnt;
1857 V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) {
1858 setup_colmv(ctx, run, s);
1863 if (sh0->slice_segment_addr != 0) {
1864 v4l2_warn(&dev->v4l2_dev,
1865 "New frame but segment_addr=%d\n",
1866 sh0->slice_segment_addr);
1870 /* Allocate a bitbuf if we need one - don't need one if single
1871 * slice as we can use the src buf directly
1873 if (!frame_end && !de->bit_copy_gptr->ptr) {
1875 bits_alloc = rpivid_bit_buf_size(s->sps.pic_width_in_luma_samples,
1876 s->sps.pic_height_in_luma_samples,
1877 s->sps.bit_depth_luma_minus8);
1879 if (gptr_alloc(dev, de->bit_copy_gptr,
1881 DMA_ATTR_FORCE_CONTIGUOUS) != 0) {
1882 v4l2_err(&dev->v4l2_dev,
1883 "Unable to alloc buf (%zu) for bit copy\n",
1887 v4l2_info(&dev->v4l2_dev,
1888 "Alloc buf (%zu) for bit copy OK\n",
1893 // Either map src buffer or use directly
1898 s->src_addr = vb2_dma_contig_plane_dma_addr(&run->src->vb2_buf,
1901 s->src_buf = vb2_plane_vaddr(&run->src->vb2_buf, 0);
1902 if (!s->src_addr && !s->src_buf) {
1903 v4l2_err(&dev->v4l2_dev, "Failed to map src buffer\n");
1907 // Pre calc a few things
1909 for (i = 0; i != run->h265.slice_ents; ++i) {
1910 const struct v4l2_ctrl_hevc_slice_params *const sh = sh0 + i;
1911 const bool last_slice = frame_end && i + 1 == run->h265.slice_ents;
1915 if (run->src->planes[0].bytesused < (sh->bit_size + 7) / 8) {
1916 v4l2_warn(&dev->v4l2_dev,
1917 "Bit size %d > bytesused %d\n",
1918 sh->bit_size, run->src->planes[0].bytesused);
1921 if (sh->data_bit_offset >= sh->bit_size ||
1922 sh->bit_size - sh->data_bit_offset < 8) {
1923 v4l2_warn(&dev->v4l2_dev,
1924 "Bit size %d < Bit offset %d + 8\n",
1925 sh->bit_size, sh->data_bit_offset);
1929 s->slice_qp = 26 + s->pps.init_qp_minus26 + sh->slice_qp_delta;
1930 s->max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ?
1932 (5 - sh->five_minus_max_num_merge_cand);
1933 s->dependent_slice_segment_flag =
1935 V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT) != 0);
1937 s->nb_refs[0] = (sh->slice_type == HEVC_SLICE_I) ?
1939 sh->num_ref_idx_l0_active_minus1 + 1;
1940 s->nb_refs[1] = (sh->slice_type != HEVC_SLICE_B) ?
1942 sh->num_ref_idx_l1_active_minus1 + 1;
1944 if (s->sps.flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED)
1945 populate_scaling_factors(run, de, s);
1947 /* Calc all the random coord info to avoid repeated conversion in/out */
1948 s->start_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr];
1949 s->start_ctb_x = sh->slice_segment_addr % de->pic_width_in_ctbs_y;
1950 s->start_ctb_y = sh->slice_segment_addr / de->pic_width_in_ctbs_y;
1951 /* Last CTB of previous slice */
1952 prev_rs = !s->start_ts ? 0 : s->ctb_addr_ts_to_rs[s->start_ts - 1];
1953 s->prev_ctb_x = prev_rs % de->pic_width_in_ctbs_y;
1954 s->prev_ctb_y = prev_rs / de->pic_width_in_ctbs_y;
1956 if ((s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED))
1957 rv = wpp_decode_slice(de, s, last_slice);
1959 rv = decode_slice(de, s, last_slice);
1972 memset(dpb_q_aux, 0,
1973 sizeof(*dpb_q_aux) * V4L2_HEVC_DPB_ENTRIES_NUM_MAX);
1975 // Locate ref frames
1976 // At least in the current implementation this is constant across all
1977 // slices. If this changes we will need idx mapping code.
1978 // Uses sh so here rather than trigger
1980 vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx,
1981 V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
1984 v4l2_err(&dev->v4l2_dev, "VQ gone!\n");
1988 // v4l2_info(&dev->v4l2_dev, "rpivid_h265_end of frame\n");
1989 if (write_cmd_buffer(dev, de, s))
1992 for (i = 0; i < dec->num_active_dpb_entries; ++i) {
1994 vb2_find_timestamp(vq, dec->dpb[i].timestamp, 0);
1995 struct vb2_buffer *buf = buffer_index < 0 ?
1997 vb2_get_buffer(vq, buffer_index);
2000 v4l2_warn(&dev->v4l2_dev,
2001 "Missing DPB ent %d, timestamp=%lld, index=%d\n",
2002 i, (long long)dec->dpb[i].timestamp,
2008 dpb_q_aux[i] = aux_q_ref_idx(ctx, buffer_index);
2010 v4l2_warn(&dev->v4l2_dev,
2011 "Missing DPB AUX ent %d, timestamp=%lld, index=%d\n",
2012 i, (long long)dec->dpb[i].timestamp,
2017 vb2_dma_contig_plane_dma_addr(buf, 0);
2020 // Move DPB from temp
2021 for (i = 0; i != V4L2_HEVC_DPB_ENTRIES_NUM_MAX; ++i) {
2022 aux_q_release(ctx, &s->ref_aux[i]);
2023 s->ref_aux[i] = dpb_q_aux[i];
2025 // Unref the old frame aux too - it is either in the DPB or not
2027 aux_q_release(ctx, &s->frame_aux);
2030 s->frame_aux = aux_q_new(ctx, run->dst->vb2_buf.index);
2032 if (!s->frame_aux) {
2033 v4l2_err(&dev->v4l2_dev,
2034 "Failed to obtain aux storage for frame\n");
2038 de->frame_aux = aux_q_ref(ctx, s->frame_aux);
2041 if (de->dpbno_col != ~0U) {
2042 if (de->dpbno_col >= dec->num_active_dpb_entries) {
2043 v4l2_err(&dev->v4l2_dev,
2044 "Col ref index %d >= %d\n",
2046 dec->num_active_dpb_entries);
2048 // Standard requires that the col pic is
2049 // constant for the duration of the pic
2050 // (text of collocated_ref_idx in H265-2 2018
2053 // Spot the collocated ref in passing
2054 de->col_aux = aux_q_ref(ctx,
2055 dpb_q_aux[de->dpbno_col]);
2058 v4l2_warn(&dev->v4l2_dev,
2059 "Missing DPB ent for col\n");
2060 // Probably need to abort if this fails
2061 // as P2 may explode on bad data
2067 de->state = RPIVID_DECODE_PHASE1;
2073 // Actual error reporting happens in Trigger
2074 de->state = frame_end ? RPIVID_DECODE_ERROR_DONE :
2075 RPIVID_DECODE_ERROR_CONTINUE;
2076 xtrace_fail(dev, de);
2079 //////////////////////////////////////////////////////////////////////////////
2080 // Handle PU and COEFF stream overflow
2083 // -1 Phase 1 decode error
2085 // >0 Out of space (bitmask)
2087 #define STATUS_COEFF_EXHAUSTED 8
2088 #define STATUS_PU_EXHAUSTED 16
2090 static int check_status(const struct rpivid_dev *const dev)
2092 const u32 cfstatus = apb_read(dev, RPI_CFSTATUS);
2093 const u32 cfnum = apb_read(dev, RPI_CFNUM);
2094 u32 status = apb_read(dev, RPI_STATUS);
2096 // Handle PU and COEFF stream overflow
2098 // this is the definition of successful completion of phase 1
2099 // it assures that status register is zero and all blocks in each tile
2101 if (cfstatus == cfnum)
2102 return 0; //No error
2104 status &= (STATUS_PU_EXHAUSTED | STATUS_COEFF_EXHAUSTED);
2111 static void phase2_cb(struct rpivid_dev *const dev, void *v)
2113 struct rpivid_dec_env *const de = v;
2117 /* Done with buffers - allow new P1 */
2118 rpivid_hw_irq_active1_enable_claim(dev, 1);
2120 v4l2_m2m_buf_done(de->frame_buf, VB2_BUF_STATE_DONE);
2121 de->frame_buf = NULL;
2124 media_request_unpin(de->req_pin);
2127 media_request_object_complete(de->req_obj);
2135 static void phase2_claimed(struct rpivid_dev *const dev, void *v)
2137 struct rpivid_dec_env *const de = v;
2142 apb_write_vc_addr(dev, RPI_PURBASE, de->pu_base_vc);
2143 apb_write_vc_len(dev, RPI_PURSTRIDE, de->pu_stride);
2144 apb_write_vc_addr(dev, RPI_COEFFRBASE, de->coeff_base_vc);
2145 apb_write_vc_len(dev, RPI_COEFFRSTRIDE, de->coeff_stride);
2147 apb_write_vc_addr(dev, RPI_OUTYBASE, de->frame_addr);
2148 apb_write_vc_addr(dev, RPI_OUTCBASE,
2149 de->frame_addr + de->frame_c_offset);
2150 apb_write_vc_len(dev, RPI_OUTYSTRIDE, de->frame_stride);
2151 apb_write_vc_len(dev, RPI_OUTCSTRIDE, de->frame_stride);
2153 // v4l2_info(&dev->v4l2_dev, "Frame: Y=%llx, C=%llx, Stride=%x\n",
2154 // de->frame_addr, de->frame_addr + de->frame_c_offset,
2155 // de->frame_stride);
2157 for (i = 0; i < 16; i++) {
2158 // Strides are in fact unused but fill in anyway
2159 apb_write_vc_addr(dev, 0x9000 + 16 * i, de->ref_addrs[i]);
2160 apb_write_vc_len(dev, 0x9004 + 16 * i, de->frame_stride);
2161 apb_write_vc_addr(dev, 0x9008 + 16 * i,
2162 de->ref_addrs[i] + de->frame_c_offset);
2163 apb_write_vc_len(dev, 0x900C + 16 * i, de->frame_stride);
2166 apb_write(dev, RPI_CONFIG2, de->rpi_config2);
2167 apb_write(dev, RPI_FRAMESIZE, de->rpi_framesize);
2168 apb_write(dev, RPI_CURRPOC, de->rpi_currpoc);
2169 // v4l2_info(&dev->v4l2_dev, "Config2=%#x, FrameSize=%#x, POC=%#x\n",
2170 // de->rpi_config2, de->rpi_framesize, de->rpi_currpoc);
2172 // collocated reads/writes
2173 apb_write_vc_len(dev, RPI_COLSTRIDE,
2174 de->ctx->colmv_stride); // Read vals
2175 apb_write_vc_len(dev, RPI_MVSTRIDE,
2176 de->ctx->colmv_stride); // Write vals
2177 apb_write_vc_addr(dev, RPI_MVBASE,
2178 !de->frame_aux ? 0 : de->frame_aux->col.addr);
2179 apb_write_vc_addr(dev, RPI_COLBASE,
2180 !de->col_aux ? 0 : de->col_aux->col.addr);
2182 //v4l2_info(&dev->v4l2_dev,
2183 // "Mv=%llx, Col=%llx, Stride=%x, Buf=%llx->%llx\n",
2184 // de->rpi_mvbase, de->rpi_colbase, de->ctx->colmv_stride,
2185 // de->ctx->colmvbuf.addr, de->ctx->colmvbuf.addr +
2186 // de->ctx->colmvbuf.size);
2188 rpivid_hw_irq_active2_irq(dev, &de->irq_ent, phase2_cb, de);
2190 apb_write_final(dev, RPI_NUMROWS, de->pic_height_in_ctbs_y);
2195 static void phase1_claimed(struct rpivid_dev *const dev, void *v);
2197 // release any and all objects associated with de
2198 // and reenable phase 1 if required
2199 static void phase1_err_fin(struct rpivid_dev *const dev,
2200 struct rpivid_ctx *const ctx,
2201 struct rpivid_dec_env *const de)
2203 /* Return all detached buffers */
2205 v4l2_m2m_buf_done(de->src_buf, VB2_BUF_STATE_ERROR);
2208 v4l2_m2m_buf_done(de->frame_buf, VB2_BUF_STATE_ERROR);
2209 de->frame_buf = NULL;
2212 media_request_unpin(de->req_pin);
2216 media_request_object_complete(de->req_obj);
2222 /* Reenable phase 0 if we were blocking */
2223 if (atomic_add_return(-1, &ctx->p1out) >= RPIVID_P1BUF_COUNT - 1)
2224 v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
2226 /* Done with P1-P2 buffers - allow new P1 */
2227 rpivid_hw_irq_active1_enable_claim(dev, 1);
2230 static void phase1_thread(struct rpivid_dev *const dev, void *v)
2232 struct rpivid_dec_env *const de = v;
2233 struct rpivid_ctx *const ctx = de->ctx;
2235 struct rpivid_gptr *const pu_gptr = ctx->pu_bufs + ctx->p2idx;
2236 struct rpivid_gptr *const coeff_gptr = ctx->coeff_bufs + ctx->p2idx;
2240 if (de->p1_status & STATUS_PU_EXHAUSTED) {
2241 if (gptr_realloc_new(dev, pu_gptr, next_size(pu_gptr->size))) {
2242 v4l2_err(&dev->v4l2_dev,
2243 "%s: PU realloc (%zx) failed\n",
2244 __func__, pu_gptr->size);
2247 v4l2_info(&dev->v4l2_dev, "%s: PU realloc (%zx) OK\n",
2248 __func__, pu_gptr->size);
2251 if (de->p1_status & STATUS_COEFF_EXHAUSTED) {
2252 if (gptr_realloc_new(dev, coeff_gptr,
2253 next_size(coeff_gptr->size))) {
2254 v4l2_err(&dev->v4l2_dev,
2255 "%s: Coeff realloc (%zx) failed\n",
2256 __func__, coeff_gptr->size);
2259 v4l2_info(&dev->v4l2_dev, "%s: Coeff realloc (%zx) OK\n",
2260 __func__, coeff_gptr->size);
2263 phase1_claimed(dev, de);
2268 if (!pu_gptr->addr || !coeff_gptr->addr) {
2269 v4l2_err(&dev->v4l2_dev,
2270 "%s: Fatal: failed to reclaim old alloc\n",
2274 xtrace_fail(dev, de);
2275 phase1_err_fin(dev, ctx, de);
2278 /* Always called in irq context (this is good) */
2279 static void phase1_cb(struct rpivid_dev *const dev, void *v)
2281 struct rpivid_dec_env *const de = v;
2282 struct rpivid_ctx *const ctx = de->ctx;
2286 de->p1_status = check_status(dev);
2288 if (de->p1_status != 0) {
2289 v4l2_info(&dev->v4l2_dev, "%s: Post wait: %#x\n",
2290 __func__, de->p1_status);
2292 if (de->p1_status < 0)
2295 /* Need to realloc - push onto a thread rather than IRQ */
2296 rpivid_hw_irq_active1_thread(dev, &de->irq_ent,
2301 v4l2_m2m_buf_done(de->src_buf, VB2_BUF_STATE_DONE);
2304 /* All phase1 error paths done - it is safe to inc p2idx */
2306 (ctx->p2idx + 1 >= RPIVID_P2BUF_COUNT) ? 0 : ctx->p2idx + 1;
2308 /* Renable the next setup if we were blocking */
2309 if (atomic_add_return(-1, &ctx->p1out) >= RPIVID_P1BUF_COUNT - 1) {
2310 xtrace_fin(dev, de);
2311 v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
2314 rpivid_hw_irq_active2_claim(dev, &de->irq_ent, phase2_claimed, de);
2320 xtrace_fail(dev, de);
2321 phase1_err_fin(dev, ctx, de);
2324 static void phase1_claimed(struct rpivid_dev *const dev, void *v)
2326 struct rpivid_dec_env *const de = v;
2327 struct rpivid_ctx *const ctx = de->ctx;
2329 const struct rpivid_gptr * const pu_gptr = ctx->pu_bufs + ctx->p2idx;
2330 const struct rpivid_gptr * const coeff_gptr = ctx->coeff_bufs +
2338 de->pu_base_vc = pu_gptr->addr;
2340 ALIGN_DOWN(pu_gptr->size / de->pic_height_in_ctbs_y, 64);
2342 de->coeff_base_vc = coeff_gptr->addr;
2344 ALIGN_DOWN(coeff_gptr->size / de->pic_height_in_ctbs_y, 64);
2346 /* phase1_claimed blocked until cb_phase1 completed so p2idx inc
2347 * in cb_phase1 after error detection
2350 apb_write_vc_addr(dev, RPI_PUWBASE, de->pu_base_vc);
2351 apb_write_vc_len(dev, RPI_PUWSTRIDE, de->pu_stride);
2352 apb_write_vc_addr(dev, RPI_COEFFWBASE, de->coeff_base_vc);
2353 apb_write_vc_len(dev, RPI_COEFFWSTRIDE, de->coeff_stride);
2355 // Trigger command FIFO
2356 apb_write(dev, RPI_CFNUM, de->cmd_len);
2359 rpivid_hw_irq_active1_irq(dev, &de->irq_ent, phase1_cb, de);
2361 // And start the h/w
2362 apb_write_vc_addr_final(dev, RPI_CFBASE, de->cmd_addr);
2368 xtrace_fail(dev, de);
2369 phase1_err_fin(dev, ctx, de);
2372 static void dec_state_delete(struct rpivid_ctx *const ctx)
2375 struct rpivid_dec_state *const s = ctx->state;
2383 for (i = 0; i != HEVC_MAX_REFS; ++i)
2384 aux_q_release(ctx, &s->ref_aux[i]);
2385 aux_q_release(ctx, &s->frame_aux);
2392 wait_queue_head_t wq;
2393 struct rpivid_hw_irq_ent irq_ent;
2396 static void phase2_sync_claimed(struct rpivid_dev *const dev, void *v)
2398 struct irq_sync *const sync = v;
2400 atomic_set(&sync->done, 1);
2404 static void phase1_sync_claimed(struct rpivid_dev *const dev, void *v)
2406 struct irq_sync *const sync = v;
2408 rpivid_hw_irq_active1_enable_claim(dev, 1);
2409 rpivid_hw_irq_active2_claim(dev, &sync->irq_ent, phase2_sync_claimed, sync);
2412 /* Sync with IRQ operations
2414 * Claims phase1 and phase2 in turn and waits for the phase2 claim so any
2415 * pending IRQ ops will have completed by the time this returns
2417 * phase1 has counted enables so must reenable once claimed
2418 * phase2 has unlimited enables
2420 static void irq_sync(struct rpivid_dev *const dev)
2422 struct irq_sync sync;
2424 atomic_set(&sync.done, 0);
2425 init_waitqueue_head(&sync.wq);
2427 rpivid_hw_irq_active1_claim(dev, &sync.irq_ent, phase1_sync_claimed, &sync);
2428 wait_event(sync.wq, atomic_read(&sync.done));
2431 static void h265_ctx_uninit(struct rpivid_dev *const dev, struct rpivid_ctx *ctx)
2435 dec_env_uninit(ctx);
2436 dec_state_delete(ctx);
2438 // dec_env & state must be killed before this to release the buffer to
2442 for (i = 0; i != ARRAY_SIZE(ctx->bitbufs); ++i)
2443 gptr_free(dev, ctx->bitbufs + i);
2444 for (i = 0; i != ARRAY_SIZE(ctx->pu_bufs); ++i)
2445 gptr_free(dev, ctx->pu_bufs + i);
2446 for (i = 0; i != ARRAY_SIZE(ctx->coeff_bufs); ++i)
2447 gptr_free(dev, ctx->coeff_bufs + i);
2450 static void rpivid_h265_stop(struct rpivid_ctx *ctx)
2452 struct rpivid_dev *const dev = ctx->dev;
2454 v4l2_info(&dev->v4l2_dev, "%s\n", __func__);
2457 h265_ctx_uninit(dev, ctx);
2460 static int rpivid_h265_start(struct rpivid_ctx *ctx)
2462 struct rpivid_dev *const dev = ctx->dev;
2465 unsigned int w = ctx->dst_fmt.width;
2466 unsigned int h = ctx->dst_fmt.height;
2471 #if DEBUG_TRACE_P1_CMD
2475 // Generate a sanitised WxH for memory alloc
2476 // Assume HD if unset
2487 v4l2_info(&dev->v4l2_dev, "%s: (%dx%d)\n", __func__,
2488 ctx->dst_fmt.width, ctx->dst_fmt.height);
2492 ctx->state = kzalloc(sizeof(*ctx->state), GFP_KERNEL);
2494 v4l2_err(&dev->v4l2_dev, "Failed to allocate decode state\n");
2498 if (dec_env_init(ctx) != 0) {
2499 v4l2_err(&dev->v4l2_dev, "Failed to allocate decode envs\n");
2503 // Finger in the air PU & Coeff alloc
2504 // Will be realloced if too small
2505 coeff_alloc = rpivid_round_up_size(wxh);
2506 pu_alloc = rpivid_round_up_size(wxh / 4);
2507 for (i = 0; i != ARRAY_SIZE(ctx->pu_bufs); ++i) {
2508 // Don't actually need a kernel mapping here
2509 if (gptr_alloc(dev, ctx->pu_bufs + i, pu_alloc,
2510 DMA_ATTR_NO_KERNEL_MAPPING))
2512 if (gptr_alloc(dev, ctx->coeff_bufs + i, coeff_alloc,
2513 DMA_ATTR_NO_KERNEL_MAPPING))
2521 h265_ctx_uninit(dev, ctx);
2525 static void rpivid_h265_trigger(struct rpivid_ctx *ctx)
2527 struct rpivid_dev *const dev = ctx->dev;
2528 struct rpivid_dec_env *const de = ctx->dec0;
2532 switch (!de ? RPIVID_DECODE_ERROR_CONTINUE : de->state) {
2533 case RPIVID_DECODE_SLICE_START:
2534 de->state = RPIVID_DECODE_SLICE_CONTINUE;
2536 case RPIVID_DECODE_SLICE_CONTINUE:
2537 v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
2538 VB2_BUF_STATE_DONE);
2543 v4l2_err(&dev->v4l2_dev, "%s: Unexpected state: %d\n", __func__,
2546 case RPIVID_DECODE_ERROR_DONE:
2550 case RPIVID_DECODE_ERROR_CONTINUE:
2551 xtrace_fin(dev, de);
2552 v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
2553 VB2_BUF_STATE_ERROR);
2556 case RPIVID_DECODE_PHASE1:
2559 #if !USE_REQUEST_PIN
2560 /* Alloc a new request object - needs to be alloced dynamically
2561 * as the media request will release it some random time after
2564 de->req_obj = kmalloc(sizeof(*de->req_obj), GFP_KERNEL);
2566 xtrace_fail(dev, de);
2568 v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev,
2570 VB2_BUF_STATE_ERROR);
2573 media_request_object_init(de->req_obj);
2574 #warning probably needs to _get the req obj too
2576 ctx->p1idx = (ctx->p1idx + 1 >= RPIVID_P1BUF_COUNT) ?
2579 /* We know we have src & dst so no need to test */
2580 de->src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx);
2581 de->frame_buf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx);
2584 de->req_pin = de->src_buf->vb2_buf.req_obj.req;
2585 media_request_pin(de->req_pin);
2587 media_request_object_bind(de->src_buf->vb2_buf.req_obj.req,
2588 &dst_req_obj_ops, de, false,
2592 /* We could get rid of the src buffer here if we've already
2593 * copied it, but we don't copy the last buffer unless it
2594 * didn't return a contig dma addr and that shouldn't happen
2597 /* Enable the next setup if our Q isn't too big */
2598 if (atomic_add_return(1, &ctx->p1out) < RPIVID_P1BUF_COUNT) {
2599 xtrace_fin(dev, de);
2600 v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
2603 rpivid_hw_irq_active1_claim(dev, &de->irq_ent, phase1_claimed,
2610 const struct rpivid_dec_ops rpivid_dec_ops_h265 = {
2611 .setup = rpivid_h265_setup,
2612 .start = rpivid_h265_start,
2613 .stop = rpivid_h265_stop,
2614 .trigger = rpivid_h265_trigger,
2617 static int try_ctrl_sps(struct v4l2_ctrl *ctrl)
2619 const struct v4l2_ctrl_hevc_sps *const sps = ctrl->p_new.p_hevc_sps;
2620 struct rpivid_ctx *const ctx = ctrl->priv;
2621 struct rpivid_dev *const dev = ctx->dev;
2623 if (sps->chroma_format_idc != 1) {
2624 v4l2_warn(&dev->v4l2_dev,
2625 "Chroma format (%d) unsupported\n",
2626 sps->chroma_format_idc);
2630 if (sps->bit_depth_luma_minus8 != 0 &&
2631 sps->bit_depth_luma_minus8 != 2) {
2632 v4l2_warn(&dev->v4l2_dev,
2633 "Luma depth (%d) unsupported\n",
2634 sps->bit_depth_luma_minus8 + 8);
2638 if (sps->bit_depth_luma_minus8 != sps->bit_depth_chroma_minus8) {
2639 v4l2_warn(&dev->v4l2_dev,
2640 "Chroma depth (%d) != Luma depth (%d)\n",
2641 sps->bit_depth_chroma_minus8 + 8,
2642 sps->bit_depth_luma_minus8 + 8);
2646 if (!sps->pic_width_in_luma_samples ||
2647 !sps->pic_height_in_luma_samples ||
2648 sps->pic_width_in_luma_samples > 4096 ||
2649 sps->pic_height_in_luma_samples > 4096) {
2650 v4l2_warn(&dev->v4l2_dev,
2651 "Bad sps width (%u) x height (%u)\n",
2652 sps->pic_width_in_luma_samples,
2653 sps->pic_height_in_luma_samples);
2657 if (!ctx->dst_fmt_set)
2660 if ((sps->bit_depth_luma_minus8 == 0 &&
2661 ctx->dst_fmt.pixelformat != V4L2_PIX_FMT_NV12_COL128) ||
2662 (sps->bit_depth_luma_minus8 == 2 &&
2663 ctx->dst_fmt.pixelformat != V4L2_PIX_FMT_NV12_10_COL128)) {
2664 v4l2_warn(&dev->v4l2_dev,
2665 "SPS luma depth %d does not match capture format\n",
2666 sps->bit_depth_luma_minus8 + 8);
2670 if (sps->pic_width_in_luma_samples > ctx->dst_fmt.width ||
2671 sps->pic_height_in_luma_samples > ctx->dst_fmt.height) {
2672 v4l2_warn(&dev->v4l2_dev,
2673 "SPS size (%dx%d) > capture size (%d,%d)\n",
2674 sps->pic_width_in_luma_samples,
2675 sps->pic_height_in_luma_samples,
2677 ctx->dst_fmt.height);
2684 const struct v4l2_ctrl_ops rpivid_hevc_sps_ctrl_ops = {
2685 .try_ctrl = try_ctrl_sps,
2688 static int try_ctrl_pps(struct v4l2_ctrl *ctrl)
2690 const struct v4l2_ctrl_hevc_pps *const pps = ctrl->p_new.p_hevc_pps;
2691 struct rpivid_ctx *const ctx = ctrl->priv;
2692 struct rpivid_dev *const dev = ctx->dev;
2695 V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED) &&
2697 V4L2_HEVC_PPS_FLAG_TILES_ENABLED) &&
2698 (pps->num_tile_columns_minus1 || pps->num_tile_rows_minus1)) {
2699 v4l2_warn(&dev->v4l2_dev,
2700 "WPP + Tiles not supported\n");
2707 const struct v4l2_ctrl_ops rpivid_hevc_pps_ctrl_ops = {
2708 .try_ctrl = try_ctrl_pps,