va: enable multi tile support for H265 encoder.
authorHe Junyan <junyan.he@intel.com>
Mon, 25 Apr 2022 07:49:31 +0000 (15:49 +0800)
committerHe Junyan <junyan.he@intel.com>
Thu, 1 Dec 2022 01:45:07 +0000 (09:45 +0800)
Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/2036>

subprojects/gst-plugins-bad/sys/va/gstvaencoder.c
subprojects/gst-plugins-bad/sys/va/gstvaencoder.h
subprojects/gst-plugins-bad/sys/va/gstvah265enc.c

index 12ecafa..9e08df7 100644 (file)
@@ -603,6 +603,35 @@ gst_va_encoder_get_max_slice_num (GstVaEncoder * self,
   return attrib.value;
 }
 
+gint32
+gst_va_encoder_get_slice_structure (GstVaEncoder * self,
+    VAProfile profile, VAEntrypoint entrypoint)
+{
+  VAStatus status;
+  VADisplay dpy;
+  VAConfigAttrib attrib = {.type = VAConfigAttribEncSliceStructure };
+
+  g_return_val_if_fail (GST_IS_VA_ENCODER (self), 0);
+
+  if (profile == VAProfileNone)
+    return -1;
+
+  dpy = gst_va_display_get_va_dpy (self->display);
+  status = vaGetConfigAttributes (dpy, profile, entrypoint, &attrib, 1);
+  if (status != VA_STATUS_SUCCESS) {
+    GST_WARNING_OBJECT (self, "Failed to query encoding slice structure: %s",
+        vaErrorStr (status));
+    return 0;
+  }
+
+  if (attrib.value == VA_ATTRIB_NOT_SUPPORTED) {
+    GST_WARNING_OBJECT (self, "Driver does not support slice structure");
+    return 0;
+  }
+
+  return attrib.value;
+}
+
 gboolean
 gst_va_encoder_get_max_num_reference (GstVaEncoder * self,
     VAProfile profile, VAEntrypoint entrypoint,
@@ -767,6 +796,35 @@ gst_va_encoder_has_trellis (GstVaEncoder * self,
   return attrib.value & VA_ENC_QUANTIZATION_TRELLIS_SUPPORTED;
 }
 
+gboolean
+gst_va_encoder_has_tile (GstVaEncoder * self,
+    VAProfile profile, VAEntrypoint entrypoint)
+{
+  VAStatus status;
+  VADisplay dpy;
+  VAConfigAttrib attrib = {.type = VAConfigAttribEncTileSupport };
+
+  g_return_val_if_fail (GST_IS_VA_ENCODER (self), FALSE);
+
+  if (profile == VAProfileNone)
+    return FALSE;
+
+  dpy = gst_va_display_get_va_dpy (self->display);
+  status = vaGetConfigAttributes (dpy, profile, entrypoint, &attrib, 1);
+  if (status != VA_STATUS_SUCCESS) {
+    GST_WARNING_OBJECT (self, "Failed to query the tile: %s",
+        vaErrorStr (status));
+    return FALSE;
+  }
+
+  if (attrib.value == VA_ATTRIB_NOT_SUPPORTED) {
+    GST_WARNING_OBJECT (self, "Driver does not support tile");
+    return FALSE;
+  }
+
+  return attrib.value > 0;
+}
+
 guint32
 gst_va_encoder_get_rtformat (GstVaEncoder * self,
     VAProfile profile, VAEntrypoint entrypoint)
index 3f13291..290fb35 100644 (file)
@@ -64,6 +64,9 @@ gboolean              gst_va_encoder_has_profile          (GstVaEncoder * self,
 gint                  gst_va_encoder_get_max_slice_num    (GstVaEncoder * self,
                                                            VAProfile profile,
                                                            VAEntrypoint entrypoint);
+gint32                gst_va_encoder_get_slice_structure  (GstVaEncoder * self,
+                                                           VAProfile profile,
+                                                           VAEntrypoint entrypoint);
 gboolean              gst_va_encoder_get_max_num_reference (GstVaEncoder * self,
                                                             VAProfile profile,
                                                             VAEntrypoint entrypoint,
@@ -81,6 +84,9 @@ guint32               gst_va_encoder_get_quality_level    (GstVaEncoder * self,
 gboolean              gst_va_encoder_has_trellis          (GstVaEncoder * self,
                                                            VAProfile profile,
                                                            VAEntrypoint entrypoint);
+gboolean              gst_va_encoder_has_tile             (GstVaEncoder * self,
+                                                           VAProfile profile,
+                                                           VAEntrypoint entrypoint);
 guint32               gst_va_encoder_get_rtformat         (GstVaEncoder * self,
                                                            VAProfile profile,
                                                            VAEntrypoint entrypoint);
index 09eda53..bd29586 100644 (file)
@@ -90,6 +90,8 @@ enum
   PROP_RATE_CONTROL,
   PROP_CPB_SIZE,
   PROP_AUD,
+  PROP_NUM_TILE_COLS,
+  PROP_NUM_TILE_ROWS,
   N_PROPERTIES
 };
 
@@ -113,6 +115,11 @@ static GstObjectClass *parent_class = NULL;
 
 #define MAX_GOP_SIZE  1024
 
+/* The max tiles in column according to spec A1 */
+#define MAX_COL_TILES 20
+/* The max tiles in row according to spec A1 */
+#define MAX_ROW_TILES 22
+
 /* *INDENT-OFF* */
 struct _GstVaH265EncClass
 {
@@ -150,6 +157,8 @@ struct _GstVaH265Enc
     gboolean aud;
     guint32 mbbrc;
     guint32 num_slices;
+    guint32 num_tile_cols;
+    guint32 num_tile_rows;
     guint32 cpb_size;
     guint32 target_percentage;
     guint32 target_usage;
@@ -180,11 +189,27 @@ struct _GstVaH265Enc
   guint min_cr;
 
   gboolean aud;
-  guint32 num_slices;
   guint32 packed_headers;
 
   struct
   {
+    guint32 num_slices;
+    /* start address in CTUs */
+    guint32 *slice_segment_address;
+    /* CTUs in this slice */
+    guint32 *num_ctu_in_slice;
+
+    gboolean slice_span_tiles;
+    guint32 num_tile_cols;
+    guint32 num_tile_rows;
+    /* CTUs in each tile column */
+    guint32 *tile_ctu_cols;
+    /* CTUs in each tile row */
+    guint32 *tile_ctu_rows;
+  } partition;
+
+  struct
+  {
     guint8 log2_min_luma_coding_block_size_minus3;
     guint8 log2_diff_max_min_luma_coding_block_size;
     guint8 log2_min_transform_block_size_minus2;
@@ -447,6 +472,12 @@ _enc_frame (GstVideoCodecFrame * frame)
   return enc_frame;
 }
 
+static inline gboolean
+_is_tile_enabled (GstVaH265Enc * self)
+{
+  return self->partition.num_tile_cols * self->partition.num_tile_rows > 1;
+}
+
 static GstH265NalUnitType
 _h265_nal_unit_type (GstVaH265EncFrame * frame)
 {
@@ -1290,12 +1321,12 @@ _h265_fill_picture_parameter (GstVaH265Enc * self, GstVaH265EncFrame * frame,
     VAEncPictureParameterBufferHEVC * pic_param, gint collocated_poc)
 {
   GstVaBaseEnc *base = GST_VA_BASE_ENC (self);
-  gboolean tiles_enabled_flag;
   guint8 num_ref_idx_l0_default_active_minus1 = 0;
   guint8 num_ref_idx_l1_default_active_minus1 = 0;
   guint hierarchical_level_plus1 = 0;
   guint i;
 
+  /* *INDENT-OFF* */
   if (self->gop.b_pyramid) {
     /* I/P is the base hierarchical level 0, L0 level B is 1, and so on. */
     hierarchical_level_plus1 = 1;
@@ -1315,10 +1346,6 @@ _h265_fill_picture_parameter (GstVaH265Enc * self, GstVaH265EncFrame * frame,
         (self->gop.backward_ref_num > 0 ? self->gop.backward_ref_num - 1 : 0);
   }
 
-  /* TODO: multi tile support. */
-  tiles_enabled_flag = 0;
-
-  /* *INDENT-OFF* */
   *pic_param = (VAEncPictureParameterBufferHEVC) {
     .decoded_curr_pic.picture_id =
         gst_va_encode_picture_get_reconstruct_surface (frame->picture),
@@ -1357,10 +1384,10 @@ _h265_fill_picture_parameter (GstVaH265Enc * self, GstVaH265EncFrame * frame,
       .weighted_bipred_flag = self->features.weighted_bipred_flag,
       .transquant_bypass_enabled_flag =
           self->features.transquant_bypass_enabled_flag,
-      .tiles_enabled_flag = tiles_enabled_flag,
+      .tiles_enabled_flag = _is_tile_enabled (self),
       .entropy_coding_sync_enabled_flag = 0,
       /* When we enable multi tiles, enable this. */
-      .loop_filter_across_tiles_enabled_flag = tiles_enabled_flag,
+      .loop_filter_across_tiles_enabled_flag = _is_tile_enabled (self),
       .pps_loop_filter_across_slices_enabled_flag = 1,
       /* Should not change the scaling list, not used now */
       .scaling_list_data_present_flag =
@@ -1426,6 +1453,24 @@ _h265_fill_picture_parameter (GstVaH265Enc * self, GstVaH265EncFrame * frame,
     pic_param->collocated_ref_pic_index = 0xFF;
   }
 
+  /* Setup tile info */
+  if (pic_param->pic_fields.bits.tiles_enabled_flag) {
+    /* Always set loop filter across tiles enabled now */
+    pic_param->pic_fields.bits.loop_filter_across_tiles_enabled_flag = 1;
+
+    pic_param->num_tile_columns_minus1 = self->partition.num_tile_cols - 1;
+    pic_param->num_tile_rows_minus1 = self->partition.num_tile_rows - 1;
+
+    /* The VA row_height_minus1 and column_width_minus1 size is 1 smaller
+       than the MAX_COL_TILES and MAX_ROW_TILES, which means the driver
+       can deduce the last tile's size based on the picture info. We need
+       to take care of the array size here. */
+    for (i = 0; i < MIN (self->partition.num_tile_cols, 19); i++)
+      pic_param->column_width_minus1[i] = self->partition.tile_ctu_cols[i] - 1;
+    for (i = 0; i < MIN (self->partition.num_tile_rows, 21); i++)
+      pic_param->row_height_minus1[i] = self->partition.tile_ctu_rows[i] - 1;
+  }
+
   return TRUE;
 }
 
@@ -1615,33 +1660,16 @@ _h265_add_slices (GstVaH265Enc * self,
     gint negative_pocs[16], guint num_negative_pics,
     gint positive_pocs[16], guint num_positive_pics)
 {
-  guint ctu_size;
-  guint ctus_per_slice, ctus_mod_slice, cur_slice_ctus;
-  guint last_ctu_index;
   guint i_slice;
   VAEncSliceParameterBufferHEVC slice;
   GstH265SliceHdr slice_hdr;
 
-  ctu_size = self->ctu_width * self->ctu_height;
-
-  g_assert (self->num_slices && self->num_slices < ctu_size);
-
-  ctus_per_slice = ctu_size / self->num_slices;
-  ctus_mod_slice = ctu_size % self->num_slices;
-  last_ctu_index = 0;
-
-  for (i_slice = 0; i_slice < self->num_slices; i_slice++) {
-    cur_slice_ctus = ctus_per_slice;
-    /* Scatter the remainder to each slice */
-    if (ctus_mod_slice) {
-      ++cur_slice_ctus;
-      --ctus_mod_slice;
-    }
-
-    if (!_h265_fill_slice_parameter (self, frame, last_ctu_index,
-            cur_slice_ctus, (i_slice == self->num_slices - 1),
-            list_forward, list_forward_num,
-            list_backward, list_backward_num, &slice))
+  for (i_slice = 0; i_slice < self->partition.num_slices; i_slice++) {
+    if (!_h265_fill_slice_parameter (self, frame,
+            self->partition.slice_segment_address[i_slice],
+            self->partition.num_ctu_in_slice[i_slice],
+            (i_slice == self->partition.num_slices - 1), list_forward,
+            list_forward_num, list_backward, list_backward_num, &slice))
       return FALSE;
 
     if (!_h265_add_slice_parameter (self, frame, &slice))
@@ -1656,10 +1684,6 @@ _h265_add_slices (GstVaH265Enc * self,
       if (!_h265_add_slice_header (self, frame, &slice_hdr))
         return FALSE;
     }
-
-    /* set calculation for next slice */
-    last_ctu_index += cur_slice_ctus;
-    g_assert (last_ctu_index <= ctu_size);
   }
 
   return TRUE;
@@ -2258,7 +2282,9 @@ gst_va_h265_enc_reset_state (GstVaBaseEnc * base)
   GST_OBJECT_LOCK (self);
   self->features.use_trellis = self->prop.use_trellis;
   self->aud = self->prop.aud;
-  self->num_slices = self->prop.num_slices;
+  self->partition.num_slices = self->prop.num_slices;
+  self->partition.num_tile_cols = self->prop.num_tile_cols;
+  self->partition.num_tile_rows = self->prop.num_tile_rows;
   self->gop.idr_period = self->prop.key_int_max;
   self->gop.num_bframes = self->prop.num_bframes;
   self->gop.b_pyramid = self->prop.b_pyramid;
@@ -2297,6 +2323,12 @@ gst_va_h265_enc_reset_state (GstVaBaseEnc * base)
 
   self->packed_headers = 0;
 
+  self->partition.slice_span_tiles = FALSE;
+  g_clear_pointer (&self->partition.slice_segment_address, g_free);
+  g_clear_pointer (&self->partition.num_ctu_in_slice, g_free);
+  g_clear_pointer (&self->partition.tile_ctu_cols, g_free);
+  g_clear_pointer (&self->partition.tile_ctu_rows, g_free);
+
   self->features.log2_min_luma_coding_block_size_minus3 = 0;
   self->features.log2_diff_max_min_luma_coding_block_size = 0;
   self->features.log2_diff_max_min_luma_coding_block_size = 0;
@@ -2489,37 +2521,375 @@ out:
   update_property (bool, obj, old_val, new_val, prop_id)
 
 static void
-_h265_validate_parameters (GstVaH265Enc * self)
+_h265_calculate_tile_partition (GstVaH265Enc * self)
+{
+  guint32 ctu_per_slice;
+  guint32 left_slices;
+  gint32 i, j, k;
+  guint32 ctu_tile_width_accu[MAX_COL_TILES + 1];
+  guint32 ctu_tile_height_accu[MAX_ROW_TILES + 1];
+  /* CTB address in tile scan.
+     Add one as sentinel, hold val to calculate ctu_num */
+  guint32 *tile_slice_address =
+      g_malloc ((self->partition.num_slices + 1) * sizeof (guint32));
+  /* map the CTB address in tile scan to CTB raster scan of a picture. */
+  guint32 *tile_slice_address_map =
+      g_malloc (self->ctu_width * self->ctu_height * sizeof (guint32));
+
+  self->partition.slice_segment_address =
+      g_malloc (self->partition.num_slices * sizeof (guint32));
+  self->partition.num_ctu_in_slice =
+      g_malloc (self->partition.num_slices * sizeof (guint32));
+  self->partition.tile_ctu_cols = g_malloc (MAX_COL_TILES * sizeof (guint32));
+  self->partition.tile_ctu_rows = g_malloc (MAX_ROW_TILES * sizeof (guint32));
+
+  /* firstly uniformly separate CTUs into tiles, as the spec 6.5.1 define */
+  for (i = 0; i < self->partition.num_tile_cols; i++)
+    self->partition.tile_ctu_cols[i] =
+        ((i + 1) * self->ctu_width) / self->partition.num_tile_cols -
+        (i * self->ctu_width) / self->partition.num_tile_cols;
+  for (i = 0; i < self->partition.num_tile_rows; i++)
+    self->partition.tile_ctu_rows[i] =
+        ((i + 1) * self->ctu_height) / self->partition.num_tile_rows -
+        (i * self->ctu_height) / self->partition.num_tile_rows;
+
+  /* The requirement that the slice should not span tiles. Firstly we
+     should scatter slices uniformly into each tile, bigger tile gets
+     more slices. Then we should assign CTUs within one tile uniformly
+     to each slice in that tile. */
+  if (!self->partition.slice_span_tiles) {
+    guint32 *slices_per_tile = g_malloc (self->partition.num_tile_cols *
+        self->partition.num_tile_rows * sizeof (guint32));
+
+    ctu_per_slice = (self->ctu_width * self->ctu_height +
+        self->partition.num_slices - 1) / self->partition.num_slices;
+    g_assert (ctu_per_slice > 0);
+    left_slices = self->partition.num_slices;
+
+    for (i = 0;
+        i < self->partition.num_tile_cols * self->partition.num_tile_rows;
+        i++) {
+      slices_per_tile[i] = 1;
+      left_slices--;
+    }
+    while (left_slices) {
+      /* Find the biggest CTUs/slices, and assign more. */
+      gfloat largest = 0.0f;
+      k = -1;
+      for (i = 0;
+          i < self->partition.num_tile_cols * self->partition.num_tile_rows;
+          i++) {
+        gfloat f;
+        f = ((gfloat)
+            (self->partition.tile_ctu_cols[i % self->partition.num_tile_cols] *
+                self->partition.tile_ctu_rows
+                [i / self->partition.num_tile_cols])) /
+            (gfloat) slices_per_tile[i];
+        g_assert (f >= 1.0f);
+        if (f > largest) {
+          k = i;
+          largest = f;
+        }
+      }
+
+      g_assert (k >= 0);
+      slices_per_tile[k]++;
+      left_slices--;
+    }
+
+    /* Assign CTUs in one tile uniformly to each slice. Note: the slice start
+       address is CTB address in tile scan(see spec 6.5), that is, we accumulate
+       all CTUs in tile0, then tile1, and tile2..., not from the picture's
+       perspective. */
+    tile_slice_address[0] = 0;
+    k = 1;
+    for (i = 0; i < self->partition.num_tile_rows; i++) {
+      for (j = 0; j < self->partition.num_tile_cols; j++) {
+        guint32 s_num = slices_per_tile[i * self->partition.num_tile_cols + j];
+        guint32 one_tile_ctus =
+            self->partition.tile_ctu_cols[j] * self->partition.tile_ctu_rows[i];
+        guint32 s;
+
+        GST_LOG_OBJECT (self, "Tile(row %d col %d), has CTU in col %d,"
+            " CTU in row is %d, total CTU %d, assigned %d slices", i, j,
+            self->partition.tile_ctu_cols[j], self->partition.tile_ctu_rows[i],
+            one_tile_ctus, s_num);
+
+        g_assert (s_num > 0);
+        for (s = 0; s < s_num; s++) {
+          tile_slice_address[k] = tile_slice_address[k - 1] +
+              ((s + 1) * one_tile_ctus) / s_num - (s * one_tile_ctus) / s_num;
+          self->partition.num_ctu_in_slice[k - 1] =
+              tile_slice_address[k] - tile_slice_address[k - 1];
+          k++;
+        }
+      }
+    }
+
+    g_assert (k == self->partition.num_slices + 1);
+    /* Calculate the last one */
+    self->partition.num_ctu_in_slice[self->partition.num_slices - 1] =
+        self->ctu_width * self->ctu_height -
+        tile_slice_address[self->partition.num_slices - 1];
+
+    g_free (slices_per_tile);
+  }
+  /* The easy way, just assign CTUs to each slice uniformly */
+  else {
+    guint ctu_size, ctu_mod_slice, cur_slice_ctu, last_ctu_index;
+
+    ctu_size = self->ctu_width * self->ctu_height;
+
+    ctu_per_slice = ctu_size / self->partition.num_slices;
+    ctu_mod_slice = ctu_size % self->partition.num_slices;
+    last_ctu_index = 0;
+
+    for (i = 0; i < self->partition.num_slices; i++) {
+      cur_slice_ctu = ctu_per_slice;
+      /* Scatter the remainder to each slice */
+      if (ctu_mod_slice) {
+        ++cur_slice_ctu;
+        --ctu_mod_slice;
+      }
+
+      tile_slice_address[i] = last_ctu_index;
+      self->partition.num_ctu_in_slice[i] = cur_slice_ctu;
+
+      /* set calculation for next slice */
+      last_ctu_index += cur_slice_ctu;
+      g_assert (last_ctu_index <= ctu_size);
+    }
+  }
+
+  /* Build the map to specifying the conversion between a CTB address in CTB
+     raster scan of a picture and a CTB address in tile scan(see spec 6.5.1
+     for details). */
+  ctu_tile_width_accu[0] = 0;
+  for (i = 1; i <= self->partition.num_tile_cols; i++)
+    ctu_tile_width_accu[i] =
+        ctu_tile_width_accu[i - 1] + self->partition.tile_ctu_cols[i - 1];
+
+  ctu_tile_height_accu[0] = 0;
+  for (i = 1; i <= self->partition.num_tile_rows; i++)
+    ctu_tile_height_accu[i] =
+        ctu_tile_height_accu[i - 1] + self->partition.tile_ctu_rows[i - 1];
+
+  for (k = 0; k < self->ctu_width * self->ctu_height; k++) {
+    /* The ctu coordinate in the picture. */
+    guint32 x = k % self->ctu_width;
+    guint32 y = k / self->ctu_width;
+    /* The ctu coordinate in the tile mode. */
+    guint32 tile_x = 0;
+    guint32 tile_y = 0;
+    /* The index of the CTU in the tile mode. */
+    guint32 tso = 0;
+
+    for (i = 0; i < self->partition.num_tile_cols; i++)
+      if (x >= ctu_tile_width_accu[i])
+        tile_x = i;
+    g_assert (tile_x <= self->partition.num_tile_cols - 1);
+
+    for (j = 0; j < self->partition.num_tile_rows; j++)
+      if (y >= ctu_tile_height_accu[j])
+        tile_y = j;
+    g_assert (tile_y <= self->partition.num_tile_rows - 1);
+
+    /* add all ctus in the tiles the same line before us */
+    for (i = 0; i < tile_x; i++)
+      tso += self->partition.tile_ctu_rows[tile_y] *
+          self->partition.tile_ctu_cols[i];
+
+    /* add all ctus in the tiles above us */
+    for (j = 0; j < tile_y; j++)
+      tso += self->ctu_width * self->partition.tile_ctu_rows[j];
+
+    /* add the ctus inside the same tile before us */
+    tso += (y - ctu_tile_height_accu[tile_y]) *
+        self->partition.tile_ctu_cols[tile_x]
+        + x - ctu_tile_width_accu[tile_x];
+
+    g_assert (tso < self->ctu_width * self->ctu_height);
+
+    tile_slice_address_map[tso] = k;
+  }
+
+  for (i = 0; i < self->partition.num_slices; i++)
+    self->partition.slice_segment_address[i] =
+        tile_slice_address_map[tile_slice_address[i]];
+
+  g_free (tile_slice_address);
+  g_free (tile_slice_address_map);
+}
+
+static void
+_h265_calculate_slice_partition (GstVaH265Enc * self, gint32 slice_structure)
+{
+  guint ctu_size;
+  guint ctus_per_slice, ctus_mod_slice, cur_slice_ctus;
+  guint last_ctu_index;
+  guint i_slice;
+
+  /* TODO: consider other slice structure modes */
+  if (!(slice_structure & VA_ENC_SLICE_STRUCTURE_ARBITRARY_MACROBLOCKS) &&
+      !(slice_structure & VA_ENC_SLICE_STRUCTURE_ARBITRARY_ROWS)) {
+    GST_INFO_OBJECT (self, "Driver slice structure is %x, does not support"
+        " ARBITRARY_MACROBLOCKS mode, fallback to no slice partition",
+        slice_structure);
+    self->partition.num_slices = 1;
+  }
+
+  self->partition.slice_segment_address =
+      g_malloc (self->partition.num_slices * sizeof (guint32));
+  self->partition.num_ctu_in_slice =
+      g_malloc (self->partition.num_slices * sizeof (guint32));
+
+  ctu_size = self->ctu_width * self->ctu_height;
+
+  g_assert (self->partition.num_slices &&
+      self->partition.num_slices < ctu_size);
+
+  ctus_per_slice = ctu_size / self->partition.num_slices;
+  ctus_mod_slice = ctu_size % self->partition.num_slices;
+  last_ctu_index = 0;
+
+  for (i_slice = 0; i_slice < self->partition.num_slices; i_slice++) {
+    cur_slice_ctus = ctus_per_slice;
+    /* Scatter the remainder to each slice */
+    if (ctus_mod_slice) {
+      ++cur_slice_ctus;
+      --ctus_mod_slice;
+    }
+
+    /* Align start address to the row begin */
+    if (slice_structure & VA_ENC_SLICE_STRUCTURE_ARBITRARY_ROWS) {
+      guint ctu_width_round_factor;
+
+      ctu_width_round_factor =
+          self->ctu_width - (cur_slice_ctus % self->ctu_width);
+      cur_slice_ctus += ctu_width_round_factor;
+      if ((last_ctu_index + cur_slice_ctus) > ctu_size)
+        cur_slice_ctus = ctu_size - last_ctu_index;
+    }
+
+    self->partition.slice_segment_address[i_slice] = last_ctu_index;
+    self->partition.num_ctu_in_slice[i_slice] = cur_slice_ctus;
+
+    /* set calculation for next slice */
+    last_ctu_index += cur_slice_ctus;
+    g_assert (last_ctu_index <= ctu_size);
+  }
+}
+
+static gboolean
+_h265_setup_slice_and_tile_partition (GstVaH265Enc * self)
 {
   GstVaBaseEnc *base = GST_VA_BASE_ENC (self);
   gint32 max_slices;
+  gint32 slice_structure;
 
   /* Ensure the num_slices provided by the user not exceed the limit
    * of the number of slices permitted by the stream and by the
    * hardware. */
-  g_assert (self->num_slices >= 1);
+  g_assert (self->partition.num_slices >= 1);
   max_slices = gst_va_encoder_get_max_slice_num (base->encoder,
       base->profile, GST_VA_BASE_ENC_ENTRYPOINT (base));
-  if (self->num_slices > max_slices)
-    self->num_slices = max_slices;
+  if (self->partition.num_slices > max_slices)
+    self->partition.num_slices = max_slices;
 
   /* The stream size limit. */
-  if (self->num_slices > ((self->ctu_width * self->ctu_height + 1) / 2))
-    self->num_slices = ((self->ctu_width * self->ctu_height + 1) / 2);
+  if (self->partition.num_slices >
+      ((self->ctu_width * self->ctu_height + 1) / 2))
+    self->partition.num_slices = ((self->ctu_width * self->ctu_height + 1) / 2);
 
-  update_property_uint (base, &self->prop.num_slices, self->num_slices,
-      PROP_NUM_SLICES);
+  slice_structure = gst_va_encoder_get_slice_structure (base->encoder,
+      base->profile, GST_VA_BASE_ENC_ENTRYPOINT (base));
 
-  /* Ensure trellis. */
-  if (self->features.use_trellis &&
-      !gst_va_encoder_has_trellis (base->encoder, base->profile,
-          GST_VA_BASE_ENC_ENTRYPOINT (base))) {
-    GST_INFO_OBJECT (self, "The trellis is not supported");
-    self->features.use_trellis = FALSE;
+  if (_is_tile_enabled (self)) {
+    const GstVaH265LevelLimits *level_limits;
+    guint i;
+
+    if (!gst_va_encoder_has_tile (base->encoder,
+            base->profile, GST_VA_BASE_ENC_ENTRYPOINT (base))) {
+      self->partition.num_tile_cols = 1;
+      self->partition.num_tile_rows = 1;
+    }
+
+    level_limits = NULL;
+    for (i = 0; i < G_N_ELEMENTS (_va_h265_level_limits); i++) {
+      if (_va_h265_level_limits[i].level_idc == self->level_idc) {
+        level_limits = &_va_h265_level_limits[i];
+        break;
+      }
+    }
+    g_assert (level_limits);
+
+    if (self->partition.num_tile_cols > level_limits->MaxTileColumns) {
+      GST_INFO_OBJECT (self, "num_tile_cols:%d exceeds MaxTileColumns:%d"
+          " of level %s", self->partition.num_tile_cols,
+          level_limits->MaxTileColumns, self->level_str);
+      self->partition.num_tile_cols = level_limits->MaxTileColumns;
+    }
+    if (self->partition.num_tile_rows > level_limits->MaxTileRows) {
+      GST_INFO_OBJECT (self, "num_tile_rows:%d exceeds MaxTileRows:%d"
+          " of level %s", self->partition.num_tile_rows,
+          level_limits->MaxTileRows, self->level_str);
+      self->partition.num_tile_rows = level_limits->MaxTileRows;
+    }
+
+    if (self->partition.num_tile_cols > self->ctu_width) {
+      GST_INFO_OBJECT (self,
+          "Only %d CTUs in width, not enough to split into %d tile columns",
+          self->ctu_width, self->partition.num_tile_cols);
+      self->partition.num_tile_cols = self->ctu_width;
+    }
+    if (self->partition.num_tile_rows > self->ctu_height) {
+      GST_INFO_OBJECT (self,
+          "Only %d CTUs in height, not enough to split into %d tile rows",
+          self->ctu_height, self->partition.num_tile_rows);
+      self->partition.num_tile_rows = self->ctu_height;
+    }
+
+    /* Some driver require that the slice should not span tiles,
+       we need to increase slice number if needed. */
+    if (gst_va_display_is_implementation (base->display,
+            GST_VA_IMPLEMENTATION_INTEL_IHD)) {
+      if (self->partition.num_slices <
+          self->partition.num_tile_cols * self->partition.num_tile_rows) {
+        if (self->partition.num_tile_cols * self->partition.num_tile_rows >
+            max_slices) {
+          GST_ERROR_OBJECT (self, "The slice can not span tiles, but total"
+              " tile num %d is bigger than max_slices %d",
+              self->partition.num_tile_cols * self->partition.num_tile_rows,
+              max_slices);
+          return FALSE;
+        } else {
+          GST_INFO_OBJECT (self, "The num_slices %d is smaller than tile"
+              " num %d. The slice can not span tiles, so set the num-slices"
+              " to tile num.", self->partition.num_slices,
+              self->partition.num_tile_cols * self->partition.num_tile_rows);
+          self->partition.num_slices =
+              self->partition.num_tile_cols * self->partition.num_tile_rows;
+        }
+      }
+
+      self->partition.slice_span_tiles = FALSE;
+    } else {
+      self->partition.slice_span_tiles = TRUE;
+    }
+
+    _h265_calculate_tile_partition (self);
+  } else {
+    _h265_calculate_slice_partition (self, slice_structure);
   }
 
-  update_property_bool (base, &self->prop.use_trellis,
-      self->features.use_trellis, PROP_TRELLIS);
+  update_property_uint (base, &self->prop.num_slices,
+      self->partition.num_slices, PROP_NUM_SLICES);
+  update_property_uint (base, &self->prop.num_tile_cols,
+      self->partition.num_tile_cols, PROP_NUM_TILE_COLS);
+  update_property_uint (base, &self->prop.num_tile_rows,
+      self->partition.num_tile_rows, PROP_NUM_TILE_ROWS);
+
+  return TRUE;
 }
 
 /* Normalizes bitrate (and CPB size) for HRD conformance */
@@ -3021,7 +3391,7 @@ _h265_calculate_coded_size (GstVaH265Enc * self)
   codedbuf_size += 4 + GST_ROUND_UP_8 (MAX_PPS_HDR_SIZE) / 8;
 
   /* Account for slice header */
-  codedbuf_size += self->num_slices * (4 +
+  codedbuf_size += self->partition.num_slices * (4 +
       GST_ROUND_UP_8 (MAX_SLICE_HDR_SIZE + MAX_SHORT_TERM_REFPICSET_SIZE) / 8);
 
   /* TODO: Only YUV 4:2:0 formats are supported for now.
@@ -3575,6 +3945,19 @@ print_options:
       self->features.weighted_pred_flag,
       self->features.weighted_bipred_flag,
       self->features.transquant_bypass_enabled_flag);
+
+  /* Ensure trellis. */
+  if (self->features.use_trellis &&
+      !gst_va_encoder_has_trellis (base->encoder, base->profile,
+          GST_VA_BASE_ENC_ENTRYPOINT (base))) {
+    GST_INFO_OBJECT (self, "The trellis is not supported");
+    self->features.use_trellis = FALSE;
+  }
+
+  if (self->prop.use_trellis != self->features.use_trellis) {
+    self->prop.use_trellis = self->features.use_trellis;
+    g_object_notify_by_pspec (G_OBJECT (self), properties[PROP_TRELLIS]);
+  }
 }
 
 /* We need to decide the profile and entrypoint before call this.
@@ -3822,8 +4205,6 @@ gst_va_h265_enc_reconfig (GstVaBaseEnc * base)
       base->width, base->height, self->ctu_width, self->ctu_height,
       GST_TIME_ARGS (base->frame_duration));
 
-  _h265_validate_parameters (self);
-
   if (!_h265_ensure_rate_control (self))
     return FALSE;
 
@@ -3837,6 +4218,9 @@ gst_va_h265_enc_reconfig (GstVaBaseEnc * base)
 
   _h265_calculate_coded_size (self);
 
+  if (!_h265_setup_slice_and_tile_partition (self))
+    return FALSE;
+
   if (!_h265_init_packed_headers (self))
     return FALSE;
 
@@ -4086,6 +4470,12 @@ gst_va_h265_enc_set_property (GObject * object, guint prop_id,
       g_atomic_int_set (&GST_VA_BASE_ENC (self)->reconf, TRUE);
       already_effect = TRUE;
       break;
+    case PROP_NUM_TILE_COLS:
+      self->prop.num_tile_cols = g_value_get_uint (value);
+      break;
+    case PROP_NUM_TILE_ROWS:
+      self->prop.num_tile_rows = g_value_get_uint (value);
+      break;
     case PROP_RATE_CONTROL:
       self->prop.rc_ctrl = g_value_get_enum (value);
       g_atomic_int_set (&GST_VA_BASE_ENC (self)->reconf, TRUE);
@@ -4169,6 +4559,12 @@ gst_va_h265_enc_get_property (GObject * object, guint prop_id,
     case PROP_TARGET_USAGE:
       g_value_set_uint (value, self->prop.target_usage);
       break;
+    case PROP_NUM_TILE_COLS:
+      g_value_set_uint (value, self->prop.num_tile_cols);
+      break;
+    case PROP_NUM_TILE_ROWS:
+      g_value_set_uint (value, self->prop.num_tile_rows);
+      break;
     case PROP_RATE_CONTROL:
       g_value_set_enum (value, self->prop.rc_ctrl);
       break;
@@ -4466,6 +4862,24 @@ gst_va_h265_enc_class_init (gpointer g_klass, gpointer class_data)
       "The desired max CPB size in Kb (0: auto-calculate)", 0, 2000 * 1024, 0,
       param_flags);
 
+  /**
+   * GstVaH265Enc:num-tile-cols:
+   *
+   * The number of tile columns when tile encoding is enabled.
+   */
+  properties[PROP_NUM_TILE_COLS] = g_param_spec_uint ("num-tile-cols",
+      "number of tile columns", "The number of columns for tile encoding",
+      1, MAX_COL_TILES, 1, param_flags);
+
+  /**
+   * GstVaH265Enc:num-tile-rows:
+   *
+   * The number of tile rows when tile encoding is enabled.
+   */
+  properties[PROP_NUM_TILE_ROWS] = g_param_spec_uint ("num-tile-rows",
+      "number of tile rows", "The number of rows for tile encoding",
+      1, MAX_ROW_TILES, 1, param_flags);
+
   if (vah265enc_class->rate_control_type > 0) {
     properties[PROP_RATE_CONTROL] = g_param_spec_enum ("rate-control",
         "rate control mode", "The desired rate control mode for the encoder",