* © Copyright 2017-2018 Alyssa Rosenzweig
* © Copyright 2017-2018 Connor Abbott
* © Copyright 2017-2018 Lyude Paul
+ * © Copyright2019 Collabora
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
u32 zero6[7];
/* Very weird format, see generation code in trans_builder.c */
- u32 resolution_check;
-
+ u32 tiler_resolution_check;
u32 tiler_flags;
- u64 unknown_address_1; /* Pointing towards... a zero buffer? */
- u64 unknown_address_2;
+ /* Guesses? */
+ mali_ptr tiler_scratch_start; /* Pointing towards... a zero buffer? */
+ mali_ptr tiler_scratch_middle;
/* See mali_kbase_replay.c */
- u64 tiler_heap_free;
- u64 tiler_heap_end;
+ mali_ptr tiler_heap_free;
+ mali_ptr tiler_heap_end;
/* More below this, maybe */
} __attribute__((packed));
u32 clear_stencil : 8;
u32 unk3 : 24; // = 0x100
float clear_depth;
- mali_ptr tiler_meta;
- /* 0x40 */
+
+
+ /* Tiler section begins here */
+ u32 tiler_unknown;
+
+ /* Name known from the replay workaround in the kernel. What exactly is
+ * flagged here is less known. We do that (tiler_flags & 0x1ff)
+ * specifies a mask of hierarchy weights, which explains some of the
+ * performance mysteries around setting it. We also known (1 << 16)
+ * should be set, but there's no explanation in the kernel why. */
+ u32 tiler_flags;
/* Note: these are guesses! */
mali_ptr tiler_scratch_start;
mali_ptr tiler_scratch_middle;
- /* These are not, since we see symmetry with replay jobs which name these explicitly */
- mali_ptr tiler_heap_start;
+ /* These are not, since we see symmetry with replay
+ * jobs which name these explicitly */
+
+ mali_ptr tiler_heap_start; /* tiler heap_free_address */
mali_ptr tiler_heap_end;
- u64 zero9, zero10, zero11, zero12;
+ u32 tiler_weights[8];
/* optional: struct bifrost_fb_extra extra */
/* struct bifrost_render_target rts[] */
* The formula itself was discovered mostly by manual bruteforce and
* aggressive algebraic simplification. */
- fb->resolution_check = ((w + h) / 3) << 4;
+ fb->tiler_resolution_check = ((w + h) / 3) << 4;
}
struct mali_single_framebuffer
.format = 0x30000000,
.clear_flags = 0x1000,
.unknown_address_0 = ctx->scratchpad.gpu,
- .unknown_address_1 = ctx->misc_0.gpu,
- .unknown_address_2 = ctx->misc_0.gpu + 40960,
+ .tiler_scratch_start = ctx->misc_0.gpu,
+ .tiler_scratch_middle = ctx->misc_0.gpu + 40960,
.tiler_flags = 0xf0,
.tiler_heap_free = ctx->tiler_heap.gpu,
.tiler_heap_end = ctx->tiler_heap.gpu + ctx->tiler_heap.size,
panfrost_emit_mfbd(struct panfrost_context *ctx)
{
struct bifrost_framebuffer framebuffer = {
- /* It is not yet clear what tiler_meta means or how it's
- * calculated, but we can tell the lower 32-bits are a
- * (monotonically increasing?) function of tile count and
- * geometry complexity; I suspect it defines a memory size of
- * some kind? for the tiler. It's really unclear at the
- * moment... but to add to the confusion, the hardware is happy
- * enough to accept a zero in this field, so we don't even have
- * to worry about it right now.
- *
- * The byte (just after the 32-bit mark) is much more
- * interesting. The higher nibble I've only ever seen as 0xF,
- * but the lower one I've seen as 0x0 or 0xF, and it's not
- * obvious what the difference is. But what -is- obvious is
- * that when the lower nibble is zero, performance is severely
- * degraded compared to when the lower nibble is set.
- * Evidently, that nibble enables some sort of fast path,
- * perhaps relating to caching or tile flush? Regardless, at
- * this point there's no clear reason not to set it, aside from
- * substantially increased memory requirements (of the misc_0
- * buffer) */
-
- .tiler_meta = ((uint64_t) 0xff << 32) | 0x0,
+ /* It is not yet clear what this means or how it's
+ * calculated, but we can tell it is a (monotonically
+ * increasing?) function of tile count and geometry complexity;
+ * I suspect it defines a memory size of some kind? for the
+ * tiler. It's really unclear at the moment... but to add to
+ * the confusion, the hardware is happy enough to accept a zero
+ * in this field, so we don't even have to worry about it right
+ * now. */
+
+ .tiler_unknown = 0x0,
+
+ /* The lower 0xff controls the hierarchy mask. Set more bits
+ * on for more tile granularity (which can be a performance win
+ * on some scenes, at memory bandwidth costs). For now, be lazy
+ * and enable everything. This might be a terrible idea. */
+ .tiler_flags = 0xff,
.width1 = MALI_POSITIVE(ctx->pipe_framebuffer.width),
.height1 = MALI_POSITIVE(ctx->pipe_framebuffer.height),
}
MEMORY_PROP(s, unknown_address_0);
- MEMORY_PROP(s, unknown_address_1);
- MEMORY_PROP(s, unknown_address_2);
+ MEMORY_PROP(s, tiler_scratch_start);
+ MEMORY_PROP(s, tiler_scratch_middle);
- pandecode_prop("resolution_check = 0x%" PRIx32, s->resolution_check);
+ pandecode_prop("tiler_resolution_check = 0x%" PRIx32, s->tiler_resolution_check);
pandecode_prop("tiler_flags = 0x%" PRIx32, s->tiler_flags);
MEMORY_PROP(s, tiler_heap_free);
if (fb->sample_locations)
pandecode_prop("sample_locations = sample_locations_%d", job_no);
- /* Assume that unknown1 and tiler_meta were emitted in the last job for
+ /* Assume that unknown1 was emitted in the last job for
* now */
- /*pandecode_prop("unknown1 = unknown1_%d_p", job_no - 1);
- pandecode_prop("tiler_meta = tiler_meta_%d_p", job_no - 1);*/
MEMORY_PROP(fb, unknown1);
- MEMORY_PROP(fb, tiler_meta);
+
+ pandecode_prop("tiler_unknown = 0x%x", fb->tiler_unknown);
+ pandecode_prop("tiler_flags = 0x%x", fb->tiler_flags);
pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1);
pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1);
MEMORY_PROP(fb, tiler_heap_start);
MEMORY_PROP(fb, tiler_heap_end);
- if (fb->zero3 || fb->zero4 || fb->zero9 || fb->zero10 || fb->zero11 || fb->zero12) {
+ if (fb->zero3 || fb->zero4) {
pandecode_msg("framebuffer zeros tripped\n");
pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3);
pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4);
- pandecode_prop("zero9 = 0x%" PRIx64, fb->zero9);
- pandecode_prop("zero10 = 0x%" PRIx64, fb->zero10);
- pandecode_prop("zero11 = 0x%" PRIx64, fb->zero11);
- pandecode_prop("zero12 = 0x%" PRIx64, fb->zero12);
+ }
+
+ bool nonzero_weights = false;
+
+ for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) {
+ nonzero_weights |= fb->tiler_weights[w] != 0x0;
+ }
+
+ if (nonzero_weights) {
+ pandecode_log(".tiler_weights = {");
+
+ for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) {
+ pandecode_log("%d, ", fb->tiler_weights[w]);
+ }
+
+ pandecode_log("},");
}
pandecode_indent--;