panfrost_batch_get_scratchpad(batch,
ss->info.tls_size,
dev->thread_tls_alloc,
- dev->core_count);
+ dev->core_id_range);
info.tls.ptr = bo->ptr.gpu;
}
unsigned size =
pan_wls_adjust_size(info.wls.size) *
pan_wls_instances(&info.wls.dim) *
- dev->core_count;
+ dev->core_id_range;
struct panfrost_bo *bo =
panfrost_batch_get_shared_memory(batch, size, 1);
panfrost_batch_get_scratchpad(batch,
batch->stack_size,
dev->thread_tls_alloc,
- dev->core_count):
+ dev->core_id_range):
NULL;
struct pan_tls_info tls = {
.tls = {
panfrost_batch_get_scratchpad(batch,
batch->stack_size,
dev->thread_tls_alloc,
- dev->core_count):
+ dev->core_id_range):
NULL;
struct pan_tls_info tls = {
.tls = {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
- unsigned size = sizeof(uint64_t) * dev->core_count;
+ unsigned size = sizeof(uint64_t) * dev->core_id_range;
/* Allocate a resource for the query results to be stored */
if (!query->rsrc) {
if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
uint64_t passed = 0;
- for (int i = 0; i < dev->core_count; ++i)
+ for (int i = 0; i < dev->core_id_range; ++i)
passed += result[i];
if (dev->arch <= 5 && !query->msaa)
panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
unsigned size_per_thread,
unsigned thread_tls_alloc,
- unsigned core_count)
+ unsigned core_id_range)
{
unsigned size = panfrost_get_total_stack_size(size_per_thread,
thread_tls_alloc,
- core_count);
+ core_id_range);
if (batch->scratchpad) {
assert(batch->scratchpad->size >= size);
panfrost_batch_adjust_stack_size(struct panfrost_batch *batch);
struct panfrost_bo *
-panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count);
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_id_range);
struct panfrost_bo *
panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count);
{
unsigned instances = pan_wls_instances(dim);
- return pan_wls_adjust_size(wls_size) * instances * dev->core_count;
+ return pan_wls_adjust_size(wls_size) * instances * dev->core_id_range;
}
#ifdef PAN_ARCH
unsigned arch;
unsigned gpu_id;
unsigned revision;
+
+ /* Number of shader cores */
unsigned core_count;
+
+ /* Range of core IDs, equal to the maximum core ID + 1. Satisfies
+ * core_id_range >= core_count.
+ */
+ unsigned core_id_range;
+
unsigned thread_tls_alloc;
struct panfrost_tiler_features tiler_features;
const struct panfrost_model *model;
panfrost_get_total_stack_size(
unsigned thread_size,
unsigned threads_per_core,
- unsigned core_count);
+ unsigned core_id_range);
/* Attributes / instancing */
}
static unsigned
-panfrost_query_core_count(int fd)
+panfrost_query_core_count(int fd, unsigned *core_id_range)
{
/* On older kernels, worst-case to 16 cores */
unsigned mask = panfrost_query_raw(fd,
DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);
- /* Some cores might be absent. For TLS computation purposes, we care
- * about the greatest ID + 1, which equals the core count if all cores
- * are present, but allocates space for absent cores if needed.
- * util_last_bit is defined to return the greatest bit set + 1, which
- * is exactly what we need. */
+ /* Some cores might be absent. In some cases, we care
+ * about the range of core IDs (that is, the greatest core ID + 1). If
+ * the core mask is contiguous, this equals the core count.
+ */
+ *core_id_range = util_last_bit(mask);
- return util_last_bit(mask);
+ /* The actual core count skips overs the gaps */
+ return util_bitcount(mask);
}
/* Architectural maximums, since this register may be not implemented
dev->memctx = memctx;
dev->gpu_id = panfrost_query_gpu_version(fd);
dev->arch = pan_arch(dev->gpu_id);
- dev->core_count = panfrost_query_core_count(fd);
+ dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range);
dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch);
dev->kernel_version = drmGetVersion(fd);
dev->revision = panfrost_query_gpu_revision(fd);
panfrost_get_total_stack_size(
unsigned thread_size,
unsigned threads_per_core,
- unsigned core_count)
+ unsigned core_id_range)
{
unsigned size_per_thread = (thread_size == 0) ? 0 :
util_next_power_of_two(ALIGN_POT(thread_size, 16));
- return size_per_thread * threads_per_core * core_count;
+ return size_per_thread * threads_per_core * core_id_range;
}
// If counter belongs to shader core, accumulate values for all other cores
if (counter->category_index == PAN_SHADER_CORE_INDEX) {
- for (uint32_t core = 1; core < perf->dev->core_count; ++core) {
+ for (uint32_t core = 1; core < perf->dev->core_id_range; ++core) {
ret += perf->counter_values[offset + PAN_COUNTERS_PER_CATEGORY * core];
}
}
// Generally counter blocks are laid out in the following order:
// Job manager, tiler, one or more L2 caches, and one or more shader cores.
unsigned l2_slices = panfrost_query_l2_slices(dev);
- uint32_t n_blocks = 2 + l2_slices + dev->core_count;
+ uint32_t n_blocks = 2 + l2_slices + dev->core_id_range;
perf->n_counter_values = PAN_COUNTERS_PER_CATEGORY * n_blocks;
perf->counter_values = ralloc_array(perf, uint32_t, perf->n_counter_values);
if (batch->tlsinfo.tls.size) {
unsigned size = panfrost_get_total_stack_size(batch->tlsinfo.tls.size,
pdev->thread_tls_alloc,
- pdev->core_count);
+ pdev->core_id_range);
batch->tlsinfo.tls.ptr =
pan_pool_alloc_aligned(&cmdbuf->tls_pool.base, size, 4096).gpu;
}