* store the task payload which is passed to mesh shaders.
*
* The driver only needs to create this BO once,
- * and it will always be able to accomodate the maximum needed
+ * and it will always be able to accommodate the maximum needed
* task payload size.
*
* The following memory layout is used:
* it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
* because it doesn't enter any diamond and thus can't exit any diamond.
*
- * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
+ * The viewport is rotated by 45 degrees to turn diamonds into squares, and a bounding
* box test is used to determine whether a line is entirely inside any square (diamond).
*
* The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate);
}
- /* Rotate the viewport by 45 degress, so that diamonds become squares. */
+ /* Rotate the viewport by 45 degrees, so that diamonds become squares. */
rotate_45degrees(b, v0);
rotate_45degrees(b, v1);
unsigned full_dwords = total_bytes / 4u;
unsigned remaining_bytes = total_bytes - full_dwords * 4u;
- /* Accomodate max number of split 64-bit loads */
+ /* Accommodate max number of split 64-bit loads */
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];
/* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
/* Run culling algorithms if culling is enabled.
*
* NGG culling can be enabled or disabled in runtime.
- * This is determined by a SGPR shader argument which is acccessed
+ * This is determined by a SGPR shader argument which is accessed
* by the following NIR intrinsic.
*/
/* We want to export primitives to streamout buffer in sequence,
* but not all vertices are alive or mark end of a primitive, so
- * there're "holes". We don't need continous invocations to write
+ * there're "holes". We don't need continuous invocations to write
* primitives to streamout buffer like final vertex export, so
* just repack to get the sequence (export_seq) is enough, no need
* to do compaction.
nir_ssa_def *zero = nir_imm_int(b, 0);
u_foreach_bit64(slot, mask) {
- /* Should not occour here, handled separately. */
+ /* Should not occur here, handled separately. */
assert(slot != VARYING_SLOT_PRIMITIVE_COUNT && slot != VARYING_SLOT_PRIMITIVE_INDICES);
unsigned component_mask = s->output_info[slot].components_mask;
uint64_t outputs_written = b->shader->info.outputs_written;
/* use outputs_written to determine export format as we use it to set
- * R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store ouput,
- * because store ouput may be optimized out.
+ * R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store output,
+ * because store output may be optimized out.
*/
unsigned format =
ac_get_spi_shader_z_format(outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH),
pack_op = nir_op_pack_snorm_2x16;
break;
default:
- unreachable("unsupport color export format");
+ unreachable("unsupported color export format");
break;
}
if (os_get_total_physical_memory(&system_ram_size))
chunk->system_ram_size = system_ram_size / (1024 * 1024);
- /* Parse cpuinfo to get more detailled information. */
+ /* Parse cpuinfo to get more detailed information. */
f = fopen("/proc/cpuinfo", "r");
if (!f)
return;
// Use the regularity properties of the combined format enum.
//
// Note: float is incompatible with 8-bit data formats,
- // [us]{norm,scaled} are incomparible with 32-bit data formats.
+ // [us]{norm,scaled} are incompatible with 32-bit data formats.
// [us]scaled are not writable.
switch (nfmt) {
case V_008F0C_BUF_NUM_FORMAT_UNORM:
unsigned end_reg_offset = reg_offset + count * 4;
unsigned end_range_offset = ranges[i].offset + ranges[i].size;
- /* Test if the ranges interect. */
+ /* Test if the ranges intersect. */
if (MAX2(ranges[i].offset, reg_offset) < MIN2(end_range_offset, end_reg_offset)) {
/* Assertion: A register can be listed only once. */
assert(!found);
if (ret == ADDR_OK) {
/* If the DCC memory isn't properly
* aligned, the data are interleaved
- * accross slices.
+ * across slices.
*/
if (AddrDccOut->dccRamSizeAligned)
dcc_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize;
### VALUTransUseHazard
Triggered by:
-A VALU instrction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
+A VALU instruction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
transcendental instructions in-between.
Mitigated by:
#### Resolve hazards and insert NOPs
Some instructions require wait states or other instructions to resolve hazards which are not handled by the hardware.
-This pass makes sure that no known hazards occour.
+This pass makes sure that no known hazards occur.
#### Emit program - Assembler
An important difference is that VS is always the first stage to run in SW models,
whereas HW VS refers to the last HW stage before fragment shading in GCN/RDNA terminology.
-That's why, among other things, the HW VS is no longer used to execute the SW VS when tesselation or geometry shading are used.
+That's why, among other things, the HW VS is no longer used to execute the SW VS when tessellation or geometry shading are used.
#### Glossary of software stages
/* extract a full dword if possible */
if (tmp.bytes() >= (dword + 1) * 4) {
- /* if the source is splitted into components, use p_create_vector */
+ /* if the source is split into components, use p_create_vector */
auto it = ctx->allocated_vec.find(tmp.id());
if (it != ctx->allocated_vec.end()) {
unsigned index = dword << 1;
ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
alignment, max_fetched_components);
const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
- /* Adjust bytes needed in case we need to do a smaller load due to aligment.
+ /* Adjust bytes needed in case we need to do a smaller load due to alignment.
* If a larger format is selected, it's still OK to load a smaller amount from it.
*/
bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
/* pre-GFX9 */
-static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
+static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tessellation control */
static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
static constexpr Stage tess_eval_es(HWStage::ES,
- SWStage::TES); /* tesselation evaluation before geometry */
+ SWStage::TES); /* tessellation evaluation before geometry */
static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
/* Raytracing */
static constexpr Stage raytracing_cs(HWStage::CS, SWStage::RT);
get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
{
/* Because some 16-bit instructions are already VOP3 on GFX10, we use the
- * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
+ * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
* DPP with the arithmetic instructions. This requires to sign-extend.
*/
switch (op) {
for (unsigned i = 0; i < src.size(); i++) {
if (!identity[i].isConstant() ||
- identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
+ identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
if (ctx->program->gfx_level < GFX10)
assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
identity[i].physReg() == PhysReg{sitmp + i});
- name is the name of the opcode (prepend nir_op_ for the enum name)
- all types are strings that get nir_type_ prepended to them
- input_types is a list of types
- - algebraic_properties is a space-seperated string, where nir_op_is_ is
+ - algebraic_properties is a space-separated string, where nir_op_is_ is
prepended before each entry
- const_expr is an expression or series of statements that computes the
constant value of the opcode given the constant values of its inputs.
sel.offset() == 0 &&
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
- /* The undesireable upper bits are already shifted out. */
+ /* The undesirable upper bits are already shifted out. */
return;
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
unsigned sign_ext = extract->operands[3].constantValue();
unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
- /* TODO: These are doable, but probably don't occour too often. */
+ /* TODO: These are doable, but probably don't occur too often. */
if (extract_idx || sign_ext || dst_bitsize != 32)
return false;
break;
/* don't use LDS/GDS instructions to hide latency since it can
- * significanly worsen LDS scheduling */
+ * significantly worsen LDS scheduling */
if (candidate->isDS() || !can_move_down) {
add_to_hazard_query(&hq, candidate.get());
ctx.mv.downwards_skip(cursor);
Temp var = phi->operands[i].getTemp();
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
- /* prevent the definining instruction from being DCE'd if it could be rematerialized */
+ /* prevent the defining instruction from being DCE'd if it could be rematerialized */
if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
ctx.unused_remats.erase(ctx.remat[var].instr);
ctx.renames[pred_idx].find(phi->operands[i].getTemp());
if (it != ctx.renames[pred_idx].end()) {
phi->operands[i].setTemp(it->second);
- /* prevent the definining instruction from being DCE'd if it could be rematerialized */
+ /* prevent the defining instruction from being DCE'd if it could be rematerialized */
} else {
auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
if (remat_it != ctx.remat.end()) {
tmp = rename;
} else {
tmp = pair.first;
- /* prevent the definining instruction from being DCE'd if it could be rematerialized */
+ /* prevent the defining instruction from being DCE'd if it could be rematerialized */
if (ctx.remat.count(tmp))
ctx.unused_remats.erase(ctx.remat[tmp].instr);
}
std::vector<aco_ptr<Instruction>> instructions;
unsigned idx = 0;
- /* phis are handled separetely */
+ /* phis are handled separately */
while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
instructions.emplace_back(std::move(block->instructions[idx++]));
if (rename_it != ctx.renames[block_idx].end()) {
op.setTemp(rename_it->second);
} else {
- /* prevent its definining instruction from being DCE'd if it could be rematerialized */
+ /* prevent its defining instruction from being DCE'd if it could be rematerialized */
auto remat_it = ctx.remat.find(op.getTemp());
if (remat_it != ctx.remat.end()) {
ctx.unused_remats.erase(remat_it->second.instr);
free(buffer);
}
- /* Delete copy-constructor and -assigment to avoid double free() */
+ /* Delete copy-constructor and -assignment to avoid double free() */
monotonic_buffer_resource(const monotonic_buffer_resource&) = delete;
monotonic_buffer_resource& operator=(const monotonic_buffer_resource&) = delete;
Export `MESA_LOADER_DRIVER_OVERRIDE=r300
LD_PRELOAD=$prefix/lib/libradeon_noop_drm_shim.so`. (or r600 for r600-class HW)
-By default, rv515 is exposed. The chip can be selected an enviornment
+By default, rv515 is exposed. The chip can be selected an environment
variable like `RADEON_GPU_ID=CAYMAN` or `RADEON_GPU_ID=0x6740`.
*
* where d is the depth of the texture array and layer
* comes from the component indicated in the tables below.
- * Workaroudn for an issue where the layer is taken from a
+ * Workaround for an issue where the layer is taken from a
* helper invocation which happens to fall on a different
* layer due to extrapolation."
*
if (atomic) {
data_type = LLVMTypeOf(a->data[0]);
} else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
- /* Image stores might have been shrinked using the format. */
+ /* Image stores might have been shrunk using the format. */
data_type = LLVMTypeOf(a->data[0]);
dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
} else {
* to implement the body.
*
* params:
- * - ctx is the usal nir context
+ * - ctx is the usual nir context
* - wctx is a temporary struct containing some loop info. Can be left uninitialized.
* - value is the possibly divergent value for which we built the loop
* - divergent is whether value is actually divergent. If false we just pass
* tree depth for internal nodes)
*
* Dividing area by both relative costs will make it more likely that we merge nodes with
- * a hight child cost.
+ * a high child cost.
*/
float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
nir_ssa_def *load = loads[0];
/* Extract the channels we actually need when we couldn't skip starting
- * components or had to emit more than one load instrinsic.
+ * components or had to emit more than one load intrinsic.
*/
if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
// must be honored. All alignments are power of 2.
//
// Input:
-// count : Maximum number of keyvals
+// count : Maximum number of keyvals
//
// Outputs:
-// keyval_size : Size of a single keyval
+// keyval_size : Size of a single keyval
//
-// keyvals_size : Minimum size of the even and odd keyval buffers
-// keyvals_alignment : Alignment of each keyval buffer
+// keyvals_size : Minimum size of the even and odd keyval buffers
+// keyvals_alignment : Alignment of each keyval buffer
//
-// internal_size : Minimum size of internal buffer
-// internal_aligment : Alignment of the internal buffer
+// internal_size : Minimum size of internal buffer
+// internal_alignment : Alignment of the internal buffer
//
-// indirect_size : Minimum size of indirect buffer
-// indirect_aligment : Alignment of the indirect buffer
+// indirect_size : Minimum size of indirect buffer
+// indirect_alignment : Alignment of the indirect buffer
//
// .keyvals_even/odd
// -----------------
size += ycbcr_sampler_count * sizeof(struct vk_ycbcr_conversion_state);
}
- /* We need to allocate decriptor set layouts off the device allocator with DEVICE scope because
+ /* We need to allocate descriptor set layouts off the device allocator with DEVICE scope because
* they are reference counted and may not be destroyed when vkDestroyDescriptorSetLayout is
* called.
*/
struct ac_compiler_passes *passes;
};
-/* we have to store a linked list per thread due to the possiblity of multiple gpus being required */
+/* we have to store a linked list per thread due to the possibility of multiple gpus being required */
static thread_local std::list<radv_llvm_per_thread_info> radv_llvm_per_thread_list;
bool
if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
device->cs_wave_size = 32;
- /* For pixel shaders, wave64 is recommanded. */
+ /* For pixel shaders, wave64 is recommended. */
if (device->instance->perftest_flags & RADV_PERFTEST_PS_WAVE_32)
device->ps_wave_size = 32;
device->ws->query_value(device->ws, RADEON_GTT_USAGE);
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
- /* Compute the total free space that can be allocated for this process accross all heaps. */
+ /* Compute the total free space that can be allocated for this process across all heaps. */
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
memoryBudget->heapBudget[vram_vis_heap_idx] = total_free_space + total_internal_usage;
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
- /* Compute the total free space that can be allocated for this process accross all heaps. */
+ /* Compute the total free space that can be allocated for this process across all heaps. */
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
/* Compute the remaining visible VRAM size for this process. */
if (enable_mrt_compaction) {
blend.spi_shader_col_format = radv_compact_spi_shader_col_format(ps, &blend);
- /* In presense of MRT holes (ie. the FS exports MRT1 but not MRT0), the compiler will remap
+ /* In presence of MRT holes (ie. the FS exports MRT1 but not MRT0), the compiler will remap
* them, so that only MRT0 is exported and the driver will compact SPI_SHADER_COL_FORMAT to
* match what the FS actually exports. Though, to make sure the hw remapping works as
* expected, we should also clear color attachments without exports in CB_SHADER_MASK.
} ace_internal;
/**
- * Whether a query pool has been resetted and we have to flush caches.
+ * Whether a query pool has been reset and we have to flush caches.
*/
bool pending_reset_query;
if (cmd_buffer->pending_reset_query) {
if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
/* Only need to flush caches if the query pool size is
- * large enough to be resetted using the compute shader
+ * large enough to be reset using the compute shader
* path. Small pools don't need any cache flushes
* because we use a CP dma clear.
*/
rra_dump_chunk_description(accel_struct_offsets[i],
sizeof(struct rra_accel_struct_chunk_header), accel_struct_size,
- "RawAccelStruc", RADV_RRA_CHUNK_ID_ACCEL_STRUCT, file);
+ "RawAccelStruct", RADV_RRA_CHUNK_ID_ACCEL_STRUCT, file);
}
uint64_t file_end = (uint64_t)ftell(file);
nir_store_var(b, child_indices,
nir_imm_ivec4(b, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu), 0xf);
- /* Need to remove infinities here because otherwise we get nasty NaN propogation
+ /* Need to remove infinities here because otherwise we get nasty NaN propagation
* if the direction has 0s in it. */
/* inv_dir = clamp(inv_dir, -FLT_MAX, FLT_MAX); */
inv_dir = nir_fclamp(b, inv_dir, nir_imm_float(b, -FLT_MAX), nir_imm_float(b, FLT_MAX));
nir_ssa_def *k_indices[3] = {kx, ky, kz};
nir_ssa_def *k = nir_vec(b, k_indices, 3);
- /* Swap kx and ky dimensions to preseve winding order */
+ /* Swap kx and ky dimensions to preserve winding order */
unsigned swap_xy_swizzle[4] = {1, 0, 2, 3};
k = nir_bcsel(b, nir_flt(b, nir_vector_extract(b, dir, kz), nir_imm_float(b, 0.0f)),
nir_swizzle(b, k, swap_xy_swizzle, 3), k);
size += code_dw * sizeof(uint32_t) + sizeof(struct radv_shader_binary_legacy);
- /* We need to calloc to prevent unintialized data because this will be used
+ /* We need to calloc to prevent uninitialized data because this will be used
* directly for the disk cache. Uninitialized data can appear because of
* padding in the struct or because legacy_binary->data can be at an offset
* from the start less than sizeof(radv_shader_binary_legacy). */
if (pdevice->rad_info.gfx_level >= GFX11) {
/* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
* To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
- * substract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
+ * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
*
* 1) get the current buffer base address for this SE
* 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned