outputs[location][j] =
nir_load_buffer_amd(&b, 1, 32, gsvs_ring, vtx_offset, zero, zero,
- .base = offset, .is_swizzled = false,
+ .base = offset,
.access = ACCESS_COHERENT | ACCESS_STREAM_CACHE_POLICY);
offset += gs_nir->info.gs.vertices_out * 16 * 4;
nir_ssa_def *data = nir_u2uN(b, output, 32);
nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
- .is_swizzled = true,
- .access = ACCESS_COHERENT | ACCESS_STREAM_CACHE_POLICY,
+ .access = ACCESS_COHERENT | ACCESS_STREAM_CACHE_POLICY |
+ ACCESS_IS_SWIZZLED_AMD,
/* For ACO to not reorder this store around EmitVertex/EndPrimitve */
.memory_modes = nir_var_shader_out);
}
nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi),
gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
- .is_swizzled = true,
- .access = ACCESS_COHERENT | ACCESS_STREAM_CACHE_POLICY,
+ .access = ACCESS_COHERENT | ACCESS_STREAM_CACHE_POLICY |
+ ACCESS_IS_SWIZZLED_AMD,
/* For ACO to not reorder this store around EmitVertex/EndPrimitve */
.memory_modes = nir_var_shader_out);
}
store_bytes = MIN2(store_bytes, 2);
nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u);
- nir_store_buffer_amd(b, store_val, desc, v_off, s_off, zero, .is_swizzled = swizzled,
+ nir_store_buffer_amd(b, store_val, desc, v_off, s_off, zero,
.base = start_byte, .memory_modes = nir_var_shader_out,
- .access = ACCESS_COHERENT | (slc ? ACCESS_STREAM_CACHE_POLICY : 0));
+ .access = ACCESS_COHERENT |
+ (slc ? ACCESS_STREAM_CACHE_POLICY : 0) |
+ (swizzled ? ACCESS_IS_SWIZZLED_AMD : 0));
start_byte += store_bytes;
bytes -= store_bytes;
for (unsigned j = 0; j < 4; j++)
comp[j] = outputs[i].chan[j] ? outputs[i].chan[j] : undef;
nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, soffset, vindex,
- .is_swizzled = true, .memory_modes = nir_var_shader_out,
- .access = ACCESS_COHERENT);
+ .memory_modes = nir_var_shader_out,
+ .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
}
nir_pop_if(b, NULL);
Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
- bool swizzled = nir_intrinsic_is_swizzled(intrin);
+ bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
bool slc = nir_intrinsic_access(intrin) & ACCESS_STREAM_CACHE_POLICY;
Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
- bool swizzled = nir_intrinsic_is_swizzled(intrin);
+ bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
bool slc = nir_intrinsic_access(intrin) & ACCESS_STREAM_CACHE_POLICY;
LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[3]) : NULL;
unsigned num_components = instr->dest.ssa.num_components;
unsigned const_offset = nir_intrinsic_base(instr);
- bool swizzled = nir_intrinsic_is_swizzled(instr);
+ bool swizzled = nir_intrinsic_access(instr) & ACCESS_IS_SWIZZLED_AMD;
bool reorder = nir_intrinsic_can_reorder(instr);
bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
bool slc = nir_intrinsic_access(instr) & ACCESS_STREAM_CACHE_POLICY;
LLVMValueRef addr_soffset = get_src(ctx, instr->src[3]);
LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[4]) : NULL;
unsigned const_offset = nir_intrinsic_base(instr);
- bool swizzled = nir_intrinsic_is_swizzled(instr);
+ bool swizzled = nir_intrinsic_access(instr) & ACCESS_IS_SWIZZLED_AMD;
bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
bool slc = nir_intrinsic_access(instr) & ACCESS_STREAM_CACHE_POLICY;
# The swizzle mask for quad_swizzle_amd & masked_swizzle_amd
index("unsigned", "swizzle_mask")
-# Whether the load_buffer_amd/store_buffer_amd is swizzled
-index("bool", "is_swizzled")
-
# Offsets for load_shared2_amd/store_shared2_amd
index("uint8_t", "offset0")
index("uint8_t", "offset1")
# src[] = { descriptor, vector byte offset, scalar byte offset, index offset }
# The index offset is multiplied by the stride in the descriptor. The vertex/scalar byte offsets
# are in bytes.
-intrinsic("load_buffer_amd", src_comp=[4, 1, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE])
+intrinsic("load_buffer_amd", src_comp=[4, 1, 1, 1], dest_comp=0, indices=[BASE, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE])
# src[] = { store value, descriptor, vector byte offset, scalar byte offset, index offset }
-intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, MEMORY_MODES, ACCESS])
+intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1, 1], indices=[BASE, WRITE_MASK, MEMORY_MODES, ACCESS])
# src[] = { address, unsigned 32-bit offset }.
load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
/** Execute instruction also in helpers. */
ACCESS_INCLUDE_HELPERS = (1 << 8),
+
+ /**
+ * Whether the address bits are swizzled by the hw. This practically means
+ * that loads can't be vectorized and must be exactly 32 bits on some chips.
+ * The swizzle amount is determined by the descriptor.
+ */
+ ACCESS_IS_SWIZZLED_AMD = (1 << 9),
};
/**