From a09af8669396aebe280d959af695dc68307217cd Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 5 May 2022 15:44:16 -0700 Subject: [PATCH] [AMDGPU] Enable FLAT LDS DMA on gfx9/10 before gfx940 We always had global and scratch loads to LDS in the gfx9, but did not handle it. These were available via the 'lds' encoding bit. In gfx940 this bit was reused as 'svs' which resulted in new '_lds' opcodes effectively pushing this bit into the opcode, but functionally it is the same. These instructions are also available on gfx10. Differential Revision: https://reviews.llvm.org/D125126 --- .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 31 ++++++++ llvm/lib/Target/AMDGPU/FLATInstructions.td | 92 ++++++++++++++++------ llvm/test/MC/AMDGPU/gfx1030_new.s | 15 ++++ llvm/test/MC/AMDGPU/gfx10_asm_flat.s | 63 +++++++++++++++ llvm/test/MC/AMDGPU/gfx8_unsupported.s | 12 +++ llvm/test/MC/AMDGPU/gfx9-asm-err.s | 9 +++ llvm/test/MC/AMDGPU/gfx90a_err.s | 6 ++ llvm/test/MC/AMDGPU/gfx940_err.s | 6 ++ llvm/test/MC/AMDGPU/gfx9_asm_flat.s | 69 ++++++++++++++++ .../MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt | 15 ++++ .../test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt | 63 +++++++++++++++ llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt | 69 ++++++++++++++++ 12 files changed, 428 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 7b2233b..5837d38 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1624,6 +1624,8 @@ private: bool validateDivScale(const MCInst &Inst); bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc); + bool validateFlatLdsDMA(const MCInst &Inst, const OperandVector &Operands, + const SMLoc &IDLoc); Optional validateLdsDirect(const MCInst &Inst); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); @@ -4417,6 +4419,31 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateFlatLdsDMA(const MCInst &Inst, + const OperandVector &Operands, + const SMLoc &IDLoc) { + if (isGFX940()) + return true; + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & (SIInstrFlags::VALU | SIInstrFlags::FLAT)) != + (SIInstrFlags::VALU | SIInstrFlags::FLAT)) + return true; + // This is FLAT LDS DMA. + + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyLDS, Operands); + StringRef CStr(S.getPointer()); + if (!CStr.startswith("lds")) { + // This is incorrectly selected LDS DMA version of a FLAT load opcode. + // And LDS version should have 'lds' modifier, but it follows optional + // operands so its absense is ignored by the matcher. + Error(IDLoc, "invalid operands for instruction"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands) { @@ -4532,6 +4559,10 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, return false; } + if (!validateFlatLdsDMA(Inst, Operands, IDLoc)) { + return false; + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index a4ac72d..78f1cbf 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -58,6 +58,7 @@ class FLAT_Pseudo has_sccb = 1; bits<1> sccbValue = 0; bits<1> has_sve = 0; // Scratch VGPR Enable + bits<1> lds = 0; bits<1> sve = 0; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, @@ -110,7 +111,7 @@ class FLAT_Real op, FLAT_Pseudo ps> : bits<5> cpol; // Only valid on gfx9 - bits<1> lds = 0; // XXX - What does this actually do? + bits<1> lds = ps.lds; // LDS DMA for global and scratch // Segment, 00=flat, 01=scratch, 10=global, 11=reserved bits<2> seg = !if(ps.is_flat_global, 0b10, @@ -253,6 +254,7 @@ class FLAT_Global_Load_LDS_Pseudo : FLAT_Ps " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let LGKM_CNT = 1; let is_flat_global = 1; + let lds = 1; let has_data = 0; let has_vdst = 0; let mayLoad = 1; @@ -411,6 +413,7 @@ class FLAT_Scratch_Load_LDS_Pseudo ; -let SubtargetPredicate = isGFX940Plus in { - defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">; defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">; defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">; defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; -} // End let SubtargetPredicate = isGFX940Plus } // End is_flat_global = 1 @@ -873,16 +873,12 @@ defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>; defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>; -let SubtargetPredicate = isGFX940Plus in { - defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">; defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">; defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ushort">; defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; -} // End let SubtargetPredicate = isGFX940Plus - } // End SubtargetPredicate = HasFlatScratchInsts let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { @@ -1655,6 +1651,33 @@ multiclass FLAT_Real_AllAddr_SVE_vi op> { } } +multiclass FLAT_Real_AllAddr_LDS op, bits<7> pre_gfx940_op, + string pre_gfx940_name = !subst("_lds", "", !cast(NAME).PseudoInstr), + bit has_sccb = !cast(NAME).has_sccb> { + + let OtherPredicates = [isGFX8GFX9NotGFX940] in { + def _vi : FLAT_Real_vi(NAME), has_sccb> { + let AsmString = pre_gfx940_name # !cast(NAME).AsmOperands # " lds"; + } + def _SADDR_vi : FLAT_Real_vi(NAME#"_SADDR"), has_sccb> { + let AsmString = pre_gfx940_name # !cast(NAME#"_SADDR").AsmOperands # " lds"; + } + } + + let SubtargetPredicate = isGFX940Plus in { + def _gfx940 : FLAT_Real_gfx940(NAME)>; + def _SADDR_gfx940 : FLAT_Real_gfx940(NAME#"_SADDR")>; + } +} + +multiclass FLAT_Real_AllAddr_SVE_LDS op, bits<7> pre_gfx940_op> { + defm "" : FLAT_Real_AllAddr_LDS; + let SubtargetPredicate = isGFX940Plus in { + def _SVS_gfx940 : FLAT_Real_gfx940(NAME#"_SVS")>; + def _ST_gfx940 : FLAT_Real_gfx940(NAME#"_ST")>; + } +} + def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>; def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>; @@ -1746,13 +1769,11 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; -let AssemblerPredicate = isGFX940Plus in { -defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_vi <0x026>; -defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_vi <0x027>; -defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_vi <0x028>; -defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_vi <0x029>; -defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_vi <0x02a>; -} // End let AssemblerPredicate = isGFX940Plus +defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_LDS <0x026, 0x10>; +defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_LDS <0x027, 0x11>; +defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>; +defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>; +defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>; defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>; defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>; @@ -1781,13 +1802,11 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Real_Atomics_vi <0x6a>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Real_Atomics_vi <0x6b>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Real_Atomics_vi <0x6c>; -let AssemblerPredicate = isGFX940Plus in { -defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_SVE_vi <0x026>; -defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_SVE_vi <0x027>; -defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_AllAddr_SVE_vi <0x028>; -defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_SVE_vi <0x029>; -defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_AllAddr_SVE_vi <0x02a>; -} // End let AssemblerPredicate = isGFX940Plus +defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_SVE_LDS <0x026, 0x10>; +defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_SVE_LDS <0x027, 0x11>; +defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_AllAddr_SVE_LDS <0x028, 0x12>; +defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_SVE_LDS <0x029, 0x13>; +defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_AllAddr_SVE_LDS <0x02a, 0x14>; defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_SVE_vi <0x10>; defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_SVE_vi <0x11>; @@ -1926,6 +1945,23 @@ multiclass FLAT_Real_ScratchAllAddr_gfx10 op> : FLAT_Real_SADDR_gfx10, FLAT_Real_ST_gfx10; +multiclass FLAT_Real_AllAddr_LDS_gfx10 op, + string opname = !subst("_lds", "", !cast(NAME).PseudoInstr)> { + let AsmString = opname # !cast(NAME).AsmOperands # " lds" in + defm "" : FLAT_Real_Base_gfx10; + + let AsmString = opname # !cast(NAME#"_SADDR").AsmOperands # " lds" in + defm "" : FLAT_Real_SADDR_gfx10; +} + +multiclass FLAT_Real_ScratchAllAddr_LDS_gfx10 op, + string opname = !subst("_lds", "", !cast(NAME).PseudoInstr)> { + defm "" : FLAT_Real_AllAddr_LDS_gfx10; + + let AsmString = opname # !cast(NAME#"_ST").AsmOperands # " lds" in + defm "" : FLAT_Real_ST_gfx10; +} + // ENC_FLAT. defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>; defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>; @@ -2042,6 +2078,12 @@ defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>; defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>; +defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_LDS_gfx10 <0x008>; +defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_LDS_gfx10 <0x009>; +defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS_gfx10 <0x00a>; +defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS_gfx10 <0x00b>; +defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS_gfx10 <0x00c>; + // ENC_FLAT_SCRATCH. defm SCRATCH_LOAD_UBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x008>; defm SCRATCH_LOAD_SBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x009>; @@ -2065,3 +2107,9 @@ defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x022>; defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>; defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>; defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>; + +defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x008>; +defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x009>; +defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00a>; +defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00b>; +defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>; diff --git a/llvm/test/MC/AMDGPU/gfx1030_new.s b/llvm/test/MC/AMDGPU/gfx1030_new.s index 08501d9..c6eb05b 100644 --- a/llvm/test/MC/AMDGPU/gfx1030_new.s +++ b/llvm/test/MC/AMDGPU/gfx1030_new.s @@ -204,3 +204,18 @@ s_waitcnt_depctr depctr_hold_cnt(1) & depctr_sa_sdst(1) & depctr_va_vdst(1) & de s_waitcnt_depctr depctr_hold_cnt(1), depctr_sa_sdst(1), depctr_va_vdst(14), depctr_va_sdst(6), depctr_va_ssrc(1), depctr_va_vcc(1), depctr_vm_vsrc(6) // GFX10: encoding: [0x9b,0xed,0xa3,0xbf] + +scratch_load_dword off, off offset:1024 lds +// GFX10: [0x00,0x64,0x30,0xdc,0x00,0x00,0x7f,0x00] + +scratch_load_ubyte off, off offset:1024 lds +// GFX10: [0x00,0x64,0x20,0xdc,0x00,0x00,0x7f,0x00] + +scratch_load_sbyte off, off offset:1024 lds +// GFX10: [0x00,0x64,0x24,0xdc,0x00,0x00,0x7f,0x00] + +scratch_load_ushort off, off offset:1024 lds +// GFX10: [0x00,0x64,0x28,0xdc,0x00,0x00,0x7f,0x00] + +scratch_load_sshort off, off offset:1024 lds +// GFX10: [0x00,0x64,0x2c,0xdc,0x00,0x00,0x7f,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_flat.s b/llvm/test/MC/AMDGPU/gfx10_asm_flat.s index a3c666b..daf0608 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_flat.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_flat.s @@ -359,6 +359,69 @@ global_atomic_xor v[1:2], v2, off dlc global_atomic_xor_x2 v[1:2], v[2:3], off dlc // GFX10: [0x00,0x90,0x6c,0xdd,0x01,0x02,0x7d,0x00] +global_load_dword v2, s[4:5] offset:1024 lds +// GFX10: [0x00,0xa4,0x30,0xdc,0x02,0x00,0x04,0x00] + +global_load_dword v2, s[4:5] offset:1024 lds +// GFX10: [0x00,0xa4,0x30,0xdc,0x02,0x00,0x04,0x00] + +global_load_ubyte v2, s[4:5] offset:1024 lds +// GFX10: [0x00,0xa4,0x20,0xdc,0x02,0x00,0x04,0x00] + +global_load_sbyte v2, s[4:5] offset:1024 lds +// GFX10: [0x00,0xa4,0x24,0xdc,0x02,0x00,0x04,0x00] + +global_load_ushort v2, s[4:5] offset:1024 lds +// GFX10: [0x00,0xa4,0x28,0xdc,0x02,0x00,0x04,0x00] + +global_load_sshort v2, s[4:5] offset:1024 lds +// GFX10: [0x00,0xa4,0x2c,0xdc,0x02,0x00,0x04,0x00] + +global_load_dword v[2:3], off offset:1024 lds +// GFX10: [0x00,0xa4,0x30,0xdc,0x02,0x00,0x7d,0x00] + +global_load_ubyte v[2:3], off offset:1024 lds +// GFX10: [0x00,0xa4,0x20,0xdc,0x02,0x00,0x7d,0x00] + +global_load_sbyte v[2:3], off offset:1024 lds +// GFX10: [0x00,0xa4,0x24,0xdc,0x02,0x00,0x7d,0x00] + +global_load_ushort v[2:3], off offset:1024 lds +// GFX10: [0x00,0xa4,0x28,0xdc,0x02,0x00,0x7d,0x00] + +global_load_sshort v[2:3], off offset:1024 lds +// GFX10: [0x00,0xa4,0x2c,0xdc,0x02,0x00,0x7d,0x00] + +scratch_load_dword v2, off offset:1024 lds +// GFX10: [0x00,0x64,0x30,0xdc,0x02,0x00,0x7d,0x00] + +scratch_load_ubyte v2, off offset:1024 lds +// GFX10: [0x00,0x64,0x20,0xdc,0x02,0x00,0x7d,0x00] + +scratch_load_sbyte v2, off offset:1024 lds +// GFX10: [0x00,0x64,0x24,0xdc,0x02,0x00,0x7d,0x00] + +scratch_load_ushort v2, off offset:1024 lds +// GFX10: [0x00,0x64,0x28,0xdc,0x02,0x00,0x7d,0x00] + +scratch_load_sshort v2, off offset:1024 lds +// GFX10: [0x00,0x64,0x2c,0xdc,0x02,0x00,0x7d,0x00] + +scratch_load_dword off, s4 offset:1024 lds +// GFX10: [0x00,0x64,0x30,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_ubyte off, s4 offset:1024 lds +// GFX10: [0x00,0x64,0x20,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_sbyte off, s4 offset:1024 lds +// GFX10: [0x00,0x64,0x24,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_ushort off, s4 offset:1024 lds +// GFX10: [0x00,0x64,0x28,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_sshort off, s4 offset:1024 lds +// GFX10: [0x00,0x64,0x2c,0xdc,0x00,0x00,0x04,0x00] + //===----------------------------------------------------------------------===// // Also see flat-gfx10.s, flat-global.s, flat-scratch-instructions.s. //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/AMDGPU/gfx8_unsupported.s b/llvm/test/MC/AMDGPU/gfx8_unsupported.s index ede155d..1e06a1d 100644 --- a/llvm/test/MC/AMDGPU/gfx8_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx8_unsupported.s @@ -1828,6 +1828,18 @@ v_xnor_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD v_xor3_b32 v255, v1, v2, v3 // CHECK: error: instruction not supported on this GPU +global_load_lds_dword v[2:3], off +// CHECK: error: instruction not supported on this GPU + +global_load_dword v[2:3], off lds +// CHECK: error: instruction not supported on this GPU + +scratch_load_dword v2, off lds +// CHECK: error: instruction not supported on this GPU + +scratch_load_dword off, s2 lds +// CHECK: error: instruction not supported on this GPU + //===----------------------------------------------------------------------===// // Unsupported e32 variants. //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/AMDGPU/gfx9-asm-err.s b/llvm/test/MC/AMDGPU/gfx9-asm-err.s index c1fc9f4..8c666cf 100644 --- a/llvm/test/MC/AMDGPU/gfx9-asm-err.s +++ b/llvm/test/MC/AMDGPU/gfx9-asm-err.s @@ -32,3 +32,12 @@ v_subrev_u16_e64 v5, v1, -4.0 v_cvt_u32_f64 v5, v[0:1] quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf // GFX9ERR: error: not a valid operand. + +global_load_lds_dword v[2:3], off +// GFX9ERR: error: instruction not supported on this GPU + +global_load_dword v[2:3], off +// GFX9ERR: error: invalid operands for instruction + +scratch_load_dword v2, off, offset:256 +// GFX9ERR: error: invalid operands for instruction diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s index 69937d9..a69e952 100644 --- a/llvm/test/MC/AMDGPU/gfx90a_err.s +++ b/llvm/test/MC/AMDGPU/gfx90a_err.s @@ -284,3 +284,9 @@ ds_ordered_count v5, v1 offset:65535 gds exp pos0 v3, v2, v1, v0 // GFX90A: error: instruction not supported on this GPU + +global_load_lds_dword v[2:3], off +// GFX90A: error: instruction not supported on this GPU + +scratch_load_lds_dword v2, off +// GFX90A: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s index 3fc776f..e425f23 100644 --- a/llvm/test/MC/AMDGPU/gfx940_err.s +++ b/llvm/test/MC/AMDGPU/gfx940_err.s @@ -84,3 +84,9 @@ ds_ordered_count v5, v1 offset:65535 gds exp pos0 v3, v2, v1, v0 // GFX940: error: instruction not supported on this GPU + +global_load_dword v[2:3], off lds +// GFX940: error: operands are not valid for this GPU or mode + +scratch_load_dword v2, off lds +// GFX940: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s index 7612308..d038067 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s @@ -4301,3 +4301,72 @@ scratch_load_short_d16_hi v5, off, s2 offset:-1 glc scratch_load_short_d16_hi v5, off, s2 offset:-1 slc // CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05] + +global_load_dword v[2:3], off lds +// CHECK: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00] + +global_load_dword v[2:3], off offset:2048 lds +// CHECK: [0x00,0xa8,0x50,0xdc,0x02,0x00,0x7f,0x00] + +global_load_ubyte v[2:3], off offset:2048 lds +// CHECK: [0x00,0xa8,0x40,0xdc,0x02,0x00,0x7f,0x00] + +global_load_sbyte v[2:3], off offset:2048 lds +// CHECK: [0x00,0xa8,0x44,0xdc,0x02,0x00,0x7f,0x00] + +global_load_ushort v[2:3], off offset:2048 lds +// CHECK: [0x00,0xa8,0x48,0xdc,0x02,0x00,0x7f,0x00] + +global_load_sshort v[2:3], off offset:2048 lds +// CHECK: [0x00,0xa8,0x4c,0xdc,0x02,0x00,0x7f,0x00] + +global_load_dword v2, s[4:5] offset:2048 lds +// CHECK: [0x00,0xa8,0x50,0xdc,0x02,0x00,0x04,0x00] + +global_load_ubyte v2, s[4:5] offset:2048 lds +// CHECK: [0x00,0xa8,0x40,0xdc,0x02,0x00,0x04,0x00] + +global_load_sbyte v2, s[4:5] offset:2048 lds +// CHECK: [0x00,0xa8,0x44,0xdc,0x02,0x00,0x04,0x00] + +global_load_ushort v2, s[4:5] offset:2048 lds +// CHECK: [0x00,0xa8,0x48,0xdc,0x02,0x00,0x04,0x00] + +global_load_sshort v2, s[4:5] offset:2048 lds +// CHECK: [0x00,0xa8,0x4c,0xdc,0x02,0x00,0x04,0x00] + +scratch_load_dword v2, off lds +// CHECK: [0x00,0x60,0x50,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_dword v2, off offset:2048 lds +// CHECK: [0x00,0x68,0x50,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_dword v2, off offset:2048 lds +// CHECK: [0x00,0x68,0x50,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_ubyte v2, off offset:2048 lds +// CHECK: [0x00,0x68,0x40,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_sbyte v2, off offset:2048 lds +// CHECK: [0x00,0x68,0x44,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_ushort v2, off offset:2048 lds +// CHECK: [0x00,0x68,0x48,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_sshort v2, off offset:2048 lds +// CHECK: [0x00,0x68,0x4c,0xdc,0x02,0x00,0x7f,0x00] + +scratch_load_dword off, s4 offset:2048 lds +// CHECK: [0x00,0x68,0x50,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_ubyte off, s4 offset:2048 lds +// CHECK: [0x00,0x68,0x40,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_sbyte off, s4 offset:2048 lds +// CHECK: [0x00,0x68,0x44,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_ushort off, s4 offset:2048 lds +// CHECK: [0x00,0x68,0x48,0xdc,0x00,0x00,0x04,0x00] + +scratch_load_sshort off, s4 offset:2048 lds +// CHECK: [0x00,0x68,0x4c,0xdc,0x00,0x00,0x04,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt index 92a0b0c..1162710a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt @@ -246,3 +246,18 @@ # GFX10: s_waitcnt_depctr depctr_va_vdst(1) depctr_va_sdst(1) depctr_vm_vsrc(1) ; encoding: [0x87,0x13,0xa3,0xbf] 0x87,0x13,0xa3,0xbf + +# GFX10: scratch_load_dword off, off offset:1024 lds ; encoding: [0x00,0x64,0x30,0xdc,0x00,0x00,0x7f,0x00] +0x00,0x64,0x30,0xdc,0x00,0x00,0x7f,0x00 + +# GFX10: scratch_load_ubyte off, off offset:1024 lds ; encoding: [0x00,0x64,0x20,0xdc,0x00,0x00,0x7f,0x00] +0x00,0x64,0x20,0xdc,0x00,0x00,0x7f,0x00 + +# GFX10: scratch_load_sbyte off, off offset:1024 lds ; encoding: [0x00,0x64,0x24,0xdc,0x00,0x00,0x7f,0x00] +0x00,0x64,0x24,0xdc,0x00,0x00,0x7f,0x00 + +# GFX10: scratch_load_ushort off, off offset:1024 lds ; encoding: [0x00,0x64,0x28,0xdc,0x00,0x00,0x7f,0x00] +0x00,0x64,0x28,0xdc,0x00,0x00,0x7f,0x00 + +# GFX10: scratch_load_sshort off, off offset:1024 lds ; encoding: [0x00,0x64,0x2c,0xdc,0x00,0x00,0x7f,0x00] +0x00,0x64,0x2c,0xdc,0x00,0x00,0x7f,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt index 4d57f30..3ac99f4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt @@ -98886,3 +98886,66 @@ # GFX10: s_waitcnt_depctr depctr_va_vdst(14) depctr_va_sdst(6) depctr_vm_vsrc(6) ; encoding: [0x1b,0xed,0xa3,0xbf] 0x1b,0xed,0xa3,0xbf + +# GFX10: global_load_dword v2, s[4:5] offset:1024 lds ; encoding: [0x00,0xa4,0x30,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa4,0x30,0xdc,0x02,0x00,0x04,0x00 + +# GFX10: global_load_dword v2, s[4:5] offset:1024 lds ; encoding: [0x00,0xa4,0x30,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa4,0x30,0xdc,0x02,0x00,0x04,0x00 + +# GFX10: global_load_ubyte v2, s[4:5] offset:1024 lds ; encoding: [0x00,0xa4,0x20,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa4,0x20,0xdc,0x02,0x00,0x04,0x00 + +# GFX10: global_load_sbyte v2, s[4:5] offset:1024 lds ; encoding: [0x00,0xa4,0x24,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa4,0x24,0xdc,0x02,0x00,0x04,0x00 + +# GFX10: global_load_ushort v2, s[4:5] offset:1024 lds ; encoding: [0x00,0xa4,0x28,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa4,0x28,0xdc,0x02,0x00,0x04,0x00 + +# GFX10: global_load_sshort v2, s[4:5] offset:1024 lds ; encoding: [0x00,0xa4,0x2c,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa4,0x2c,0xdc,0x02,0x00,0x04,0x00 + +# GFX10: global_load_dword v[2:3], off offset:1024 lds ; encoding: [0x00,0xa4,0x30,0xdc,0x02,0x00,0x7d,0x00] +0x00,0xa4,0x30,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: global_load_ubyte v[2:3], off offset:1024 lds ; encoding: [0x00,0xa4,0x20,0xdc,0x02,0x00,0x7d,0x00] +0x00,0xa4,0x20,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: global_load_sbyte v[2:3], off offset:1024 lds ; encoding: [0x00,0xa4,0x24,0xdc,0x02,0x00,0x7d,0x00] +0x00,0xa4,0x24,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: global_load_ushort v[2:3], off offset:1024 lds ; encoding: [0x00,0xa4,0x28,0xdc,0x02,0x00,0x7d,0x00] +0x00,0xa4,0x28,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: global_load_sshort v[2:3], off offset:1024 lds ; encoding: [0x00,0xa4,0x2c,0xdc,0x02,0x00,0x7d,0x00] +0x00,0xa4,0x2c,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: scratch_load_dword v2, off offset:1024 lds ; encoding: [0x00,0x64,0x30,0xdc,0x02,0x00,0x7d,0x00] +0x00,0x64,0x30,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: scratch_load_ubyte v2, off offset:1024 lds ; encoding: [0x00,0x64,0x20,0xdc,0x02,0x00,0x7d,0x00] +0x00,0x64,0x20,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: scratch_load_sbyte v2, off offset:1024 lds ; encoding: [0x00,0x64,0x24,0xdc,0x02,0x00,0x7d,0x00] +0x00,0x64,0x24,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: scratch_load_ushort v2, off offset:1024 lds ; encoding: [0x00,0x64,0x28,0xdc,0x02,0x00,0x7d,0x00] +0x00,0x64,0x28,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: scratch_load_sshort v2, off offset:1024 lds ; encoding: [0x00,0x64,0x2c,0xdc,0x02,0x00,0x7d,0x00] +0x00,0x64,0x2c,0xdc,0x02,0x00,0x7d,0x00 + +# GFX10: scratch_load_dword off, s4 offset:1024 lds ; encoding: [0x00,0x64,0x30,0xdc,0x00,0x00,0x04,0x00] +0x00,0x64,0x30,0xdc,0x00,0x00,0x04,0x00 + +# GFX10: scratch_load_ubyte off, s4 offset:1024 lds ; encoding: [0x00,0x64,0x20,0xdc,0x00,0x00,0x04,0x00] +0x00,0x64,0x20,0xdc,0x00,0x00,0x04,0x00 + +# GFX10: scratch_load_sbyte off, s4 offset:1024 lds ; encoding: [0x00,0x64,0x24,0xdc,0x00,0x00,0x04,0x00] +0x00,0x64,0x24,0xdc,0x00,0x00,0x04,0x00 + +# GFX10: scratch_load_ushort off, s4 offset:1024 lds ; encoding: [0x00,0x64,0x28,0xdc,0x00,0x00,0x04,0x00] +0x00,0x64,0x28,0xdc,0x00,0x00,0x04,0x00 + +# GFX10: scratch_load_sshort off, s4 offset:1024 lds ; encoding: [0x00,0x64,0x2c,0xdc,0x00,0x00,0x04,0x00] +0x00,0x64,0x2c,0xdc,0x00,0x00,0x04,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt index 326f50a..ae27897 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt @@ -115550,3 +115550,72 @@ # CHECK: v_cmpx_t_u32_sdwa s[6:7], v1, sext(v2) src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0xbe,0x7d,0x01,0x86,0x06,0x0e] 0xf9,0x04,0xbe,0x7d,0x01,0x86,0x06,0x0e + +# CHECK: global_load_dword v[2:3], off lds ; encoding: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00] +0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: global_load_dword v[2:3], off offset:2048 lds ; encoding: [0x00,0xa8,0x50,0xdc,0x02,0x00,0x7f,0x00] +0x00,0xa8,0x50,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: global_load_ubyte v[2:3], off offset:2048 lds ; encoding: [0x00,0xa8,0x40,0xdc,0x02,0x00,0x7f,0x00] +0x00,0xa8,0x40,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: global_load_sbyte v[2:3], off offset:2048 lds ; encoding: [0x00,0xa8,0x44,0xdc,0x02,0x00,0x7f,0x00] +0x00,0xa8,0x44,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: global_load_ushort v[2:3], off offset:2048 lds ; encoding: [0x00,0xa8,0x48,0xdc,0x02,0x00,0x7f,0x00] +0x00,0xa8,0x48,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: global_load_sshort v[2:3], off offset:2048 lds ; encoding: [0x00,0xa8,0x4c,0xdc,0x02,0x00,0x7f,0x00] +0x00,0xa8,0x4c,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: global_load_dword v2, s[4:5] offset:2048 lds ; encoding: [0x00,0xa8,0x50,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa8,0x50,0xdc,0x02,0x00,0x04,0x00 + +# CHECK: global_load_ubyte v2, s[4:5] offset:2048 lds ; encoding: [0x00,0xa8,0x40,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa8,0x40,0xdc,0x02,0x00,0x04,0x00 + +# CHECK: global_load_sbyte v2, s[4:5] offset:2048 lds ; encoding: [0x00,0xa8,0x44,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa8,0x44,0xdc,0x02,0x00,0x04,0x00 + +# CHECK: global_load_ushort v2, s[4:5] offset:2048 lds ; encoding: [0x00,0xa8,0x48,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa8,0x48,0xdc,0x02,0x00,0x04,0x00 + +# CHECK: global_load_sshort v2, s[4:5] offset:2048 lds ; encoding: [0x00,0xa8,0x4c,0xdc,0x02,0x00,0x04,0x00] +0x00,0xa8,0x4c,0xdc,0x02,0x00,0x04,0x00 + +# CHECK: scratch_load_dword v2, off lds ; encoding: [0x00,0x60,0x50,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x60,0x50,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_dword v2, off offset:2048 lds ; encoding: [0x00,0x68,0x50,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x68,0x50,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_dword v2, off offset:2048 lds ; encoding: [0x00,0x68,0x50,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x68,0x50,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_ubyte v2, off offset:2048 lds ; encoding: [0x00,0x68,0x40,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x68,0x40,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_sbyte v2, off offset:2048 lds ; encoding: [0x00,0x68,0x44,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x68,0x44,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_ushort v2, off offset:2048 lds ; encoding: [0x00,0x68,0x48,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x68,0x48,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_sshort v2, off offset:2048 lds ; encoding: [0x00,0x68,0x4c,0xdc,0x02,0x00,0x7f,0x00] +0x00,0x68,0x4c,0xdc,0x02,0x00,0x7f,0x00 + +# CHECK: scratch_load_dword off, s4 offset:2048 lds ; encoding: [0x00,0x68,0x50,0xdc,0x00,0x00,0x04,0x00] +0x00,0x68,0x50,0xdc,0x00,0x00,0x04,0x00 + +# CHECK: scratch_load_ubyte off, s4 offset:2048 lds ; encoding: [0x00,0x68,0x40,0xdc,0x00,0x00,0x04,0x00] +0x00,0x68,0x40,0xdc,0x00,0x00,0x04,0x00 + +# CHECK: scratch_load_sbyte off, s4 offset:2048 lds ; encoding: [0x00,0x68,0x44,0xdc,0x00,0x00,0x04,0x00] +0x00,0x68,0x44,0xdc,0x00,0x00,0x04,0x00 + +# CHECK: scratch_load_ushort off, s4 offset:2048 lds ; encoding: [0x00,0x68,0x48,0xdc,0x00,0x00,0x04,0x00] +0x00,0x68,0x48,0xdc,0x00,0x00,0x04,0x00 + +# CHECK: scratch_load_sshort off, s4 offset:2048 lds ; encoding: [0x00,0x68,0x4c,0xdc,0x00,0x00,0x04,0x00] +0x00,0x68,0x4c,0xdc,0x00,0x00,0x04,0x00 -- 2.7.4