[AMDGPU] Add support for TFE/LWE in image intrinsics. 2nd try

author David Stuttard <david.stuttard@amd.com>

Mon, 14 Jan 2019 11:55:24 +0000 (11:55 +0000)

committer David Stuttard <david.stuttard@amd.com>

Mon, 14 Jan 2019 11:55:24 +0000 (11:55 +0000)
author David Stuttard <david.stuttard@amd.com>
Mon, 14 Jan 2019 11:55:24 +0000 (11:55 +0000)
committer David Stuttard <david.stuttard@amd.com>
Mon, 14 Jan 2019 11:55:24 +0000 (11:55 +0000)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 3ea364c..7913ce8 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -590,7 +590,7 @@ class AMDGPUDimSampleProfile<string opmod,
                               AMDGPUDimProps dim,
                               AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
    let IsSample = 1;
-  let RetTypes = [llvm_anyfloat_ty];
+  let RetTypes = [llvm_any_ty];
    let ExtraAddrArgs = sample.ExtraAddrArgs;
    let Gradients = sample.Gradients;
    let LodClampMip = sample.LodOrClamp;
@@ -683,11 +683,11 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
    }
  
    defm int_amdgcn_image_load
-    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
+    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
                                    [SDNPMemOperand]>,
        AMDGPUImageDMaskIntrinsic;
    defm int_amdgcn_image_load_mip
-    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
+    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
                                       [IntrReadMem], [SDNPMemOperand], 1>,
        AMDGPUImageDMaskIntrinsic;
  
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h

index d26397a..bb7801c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -43,6 +43,7 @@ FunctionPass *createSIFoldOperandsPass();
  FunctionPass *createSIPeepholeSDWAPass();
  FunctionPass *createSILowerI1CopiesPass();
  FunctionPass *createSIFixupVectorISelPass();
+FunctionPass *createSIAddIMGInitPass();
  FunctionPass *createSIShrinkInstructionsPass();
  FunctionPass *createSILoadStoreOptimizerPass();
  FunctionPass *createSIWholeQuadModePass();
@@ -158,6 +159,9 @@ extern char &AMDGPUSimplifyLibCallsID;
  void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
  extern char &AMDGPUUseNativeCallsID;
  
+void initializeSIAddIMGInitPass(PassRegistry &);
+extern char &SIAddIMGInitID;
+
  void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
  extern char &AMDGPUPerfHintAnalysisID;
  
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td

index a5b0eac..6a4cfe0 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -377,6 +377,16 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
    "Use ds_{read|write}_b128"
  >;
  
+// Sparse texture support requires that all result registers are zeroed when
+// PRTStrictNull is set to true. This feature is turned on for all architectures
+// but is enabled as a feature in case there are situations where PRTStrictNull
+// is disabled by the driver.
+def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
+  "EnablePRTStrictNull",
+  "true",
+  "Enable zeroing of result registers for sparse texture fetches"
+>;
+
  // Unless +-flat-for-global is specified, turn on FlatForGlobal for
  // all OS-es on VI and newer hardware to avoid assertion failures due
  // to missing ADDR64 variants of MUBUF instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

index 10b7f44..ed0cc70 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -74,6 +74,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
    // We want to be able to turn these off, but making this a subtarget feature
    // for SI has the unhelpful behavior that it unsets everything else if you
    // disable it.
+  //
+  // Similarly we want enable-prt-strict-null to be on by default and not to
+  // unset everything else if it is disabled
  
    SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
  
@@ -89,6 +92,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
      FullFS += "-fp32-denormals,";
    }
  
+  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
+
    FullFS += FS;
  
    ParseSubtargetFeatures(GPU, FullFS);
@@ -175,6 +180,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      EnableUnsafeDSOffsetFolding(false),
      EnableSIScheduler(false),
      EnableDS128(false),
+    EnablePRTStrictNull(false),
      DumpCode(false),
  
      FP64(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

index 43c83ef..5584759 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -326,6 +326,7 @@ protected:
    bool EnableUnsafeDSOffsetFolding;
    bool EnableSIScheduler;
    bool EnableDS128;
+  bool EnablePRTStrictNull;
    bool DumpCode;
  
    // Subtarget statically properties set by tablegen
@@ -577,6 +578,12 @@ public:
      return getGeneration() < AMDGPUSubtarget::GFX9;
    }
  
+  /// \returns If target requires PRT Struct NULL support (zero result registers
+  /// for sparse texture support).
+  bool usePRTStrictNull() const {
+    return EnablePRTStrictNull;
+  }
+
    bool hasAutoWaitcntBeforeBarrier() const {
      return AutoWaitcntBeforeBarrier;
    }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 70d365f..e8cefdb 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -825,6 +825,7 @@ bool GCNPassConfig::addInstSelector() {
    addPass(&SIFixSGPRCopiesID);
    addPass(createSILowerI1CopiesPass());
    addPass(createSIFixupVectorISelPass());
+  addPass(createSIAddIMGInitPass());
    return false;
  }
  
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt

index 7d12199..3933117 100644 (file)
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -93,6 +93,7 @@ add_llvm_target(AMDGPUCodeGen
    R600OptimizeVectorRegisters.cpp
    R600Packetizer.cpp
    R600RegisterInfo.cpp
+  SIAddIMGInit.cpp
    SIAnnotateControlFlow.cpp
    SIDebuggerInsertNops.cpp
    SIFixSGPRCopies.cpp
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td

index 1462682..1c68dbd 100644 (file)
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -29,6 +29,7 @@ class MIMGBaseOpcode {
    bit Atomic = 0;
    bit AtomicX2 = 0; // (f)cmpswap
    bit Sampler = 0;
+  bit Gather4 = 0;
    bits<8> NumExtraArgs = 0;
    bit Gradients = 0;
    bit Coordinates = 1;
@@ -43,7 +44,7 @@ def MIMGBaseOpcode : GenericEnum {
  def MIMGBaseOpcodesTable : GenericTable {
    let FilterClass = "MIMGBaseOpcode";
    let CppTypeName = "MIMGBaseOpcodeInfo";
-  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
                  "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
                  "HasD16"];
    GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
@@ -179,6 +180,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
      defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
      let VDataDwords = 4 in
      defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
    }
  }
  
@@ -411,6 +414,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
      defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
      let VDataDwords = 4 in
      defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
    }
  }
  
@@ -421,6 +426,7 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
                          string asm = "image_gather4"#sample.LowerCaseMod> {
    def "" : MIMG_Sampler_BaseOpcode<sample> {
      let HasD16 = 1;
+    let Gather4 = 1;
    }
  
    let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -429,6 +435,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
      defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
      let VDataDwords = 4 in
      defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
    }
  }
  
diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp

new file mode 100644 (file)

index 0000000..69cafef
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -0,0 +1,181 @@
+//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Any MIMG instructions that use tfe or lwe require an initialization of the
+/// result register that will be written in the case of a memory access failure
+/// The required code is also added to tie this init code to the result of the
+/// img instruction
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-img-init"
+
+using namespace llvm;
+
+namespace {
+
+class SIAddIMGInit : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIAddIMGInit() : MachineFunctionPass(ID) {
+    initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
+
+char SIAddIMGInit::ID = 0;
+
+char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
+
+FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
+
+bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
+  bool Changed = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      auto Opcode = MI.getOpcode();
+      if (TII->isMIMG(Opcode) && !MI.mayStore()) {
+        MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+        MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+        MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+        // Check for instructions that don't have tfe or lwe fields
+        // There shouldn't be any at this point.
+        assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
+
+        unsigned TFEVal = TFE->getImm();
+        unsigned LWEVal = LWE->getImm();
+        unsigned D16Val = D16 ? D16->getImm() : 0;
+
+        if (TFEVal || LWEVal) {
+          // At least one of TFE or LWE are non-zero
+          // We have to insert a suitable initialization of the result value and
+          // tie this to the dest of the image instruction.
+
+          const DebugLoc &DL = MI.getDebugLoc();
+
+          int DstIdx =
+              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+
+          // Calculate which dword we have to initialize to 0.
+          MachineOperand *MO_Dmask =
+              TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+
+          // check that dmask operand is found.
+          assert(MO_Dmask && "Expected dmask operand in instruction");
+
+          unsigned dmask = MO_Dmask->getImm();
+          // Determine the number of active lanes taking into account the
+          // Gather4 special case
+          unsigned ActiveLanes =
+              TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
+
+          // Subreg indices are counted from 1
+          // When D16 then we want next whole VGPR after write data.
+          static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
+
+          bool Packed = !ST.hasUnpackedD16VMem();
+
+          unsigned InitIdx =
+              D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+
+          // Abandon attempt if the dst size isn't large enough
+          // - this is in fact an error but this is picked up elsewhere and
+          // reported correctly.
+          uint32_t DstSize =
+              RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+          if (DstSize < InitIdx)
+            continue;
+
+          // Create a register for the intialization value.
+          unsigned PrevDst =
+              MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+          unsigned NewDst = 0; // Final initialized value will be in here
+
+          // If PRTStrictNull feature is enabled (the default) then initialize
+          // all the result registers to 0, otherwise just the error indication
+          // register (VGPRn+1)
+          unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
+          unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
+
+          if (DstSize == 1) {
+            // In this case we can just initialize the result directly
+            BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
+                .addImm(0);
+            NewDst = PrevDst;
+          } else {
+            BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
+            for (; SizeLeft; SizeLeft--, CurrIdx++) {
+              NewDst =
+                  MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+              // Initialize dword
+              unsigned SubReg =
+                  MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+              BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
+                  .addImm(0);
+              // Insert into the super-reg
+              BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
+                  .addReg(PrevDst)
+                  .addReg(SubReg)
+                  .addImm(CurrIdx);
+
+              PrevDst = NewDst;
+            }
+          }
+
+          // Add as an implicit operand
+          MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
+
+          // Tie the just added implicit operand to the dst
+          MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
+
+          Changed = true;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 6374792..9e16737 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -216,6 +216,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
  
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  
    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@@ -813,6 +814,47 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
  }
  
+static MVT memVTFromAggregate(Type *Ty) {
+  // Only limited forms of aggregate type currently expected.
+  assert(Ty->isStructTy() && "Expected struct type");
+
+
+  Type *ElementType = nullptr;
+  unsigned NumElts;
+  if (Ty->getContainedType(0)->isVectorTy()) {
+    VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
+    ElementType = VecComponent->getElementType();
+    NumElts = VecComponent->getNumElements();
+  } else {
+    ElementType = Ty->getContainedType(0);
+    NumElts = 1;
+  }
+
+  assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
+
+  // Calculate the size of the memVT type from the aggregate
+  unsigned Pow2Elts = 0;
+  unsigned ElementSize;
+  switch (ElementType->getTypeID()) {
+    default:
+      llvm_unreachable("Unknown type!");
+    case Type::IntegerTyID:
+      ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
+      break;
+    case Type::HalfTyID:
+      ElementSize = 16;
+      break;
+    case Type::FloatTyID:
+      ElementSize = 32;
+      break;
+  }
+  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
+  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
+
+  return MVT::getVectorVT(MVT::getVT(ElementType, false),
+                          Pow2Elts);
+}
+
  bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &CI,
                                            MachineFunction &MF,
@@ -840,7 +882,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
      Info.flags = MachineMemOperand::MODereferenceable;
      if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
        Info.opc = ISD::INTRINSIC_W_CHAIN;
-      Info.memVT = MVT::getVT(CI.getType());
+      Info.memVT = MVT::getVT(CI.getType(), true);
+      if (Info.memVT == MVT::Other) {
+        // Some intrinsics return an aggregate type - special case to work out
+        // the correct memVT
+        Info.memVT = memVTFromAggregate(CI.getType());
+      }
        Info.flags |= MachineMemOperand::MOLoad;
      } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
        Info.opc = ISD::INTRINSIC_VOID;
@@ -4613,6 +4660,109 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
    return Value == 0;
  }
  
+// Re-construct the required return value for a image load intrinsic.
+// This is more complicated due to the optional use TexFailCtrl which means the required
+// return type is an aggregate
+static SDValue constructRetValue(SelectionDAG &DAG,
+                                 MachineSDNode *Result,
+                                 ArrayRef<EVT> ResultTypes,
+                                 bool IsTexFail, bool Unpacked, bool IsD16,
+                                 int DMaskPop, int NumVDataDwords,
+                                 const SDLoc &DL, LLVMContext &Context) {
+  // Determine the required return type. This is the same regardless of IsTexFail flag
+  EVT ReqRetVT = ResultTypes[0];
+  EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
+  int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
+  EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
+  EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
+                                           : AdjEltVT
+                       : ReqRetVT;
+
+  // Extract data part of the result
+  // Bitcast the result to the same type as the required return type
+  int NumElts;
+  if (IsD16 && !Unpacked)
+    NumElts = NumVDataDwords << 1;
+  else
+    NumElts = NumVDataDwords;
+
+  EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
+                           : AdjEltVT;
+
+  // Special case for v8f16. Rather than add support for this, use v4i32 to
+  // extract the data elements
+  bool V8F16Special = false;
+  if (CastVT == MVT::v8f16) {
+    CastVT = MVT::v4i32;
+    DMaskPop >>= 1;
+    ReqRetNumElts >>= 1;
+    V8F16Special = true;
+    AdjVT = MVT::v2i32;
+  }
+
+  SDValue N = SDValue(Result, 0);
+  SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+
+  // Iterate over the result
+  SmallVector<SDValue, 4> BVElts;
+
+  if (CastVT.isVector()) {
+    DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
+  } else {
+    BVElts.push_back(CastRes);
+  }
+  int ExtraElts = ReqRetNumElts - DMaskPop;
+  while(ExtraElts--)
+    BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+
+  SDValue PreTFCRes;
+  if (ReqRetNumElts > 1) {
+    SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
+    if (IsD16 && Unpacked)
+      PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
+    else
+      PreTFCRes = NewVec;
+  } else {
+    PreTFCRes = BVElts[0];
+  }
+
+  if (V8F16Special)
+    PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+
+  if (!IsTexFail) {
+    if (Result->getNumValues() > 1)
+      return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
+    else
+      return PreTFCRes;
+  }
+
+  // Extract the TexFail result and insert into aggregate return
+  SmallVector<SDValue, 1> TFCElt;
+  DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
+  SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
+  return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+}
+
+static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
+                         SDValue *LWE, bool &IsTexFail) {
+  auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
+  if (!TexFailCtrlConst)
+    return false;
+
+  uint64_t Value = TexFailCtrlConst->getZExtValue();
+  if (Value) {
+    IsTexFail = true;
+  }
+
+  SDLoc DL(TexFailCtrlConst);
+  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+  Value &= ~(uint64_t)0x1;
+  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+  Value &= ~(uint64_t)0x2;
+
+  return Value == 0;
+}
+
  SDValue SITargetLowering::lowerImage(SDValue Op,
                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
                                       SelectionDAG &DAG) const {
@@ -4626,13 +4776,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
        AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
    unsigned IntrOpcode = Intr->BaseOpcode;
  
-  SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+  SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
+  SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
    bool IsD16 = false;
    bool IsA16 = false;
    SDValue VData;
    int NumVDataDwords;
+  bool AdjustRetType = false;
+
    unsigned AddrIdx; // Index of first address argument
    unsigned DMask;
+  unsigned DMaskLanes = 0;
  
    if (BaseOpcode->Atomic) {
      VData = Op.getOperand(2);
@@ -4655,7 +4809,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
        AddrIdx = 3;
      }
    } else {
-    unsigned DMaskIdx;
+    unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
+    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+    if (!DMaskConst)
+      return Op;
+    DMask = DMaskConst->getZExtValue();
+    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
  
      if (BaseOpcode->Store) {
        VData = Op.getOperand(2);
@@ -4671,37 +4830,32 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
        }
  
        NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
-      DMaskIdx = 3;
      } else {
-      MVT LoadVT = Op.getSimpleValueType();
+      // Work out the num dwords based on the dmask popcount and underlying type
+      // and whether packing is supported.
+      MVT LoadVT = ResultTypes[0].getSimpleVT();
        if (LoadVT.getScalarType() == MVT::f16) {
          if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
              !BaseOpcode->HasD16)
            return Op; // D16 is unsupported for this instruction
  
          IsD16 = true;
-        if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
-          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
        }
  
-      NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
-      DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
-    }
+      // Confirm that the return type is large enough for the dmask specified
+      if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
+          (!LoadVT.isVector() && DMaskLanes > 1))
+          return Op;
  
-    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
-    if (!DMaskConst)
-      return Op;
+      if (IsD16 && !Subtarget->hasUnpackedD16VMem())
+        NumVDataDwords = (DMaskLanes + 1) / 2;
+      else
+        NumVDataDwords = DMaskLanes;
  
-    AddrIdx = DMaskIdx + 1;
-    DMask = DMaskConst->getZExtValue();
-    if (!DMask && !BaseOpcode->Store) {
-      // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
-      // store the channels' default values.
-      SDValue Undef = DAG.getUNDEF(Op.getValueType());
-      if (isa<MemSDNode>(Op))
-        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
-      return Undef;
+      AdjustRetType = true;
      }
+
+    AddrIdx = DMaskIdx + 1;
    }
  
    unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
@@ -4780,11 +4934,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
      CtrlIdx = AddrIdx + NumVAddrs + 3;
    }
  
+  SDValue TFE;
+  SDValue LWE;
    SDValue TexFail = Op.getOperand(CtrlIdx);
-  auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
-  if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+  bool IsTexFail = false;
+  if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
      return Op;
  
+  if (IsTexFail) {
+    if (!DMaskLanes) {
+      // Expecting to get an error flag since TFC is on - and dmask is 0
+      // Force dmask to be at least 1 otherwise the instruction will fail
+      DMask = 0x1;
+      DMaskLanes = 1;
+      NumVDataDwords = 1;
+    }
+    NumVDataDwords += 1;
+    AdjustRetType = true;
+  }
+
+  // Has something earlier tagged that the return type needs adjusting
+  // This happens if the instruction is a load or has set TexFailCtrl flags
+  if (AdjustRetType) {
+    // NumVDataDwords reflects the true number of dwords required in the return type
+    if (DMaskLanes == 0 && !BaseOpcode->Store) {
+      // This is a no-op load. This can be eliminated
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      if (isa<MemSDNode>(Op))
+        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+      return Undef;
+    }
+
+    // Have to use a power of 2 number of dwords
+    NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
+
+    EVT NewVT = NumVDataDwords > 1 ?
+                  EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
+                : MVT::f32;
+
+    ResultTypes[0] = NewVT;
+    if (ResultTypes.size() == 3) {
+      // Original result was aggregate type used for TexFailCtrl results
+      // The actual instruction returns as a vector type which has now been
+      // created. Remove the aggregate result.
+      ResultTypes.erase(&ResultTypes[1]);
+    }
+  }
+
    SDValue GLC;
    SDValue SLC;
    if (BaseOpcode->Atomic) {
@@ -4809,8 +5005,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
    Ops.push_back(SLC);
    Ops.push_back(IsA16 &&  // a16 or r128
                  ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
-  Ops.push_back(False); // tfe
-  Ops.push_back(False); // lwe
+  Ops.push_back(TFE); // tfe
+  Ops.push_back(LWE); // lwe
    Ops.push_back(DimInfo->DA ? True : False);
    if (BaseOpcode->HasD16)
      Ops.push_back(IsD16 ? True : False);
@@ -4838,11 +5034,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
      SmallVector<SDValue, 1> Elt;
      DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
      return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
-  } else if (IsD16 && !BaseOpcode->Store) {
-    MVT LoadVT = Op.getSimpleValueType();
-    SDValue Adjusted = adjustLoadValueTypeImpl(
-        SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
-    return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+  } else if (!BaseOpcode->Store) {
+    return constructRetValue(DAG, NewNode,
+                             OrigResultTypes, IsTexFail,
+                             Subtarget->hasUnpackedD16VMem(), IsD16,
+                             DMaskLanes, NumVDataDwords, DL,
+                             *DAG.getContext());
    }
  
    return SDValue(NewNode, 0);
@@ -8753,6 +8950,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {
    case AMDGPU::sub1: return 1;
    case AMDGPU::sub2: return 2;
    case AMDGPU::sub3: return 3;
+  case AMDGPU::sub4: return 4; // Possible with TFE/LWE
    }
  }
  
@@ -8766,11 +8964,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
      return Node; // not implemented for D16
  
-  SDNode *Users[4] = { nullptr };
+  SDNode *Users[5] = { nullptr };
    unsigned Lane = 0;
    unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
    unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
    unsigned NewDmask = 0;
+  unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
+  unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
+  bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+                  Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
+  unsigned TFCLane = 0;
    bool HasChain = Node->getNumValues() > 1;
  
    if (OldDmask == 0) {
@@ -8778,6 +8981,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
      return Node;
    }
  
+  unsigned OldBitsSet = countPopulation(OldDmask);
+  // Work out which is the TFE/LWE lane if that is enabled.
+  if (UsesTFC) {
+    TFCLane = OldBitsSet;
+  }
+
    // Try to figure out the used register components
    for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
         I != E; ++I) {
@@ -8797,28 +9006,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
      // set, etc.
      Lane = SubIdx2Lane(I->getConstantOperandVal(1));
  
-    // Set which texture component corresponds to the lane.
-    unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
-      Comp = countTrailingZeros(Dmask);
-      Dmask &= ~(1 << Comp);
-    }
+    // Check if the use is for the TFE/LWE generated result at VGPRn+1.
+    if (UsesTFC && Lane == TFCLane) {
+      Users[Lane] = *I;
+    } else {
+      // Set which texture component corresponds to the lane.
+      unsigned Comp;
+      for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
+        Comp = countTrailingZeros(Dmask);
+        Dmask &= ~(1 << Comp);
+      }
  
-    // Abort if we have more than one user per component
-    if (Users[Lane])
-      return Node;
+      // Abort if we have more than one user per component.
+      if (Users[Lane])
+        return Node;
  
-    Users[Lane] = *I;
-    NewDmask |= 1 << Comp;
+      Users[Lane] = *I;
+      NewDmask |= 1 << Comp;
+    }
    }
  
+  // Don't allow 0 dmask, as hardware assumes one channel enabled.
+  bool NoChannels = !NewDmask;
+  if (NoChannels) {
+    // If the original dmask has one channel - then nothing to do
+    if (OldBitsSet == 1)
+      return Node;
+    // Use an arbitrary dmask - required for the instruction to work
+    NewDmask = 1;
+  }
    // Abort if there's no change
    if (NewDmask == OldDmask)
      return Node;
  
    unsigned BitsSet = countPopulation(NewDmask);
  
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
+  // Check for TFE or LWE - increase the number of channels by one to account
+  // for the extra return value
+  // This will need adjustment for D16 if this is also included in
+  // adjustWriteMask (this function) but at present D16 are excluded.
+  unsigned NewChannels = BitsSet + UsesTFC;
+
+  int NewOpcode =
+      AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
    assert(NewOpcode != -1 &&
           NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
           "failed to find equivalent MIMG op");
@@ -8831,8 +9061,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
  
    MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
  
-  MVT ResultVT = BitsSet == 1 ?
-    SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+  MVT ResultVT = NewChannels == 1 ?
+    SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
+                           NewChannels == 5 ? 8 : NewChannels);
    SDVTList NewVTList = HasChain ?
      DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
  
@@ -8846,7 +9077,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
    }
  
-  if (BitsSet == 1) {
+  if (NewChannels == 1) {
      assert(Node->hasNUsesOfValue(1, 0));
      SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
                                        SDLoc(Node), Users[Lane]->getValueType(0),
@@ -8856,19 +9087,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    }
  
    // Update the users of the node with the new indices
-  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
      SDNode *User = Users[i];
-    if (!User)
-      continue;
-
-    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-    DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+    if (!User) {
+      // Handle the special case of NoChannels. We set NewDmask to 1 above, but
+      // Users[0] is still nullptr because channel 0 doesn't really have a use.
+      if (i || !NoChannels)
+        continue;
+    } else {
+      SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+      DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+    }
  
      switch (Idx) {
      default: break;
      case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
      case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
      case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+    case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
      }
    }
  
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index b7c4eed..2370d5f 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2972,6 +2972,42 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
      }
    }
  
+  // Verify MIMG
+  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
+    // Ensure that the return type used is large enough for all the options
+    // being used TFE/LWE require an extra result register.
+    const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
+    if (DMask) {
+      uint64_t DMaskImm = DMask->getImm();
+      uint32_t RegCount =
+          isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
+      const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
+      const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
+      const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
+
+      // Adjust for packed 16 bit values
+      if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
+        RegCount >>= 1;
+
+      // Adjust if using LWE or TFE
+      if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
+        RegCount += 1;
+
+      const uint32_t DstIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+      const MachineOperand &Dst = MI.getOperand(DstIdx);
+      if (Dst.isReg()) {
+        const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
+        uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
+        if (RegCount > DstSize) {
+          ErrInfo = "MIMG instruction returns too many registers for dst "
+                    "register class";
+          return false;
+        }
+      }
+    }
+  }
+
    // Verify VOP*. Ignore multiple sgpr operands on writelane.
    if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
        && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

index 1ef7da3..20123ed 100644 (file)
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -184,6 +184,7 @@ struct MIMGBaseOpcodeInfo {
    bool Atomic;
    bool AtomicX2;
    bool Sampler;
+  bool Gather4;
  
    uint8_t NumExtraArgs;
    bool Gradients;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h

index e507630..2de41bd 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -802,7 +802,8 @@ private:
  
    Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                 APInt DemandedElts,
-                                               int DmaskIdx = -1);
+                                               int DmaskIdx = -1,
+                                               int TFCIdx = -1);
  
    Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                      APInt &UndefElts, unsigned Depth = 0);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

index b14cf84..9bf87d0 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -969,11 +969,24 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
  /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
  Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                             APInt DemandedElts,
-                                                           int DMaskIdx) {
+                                                           int DMaskIdx,
+                                                           int TFCIdx) {
    unsigned VWidth = II->getType()->getVectorNumElements();
    if (VWidth == 1)
      return nullptr;
  
+  // Need to change to new instruction format
+  ConstantInt *TFC = nullptr;
+  bool TFELWEEnabled = false;
+  if (TFCIdx > 0) {
+    TFC = dyn_cast<ConstantInt>(II->getArgOperand(TFCIdx));
+    TFELWEEnabled =    TFC->getZExtValue() & 0x1  // TFE
+                    || TFC->getZExtValue() & 0x2; // LWE
+  }
+
+  if (TFELWEEnabled)
+    return nullptr; // TFE not yet supported
+
    ConstantInt *NewDMask = nullptr;
  
    if (DMaskIdx < 0) {
@@ -1626,7 +1639,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
      default: {
        if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
-        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
+        return simplifyAMDGCNMemoryIntrinsicDemanded(
+            II, DemandedElts, 0, II->getNumArgOperands() - 2);
  
        break;
      }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll

index bf93ffa..b297aca 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SIVI,PRT %s
+; RUN: llc -march=amdgcn -mcpu=fiji  -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,PRT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,PRT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,NOPRT %s
  
  ; GCN-LABEL: {{^}}load_1d:
  ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
@@ -10,6 +11,52 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_1d_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_2d:
  ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
@@ -18,6 +65,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_2d_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_3d:
  ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
@@ -26,6 +96,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_3d_tfe_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_cube:
  ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
  define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
@@ -34,6 +127,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_cube_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_1darray:
  ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
  define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
@@ -42,6 +158,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_1darray_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_2darray:
  ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
  define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
@@ -50,6 +189,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_2darray_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_2dmsaa:
  ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
@@ -58,6 +220,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_2dmsaa_both:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_2darraymsaa:
  ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
  define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
@@ -66,6 +251,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_mip_1d:
  ; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
@@ -74,6 +282,29 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_mip_1d_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}load_mip_2d:
  ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
@@ -82,6 +313,191 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}load_mip_2d_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; Make sure that error flag is returned even with dmask 0
+; GCN-LABEL: {{^}}load_1d_V2_tfe_dmask0:
+; GCN: v_mov_b32_e32 v1, 0
+; PRT-DAG: v_mov_b32_e32 v2, v1
+; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_1d_V1_tfe_dmask0:
+; GCN: v_mov_b32_e32 v1, 0
+; PRT-DAG: v_mov_b32_e32 v2, v1
+; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {float, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_mip_2d_tfe_dmask0:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 0, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; Do not make dmask 0 even if no result (other than tfe) is used.
+; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V2:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 6, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V1:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 2, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {float, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; Check for dmask being materially smaller than return type
+; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v3, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}}
+; SIVI: buffer_store_dword v3, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v3
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v2, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
+; SIVI: buffer_store_dword v2, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v1, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
+; SIVI: buffer_store_dword v1, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v1, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
+; SIVI: buffer_store_dword v1, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
+define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<2 x float>, i32} %v, 0
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <2 x float> %v.vec
+}
+
+
  ; GCN-LABEL: {{^}}load_mip_3d:
  ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
@@ -404,23 +820,37 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
    store float 0.000000e+00, float addrspace(3)* %lds
    %c0 = extractelement <2 x i32> %c, i32 0
    %c1 = extractelement <2 x i32> %c, i32 1
-  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
    %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
    store float 0.000000e+00, float addrspace(3)* %tmp2
    ret float %tex
  }
  
  declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  
  declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll

index 1fbfccb..fd2c6e7 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -1,7 +1,7 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
  
  ; GCN-LABEL: {{^}}load.f16.1d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
  define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -10,7 +10,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v2f16.1d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
  define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -37,7 +37,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.f16.2d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
  define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -47,7 +47,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v2f16.2d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
  define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -77,7 +77,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.f16.3d:
-; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16 d16
  define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
  main_body:
    %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -88,7 +88,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v2f16.3d:
-; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+; GCN: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm a16 d16
  define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
  main_body:
    %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll

index d857ae1..be579b8 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -1,7 +1,7 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
  
  ; GCN-LABEL: {{^}}load.f32.1d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
  define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -10,7 +10,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v2f32.1d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
  define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -37,7 +37,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.f32.2d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
  define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -47,7 +47,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v2f32.2d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
  define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -77,7 +77,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.f32.3d:
-; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
+; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16
  define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
  main_body:
    %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -88,7 +88,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v2f32.3d:
-; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16
  define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
  main_body:
    %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll

index 9619304..b6260f4 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -10,6 +10,19 @@ main_body:
    ret half %tex
  }
  
+; GCN-LABEL: {{^}}image_sample_2d_f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; PACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
+; UNPACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
+define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
+main_body:
+  %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %tex.vec = extractvalue {half, i32} %tex, 0
+  %tex.err = extractvalue {half, i32} %tex, 1
+  store i32 %tex.err, i32 addrspace(1)* %out, align 4
+  ret half %tex.vec
+}
+
  ; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16:
  ; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
  ; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
@@ -20,6 +33,22 @@ main_body:
    ret float %r
  }
  
+; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
+; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
+define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+main_body:
+  %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %tex.vec = extractvalue {<2 x half>, i32} %tex, 0
+  %tex.err = extractvalue {<2 x half>, i32} %tex, 1
+  %tex.vecf = bitcast <2 x half> %tex.vec to float
+  %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
+  %tex.errf = bitcast i32 %tex.err to float
+  %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
+  ret <2 x float> %r
+}
+
  ; GCN-LABEL: {{^}}image_sample_b_2d_v4f16:
  ; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
  ; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
@@ -30,9 +59,33 @@ main_body:
    ret <2 x float> %r
  }
  
+; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
+; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
+define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+main_body:
+  %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %tex.vec = extractvalue {<4 x half>, i32} %tex, 0
+  %tex.err = extractvalue {<4 x half>, i32} %tex, 1
+  %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
+  %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
+  %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
+  %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
+  %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
+  %tex.errf = bitcast i32 %tex.err to float
+  %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
+  ret <4 x float> %r
+}
+
  declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  
  attributes #0 = { nounwind }
  attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll

index 65f4b46..2ee69ac 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -9,6 +9,162 @@ main_body:
    ret <4 x float> %v
  }
  
+; GCN-LABEL: {{^}}sample_1d_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: v_mov_b32_e32 v4, v0
+; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 0
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 1
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 2
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 3
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f1 = extractelement <4 x float> %res.vec, i32 0
+  %res.f2 = extractelement <4 x float> %res.vec, i32 1
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f1 = extractelement <4 x float> %res.vec, i32 1
+  %res.f2 = extractelement <4 x float> %res.vec, i32 3
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f1 = extractelement <4 x float> %res.vec, i32 0
+  %res.f2 = extractelement <4 x float> %res.vec, i32 2
+  %res.f3 = extractelement <4 x float> %res.vec, i32 3
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2
+  %res = insertelement <4 x float> %res.tmp3, float %res.errf, i32 3
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: v_mov_b32_e32 v4, v0
+; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
  ; GCN-LABEL: {{^}}sample_2d:
  ; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}}
  define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
@@ -361,6 +517,17 @@ main_body:
    ret float %v
  }
  
+; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1_tfe:
+; GCN: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da{{$}}
+define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {float, i32} %v, 0
+  %v.err = extractvalue {float, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret float %v.vec
+}
+
  ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2:
  ; GCN: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da{{$}}
  define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
@@ -369,6 +536,22 @@ main_body:
    ret <2 x float> %v
  }
  
+; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe:
+; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
+define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+main_body:
+  %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {<2 x float>, i32} %v, 0
+  %v.f1 = extractelement <2 x float> %v.vec, i32 0
+  %v.f2 = extractelement <2 x float> %v.vec, i32 1
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  %v.errf = bitcast i32 %v.err to float
+  %res.0 = insertelement <4 x float> undef, float %v.f1, i32 0
+  %res.1 = insertelement <4 x float> %res.0, float %v.f2, i32 1
+  %res.2 = insertelement <4 x float> %res.1, float %v.errf, i32 2
+  ret <4 x float> %res.2
+}
+
  ; GCN-LABEL: {{^}}sample_1d_unorm:
  ; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm{{$}}
  define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
@@ -491,6 +674,7 @@ main_body:
  }
  
  declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
@@ -542,7 +726,9 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
  declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  
  declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {float, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  
  attributes #0 = { nounwind }
  attributes #1 = { nounwind readonly }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

index 7ca0d77..75e8618 100644 (file)
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1288,6 +1288,28 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8
    ret float %elt0
  }
  
+; Check that the intrinsic remains unchanged in the presence of TFE or LWE
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+; CHECK: ret float %elt0
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+  %data.vec = extractvalue {<4 x float>,i32} %data, 0
+  %elt0 = extractelement <4 x float> %data.vec, i32 0
+  ret float %elt0
+}
+
+; Check that the intrinsic remains unchanged in the presence of TFE or LWE
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+; CHECK: ret float %elt0
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+  %data.vec = extractvalue {<4 x float>,i32} %data, 0
+  %elt0 = extractelement <4 x float> %data.vec, i32 0
+  ret float %elt0
+}
+
  ; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
  ; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
  ; CHECK-NEXT: ret float %data
@@ -1466,6 +1488,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_
  }
  
  declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
  declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
author	David Stuttard <david.stuttard@amd.com>
	Mon, 14 Jan 2019 11:55:24 +0000 (11:55 +0000)
committer	David Stuttard <david.stuttard@amd.com>
	Mon, 14 Jan 2019 11:55:24 +0000 (11:55 +0000)
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPU.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/CMakeLists.txt		patch \| blob \| history
llvm/lib/Target/AMDGPU/MIMGInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp	[new file with mode: 0644]	patch \| blob
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h		patch \| blob \| history
llvm/lib/Transforms/InstCombine/InstCombineInternal.h		patch \| blob \| history
llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll		patch \| blob \| history
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll		patch \| blob \| history