From cfbdba2d0b1d4bfe3c3dee815b502740e99a0a39 Mon Sep 17 00:00:00 2001
From: Marek Olsak <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 20:29:10 +0000
Subject: [PATCH] AMDGPU: really don't commute REV opcodes if the target
 variant doesn't exist

If pseudoToMCOpcode failed, we would return the original opcode, so operands
would be swapped, but the instruction would remain the same.
It resulted in LSHLREV a, b ---> LSHLREV b, a.

This fixes Glamor text rendering and
piglit/arb_sample_shading-builtin-gl-sample-mask on VI.

This is a candidate for stable branches.

v2: the test was simplified by Tom Stellard
llvm-svn: 240824
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp     | 20 ++++++++++--------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h       |  2 +-
 llvm/test/CodeGen/AMDGPU/commute-shifts.ll | 33 ++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/commute-shifts.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 47bc178..b517fb2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -440,22 +440,22 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 }
 
-unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
+int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
   const unsigned Opcode = MI.getOpcode();
 
   int NewOpc;
 
   // Try to map original to commuted opcode
   NewOpc = AMDGPU::getCommuteRev(Opcode);
-  // Check if the commuted (REV) opcode exists on the target.
-  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
-    return NewOpc;
+  if (NewOpc != -1)
+    // Check if the commuted (REV) opcode exists on the target.
+    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
 
   // Try to map commuted to original opcode
   NewOpc = AMDGPU::getCommuteOrig(Opcode);
-  // Check if the original (non-REV) opcode exists on the target.
-  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
-    return NewOpc;
+  if (NewOpc != -1)
+    // Check if the original (non-REV) opcode exists on the target.
+    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
 
   return Opcode;
 }
@@ -771,6 +771,10 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
   if (MI->getNumOperands() < 3)
     return nullptr;
 
+  int CommutedOpcode = commuteOpcode(*MI);
+  if (CommutedOpcode == -1)
+    return nullptr;
+
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src0);
   assert(Src0Idx != -1 && "Should always have src0 operand");
@@ -833,7 +837,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
   }
 
   if (MI)
-    MI->setDesc(get(commuteOpcode(*MI)));
+    MI->setDesc(get(CommutedOpcode));
 
   return MI;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6fafb94..0382272 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -117,7 +117,7 @@ public:
   // register.  If there is no hardware instruction that can store to \p
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
-  unsigned commuteOpcode(const MachineInstr &MI) const;
+  int commuteOpcode(const MachineInstr &MI) const;
 
   MachineInstr *commuteInstruction(MachineInstr *MI,
                                    bool NewMI = false) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
new file mode 100644
index 0000000..f88cf64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}main:
+; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1
+
+define void @main() #0 {
+main_body:
+  %0 = fptosi float undef to i32
+  %1 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> undef, <32 x i8> undef, i32 2)
+  %2 = extractelement <4 x i32> %1, i32 0
+  %3 = and i32 %0, 7
+  %4 = shl i32 1, %3
+  %5 = and i32 %2, %4
+  %6 = icmp eq i32 %5, 0
+  %.10 = select i1 %6, float 0.000000e+00, float undef
+  %7 = call i32 @llvm.SI.packf16(float undef, float %.10)
+  %8 = bitcast i32 %7 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %8, float undef, float %8)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
-- 
2.7.4