From b0847b0095e10e784dc241ebb19f39edd9c6a7f8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 14 Nov 2022 15:44:22 -0800
Subject: [PATCH] AMDGPU/GlobalISel: Insert freeze when splitting vector
 G_SEXT_INREG

This transform is broken for undef or poison inputs without a freeze.
This is also broken in lots of other places where shifts are split
into 32-bit pieces.

Amt < 32 case:
; Broken: https://alive2.llvm.org/ce/z/7bb4vc
; Freezing the low half of the bits makes it correct
; Fixed: https://alive2.llvm.org/ce/z/zJAZFr
define i64 @src(i64 %val) {
  %shl = shl i64 %val, 55
  %shr = ashr i64 %shl, 55
  ret i64 %shr
}

define i64 @tgt(i64 %val) {
  %lo32 = trunc i64 %val to i32
  %shr.half = lshr i64 %val, 32
  %hi32 = trunc i64 %shr.half to i32
  %inreg.0 = shl i32 %lo32, 23
  %new.lo = ashr i32 %inreg.0, 23
  %new.hi = ashr i32 %new.lo, 31
  %zext.lo = zext i32 %new.lo to i64
  %zext.hi = zext i32 %new.hi to i64
  %hi.ins = shl i64 %zext.hi, 32
  %or = or i64 %hi.ins, %zext.lo
  ret i64 %or
}

Amt == 32 case:
Broken: https://alive2.llvm.org/ce/z/5f4qwQ
Fixed: https://alive2.llvm.org/ce/z/A2hWWF
This one times out alive; works if argument is made noundef or
scaled down to a smaller bitwidth.

define i64 @src(i64 %val) {
  %shl = shl i64 %val, 32
  %shr = ashr i64 %shl, 32
  ret i64 %shr
}

define i64 @tgt(i64 %val) {
  %lo32 = trunc i64 %val to i32
  %shr.half = lshr i64 %val, 32
  %hi32 = trunc i64 %shr.half to i32
  %new.hi = ashr i32 %lo32, 31
  %zext.lo = zext i32 %lo32 to i64
  %zext.hi = zext i32 %new.hi to i64
  %hi.ins = shl i64 %zext.hi, 32
  %or = or i64 %hi.ins, %zext.lo
  ret i64 %or
}

Amt > 32 case:
; Correct: https://alive2.llvm.org/ce/z/tvrhPf
define i64 @src(i64 %val) {
  %shl = shl i64 %val, 9
  %shr = ashr i64 %shl, 9
  ret i64 %shr
}

define i64 @tgt(i64 %val) {
  %lo32 = trunc i64 %val to i32
  %lshr = lshr i64 %val, 32
  %hi32 = trunc i64 %lshr to i32
  %inreg.0 = shl i32 %hi32, 9
  %new.hi = ashr i32 %inreg.0, 9
  %zext.lo = zext i32 %lo32 to i64
  %zext.hi = zext i32 %new.hi to i64
  %hi.ins = shl i64 %zext.hi, 32
  %or = or i64 %hi.ins, %zext.lo
  ret i64 %or
}
---
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp            |  8 ++++++--
 .../CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir   | 12 +++++++-----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d12a64c..3e0d601 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2447,17 +2447,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     int Amt = MI.getOperand(2).getImm();
     if (Amt <= 32) {
+      // Downstream users have expectations for the high bit behavior, so freeze
+      // incoming undefined bits.
       if (Amt == 32) {
         // The low bits are unchanged.
-        B.buildCopy(DstRegs[0], SrcRegs[0]);
+        B.buildFreeze(DstRegs[0], SrcRegs[0]);
       } else {
+        auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
         // Extend in the low bits and propagate the sign bit to the high half.
-        B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
+        B.buildSExtInReg(DstRegs[0], Freeze, Amt);
       }
 
       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
     } else {
       // The low bits are unchanged, and extend in the high bits.
+      // No freeze required
       B.buildCopy(DstRegs[0], SrcRegs[0]);
       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
     }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir
index c4f4902..0aceefe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir
@@ -135,7 +135,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV]], 1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 1
     ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32)
@@ -159,7 +160,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV]], 31
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 31
     ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32)
@@ -183,10 +185,10 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[COPY1]], [[C]](s32)
-    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[ASHR]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[FREEZE]], [[C]](s32)
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[FREEZE]](s32), [[ASHR]](s32)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_SEXT_INREG %0, 32
-- 
2.7.4