B.setInstrAndDebugLoc(MI);
auto Unmerge = B.buildUnmerge(S32, Src);
- Register Hi32 = Unmerge.getReg(0);
- Register Lo32 = Unmerge.getReg(1);
- MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
- MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
- Register CvtDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const LLT V2S16 = LLT::vector(2, 16);
- MRI.setType(CvtDst, V2S16);
-
- B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32,
- {CvtDst},
- {Hi32, Lo32},
+ auto CvtPk = B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32,
+ {V2S16},
+ {Unmerge.getReg(0), Unmerge.getReg(1)},
MI.getFlags());
auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
-
auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
- MRI.setRegClass(MinBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass);
-
auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
- MRI.setRegClass(MaxBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass);
-
- Register MedDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- MRI.setType(MedDst, S32);
-
- Register CvtDst32 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- MRI.setType(CvtDst32, S32);
- B.buildBitcast(CvtDst32, CvtDst);
+ auto Bitcast = B.buildBitcast({S32}, CvtPk);
- B.buildInstr(AMDGPU::G_AMDGPU_MED3_S32,
- {MedDst},
- {MinBoundaryDst.getReg(0), CvtDst32, MaxBoundaryDst.getReg(0)},
+ auto Med3 = B.buildInstr(AMDGPU::G_AMDGPU_MED3_S32,
+ {S32},
+ {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
MI.getFlags());
- Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
- B.buildTrunc(TruncDst, MedDst);
- B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
+ auto Trunc = B.buildTrunc(LLT::scalar(16), Med3);
+ B.buildCopy(MI.getOperand(0).getReg(), Trunc);
MI.eraseFromParent();
}
; GFX10-LABEL: {{^}}v_clamp_i64_i16
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX6789: v_mov_b32_e32 [[B]], 0x7fff
-; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
-; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
+; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
+; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
-; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff
+; GFX10: v_mov_b32_e32 [[B]], 0x7fff
+; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
define i16 @v_clamp_i64_i16(i64 %in) #0 {
entry:
%max = call i64 @llvm.smax.i64(i64 %in, i64 -32768)
; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX6789: v_mov_b32_e32 [[B]], 0x7fff
-; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
-; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
+; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
+; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
-; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff
+; GFX10: v_mov_b32_e32 [[B]], 0x7fff
+; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 {
entry:
%min = call i64 @llvm.smin.i64(i64 %in, i64 32767)
; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX6789: v_mov_b32_e32 [[B]], 0x100
-; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
-; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
+; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
+; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
-; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x100
+; GFX10: v_mov_b32_e32 [[B]], 0x100
+; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 {
entry:
%min = call i64 @llvm.smin.i64(i64 %in, i64 256)
; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX6789: v_mov_b32_e32 [[B]], 0x100
-; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
-; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
+; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
+; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
-; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
-; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x100
+; GFX10: v_mov_b32_e32 [[B]], 0x100
+; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 {
entry:
%max = call i64 @llvm.smax.i64(i64 %in, i64 -255)