From 573a5de7551cd33d00e67e4653d8c4e9e886b68b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Oct 2022 19:42:17 -0700
Subject: [PATCH] llvm-reduce: Add opcode reduction pass

Try some dumb strength reductions to "simpler" opcodes.
Make some opcode substitutions I typically try to get smaller
MIR out of codegen. This is a bit target specific and I have a
lot of increasingly target specific modifications I try
during manual reduction.
---
 llvm/test/tools/llvm-reduce/reduce-opcodes.ll | 220 ++++++++++++++++++
 llvm/tools/llvm-reduce/CMakeLists.txt         |   1 +
 llvm/tools/llvm-reduce/DeltaManager.cpp       |   2 +
 .../llvm-reduce/deltas/ReduceOpcodes.cpp      | 112 +++++++++
 llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h |  18 ++
 5 files changed, 353 insertions(+)
 create mode 100644 llvm/test/tools/llvm-reduce/reduce-opcodes.ll
 create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
 create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h

diff --git a/llvm/test/tools/llvm-reduce/reduce-opcodes.ll b/llvm/test/tools/llvm-reduce/reduce-opcodes.ll
new file mode 100644
index 000000000000..285283e013ad
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/reduce-opcodes.ll
@@ -0,0 +1,220 @@
+; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=opcodes --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t
+; RUN: FileCheck -check-prefix=RESULT %s < %t
+
+; CHECK-INTERESTINGNESS: @fdiv_fast(
+; RESULT: %op = fmul fast float %a, %b, !dbg !7, !fpmath !13
+define float @fdiv_fast(float %a, float %b) {
+  %op = fdiv fast float %a, %b, !dbg !7, !fpmath !13
+  ret float %op
+}
+
+; CHECK-INTERESTINGNESS: @frem_nnan(
+; RESULT: %op = fmul nnan float %a, %b, !dbg !7, !fpmath !13
+define float @frem_nnan(float %a, float %b) {
+  %op = frem nnan float %a, %b, !dbg !7, !fpmath !13
+  ret float %op
+}
+
+; CHECK-INTERESTINGNESS: @udiv(
+; RESULT: %op = mul i32 %a, %b, !dbg !7
+define i32 @udiv(i32 %a, i32 %b) {
+  %op = udiv i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @udiv_vec(
+; RESULT: %op = mul <2 x i32> %a, %b, !dbg !7
+define <2 x i32> @udiv_vec(<2 x i32> %a, <2 x i32> %b) {
+  %op = udiv <2 x i32> %a, %b, !dbg !7
+  ret <2 x i32> %op
+}
+
+; CHECK-INTERESTINGNESS: @sdiv(
+; RESULT: %op = mul i32 %a, %b{{$}}
+define i32 @sdiv(i32 %a, i32 %b) {
+  %op = sdiv i32 %a, %b
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @sdiv_exact(
+; RESULT: %op = mul i32 %a, %b, !dbg !7
+define i32 @sdiv_exact(i32 %a, i32 %b) {
+  %op = sdiv exact i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @urem(
+; RESULT: %op = mul i32 %a, %b, !dbg !7
+define i32 @urem(i32 %a, i32 %b) {
+  %op = urem i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @srem(
+; RESULT: %op = mul i32 %a, %b, !dbg !7
+define i32 @srem(i32 %a, i32 %b) {
+  %op = srem i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; Make sure there's no crash if the IRBuilder decided to constant fold something
+; CHECK-INTERESTINGNESS: @add_constant_fold(
+; RESULT: %op = add i32 0, 0, !dbg !7
+define i32 @add_constant_fold() {
+  %op = add i32 0, 0, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @add(
+; RESULT: %op = or i32 %a, %b, !dbg !7
+define i32 @add(i32 %a, i32 %b) {
+  %op = add i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @add_nuw(
+; RESULT: %op = or i32 %a, %b, !dbg !7
+define i32 @add_nuw(i32 %a, i32 %b) {
+  %op = add nuw i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @add_nsw(
+; RESULT: %op = or i32 %a, %b, !dbg !7
+define i32 @add_nsw(i32 %a, i32 %b) {
+  %op = add nsw i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @sub_nuw_nsw(
+; RESULT: %op = or i32 %a, %b, !dbg !7
+define i32 @sub_nuw_nsw(i32 %a, i32 %b) {
+  %op = sub nuw nsw i32 %a, %b, !dbg !7
+  ret i32 %op
+}
+
+; CHECK-INTERESTINGNESS: @workitem_id_y(
+; RESULT: %id = call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
+define i32 @workitem_id_y() {
+  %id = call i32 @llvm.amdgcn.workitem.id.y(), !dbg !7
+  ret i32 %id
+}
+
+; CHECK-INTERESTINGNESS: @workitem_id_z(
+; RESULT: %id = call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
+define i32 @workitem_id_z() {
+  %id = call i32 @llvm.amdgcn.workitem.id.z(), !dbg !7
+  ret i32 %id
+}
+
+; CHECK-INTERESTINGNESS: @workgroup_id_y(
+; RESULT: %id = call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !7
+define i32 @workgroup_id_y() {
+  %id = call i32 @llvm.amdgcn.workgroup.id.y(), !dbg !7
+  ret i32 %id
+}
+
+; CHECK-INTERESTINGNESS: @workgroup_id_z(
+; RESULT: %id = call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !7
+define i32 @workgroup_id_z() {
+  %id = call i32 @llvm.amdgcn.workgroup.id.z(), !dbg !7
+  ret i32 %id
+}
+
+; CHECK-LABEL: @minnum_nsz(
+; RESULT: %op = fmul nsz float %a, %b, !dbg !7
+define float @minnum_nsz(float %a, float %b) {
+  %op = call nsz float @llvm.minnum.f32(float %a, float %b), !dbg !7
+  ret float %op
+}
+
+; CHECK-LABEL: @maxnum_nsz(
+; RESULT: %op = fmul nsz float %a, %b, !dbg !7
+define float @maxnum_nsz(float %a, float %b) {
+  %op = call nsz float @llvm.maxnum.f32(float %a, float %b), !dbg !7
+  ret float %op
+}
+
+; CHECK-LABEL: @minimum(
+; RESULT: %op = fmul nsz float %a, %b, !dbg !7
+define float @minimum_nsz(float %a, float %b) {
+  %op = call nsz float @llvm.minimum.f32(float %a, float %b), !dbg !7
+  ret float %op
+}
+
+; CHECK-LABEL: @maximum(
+; RESULT: %op = fmul nsz float %a, %b, !dbg !7
+define float @maximum_nsz(float %a, float %b) {
+  %op = call nsz float @llvm.maximum.f32(float %a, float %b), !dbg !7
+  ret float %op
+}
+
+; CHECK-LABEL: @sqrt_ninf(
+; RESULT: %op = fmul ninf float %a, 2.000000e+00, !dbg !7
+define float @sqrt_ninf(float %a, float %b) {
+  %op = call ninf float @llvm.sqrt.f32(float %a), !dbg !7
+  ret float %op
+}
+
+; CHECK-LABEL: @sqrt_vec(
+; RESULT: %op = fmul <2 x float> %a, <float 2.000000e+00, float 2.000000e+00>, !dbg !7
+define <2 x float> @sqrt_vec(<2 x float> %a, <2 x float> %b) {
+  %op = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a), !dbg !7
+  ret <2 x float> %op
+}
+
+; CHECK-LABEL: @div_fixup(
+; RESULT: %op = call float @llvm.fma.f32(float %a, float %b, float %c)
+define float @div_fixup(float %a, float %b, float %c) {
+  %op = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c)
+  ret float %op
+}
+
+; CHECK-LABEL: @fma_legacy(
+; RESULT: %op = call float @llvm.fma.f32(float %a, float %b, float %c)
+define float @fma_legacy(float %a, float %b, float %c) {
+  %op = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %c)
+  ret float %op
+}
+
+; CHECK-LABEL: @fmul_legacy(
+; RESULT: %op = fmul float %a, %b
+define float @fmul_legacy(float %a, float %b) {
+  %op = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  ret float %op
+}
+
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.workitem.id.z()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare float @llvm.amdgcn.div.fixup.f32(float, float, float)
+declare float @llvm.amdgcn.fma.legacy(float, float, float)
+declare float @llvm.amdgcn.fmul.legacy(float, float)
+
+declare float @llvm.sqrt.f32(float)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
+declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.minimum.f32(float, float)
+
+!llvm.dbg.cu = !{!0}
+!opencl.ocl.version = !{!3, !3}
+!llvm.module.flags = !{!4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "arst.c", directory: "/some/random/directory")
+!2 = !{}
+!3 = !{i32 2, i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 2}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!""}
+!7 = !DILocation(line: 2, column: 6, scope: !8)
+!8 = distinct !DISubprogram(name: "arst", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null, !11}
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{float 2.500000e+00}
diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt
index 0dafd8834d8e..6f70806eee0f 100644
--- a/llvm/tools/llvm-reduce/CMakeLists.txt
+++ b/llvm/tools/llvm-reduce/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_tool(llvm-reduce
   deltas/ReduceMetadata.cpp
   deltas/ReduceModuleData.cpp
   deltas/ReduceOperandBundles.cpp
+  deltas/ReduceOpcodes.cpp
   deltas/ReduceSpecialGlobals.cpp
   deltas/ReduceOperands.cpp
   deltas/ReduceOperandsSkip.cpp
diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp
index 2bd9b2850f68..e544d27878ae 100644
--- a/llvm/tools/llvm-reduce/DeltaManager.cpp
+++ b/llvm/tools/llvm-reduce/DeltaManager.cpp
@@ -32,6 +32,7 @@
 #include "deltas/ReduceInstructionsMIR.h"
 #include "deltas/ReduceMetadata.h"
 #include "deltas/ReduceModuleData.h"
+#include "deltas/ReduceOpcodes.h"
 #include "deltas/ReduceOperandBundles.h"
 #include "deltas/ReduceOperands.h"
 #include "deltas/ReduceOperandsSkip.h"
@@ -89,6 +90,7 @@ static cl::list<std::string>
     DELTA_PASS("simplify-cfg", reduceUsingSimplifyCFGDeltaPass)                \
     DELTA_PASS("attributes", reduceAttributesDeltaPass)                        \
     DELTA_PASS("module-data", reduceModuleDataDeltaPass)                       \
+    DELTA_PASS("opcodes", reduceOpcodesDeltaPass)                              \
   } while (false)
 
 #define DELTA_PASSES_MIR                                                       \
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
new file mode 100644
index 000000000000..a83403497925
--- /dev/null
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
@@ -0,0 +1,112 @@
+//===- ReduceOpcodes.cpp - Specialized Delta Pass -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Try to replace instructions that are likely to codegen to simpler or smaller
+// sequences. This is a fuzzy and target specific concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceOpcodes.h"
+#include "Delta.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+static Value *replaceIntrinsic(Module &M, IntrinsicInst *II,
+                               Intrinsic::ID NewIID,
+                               ArrayRef<Type *> Tys = None) {
+  Function *NewFunc = Intrinsic::getDeclaration(&M, NewIID, Tys);
+  II->setCalledFunction(NewFunc);
+  return II;
+}
+
+static Value *reduceInstruction(Module &M, Instruction &I) {
+  IRBuilder<> B(&I);
+  switch (I.getOpcode()) {
+  case Instruction::FDiv:
+  case Instruction::FRem:
+    // Divisions tends to codegen into a long sequence or a library call.
+    return B.CreateFMul(I.getOperand(0), I.getOperand(1));
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    // Divisions tends to codegen into a long sequence or a library call.
+    return B.CreateMul(I.getOperand(0), I.getOperand(1));
+  case Instruction::Add:
+  case Instruction::Sub: {
+    // Add/sub are more likely codegen to instructions with carry out side
+    // effects.
+    return B.CreateOr(I.getOperand(0), I.getOperand(1));
+  }
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II)
+      return nullptr;
+
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::sqrt:
+      return B.CreateFMul(II->getArgOperand(0),
+                          ConstantFP::get(I.getType(), 2.0));
+    case Intrinsic::minnum:
+    case Intrinsic::maxnum:
+    case Intrinsic::minimum:
+    case Intrinsic::maximum:
+    case Intrinsic::amdgcn_fmul_legacy:
+      return B.CreateFMul(II->getArgOperand(0), II->getArgOperand(1));
+    case Intrinsic::amdgcn_workitem_id_y:
+    case Intrinsic::amdgcn_workitem_id_z:
+      return replaceIntrinsic(M, II, Intrinsic::amdgcn_workitem_id_x);
+    case Intrinsic::amdgcn_workgroup_id_y:
+    case Intrinsic::amdgcn_workgroup_id_z:
+      return replaceIntrinsic(M, II, Intrinsic::amdgcn_workgroup_id_x);
+    case Intrinsic::amdgcn_div_fixup:
+    case Intrinsic::amdgcn_fma_legacy:
+      return replaceIntrinsic(M, II, Intrinsic::fma, {II->getType()});
+    default:
+      return nullptr;
+    }
+
+    return nullptr;
+  }
+  default:
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+static void replaceOpcodesInModule(Oracle &O, Module &Mod) {
+  for (Function &F : Mod) {
+    for (BasicBlock &BB : F)
+      for (Instruction &I : make_early_inc_range(BB)) {
+        if (O.shouldKeep())
+          continue;
+
+        Instruction *Replacement =
+            dyn_cast_or_null<Instruction>(reduceInstruction(Mod, I));
+        if (Replacement && Replacement != &I) {
+          if (auto *Op = dyn_cast<FPMathOperator>(Replacement))
+            Replacement->copyFastMathFlags(&I);
+
+          Replacement->copyIRFlags(&I);
+          Replacement->copyMetadata(I);
+          Replacement->takeName(&I);
+          I.replaceAllUsesWith(Replacement);
+          I.eraseFromParent();
+        }
+      }
+  }
+}
+
+void llvm::reduceOpcodesDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing Opcodes...\n";
+  runDeltaPass(Test, replaceOpcodesInModule);
+}
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h
new file mode 100644
index 000000000000..79edc7f32fac
--- /dev/null
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h
@@ -0,0 +1,18 @@
+//===- ReduceOpcodes.h - Specialized Delta Pass -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPCODES_H
+#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPCODES_H
+
+#include "TestRunner.h"
+
+namespace llvm {
+void reduceOpcodesDeltaPass(TestRunner &Test);
+} // namespace llvm
+
+#endif
-- 
2.34.1