From 909e5ce47a70181dead332826e93f89b2928f0c0 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 22 Mar 2023 19:04:09 -0400 Subject: [PATCH] [mlir][arith] Add `uitofp` support to WIE This includes standard LIT tests and integration tests with the LLVM CPU runner. I plan to use this to implement `sitofp` in D146597. Reviewed By: antiagainst Differential Revision: https://reviews.llvm.org/D146606 --- .../Dialect/Arith/Transforms/EmulateWideInt.cpp | 69 ++++++++++++++++++- .../Arith/emulate-wide-int-canonicalization.mlir | 14 ++++ mlir/test/Dialect/Arith/emulate-wide-int.mlir | 56 ++++++++++++++++ .../CPU/test-wide-int-emulation-uitofp-i32.mlir | 77 ++++++++++++++++++++++ 4 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp index db3ddab..83f0139 100644 --- a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/FuncConversions.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/TypeUtilities.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/FormatVariadic.h" @@ -907,6 +908,70 @@ struct ConvertShRSI final : OpConversionPattern { }; //===----------------------------------------------------------------------===// +// ConvertUIToFP +//===----------------------------------------------------------------------===// + +struct ConvertUIToFP final : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(arith::UIToFPOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + Type oldTy = op.getIn().getType(); + auto newTy = + dyn_cast_or_null(getTypeConverter()->convertType(oldTy)); + if (!newTy) + return rewriter.notifyMatchFailure( + loc, llvm::formatv("unsupported type: {0}", oldTy)); + unsigned newBitWidth = newTy.getElementTypeBitWidth(); + + auto [low, hi] = extractLastDimHalves(rewriter, loc, adaptor.getIn()); + Value lowInt = dropTrailingX1Dim(rewriter, loc, low); + Value hiInt = dropTrailingX1Dim(rewriter, loc, hi); + Value zeroCst = + createScalarOrSplatConstant(rewriter, loc, hiInt.getType(), 0); + + // The final result has the following form: + // if (hi == 0) return uitofp(low) + // else return uitofp(low) + uitofp(hi) * 2^BW + // + // where `BW` is the bitwidth of the narrowed integer type. We emit a + // select to make it easier to fold-away the `hi` part calculation when it + // is known to be zero. + // + // Note 1: The emulation is precise only for input values that have exact + // integer representation in the result floating point type, and may lead + // loss of precision otherwise. + // + // Note 2: We do not strictly need the `hi == 0`, case, but it makes + // constant folding easier. + Value hiEqZero = rewriter.create( + loc, arith::CmpIPredicate::eq, hiInt, zeroCst); + + Type resultTy = op.getType(); + Type resultElemTy = getElementTypeOrSelf(resultTy); + Value lowFp = rewriter.create(loc, resultTy, lowInt); + Value hiFp = rewriter.create(loc, resultTy, hiInt); + + int64_t pow2Int = int64_t(1) << newBitWidth; + Attribute pow2Attr = + rewriter.getFloatAttr(resultElemTy, static_cast(pow2Int)); + if (auto vecTy = dyn_cast(resultTy)) + pow2Attr = SplatElementsAttr::get(vecTy, pow2Attr); + + Value pow2Val = rewriter.create(loc, resultTy, pow2Attr); + + Value hiVal = rewriter.create(loc, hiFp, pow2Val); + Value result = rewriter.create(loc, lowFp, hiVal); + + rewriter.replaceOpWithNewOp(op, hiEqZero, lowFp, result); + return success(); + } +}; + +//===----------------------------------------------------------------------===// // ConvertTruncI //===----------------------------------------------------------------------===// @@ -1080,6 +1145,6 @@ void arith::populateArithWideIntEmulationPatterns( ConvertIndexCastIntToIndex, ConvertIndexCastIntToIndex, ConvertIndexCastIndexToInt, - ConvertIndexCastIndexToInt>( - typeConverter, patterns.getContext()); + ConvertIndexCastIndexToInt, + ConvertUIToFP>(typeConverter, patterns.getContext()); } diff --git a/mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir b/mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir new file mode 100644 index 0000000..0c95ab8 --- /dev/null +++ b/mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir @@ -0,0 +1,14 @@ +// RUN: mlir-opt --arith-emulate-wide-int="widest-int-supported=32" --canonicalize %s | FileCheck %s + +// Check that we can fold away the 'hi' part calculation when it is know to be zero. +// +// CHECK-LABEL: func @uitofp_i16_ext_f64 +// CHECK-SAME: ([[ARG:%.+]]: i16) -> f64 +// CHECK-NEXT: [[EXT:%.+]] = arith.extui [[ARG]] : i16 to i32 +// CHECK-NEXT: [[FP:%.+]] = arith.uitofp [[EXT]] : i32 to f64 +// CHECK-NEXT: return [[FP]] : f64 +func.func @uitofp_i16_ext_f64(%a : i16) -> f64 { + %ext = arith.extui %a : i16 to i64 + %r = arith.uitofp %ext : i64 to f64 + return %r : f64 +} diff --git a/mlir/test/Dialect/Arith/emulate-wide-int.mlir b/mlir/test/Dialect/Arith/emulate-wide-int.mlir index 80edc6f..55b4e7f 100644 --- a/mlir/test/Dialect/Arith/emulate-wide-int.mlir +++ b/mlir/test/Dialect/Arith/emulate-wide-int.mlir @@ -908,3 +908,59 @@ func.func @xori_vector_a_b(%a : vector<3xi64>, %b : vector<3xi64>) -> vector<3xi %x = arith.xori %a, %b : vector<3xi64> return %x : vector<3xi64> } + +// CHECK-LABEL: func @uitofp_i64_f64 +// CHECK-SAME: ([[ARG:%.+]]: vector<2xi32>) -> f64 +// CHECK-NEXT: [[LOW:%.+]] = vector.extract [[ARG]][0] : vector<2xi32> +// CHECK-NEXT: [[HI:%.+]] = vector.extract [[ARG]][1] : vector<2xi32> +// CHECK-NEXT: [[CST0:%.+]] = arith.constant 0 : i32 +// CHECK-NEXT: [[HIEQ0:%.+]] = arith.cmpi eq, [[HI]], [[CST0]] : i32 +// CHECK-NEXT: [[LOWFP:%.+]] = arith.uitofp [[LOW]] : i32 to f64 +// CHECK-NEXT: [[HIFP:%.+]] = arith.uitofp [[HI]] : i32 to f64 +// CHECK-NEXT: [[POW:%.+]] = arith.constant 0x41F0000000000000 : f64 +// CHECK-NEXT: [[RESHI:%.+]] = arith.mulf [[HIFP]], [[POW]] : f64 +// CHECK-NEXT: [[RES:%.+]] = arith.addf [[LOWFP]], [[RESHI]] : f64 +// CHECK-NEXT: [[SEL:%.+]] = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : f64 +// CHECK-NEXT: return [[SEL]] : f64 +func.func @uitofp_i64_f64(%a : i64) -> f64 { + %r = arith.uitofp %a : i64 to f64 + return %r : f64 +} + +// CHECK-LABEL: func @uitofp_i64_f64_vector +// CHECK-SAME: ([[ARG:%.+]]: vector<3x2xi32>) -> vector<3xf64> +// CHECK-NEXT: [[EXTLOW:%.+]] = vector.extract_strided_slice [[ARG]] {offsets = [0, 0], sizes = [3, 1], strides = [1, 1]} : vector<3x2xi32> to vector<3x1xi32> +// CHECK-NEXT: [[EXTHI:%.+]] = vector.extract_strided_slice [[ARG]] {offsets = [0, 1], sizes = [3, 1], strides = [1, 1]} : vector<3x2xi32> to vector<3x1xi32> +// CHECK-NEXT: [[LOW:%.+]] = vector.shape_cast [[EXTLOW]] : vector<3x1xi32> to vector<3xi32> +// CHECK-NEXT: [[HI:%.+]] = vector.shape_cast [[EXTHI]] : vector<3x1xi32> to vector<3xi32> +// CHECK-NEXT: [[CST0:%.+]] = arith.constant dense<0> : vector<3xi32> +// CHECK-NEXT: [[HIEQ0:%.+]] = arith.cmpi eq, [[HI]], [[CST0]] : vector<3xi32> +// CHECK-NEXT: [[LOWFP:%.+]] = arith.uitofp [[LOW]] : vector<3xi32> to vector<3xf64> +// CHECK-NEXT: [[HIFP:%.+]] = arith.uitofp [[HI]] : vector<3xi32> to vector<3xf64> +// CHECK-NEXT: [[POW:%.+]] = arith.constant dense<0x41F0000000000000> : vector<3xf64> +// CHECK-NEXT: [[RESHI:%.+]] = arith.mulf [[HIFP]], [[POW]] : vector<3xf64> +// CHECK-NEXT: [[RES:%.+]] = arith.addf [[LOWFP]], [[RESHI]] : vector<3xf64> +// CHECK-NEXT: [[SEL:%.+]] = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : vector<3xi1>, vector<3xf64> +// CHECK-NEXT: return [[SEL]] : vector<3xf64> +func.func @uitofp_i64_f64_vector(%a : vector<3xi64>) -> vector<3xf64> { + %r = arith.uitofp %a : vector<3xi64> to vector<3xf64> + return %r : vector<3xf64> +} + +// CHECK-LABEL: func @uitofp_i64_f16 +// CHECK-SAME: ([[ARG:%.+]]: vector<2xi32>) -> f16 +// CHECK-NEXT: [[LOW:%.+]] = vector.extract [[ARG]][0] : vector<2xi32> +// CHECK-NEXT: [[HI:%.+]] = vector.extract [[ARG]][1] : vector<2xi32> +// CHECK-NEXT: [[CST0:%.+]] = arith.constant 0 : i32 +// CHECK-NEXT: [[HIEQ0:%.+]] = arith.cmpi eq, [[HI]], [[CST0]] : i32 +// CHECK-NEXT: [[LOWFP:%.+]] = arith.uitofp [[LOW]] : i32 to f16 +// CHECK-NEXT: [[HIFP:%.+]] = arith.uitofp [[HI]] : i32 to f16 +// CHECK-NEXT: [[POW:%.+]] = arith.constant 0x7C00 : f16 +// CHECK-NEXT: [[RESHI:%.+]] = arith.mulf [[HIFP]], [[POW]] : f16 +// CHECK-NEXT: [[RES:%.+]] = arith.addf [[LOWFP]], [[RESHI]] : f16 +// CHECK-NEXT: [[SEL:%.+]] = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : f16 +// CHECK-NEXT: return [[SEL]] : f16 +func.func @uitofp_i64_f16(%a : i64) -> f16 { + %r = arith.uitofp %a : i64 to f16 + return %r : f16 +} diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir new file mode 100644 index 0000000..c3d7db0 --- /dev/null +++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir @@ -0,0 +1,77 @@ +// Check that the wide integer `arith.uitofp` emulation produces the same result as wide +// `arith.uitofp`. Emulate i32 ops with i16 ops. + +// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \ +// RUN: --convert-func-to-llvm --convert-arith-to-llvm | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: --shared-libs=%mlir_c_runner_utils | \ +// RUN: FileCheck %s --match-full-lines + +// RUN: mlir-opt %s --test-arith-emulate-wide-int="widest-int-supported=16" \ +// RUN: --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \ +// RUN: --convert-func-to-llvm --convert-arith-to-llvm | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: --shared-libs=%mlir_c_runner_utils | \ +// RUN: FileCheck %s --match-full-lines + +// Ops in this function *only* will be emulated using i16 types. +func.func @emulate_uitofp(%arg: i32) -> f32 { + %res = arith.uitofp %arg : i32 to f32 + return %res : f32 +} + +func.func @check_uitofp(%arg : i32) -> () { + %res = func.call @emulate_uitofp(%arg) : (i32) -> (f32) + vector.print %res : f32 + return +} + +func.func @entry() { + %cst0 = arith.constant 0 : i32 + %cst1 = arith.constant 1 : i32 + %cst2 = arith.constant 2 : i32 + %cst7 = arith.constant 7 : i32 + %cst1337 = arith.constant 1337 : i32 + %cst_i16_max = arith.constant 65535 : i32 + %cst_i16_overflow = arith.constant 65536 : i32 + + %cst_n1 = arith.constant -1 : i32 + %cst_n13 = arith.constant -13 : i32 + %cst_n1337 = arith.constant -1337 : i32 + + %cst_i16_min = arith.constant -32768 : i32 + + %cst_f32_int_max = arith.constant 16777217 : i32 + %cst_f32_int_min = arith.constant -16777217 : i32 + + // CHECK: 0 + func.call @check_uitofp(%cst0) : (i32) -> () + // CHECK-NEXT: 1 + func.call @check_uitofp(%cst1) : (i32) -> () + // CHECK-NEXT: 2 + func.call @check_uitofp(%cst2) : (i32) -> () + // CHECK-NEXT: 7 + func.call @check_uitofp(%cst7) : (i32) -> () + // CHECK-NEXT: 1337 + func.call @check_uitofp(%cst1337) : (i32) -> () + // CHECK-NEXT: 65535 + func.call @check_uitofp(%cst_i16_max) : (i32) -> () + // CHECK-NEXT: 65536 + func.call @check_uitofp(%cst_i16_overflow) : (i32) -> () + + // CHECK-NEXT: 4.2{{.+}}e+09 + func.call @check_uitofp(%cst_n1) : (i32) -> () + // CHECK-NEXT: 4.2{{.+}}e+09 + func.call @check_uitofp(%cst_n1337) : (i32) -> () + + // CHECK-NEXT: 4.2{{.+}}e+09 + func.call @check_uitofp(%cst_i16_min) : (i32) -> () + // CHECK-NEXT: 4.2{{.+}}e+09 + func.call @check_uitofp(%cst_i16_min) : (i32) -> () + // CHECK-NEXT: 1.6{{.+}}e+07 + func.call @check_uitofp(%cst_f32_int_max) : (i32) -> () + // CHECK-NEXT: 4.2{{.+}}e+09 + func.call @check_uitofp(%cst_f32_int_min) : (i32) -> () + + return +} -- 2.7.4