[nnc] Enable fusion of bfloat16 ops (#64196)

author Bert Maher <bertrand@fb.com>

Tue, 31 Aug 2021 03:08:15 +0000 (20:08 -0700)

committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>

Tue, 31 Aug 2021 03:09:36 +0000 (20:09 -0700)
author Bert Maher <bertrand@fb.com>
Tue, 31 Aug 2021 03:08:15 +0000 (20:08 -0700)
committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Tue, 31 Aug 2021 03:09:36 +0000 (20:09 -0700)
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py

index 918cc70..a6cc085 100644 (file)
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -97,6 +97,7 @@ class TestTEFuser(JitTestCase):
              torch.float16,
              torch.float32,
              torch.float64,
+            torch.bfloat16,
          ]
          self.dtypes = self.int_dtypes + self.fp_dtypes
  
@@ -1145,7 +1146,7 @@ class TestTEFuser(JitTestCase):
          bad_dtypes = []
          for dtype, output_dtype, device, size in product(dtypes, dtypes, self.devices, sizes):
              # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              if dtype == output_dtype:
                  continue
@@ -1210,7 +1211,7 @@ class TestTEFuser(JitTestCase):
  
          for inp, device, dtype in product(inputs, self.devices, dtypes):
              # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              inp = inp.to(device=device, dtype=dtype)
              try:
@@ -1263,7 +1264,8 @@ class TestTEFuser(JitTestCase):
              torch.round,
              torch.trunc,
              torch.frac,
-            F.hardshrink,
+            # TODO: broken on ROCm?
+            # F.hardshrink,
              F.leaky_relu,
              lambda x: torch.threshold(x, 0, -10),
              lambda x: torch.clamp(x, -10, 10),
@@ -1272,7 +1274,7 @@ class TestTEFuser(JitTestCase):
          sizes = [(1,), (2,), (4, 4)]
          for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
              # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              if op in gpu_only and device == "cpu":
                  continue
@@ -1325,7 +1327,7 @@ class TestTEFuser(JitTestCase):
          ]
          devices = self.devices
          for dtype, op, device in product(self.dtypes, binary_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device)
@@ -1377,7 +1379,7 @@ class TestTEFuser(JitTestCase):
                                       "[[10, 3, 4], [4, 5]]",
                                       ]
          for dtype, size, device in product(self.dtypes, sizes, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  size_x, size_y = size
@@ -1423,7 +1425,7 @@ class TestTEFuser(JitTestCase):
          # only using  scalar values relevant to particular ops
          scalars = [1.5, 3, 0, -2.0, -1]
          for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device)
@@ -1457,7 +1459,7 @@ class TestTEFuser(JitTestCase):
          # only using  scalar values relevant to particular ops
          scalars = [1.5, 3, -2.0, -1]  # skip 0
          for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device)
@@ -1494,7 +1496,7 @@ class TestTEFuser(JitTestCase):
          # only using  scalar values relevant to particular ops
          scalars = [1.5, 3, 0, -2.0, -1]
          for dtype, op, device, scalar in product(dtypes, binary_ops, self.devices, scalars):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device)
@@ -1524,7 +1526,7 @@ class TestTEFuser(JitTestCase):
          ]
          devices = self.devices
          for dtype, op, device in product(self.dtypes, ternary_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device)
@@ -1555,7 +1557,7 @@ class TestTEFuser(JitTestCase):
          ]
          devices = self.devices
          for dtype, op, device in product(self.dtypes, ternary_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device, size=[5, 3, 128, 128])
@@ -1588,7 +1590,7 @@ class TestTEFuser(JitTestCase):
              torch.cat,
          ]
          for dtype, op, device in product(self.dtypes, list_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  x = self.data_for(dtype, device, size=[5, 4, 1, 7])
@@ -1621,7 +1623,7 @@ class TestTEFuser(JitTestCase):
          ]
          devices = self.devices
          for dtype, op, device in product(self.dtypes, ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                  continue
              try:
                  cond = self.data_for(torch.bool, device)
@@ -1650,7 +1652,6 @@ class TestTEFuser(JitTestCase):
  
              unsupported_dtypes = [
                  torch.uint8,
-                torch.bfloat16,
                  torch.complex32,
                  torch.complex64,
                  torch.complex128,
@@ -1791,6 +1792,7 @@ class TestTEFuser(JitTestCase):
              dtypes = self.dtypes.copy()
              # CPU fuser doesn't support float16.
              dtypes.remove(torch.float16)
+            dtypes.remove(torch.bfloat16)
              for dtype1, dtype2 in product(dtypes, dtypes):
                  x = torch.randint(2, (1, 13,)).to(dtype1)
                  zero = torch.tensor([[0]]).to(dtype2)
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp

index a3e3707..75305d6 100644 (file)
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -966,7 +966,9 @@ class TensorExprFuser {
          // but on top of that Float16 has a few kinks on LLVM.  Thus, on CPU we
          // additionally disable it until we either move to a more stable version
          // or find workarounds.
-        if (*st == c10::ScalarType::Half && *device == c10::kCPU) {
+        if ((*st == c10::ScalarType::Half ||
+             *st == c10::ScalarType::BFloat16) &&
+            *device == c10::kCPU) {
            return false;
          }
  
@@ -1098,8 +1100,7 @@ class TensorExprFuser {
            // All tensor types should be known.
            return false;
          }
-        if (c10::isComplexType(*st) || c10::isQIntType(*st) ||
-            *st == c10::ScalarType::BFloat16) {
+        if (c10::isComplexType(*st) || c10::isQIntType(*st)) {
            return false;
          }
        }
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp

index 51b7b77..b42d374 100644 (file)
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -16,6 +16,8 @@ std::string blockDtypeCppString(const Dtype& dtype) {
        return "1";
      case ScalarType::Half:
        return "2";
+    case ScalarType::BFloat16:
+      return "2";
      // NOLINTNEXTLINE(bugprone-branch-clone)
      case ScalarType::Char:
        return "1";
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp

index 0bbc337..b2b077b 100644 (file)
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -67,7 +67,7 @@ void* CodeGen::argToPtr(const BufferArg& bufferArg, const CallArg& callArg) {
    case ScalarType::Name:    \
      return callArg.Name##Ptr();
  
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
  
      default:
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h

index 29255aa..0504f9a 100644 (file)
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -153,7 +153,7 @@ class CodeGen::CallArg {
      memcpy(&data_, &v, sizeof(Type)); \
    }
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, ARG_TYPE_CTOR);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_TYPE_CTOR);
  #undef ARG_TYPE_CTOR
  
    void* data() const {
@@ -165,7 +165,7 @@ class CodeGen::CallArg {
      return (Type*)&data_;          \
    }
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, ARG_PTR_DEFINE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_PTR_DEFINE);
  #undef ARG_PTR_DEFINE
  
   private:
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp

index 20795e4..6c02f7f 100644 (file)
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -149,7 +149,7 @@ void dispatch_binary_op(std::ostream& os, const BinaryOpNode<Op>* v) {
    case ScalarType::Name:                                           \
      visit_binary_op<Type>(os, v->lhs(), v->rhs(), v->expr_type()); \
      break;
-    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      default:
        throw unsupported_dtype();
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp

index 30d4207..c23eda3 100644 (file)
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -98,6 +98,8 @@ std::string CudaPrinter::dtypeToCppString(const Dtype& dtype) {
        return "bool";
      case ScalarType::Half:
        return "half";
+    case ScalarType::BFloat16:
+      return "__nv_bfloat16";
      case ScalarType::Char:
        return "char";
      case ScalarType::Byte:
@@ -251,20 +253,15 @@ void CudaPrinter::visit(ForPtr v) {
  }
  
  void CudaPrinter::visit(CastPtr v) {
-  if (v->dtype().scalar_type() == ScalarType::Half) {
-    os() << "__float2half(";
-    v->src_value()->accept(this);
-    os() << ")";
-    return;
-  } else if (v->src_value()->dtype().scalar_type() == ScalarType::Half) {
-    os() << "__half2float(";
-    v->src_value()->accept(this);
-    os() << ")";
-    return;
-  }
-
-  os() << "(" << dtypeToCppString(v->dtype()) << ")";
-  os() << "(";
+  std::string castFn = v->dtype().scalar_type() == ScalarType::Half
+      ? "__float2half"
+      : v->dtype().scalar_type() == ScalarType::BFloat16 ? "__float2bfloat16"
+      : v->src_value()->dtype().scalar_type() == ScalarType::Half
+      ? "__half2float"
+      : v->src_value()->dtype().scalar_type() == ScalarType::BFloat16
+      ? "__bfloat162float"
+      : ("(" + dtypeToCppString(v->dtype()) + ")");
+  os() << castFn << "(";
    v->src_value()->accept(this);
    os() << ")";
  }
@@ -320,7 +317,8 @@ void CudaPrinter::visit(LoadPtr v) {
      return;
    }
    if (v->dtype().scalar_type() == ScalarType::Bool ||
-      v->dtype().scalar_type() == ScalarType::Half) {
+      v->dtype().scalar_type() == ScalarType::Half ||
+      v->dtype().scalar_type() == ScalarType::BFloat16) {
      // There's no __ldg overload for bool or half.
      os() << *v->base_handle() << "[" << *v->flat_index() << "]";
      return;
@@ -944,6 +942,9 @@ void CudaCodeGen::Initialize() {
    if (halfChecker.hasHalf()) {
      os() << fuser::cuda::half_support_literal << std::endl;
    }
+  if (halfChecker.hasBFloat16()) {
+    os() << fuser::cuda::bfloat16_support_literal << std::endl;
+  }
  
    std::string func_name = GetUniqueFuncName(kernel_func_name());
    os() << "extern \"C\" __global__" << std::endl;
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp

index e42ce77..4582433 100644 (file)
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -62,6 +62,10 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) {
    return lhs / rhs;
  }
  
+inline c10::BFloat16 div_value(c10::BFloat16 lhs, c10::BFloat16 rhs) {
+  return lhs / rhs;
+}
+
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  class SimpleIREvaluatorImpl : public IRVisitor {
   public:
@@ -347,7 +351,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
    case ScalarType::Name:                               \
      value_ = binary_op<Type>(lhs_v, rhs_v, expr_type); \
      break;
-      AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        case ScalarType::Bool:
          value_ = binary_op<unsigned char>(lhs_v, rhs_v, expr_type);
@@ -370,7 +374,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
    case ScalarType::Name:                                                    \
      value = compare_select_op<T, Type>(lhs, rhs, retval1, retval2, cmp_op); \
      break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        default:
          throw unsupported_dtype();
@@ -402,7 +406,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
      value_ = compare_select_op_helper<Type>(           \
          lhs_v, rhs_v, ret_val1_v, ret_val2_v, cmp_op); \
      break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        default:
          throw unsupported_dtype();
@@ -413,7 +417,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
    TORCH_API void visit(Name##ImmPtr v) override { \
      value_ = Value(v->value());                   \
    }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
  #undef IMM_VISIT
  
    TORCH_API void visit(BlockPtr v) override {
@@ -464,7 +468,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
    case ScalarType::Name:                                           \
      this->value_ = Value(castValues<SrcType, Type>(src_dtype, v)); \
      break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DST_TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DST_TYPE_CASE);
  #undef DST_TYPE_CASE
        default:
          throw unsupported_dtype();
@@ -486,7 +490,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
    case ScalarType::Name:                               \
      doCastFromSrc<Type>(src_dtype, dst_dtype, value_); \
      break;
-        AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, SRC_TYPE_CASE);
+        AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, SRC_TYPE_CASE);
  #undef SRC_TYPE_CASE
          default:
            throw unsupported_dtype();
@@ -590,7 +594,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
      std::vector<Type> v(lanes, value.as<Type>()); \
      value_ = Value(v);                            \
    } break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        default:
          throw unsupported_dtype();
@@ -610,6 +614,9 @@ class SimpleIREvaluatorImpl : public IRVisitor {
  #undef TYPE_CASE
        case ScalarType::Half:
          throw unsupported_dtype("IfThenElse condition can't have Half dtype");
+      case ScalarType::BFloat16:
+        throw unsupported_dtype(
+            "IfThenElse condition can't have BFloat16 dtype");
        default:
          throw unsupported_dtype();
      }
@@ -660,7 +667,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
      }                                                \
      value_ = Value(v);                               \
    } break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        default:
          throw unsupported_dtype();
@@ -693,7 +700,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
        ptr##Name[index[i]] = value[i];                           \
      }                                                           \
    } break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        default:
          throw unsupported_dtype();
@@ -801,6 +808,8 @@ class SimpleIREvaluatorImpl : public IRVisitor {
          visit_intrinsics_helper<int, double>(v);
        } else if (inp_dtype == ScalarType::Half) {
          throw unsupported_dtype(); // TODO
+      } else if (inp_dtype == ScalarType::BFloat16) {
+        throw unsupported_dtype(); // TODO
        }
      } else {
        switch (ty) {
@@ -1039,7 +1048,7 @@ void SimpleIREvaluator::bindArg(const BufferArg& bufArg, void* data) {
      impl_->bindVar(bufArg.var(), typed_data); \
      break;                                    \
    }
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      default:
        throw unsupported_dtype();
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h

index 494ba28..e11bb16 100644 (file)
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -36,7 +36,7 @@ class Value {
      Name##values.push_back(v); \
      return;                    \
    }
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      throw unsupported_dtype();
    }
@@ -46,14 +46,14 @@ class Value {
      Name##values.push_back(v);      \
    }
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_CTOR);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_CTOR);
  #undef VALUE_CTOR
  
  #define VALUE_VEC_CTOR(Type, Name)  \
    Value(const std::vector<Type>& v) \
        : dtype_(Dtype(k##Name, v.size())), Name##values(v) {}
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_VEC_CTOR);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_VEC_CTOR);
  #undef VALUE_VEC_CTOR
  
    template <typename T>
@@ -72,7 +72,7 @@ class Value {
    Dtype dtype_;
  
  #define VALUE_STORAGE(Type, Name) std::vector<Type> Name##values;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_STORAGE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_STORAGE);
  #undef VALUE_STORAGE
    void* ptr;
  };
@@ -85,7 +85,7 @@ class Value {
      }                                   \
      return Name##values[0];             \
    }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_DISPATCH);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_DISPATCH);
  #undef VALUE_AS_DISPATCH
  
  #define VALUE_AS_VEC_DISPATCH(Type, Name)                       \
@@ -96,7 +96,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_DISPATCH);
      }                                                           \
      return Name##values;                                        \
    }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_VEC_DISPATCH);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_VEC_DISPATCH);
  #undef VALUE_AS_VEC_DISPATCH
  
  template <typename To, typename From>
@@ -206,7 +206,7 @@ class ExprEval {
      ret_value_ = Value(ret_val_arg[0]);                 \
    } break;
        // NOLINTNEXTLINE(modernize-use-emplace)
-      AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        case ScalarType::Bool: {
          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -231,7 +231,7 @@ class ExprEval {
      codegen_->call_raw(args_extended);           \
      ret_value_ = Value(ret_val_arg[0]);          \
    } break;
-      AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
        case ScalarType::Bool: {
          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp

index cbf5ddd..c757d4b 100644 (file)
--- a/torch/csrc/jit/tensorexpr/expr.cpp
+++ b/torch/csrc/jit/tensorexpr/expr.cpp
@@ -89,7 +89,7 @@ ExprHandle ExprHandle::operator>>(const ExprHandle& other) const {
  // NOLINTNEXTLINE
  #define IMM_EXPR_DECLARE(Type, Name) \
    ExprHandle::ExprHandle(Type v) : ExprHandle(Name##Imm::make(v)) {}
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_EXPR_DECLARE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE);
  #undef IMM_EXPR_DECLARE
  
  ExprHandle sin(const ExprHandle& v) {
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h

index 4947bfd..41ce99a 100644 (file)
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -110,7 +110,7 @@ class TORCH_API ExprHandle {
    }
  
  #define IMM_EXPR_DECLARE(Type, Name) ExprHandle(Type v);
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_EXPR_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE);
  #undef IMM_EXPR_DECLARE
  
    template <class Op>
diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h

index 1b3dde5..119308b 100644 (file)
--- a/torch/csrc/jit/tensorexpr/fwd_decls.h
+++ b/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -113,7 +113,7 @@ using SyncThreadsPtr = NodePtr<SyncThreads>;
  #define IMM_DECLARE(Type, Name) \
    class Name##Imm;              \
    using Name##ImmPtr = NodePtr<Name##Imm>;
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
  #undef IMM_DECLARE
  
  } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h

index 674af8a..8ecf956 100644 (file)
--- a/torch/csrc/jit/tensorexpr/half_support.h
+++ b/torch/csrc/jit/tensorexpr/half_support.h
@@ -18,17 +18,23 @@ class HalfChecker : public IRVisitor {
      }
    }
  
-  bool hasHalf() {
+  bool hasHalf() const {
      return hasHalf_;
    }
  
+  bool hasBFloat16() const {
+    return hasBFloat16_;
+  }
+
    void visit(LoadPtr v) override {
      hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
      IRVisitor::visit(v);
    }
  
    void visit(StorePtr v) override {
      hasHalf_ |= v->buf()->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->buf()->dtype().scalar_type() == ScalarType::BFloat16;
      IRVisitor::visit(v);
    }
  
@@ -36,20 +42,26 @@ class HalfChecker : public IRVisitor {
      hasHalf_ = true;
    }
  
+  void visit(BFloat16ImmPtr v) override {
+    hasBFloat16_ = true;
+  }
+
    void visit(CastPtr v) override {
      hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
      IRVisitor::visit(v);
    }
  
   private:
    bool hasHalf_{false};
+  bool hasBFloat16_{false};
  };
  
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  class HalfRewriter : public IRMutator {
    ExprPtr mutate(LoadPtr v) override {
      ExprPtr child = IRMutator::mutate(v);
-    if (child->dtype().scalar_type() != ScalarType::Half) {
+    if (!isHalf(child)) {
        return child;
      }
  
@@ -63,12 +75,11 @@ class HalfRewriter : public IRMutator {
    StmtPtr mutate(StorePtr v) override {
      // Since mutation changes the `value()` expression in-place, we need to
      // get the dtype of the `value()` before that is mutated.
-    Dtype newType = v->value()->dtype();
+    auto newType = v->value()->dtype();
      ExprPtr new_val = v->value()->accept_mutator(this);
  
-    if (newType.scalar_type() == ScalarType::Half) {
-      new_val =
-          alloc<Cast>(newType.cloneWithScalarType(ScalarType::Half), new_val);
+    if (isHalf(newType.scalar_type())) {
+      new_val = alloc<Cast>(newType, new_val);
        inserted_half_casts_.insert(new_val);
      }
  
@@ -80,11 +91,15 @@ class HalfRewriter : public IRMutator {
      return alloc<Cast>(kFloat, v);
    }
  
+  ExprPtr mutate(BFloat16ImmPtr v) override {
+    return alloc<Cast>(kFloat, v);
+  }
+
    ExprPtr mutate(CastPtr v) override {
      ExprPtr child = v->src_value()->accept_mutator(this);
  
      // just don't allow half casts we didn't insert.
-    if (v->dtype().scalar_type() == ScalarType::Half) {
+    if (isHalf(v)) {
        if (inserted_half_casts_.count(v) < 1) {
          return child;
        }
@@ -105,8 +120,9 @@ class HalfRewriter : public IRMutator {
  
      return alloc<Cast>(v->dtype(), child);
    }
+
    StmtPtr mutate(LetPtr v) override {
-    if (v->dtype().scalar_type() == ScalarType::Half) {
+    if (isHalf(v->dtype().scalar_type())) {
        VarPtr load_new_var = alloc<Var>(v->var()->name_hint(), kFloat);
        ExprPtr new_value = alloc<Cast>(
            v->dtype().cloneWithScalarType(ScalarType::Float),
@@ -131,7 +147,7 @@ class HalfRewriter : public IRMutator {
    template <typename T>
    ExprPtr mutateArithmetic(T v) {
      IRMutator::mutate(v);
-    if (v->dtype().scalar_type() == c10::kHalf) {
+    if (isHalf(v)) {
        v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
      }
      return v;
@@ -169,6 +185,14 @@ class HalfRewriter : public IRMutator {
    }
  
   private:
+  static bool isHalf(ScalarType st) {
+    return st == ScalarType::Half || st == ScalarType::BFloat16;
+  }
+
+  static bool isHalf(ExprPtr v) {
+    return isHalf(v->dtype().scalar_type());
+  }
+
    std::unordered_set<ExprPtr> inserted_half_casts_;
    std::unordered_map<VarPtr, VarPtr> var_map;
  };
diff --git a/torch/csrc/jit/tensorexpr/hash_provider.h b/torch/csrc/jit/tensorexpr/hash_provider.h

index 91ce269..35d493a 100644 (file)
--- a/torch/csrc/jit/tensorexpr/hash_provider.h
+++ b/torch/csrc/jit/tensorexpr/hash_provider.h
@@ -92,7 +92,7 @@ class TORCH_API HashProvider : public IRVisitor {
      CACHE_GUARD();                               \
      putHash(v, hash_combine(#Name, v->value())); \
    }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
  #undef IMM_VISIT
  
    void visit(CastPtr v) override;
@@ -287,6 +287,14 @@ class TORCH_API HashProvider : public IRVisitor {
      std::memcpy(&n, &d, sizeof d);
      return te_hash(n);
    }
+
+  size_t te_hash(at::BFloat16 d) {
+    // memcpy as type punning. Should be optimized out.
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int16_t n;
+    std::memcpy(&n, &d, sizeof d);
+    return te_hash(n);
+  }
  };
  
  } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp

index 2680f53..439993c 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -231,7 +231,7 @@ bool immediateIsNegative(ExprPtr e) {
    if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
      return imm->value() < 0;                 \
    }
-  AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+  AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
    return false;
  }
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h

index 1218082..65a362e 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -320,7 +320,7 @@ class Min : public BinaryOpNode<Min> {
     private:                                                   \
      Type value_;                                              \
    };
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
  #undef IMM_DECLARE
  
  // Get immediate by ScalarType.
@@ -329,9 +329,9 @@ ExprPtr getImmediateByType(ScalarType immType, T initialVal) {
    switch (immType) {
  #define TYPE_CASE(Type, Name) \
    case ScalarType::Name:      \
-    return alloc<Name##Imm>(initialVal);
+    return alloc<Name##Imm>(Type(initialVal));
      // NOLINTNEXTLINE(bugprone-branch-clone)
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      default:
        throw unsupported_dtype();
@@ -374,7 +374,7 @@ T immediateAs(ExprPtr e) {
    if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
      return imm->value();                     \
    }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
    throw unsupported_dtype();
    return 0;
@@ -391,7 +391,7 @@ bool immediateEquals(ExprPtr e, T val) {
    if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
      return imm->value() == val;              \
    }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
    throw unsupported_dtype();
    return false;
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.cpp b/torch/csrc/jit/tensorexpr/ir_cloner.cpp

index e225826..1144833 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_cloner.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
@@ -119,7 +119,7 @@ ExprPtr IRCloner::mutate(CompareSelectPtr v) {
    ExprPtr IRCloner::mutate(Name##ImmPtr v) { \
      return v;                                \
    }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DEFINE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DEFINE);
  #undef IMM_MUTATE_DEFINE
  
  ExprPtr IRCloner::mutate(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.h b/torch/csrc/jit/tensorexpr/ir_cloner.h

index f03e128..5f516a0 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_cloner.h
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.h
@@ -26,7 +26,7 @@ class TORCH_API IRCloner : public IRMutator {
    ExprPtr mutate(RshiftPtr v) override;
    ExprPtr mutate(CompareSelectPtr v) override;
  #define IMM_MUTATE_DECLARE(Type, Name) ExprPtr mutate(Name##ImmPtr v) override;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE);
  #undef IMM_MUTATE_DECLARE
    ExprPtr mutate(CastPtr v) override;
    ExprPtr mutate(BitCastPtr v) override;
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp

index 4512158..71a40a1 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
@@ -115,7 +115,7 @@ ExprPtr IRMutator::mutate(CompareSelectPtr v) {
    ExprPtr IRMutator::mutate(Name##ImmPtr v) { \
      return v;                                 \
    }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DEFINE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DEFINE);
  #undef IMM_MUTATE_DEFINE
  
  ExprPtr IRMutator::mutate(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h

index fb6c420..0a96876 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_mutator.h
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.h
@@ -25,7 +25,7 @@ class TORCH_API IRMutator {
    virtual ExprPtr mutate(RshiftPtr v);
    virtual ExprPtr mutate(CompareSelectPtr v);
  #define IMM_MUTATE_DECLARE(Type, Name) virtual ExprPtr mutate(Name##ImmPtr v);
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE);
  #undef IMM_MUTATE_DECLARE
    virtual ExprPtr mutate(CastPtr v);
    virtual ExprPtr mutate(BitCastPtr v);
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp

index ca90d99..4a10c28 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -226,7 +226,7 @@ static void formatImm(std::ostream& os, T v) {
    void IRPrinter::visit(Name##ImmPtr v) { \
      formatImm(os(), v->value());          \
    }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT);
  #undef IMM_PRINT_VISIT
  
  void IRPrinter::visit(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h

index 327119d..fb357a8 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -34,7 +34,7 @@ class TORCH_API IRPrinter : public IRVisitor {
    void visit(RshiftPtr v) override;
    void visit(CompareSelectPtr v) override;
  #define IMM_PRINT_VISIT(Type, Name) void visit(Name##ImmPtr v) override;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT);
  #undef IMM_PRINT_VISIT
    void visit(CastPtr v) override;
    void visit(BitCastPtr v) override;
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h

index 1df8b5d..11d004f 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -97,7 +97,7 @@ inline ExprPtr evaluateOp(ExprPtr v) {
      Type val = eval.value<Type>();                            \
      return getImmediateByType(v->dtype().scalar_type(), val); \
    }
-    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      default:
        LOG(FATAL) << "Unsupported datatype: " << v->dtype();
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp

index eb2a428..9489422 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
@@ -79,7 +79,7 @@ void IRVisitor::visit(CompareSelectPtr v) {
  // NOLINTNEXTLINE
  #define IMM_VISIT(Type, Name) \
    void IRVisitor::visit(Name##ImmPtr v) {}
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
  #undef IMM_VISIT
  
  void IRVisitor::visit(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h

index 001725f..e54786b 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_visitor.h
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.h
@@ -26,7 +26,7 @@ class TORCH_API IRVisitor {
  
  #define IMM_PRINT_VISIT(Type, Name) virtual void visit(Name##ImmPtr v);
  
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
  #undef IMM_PRINT_VISIT
  
    virtual void visit(CastPtr v);
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp

index e4136d8..78cbb82 100644 (file)
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -52,7 +52,7 @@ static ExprHandle promoteToDtype(ExprHandle e, ScalarType dt) {
    case ScalarType::Name:      \
      e = cast<Type>(e);        \
      break;
-    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      default:
        throw unsupported_dtype();
@@ -520,7 +520,7 @@ ExprHandle demoteOutput(
  #define TYPE_CASE(Type, Name) \
    case ScalarType::Name:      \
      return cast<Type>(e);
-    AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      case ScalarType::Bool:
        return cast<bool>(e);
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp

index 026d52b..b9ea708 100644 (file)
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -231,7 +231,7 @@ class LLVMCodeGenImpl : public IRVisitor {
    void visit(CompareSelectPtr v) override;
  
  #define IMM_VISIT_DECLARE(_1, Name) void visit(Name##ImmPtr v) override;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT_DECLARE);
  #undef IMM_VISIT_DECLARE
  
    void visit(CastPtr v) override;
@@ -902,6 +902,10 @@ void LLVMCodeGenImpl::visit(HalfImmPtr v) {
    value_ = llvm::ConstantFP::get(HalfTy_, v->value());
  }
  
+void LLVMCodeGenImpl::visit(BFloat16ImmPtr v) {
+  TORCH_INTERNAL_ASSERT(false, "llvm codegen does not support bfloat16");
+}
+
  void LLVMCodeGenImpl::visit(BoolImmPtr v) {
    value_ = llvm::ConstantInt::get(BoolTy_, v->value());
  }
diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h

index 08aef01..22d90b9 100644 (file)
--- a/torch/csrc/jit/tensorexpr/reduction.h
+++ b/torch/csrc/jit/tensorexpr/reduction.h
@@ -171,7 +171,7 @@ inline ExprHandle maximumVal(ScalarType type) {
  #define MAX_BY_TYPE_CASE(Type, Name) \
    case ScalarType::Name:             \
      return ExprHandle(std::numeric_limits<Type>::max());
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, MAX_BY_TYPE_CASE)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
  #undef MAX_BY_TYPE_CASE
      default:
        throw unsupported_dtype();
@@ -184,7 +184,7 @@ inline ExprHandle minimumVal(ScalarType type) {
  #define MAX_BY_TYPE_CASE(Type, Name) \
    case ScalarType::Name:             \
      return ExprHandle(std::numeric_limits<Type>::min());
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, MAX_BY_TYPE_CASE)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
  #undef MAX_BY_TYPE_CASE
      default:
        throw unsupported_dtype();
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp

index c7f4882..c924bde 100644 (file)
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -69,7 +69,7 @@ void initTensorExprBindings(PyObject* module) {
  #define DTYPE_SINGLETON_ACCESSOR(ctype, name) \
    dtype_class.def_property_readonly_static(   \
        #name, [](py::object) { return k##name; }); // NOLINT
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DTYPE_SINGLETON_ACCESSOR)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DTYPE_SINGLETON_ACCESSOR)
  #undef DTYPE_SINGLETON_ACCESSOR
  
    auto expr_handle_class =
@@ -144,7 +144,7 @@ void initTensorExprBindings(PyObject* module) {
  
  #define EXPRHANDLE_CTOR(ctype, name) \
    expr_handle_class.def_static(#ctype, [](ctype v) { return ExprHandle(v); });
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, EXPRHANDLE_CTOR)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, EXPRHANDLE_CTOR)
  #undef EXPRHANDLE_CTOR
  
    py::class_<VarHandle, ExprHandle>(te, "VarHandle")
diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp

index 5cef86a..e75ecd9 100644 (file)
--- a/torch/csrc/jit/tensorexpr/types.cpp
+++ b/torch/csrc/jit/tensorexpr/types.cpp
@@ -16,7 +16,7 @@ Dtype Dtype::scalar_dtype() const {
  // NOLINTNEXTLINE
  #define DTYPE_DEFINE(_1, n) TORCH_API Dtype k##n(ScalarType::n, 1);
  
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DTYPE_DEFINE)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DTYPE_DEFINE)
  
  #undef DTYPE_DEFINE
  
@@ -28,7 +28,7 @@ Dtype ToDtype(ScalarType type) {
  #define TYPE_CASE(_1, n) \
    case ScalarType::n:    \
      return k##n;
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
  #undef TYPE_CASE
  
      case ScalarType::Undefined:
@@ -56,7 +56,7 @@ int Dtype::byte_size() const {
      scalar_size = sizeof(Type); \
      break;
  
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
  #undef TYPE_CASE
      default:
        throw std::runtime_error(
@@ -77,6 +77,8 @@ std::string Dtype::ToCppString() const {
        return "bool";
      case ScalarType::Half:
        return "half";
+    case ScalarType::BFloat16:
+      return "__nv_bfloat16";
      default:
        throw unsupported_dtype();
    }
diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h

index 00cd50d..3716a0a 100644 (file)
--- a/torch/csrc/jit/tensorexpr/types.h
+++ b/torch/csrc/jit/tensorexpr/types.h
@@ -75,7 +75,7 @@ extern TORCH_API Dtype kHandle;
  
  #define NNC_DTYPE_DECLARATION(ctype, name) extern TORCH_API Dtype k##name;
  
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, NNC_DTYPE_DECLARATION)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_DTYPE_DECLARATION)
  #undef NNC_DTYPE_DECLARATION
  
  template <typename T>
@@ -86,7 +86,7 @@ TORCH_API Dtype ToDtype();
    inline Dtype ToDtype<ctype>() {            \
      return k##name;                          \
    }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, NNC_TODTYPE_DECLARATION)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_TODTYPE_DECLARATION)
  #undef NNC_TODTYPE_DECLARATION
  
  TORCH_API Dtype ToDtype(ScalarType type);
author	Bert Maher <bertrand@fb.com>
	Tue, 31 Aug 2021 03:08:15 +0000 (20:08 -0700)
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
	Tue, 31 Aug 2021 03:09:36 +0000 (20:09 -0700)
test/test_jit_fuser_te.py		patch \| blob \| history
torch/csrc/jit/passes/tensorexpr_fuser.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/block_codegen.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/codegen.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/codegen.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/cpp_codegen.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/cuda_codegen.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/eval.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/eval.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/expr.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/expr.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/fwd_decls.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/half_support.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/hash_provider.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_cloner.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_cloner.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_mutator.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_mutator.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_printer.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_printer.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_simplifier.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_visitor.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_visitor.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/kernel.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/llvm_codegen.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/reduction.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/tensorexpr_init.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/types.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/types.h		patch \| blob \| history