[nnc] Disable erf and erfc (#63775)
authorBert Maher <bertrand@fb.com>
Wed, 25 Aug 2021 01:52:29 +0000 (18:52 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Wed, 25 Aug 2021 01:55:45 +0000 (18:55 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63775

These introduce small accuracy differences that cause some internal
tests to fail, and it's not worth fixing the tests right now because they're
slower than the ATen ops anyways.
ghstack-source-id: 136526229

Test Plan:
```
buck test mode/dev //aml/eccv/mcm/training:tests -- --exact 'aml/eccv/mcm/training:tests - test_build_torch_script_model (aml.eccv.mcm.training.tests.publish_helper_tests.TransformerPredictorPublishHelperTests)'
```

Reviewed By: navahgar

Differential Revision: D30484557

fbshipit-source-id: 095a9c810539a499105b76e1d96843dbc61b0079

test/test_jit_fuser_te.py
torch/csrc/jit/passes/tensorexpr_fuser.cpp

index 5e8204a..f2dce12 100644 (file)
@@ -1274,8 +1274,11 @@ class TestTEFuser(JitTestCase):
             lambda x: torch.threshold(x, 0, -10),
             lambda x: torch.clamp(x, -10, 10),
         ]
+        gpu_only = {torch.erf, torch.erfc}
         sizes = [(1,), (2,), (4, 4)]
         for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
+            if op in gpu_only and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device, size=size)
                 fn = apply(op)
index d4add03..3f0cd14 100644 (file)
@@ -948,6 +948,14 @@ class TensorExprFuser {
       "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
       "aten::matmul(Tensor self, Tensor other) -> Tensor",
     };
+    static const OperatorSet gpu_only_operator_set{
+      // On CPU, these are slower and less accurate than ATen kernels, because
+      // ATen is able to use MKL-VML, whereas the fuser currently can't.  The
+      // fuser uses sleef instead because sleef provides functions that operate
+      // on vectors, instead of large buffers.
+      "aten::erf(Tensor self) -> Tensor",
+      "aten::erfc(Tensor self) -> Tensor",
+    };
     static const OperatorSet pow{
       "aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor",
     };
@@ -1026,6 +1034,17 @@ class TensorExprFuser {
       }
     }
 
+    // Operator is only supported on GPU.
+    if (node->isMemberOf(gpu_only_operator_set)) {
+      auto device = tensorexpr::pickDeviceType(node->inputs());
+      if (!device) {
+        device = tensorexpr::pickDeviceType(node->outputs());
+      }
+      if (!device || !device->is_cuda()) {
+        return false;
+      }
+    }
+
     if (node->kind() == aten::to) {
       // only support same-device conversion
       auto device = tensorexpr::pickDeviceType(node->inputs());