From 939877bf4b5f37307eebd0cb035d65212b9449f6 Mon Sep 17 00:00:00 2001
From: PenghuiCheng <42089598+penghuicheng@users.noreply.github.com>
Date: Fri, 7 Dec 2018 12:01:44 -0800
Subject: [PATCH] Implementation of WeightedSum op for mkl-dnn and fix FC op
 output shape issue.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/14407

Reviewed By: yinghai

Differential Revision: D13364364

Pulled By: wesolwsk

fbshipit-source-id: e69bcd1bc52e35b2f0e45e5dc40184f1bd66605d
---
 caffe2/ideep/operators/fully_connected_op.cc      | 10 +++-
 caffe2/ideep/operators/operator_fallback_ideep.cc |  5 --
 caffe2/ideep/operators/utility_ops.cc             | 36 ++++++++++++++
 caffe2/python/ideep/fc_op_test.py                 | 41 +++++++++++++++-
 caffe2/python/ideep/weightedsum_op_test.py        | 57 +++++++++++++++++++++++
 5 files changed, 141 insertions(+), 8 deletions(-)
 create mode 100644 caffe2/python/ideep/weightedsum_op_test.py
diff --git a/caffe2/ideep/operators/fully_connected_op.cc b/caffe2/ideep/operators/fully_connected_op.cc
index 80ed367..609e528 100644
--- a/caffe2/ideep/operators/fully_connected_op.cc
+++ b/caffe2/ideep/operators/fully_connected_op.cc
@@ -101,9 +101,17 @@ class IDEEPFullyConnectedGradientOp final : public IDEEPOperator {
 
     ideep::inner_product_backward_weights::compute(X_in, dY, *dfilter, *dbias);
 
+    /**
+     * In mkl-dnn,weight gradient shape is determined by X_in,
+     * so we should ensure that weight gradient shape is consistent with weight shape.
+     */
+    if (dfilter->get_dims() != filter.get_dims()) {
+      dfilter->reshape(filter.get_dims());
+    }
+
     if (OutputSize() > INPUT_GRAD) {
       ideep::inner_product_backward_data::compute(
-          dY, filter_in, X_in.get_dims(), *Output(INPUT_GRAD));
+          dY, filter_in, X.get_dims(), *Output(INPUT_GRAD));
     }
 
     return true;
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index 84d0dbb..a5f03a1 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -31,7 +31,6 @@
 #include <caffe2/operators/softmax_op.h>
 #include <caffe2/operators/tanh_op.h>
 #include <caffe2/operators/transpose_op.h>
-#include <caffe2/operators/utility_ops.h>
 #include <caffe2/operators/affine_channel_op.h>
 #include <caffe2/operators/stop_gradient.h>
 #include <caffe2/sgd/adam_op.h>
@@ -139,10 +138,6 @@ REGISTER_IDEEP_OPERATOR(
     LearningRate,
     IDEEPFallbackOp<LearningRateOp<float, CPUContext>>);
 REGISTER_IDEEP_OPERATOR(
-    WeightedSum,
-    IDEEPFallbackOp<WeightedSumOp<CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
     LeakyRelu,
     IDEEPFallbackOp<LeakyReluOp<float, CPUContext>>);
 REGISTER_IDEEP_OPERATOR(
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index ecb5f82..98dc335 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -67,9 +67,45 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   }
 };
 
+class IDEEPWeightedSumOp : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws) {}
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE_EQ(InputSize() % 2, 0);
+    auto ndims = Input(0).ndims();
+    auto nelems = Input(0).get_nelems();
+    auto w_nelems = Input(1).get_nelems();
+    CAFFE_ENFORCE_GT(nelems, 0);
+    CAFFE_ENFORCE_EQ(w_nelems, 1);
+    auto* output = Output(0);
+    std::vector<float> scales;
+    scales.reserve(InputSize() / 2);
+    std::vector<itensor> inputs;
+    inputs.reserve(InputSize() / 2);
+    for (int i = 0; i < InputSize(); i += 2) {
+      auto& X = Input(i);
+      CAFFE_ENFORCE(X.ndims() == ndims);
+      CAFFE_ENFORCE(X.get_nelems() == nelems);
+      CAFFE_ENFORCE(Input(i + 1).get_nelems() == w_nelems);
+      inputs.push_back(X);
+      auto scale = static_cast<float *>(Input(i + 1).get_data_handle());
+      scales.push_back(scale[0]);
+    }
+
+    ideep::sum::compute(scales, inputs, *output);
+
+    return true;
+  }
+};
+
 REGISTER_IDEEP_OPERATOR(CopyCPUToIDEEP, CopyCPUToIDEEPOp);
 REGISTER_IDEEP_OPERATOR(CopyIDEEPToCPU, CopyIDEEPToCPUOp);
 REGISTER_IDEEP_OPERATOR(Copy, IDEEPCopyOp);
+REGISTER_IDEEP_OPERATOR(WeightedSum, IDEEPWeightedSumOp);
 
 OPERATOR_SCHEMA(CopyCPUToIDEEP)
     .NumInputs(1)
diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py
index 03deedb..ba1ce10 100644
--- a/caffe2/python/ideep/fc_op_test.py
+++ b/caffe2/python/ideep/fc_op_test.py
@@ -12,11 +12,12 @@ from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
 
+
 @unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
 class FcTest(hu.HypothesisTestCase):
     @given(n=st.integers(1, 5), m=st.integers(1, 5),
            k=st.integers(1, 5), **mu.gcs)
-    def test_fc(self,n, m, k, gc, dc):
+    def test_fc_2_dims(self, n, m, k, gc, dc):
         X = np.random.rand(m, k).astype(np.float32) - 0.5
         W = np.random.rand(n, k).astype(np.float32) - 0.5
         b = np.random.rand(n).astype(np.float32) - 0.5
@@ -25,7 +26,7 @@ class FcTest(hu.HypothesisTestCase):
             'FC',
             ['X', 'W', 'b'],
             ["Y"]
-            )
+        )
 
         self.assertDeviceChecks(dc, op, [X, W, b], [0])
 
@@ -222,6 +223,42 @@ class FcTest(hu.HypothesisTestCase):
             print(np.max(np.abs(db1 - db0)))
             self.assertTrue(False)
 
+    @given(n=st.integers(1, 5), m=st.integers(1, 5),
+           k=st.integers(1, 5), **mu.gcs)
+    def test_fc_4_dims_src(self, n, m, k, gc, dc):
+        X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
+        W = np.random.rand(n, k * m * m).astype(np.float32) - 0.5
+        b = np.random.rand(n).astype(np.float32) - 0.5
+
+        op = core.CreateOperator(
+            'FC',
+            ['X', 'W', 'b'],
+            ["Y"]
+        )
+
+        self.assertDeviceChecks(dc, op, [X, W, b], [0])
+
+        for i in range(3):
+            self.assertGradientChecks(gc, op, [X, W, b], i, [0])
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5),
+           k=st.integers(1, 5), **mu.gcs)
+    def test_fc_4_dims(self, n, m, k, gc, dc):
+        X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
+        W = np.random.rand(n, k, m, m).astype(np.float32) - 0.5
+        b = np.random.rand(n).astype(np.float32) - 0.5
+
+        op = core.CreateOperator(
+            'FC',
+            ['X', 'W', 'b'],
+            ["Y"]
+        )
+
+        self.assertDeviceChecks(dc, op, [X, W, b], [0])
+
+        for i in range(3):
+            self.assertGradientChecks(gc, op, [X, W, b], i, [0])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/caffe2/python/ideep/weightedsum_op_test.py b/caffe2/python/ideep/weightedsum_op_test.py
new file mode 100644
index 0000000..2a0b3ec
--- /dev/null
+++ b/caffe2/python/ideep/weightedsum_op_test.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
+class TestWeightedSumOp(hu.HypothesisTestCase):
+    @given(n=st.integers(5, 8), m=st.integers(1, 1),
+           d=st.integers(2, 4), grad_on_w=st.booleans(),
+           **mu.gcs_ideep_only)
+    def test_weighted_sum(self, n, m, d, grad_on_w, gc, dc):
+        input_names = []
+        input_vars = []
+        for i in range(m):
+            X_name = 'X' + str(i)
+            w_name = 'w' + str(i)
+            input_names.extend([X_name, w_name])
+            var = np.random.rand(n, d).astype(np.float32)
+            vars()[X_name] = var
+            input_vars.append(var)
+            var = np.random.rand(1).astype(np.float32)
+            vars()[w_name] = var
+            input_vars.append(var)
+
+        def weighted_sum_op_ref(*args):
+            res = np.zeros((n, d))
+            for i in range(m):
+                res = res + args[2 * i + 1] * args[2 * i]
+
+            return (res, )
+
+        op = core.CreateOperator(
+            "WeightedSum",
+            input_names,
+            ['Y'],
+            grad_on_w=grad_on_w,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=input_vars,
+            reference=weighted_sum_op_ref,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
2.7.4