add dense vector to id_list operator (#15090)
authorBill Li <goodmasterli@fb.com>
Wed, 19 Dec 2018 00:07:55 +0000 (16:07 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 19 Dec 2018 00:27:38 +0000 (16:27 -0800)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15090

as title
step 2 of the linked task

Reviewed By: ellie-wen

Differential Revision: D13425977

fbshipit-source-id: f3538ed68f42470ba39c5b779af764d4a5591a9d

caffe2/operators/dense_vector_to_id_list_op.cc [new file with mode: 0644]
caffe2/operators/dense_vector_to_id_list_op.h [new file with mode: 0644]
caffe2/python/operator_test/dense_vector_to_id_list_op_test.py [new file with mode: 0644]

diff --git a/caffe2/operators/dense_vector_to_id_list_op.cc b/caffe2/operators/dense_vector_to_id_list_op.cc
new file mode 100644 (file)
index 0000000..36dc345
--- /dev/null
@@ -0,0 +1,29 @@
+#include "caffe2/operators/dense_vector_to_id_list_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(DenseVectorToIdList, DenseVectorToIdListOp<CPUContext>);
+
+OPERATOR_SCHEMA(DenseVectorToIdList)
+    .NumInputs(1)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+DenseVectorToIdList: Convert a blob with dense feature into a ID_LIST.
+
+An ID_LIST is a list of IDs (may be ints, often longs) that represents a single
+feature. As described in https://caffe2.ai/docs/sparse-operations.html, a batch
+of ID_LIST examples is represented as a pair of lengths and values where the
+`lengths` (int32) segment the `values` or ids (int32/int64) into examples.
+
+Input is a single blob where the first dimension is the batch size and the
+second dimension is the length of dense vectors. This operator produces a
+ID_LIST where out_values are the indices of non-zero entries
+and out_lengths are the number of non-zeros entries in each row.
+
+)DOC")
+    .Input(0, "values", "A data blob of dense vectors")
+    .Output(0, "out_lengths", "Lengths of the sparse feature")
+    .Output(1, "out_values", "Values of the sparse feature");
+NO_GRADIENT(DenseVectorToIdList);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/dense_vector_to_id_list_op.h b/caffe2/operators/dense_vector_to_id_list_op.h
new file mode 100644 (file)
index 0000000..3ee2cfe
--- /dev/null
@@ -0,0 +1,67 @@
+#ifndef CAFFE2_OPERATORS_DENSE_VECTOR_TO_ID_LIST_OP_H_
+#define CAFFE2_OPERATORS_DENSE_VECTOR_TO_ID_LIST_OP_H_
+
+#include <set>
+#include <vector>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class DenseVectorToIdListOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(DenseVectorToIdListOp)
+
+  template <typename T, typename M>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    const auto* input_data = input.template data<T>();
+
+    CAFFE_ENFORCE_EQ(input.dim(), 2, "Sample should be 2-D");
+    const auto batch_size = input.size(0);
+    const auto col_num = input.size(1);
+
+    auto* out_lengths = Output(0);
+    out_lengths->Resize(batch_size);
+
+    auto* out_lengths_data = out_lengths->template mutable_data<int32_t>();
+
+    auto* out_values = Output(1);
+    out_values->Resize(batch_size * col_num);
+
+    auto* out_values_data = out_values->template mutable_data<M>();
+
+    auto v_pos = 0;
+    auto l_pos = 0;
+    for (auto i = 0; i < batch_size; i++) {
+      auto length = 0;
+      for (int j = 0; j < col_num; j++) {
+        if ((int)(input_data[i * col_num + j] + 0.5) != 0) {
+          out_values_data[v_pos++] = j;
+          length++;
+        }
+      }
+      out_lengths_data[l_pos++] = length;
+    }
+    out_values->Resize(v_pos);
+    out_lengths->Resize(l_pos);
+    return true;
+  }
+
+  bool RunOnDevice() override {
+    if (Input(0).template IsType<float>()) {
+      return DoRunWithType<float, int>();
+    } else {
+      CAFFE_THROW(
+          "DenseVectorToIdList operator only supports 32-bit float, but",
+          " input was of type ",
+          Input(0).dtype().name());
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_DENSE_VECTOR_TO_ID_LIST_OP_H_
diff --git a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
new file mode 100644 (file)
index 0000000..aea30d8
--- /dev/null
@@ -0,0 +1,66 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+import hypothesis.extra.numpy as hnp
+import hypothesis.strategies as st
+import numpy as np
+
+
+@st.composite
+def id_list_batch(draw):
+    batch_size = draw(st.integers(2, 2))
+    values_dtype = np.float32
+    inputs = []
+    sample_size = draw(st.integers(5, 10))
+    for _ in range(batch_size):
+        values = draw(hnp.arrays(values_dtype, sample_size, st.integers(0, 1)))
+        inputs += [values]
+    return [np.array(inputs)]
+
+
+def dense_vector_to_id_list_ref(*arg):
+    arg = arg[0]
+    batch_size = len(arg)
+    assert batch_size > 0
+    out_length = []
+    out_values = []
+    for row in arg:
+        length = 0
+        for idx, entry in enumerate(row):
+            if entry != 0:
+                out_values += [idx]
+                length += 1
+        out_length += [length]
+    return (out_length, out_values)
+
+
+class TestDenseVectorToIdList(hu.HypothesisTestCase):
+    def test_dense_vector_to_id_list_ref(self):
+        # Verify that the reference implementation is correct!
+        dense_input = np.array(
+            [[1, 0, 0, 1, 0, 0, 0, 1],
+             [1, 0, 1, 0, 0, 0, 0, 1],
+             [0, 1, 0, 0, 0, 1, 0, 1]],
+            dtype=np.float32)
+        sparse_lengths, sparse_values = dense_vector_to_id_list_ref(dense_input)
+        expected_lengths = np.array([3, 3, 3], dtype=np.int32)
+        expected_values = np.array([0, 3, 7, 0, 2, 7, 1, 5, 7], dtype=np.int64)
+
+        np.testing.assert_array_equal(sparse_lengths, expected_lengths)
+        np.testing.assert_array_equal(sparse_values, expected_values)
+
+    @given(inputs=id_list_batch(), **hu.gcs_cpu_only)
+    def test_dense_vector_to_id_list_op(self, inputs, gc, dc):
+        op = core.CreateOperator(
+            "DenseVectorToIdList",
+            ["values"],
+            ["out_lengths", "out_values"]
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        self.assertReferenceChecks(gc, op, inputs, dense_vector_to_id_list_ref)