--- /dev/null
+#include "caffe2/operators/inference_lstm_op.h"
+
+namespace caffe2 {
+namespace {
+
+bool InferenceLSTMOp::RunOnDevice() {
+ auto& _input = Input(0);
+ auto& hidden_0 = Input(1);
+ auto& hidden_1 = Input(2);
+ std::vector<Tensor> params;
+ for (int i = 3; i < InputSize(); i++) {
+ params.push_back(Input(i).UnsafeSharedInstance());
+ }
+ auto input = batch_first_ ? transpose(_input, 0, 1, &context_)
+ : _input.UnsafeSharedInstance();
+
+ auto cell_params = gather_params(params, has_biases_, &context_);
+ auto results = _lstm_impl(
+ input,
+ cell_params,
+ hidden_0,
+ hidden_1,
+ num_layers_,
+ bidirectional_,
+ &context_);
+
+ std::vector<Tensor> allOutputs(OutputSize());
+ allOutputs.at(0) = copy_ctor(std::get<0>(results));
+ if (batch_first_) {
+ allOutputs.at(0) = transpose(allOutputs.at(0), 0, 1, &context_);
+ }
+ allOutputs.at(1) = copy_ctor(std::get<1>(results));
+ allOutputs.at(2) = copy_ctor(std::get<2>(results));
+ for (int i = 0; i < OutputSize(); i++) {
+ auto output = XOutput(i, allOutputs.at(i).sizes(), dtype<float>());
+ context_.CopyItemsSameDevice(
+ allOutputs.at(i).dtype(),
+ allOutputs.at(i).numel(),
+ allOutputs.at(i).template data<float>(),
+ output.template mutable_data<float>());
+ }
+ return true;
+}
+
+REGISTER_CPU_OPERATOR(InferenceLSTM, InferenceLSTMOp);
+OPERATOR_SCHEMA(InferenceLSTM)
+ .NumInputs(1, INT_MAX)
+ .NumOutputs(3)
+ .Output(0, "output", "the output of the last layer of lstm")
+ .Output(1, "hidden", "hidden state at t = seq_len")
+ .Output(2, "cell", "cell state at t = seq_len")
+ .Arg("num_layers", "(*long*): number of layers in the lstm stack")
+ .Arg("has_biases", "(*bool*): whether the cells have biases or not")
+ .Arg("batch_first", "(*bool*): whether the batch is at dim 0")
+ .Arg("bidirectional", "(*bool*): if bidirectional");
+NO_GRADIENT(InferenceLSTM);
+} // namespace
+} // namespace caffe2
+
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+ InferenceLSTM,
+ (std::vector<c10::Argument>{
+ c10::Argument("input_list", ListType::ofTensors()),
+ c10::Argument("num_layers", IntType::get()),
+ c10::Argument("has_biases", BoolType::get()),
+ c10::Argument("batch_first", BoolType::get()),
+ c10::Argument("bidirectional", BoolType::get())}),
+ (std::vector<c10::Argument>{c10::Argument("output"),
+ c10::Argument("hidden"),
+ c10::Argument("cell")}),
+ caffe2::InferenceLSTMOp);
--- /dev/null
+#ifndef LSTM_OP_H_
+#define LSTM_OP_H_
+
+#include <c10/core/Tensor.h>
+#include <algorithm>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+#include "lstm_utils.h"
+
+C10_DECLARE_CAFFE2_OPERATOR(LSTMOp);
+
+namespace caffe2 {
+namespace {
+
+using t_tuple = std::tuple<Tensor, Tensor>;
+
+struct CellParams {
+ CellParams(
+ const Tensor& _w_ih,
+ const Tensor& _w_hh,
+ const Tensor& _b_ih,
+ const Tensor& _b_hh,
+ CPUContext* _context) {
+ initParams(_w_ih, _w_hh, _b_ih, _b_hh, _context);
+ }
+
+ CellParams(const CellParams& rhs) {
+ initParams(rhs.w_ih, rhs.w_hh, rhs.b_ih, rhs.b_hh, rhs.context);
+ }
+
+ CellParams& operator=(const CellParams& rhs) {
+ initParams(rhs.w_ih, rhs.w_hh, rhs.b_ih, rhs.b_hh, rhs.context);
+ return *this;
+ }
+
+ void initParams(
+ const Tensor& _w_ih,
+ const Tensor& _w_hh,
+ const Tensor& _b_ih,
+ const Tensor& _b_hh,
+ CPUContext* _context) {
+ w_ih = copy_ctor(_w_ih);
+ w_hh = copy_ctor(_w_hh);
+ b_ih = copy_ctor(_b_ih);
+ b_hh = copy_ctor(_b_hh);
+ context = _context;
+ }
+
+ Tensor w_ih;
+ Tensor w_hh;
+ Tensor b_ih; /* optional */
+ Tensor b_hh; /* optional */
+ CPUContext* context;
+
+ Tensor linear_ih(const Tensor& input) const {
+ return linear(input, w_ih, b_ih, context);
+ }
+ Tensor linear_hh(const Tensor& h) const {
+ return linear(h, w_hh, b_hh, context);
+ }
+};
+
+struct LSTMCell {
+ explicit LSTMCell(CPUContext* context) : context_(context) {}
+ t_tuple operator()(
+ const Tensor& input,
+ const t_tuple& hidden,
+ const CellParams& params) const {
+ const auto& hx = std::get<0>(hidden);
+ const auto& cx = std::get<1>(hidden);
+ auto linear_ih = params.linear_ih(input);
+ auto linear_hh = params.linear_hh(hx);
+ auto gates = add(linear_ih, linear_hh, context_);
+ auto chunked_gates = chunk(gates, 4, 1, context_);
+ auto ingate = sigmoid(chunked_gates[0]);
+ auto forgetgate = sigmoid(chunked_gates[1]);
+ auto cellgate = tanh(chunked_gates[2], context_);
+ auto outgate = sigmoid(chunked_gates[3]);
+
+ auto cy =
+ add(mul(forgetgate, cx, context_),
+ mul(ingate, cellgate, context_),
+ context_);
+ auto hy = mul(outgate, tanh(cy, context_), context_);
+ return std::make_tuple(std::move(hy), std::move(cy));
+ }
+ CPUContext* context_;
+};
+
+template <typename output_type, typename hidden_type>
+struct LayerOutput {
+ output_type outputs;
+ hidden_type final_hidden;
+
+ LayerOutput(const output_type& _outputs, const hidden_type& _hidden) {
+ outputs = copy_ctor(_outputs);
+ final_hidden = copy_ctor(_hidden);
+ }
+};
+
+template <typename hidden_type, typename param_type>
+struct Layer {
+ using output_type = LayerOutput<Tensor, hidden_type>;
+ virtual ~Layer() {}
+ virtual output_type operator()(
+ const Tensor& input,
+ const hidden_type& input_hidden,
+ const param_type& params) const = 0;
+};
+
+struct FullLSTMLayer : Layer<t_tuple, CellParams> {
+ FullLSTMLayer(LSTMCell& cell, CPUContext* context)
+ : cell_(cell), context_(context) {}
+
+ LayerOutput<std::vector<Tensor>, t_tuple> operator()(
+ const std::vector<Tensor>& step_inputs,
+ const std::tuple<Tensor, Tensor>& input_hidden,
+ const CellParams& params) const {
+ std::vector<Tensor> step_outputs;
+ auto hidden = copy_ctor(input_hidden);
+
+ for (size_t i = 0; i < step_inputs.size(); i++) {
+ hidden = cell_(step_inputs[i], hidden, params);
+ step_outputs.push_back(copy_ctor(std::get<0>(hidden)));
+ }
+
+ return {step_outputs, hidden};
+ }
+
+ LayerOutput<Tensor, t_tuple> operator()(
+ const Tensor& inputs,
+ const std::tuple<Tensor, Tensor>& input_hidden,
+ const CellParams& params) const override {
+ auto unstacked_output =
+ (*this)(unbind(inputs, 0, context_), input_hidden, params);
+ return {stack(unstacked_output.outputs, 0, context_),
+ unstacked_output.final_hidden};
+ }
+ LSTMCell cell_;
+ CPUContext* context_;
+};
+
+struct FullBidirectionalLSTMLayer
+ : Layer<std::pair<t_tuple, t_tuple>, std::pair<CellParams, CellParams>> {
+ using bidir_hidden_type = std::pair<t_tuple, t_tuple>;
+ using param_type = std::pair<CellParams, CellParams>;
+ using output_type = LayerOutput<Tensor, bidir_hidden_type>;
+
+ FullBidirectionalLSTMLayer(LSTMCell& cell, CPUContext* context)
+ : layer_(cell, context), context_(context) {}
+
+ output_type operator()(
+ const Tensor& input,
+ const bidir_hidden_type& input_hidden,
+ const param_type& params) const override {
+ std::vector<Tensor> outputs;
+ auto step_inputs = unbind(input, 0, context_);
+ auto fw_result = layer_(step_inputs, input_hidden.first, params.first);
+ auto fw_output = stack(fw_result.outputs, 0, context_);
+ outputs.push_back(copy_ctor(fw_output));
+ auto rev_step_inputs = reverse(std::move(step_inputs));
+ auto rev_result =
+ layer_(rev_step_inputs, input_hidden.second, params.second);
+ std::reverse(rev_result.outputs.begin(), rev_result.outputs.end());
+ auto rev_output = stack(rev_result.outputs, 0, context_);
+ outputs.push_back(copy_ctor(rev_output));
+ return {cat(outputs, fw_output.dim() - 1, context_),
+ std::make_pair(
+ std::move(fw_result.final_hidden),
+ std::move(rev_result.final_hidden))};
+ }
+
+ inline std::vector<Tensor> reverse(std::vector<Tensor>&& x) const {
+ std::reverse(x.begin(), x.end());
+ return std::move(x);
+ }
+
+ private:
+ FullLSTMLayer layer_;
+ CPUContext* context_;
+};
+
+template <typename hidden_type, typename weight_type>
+LayerOutput<Tensor, std::vector<hidden_type>> apply_layer_stack(
+ const Layer<hidden_type, weight_type>& layer,
+ const Tensor& input,
+ const std::vector<hidden_type>& hiddens,
+ const std::vector<weight_type>& weights,
+ int64_t num_layers) {
+ CAFFE_ENFORCE(
+ num_layers == hiddens.size(),
+ "Expected more hidden states in stacked_rnn");
+ CAFFE_ENFORCE(
+ num_layers == weights.size(), "Expected more weights in stacked_rnn");
+
+ auto layer_input = input.UnsafeSharedInstance();
+ auto hidden_it = hiddens.begin();
+ auto weight_it = weights.begin();
+ std::vector<hidden_type> final_hiddens(num_layers);
+ for (int64_t l = 0; l < num_layers; ++l) {
+ auto layer_output = layer(layer_input, *(hidden_it++), *(weight_it++));
+ final_hiddens.at(l) = std::move(layer_output.final_hidden);
+ layer_input = std::move(layer_output.outputs);
+ }
+ return {layer_input, final_hiddens};
+}
+
+std::tuple<Tensor, Tensor, Tensor> _lstm_impl(
+ const Tensor& input,
+ const std::vector<CellParams>& params,
+ const Tensor& hx,
+ const Tensor& cx,
+ int64_t num_layers,
+ bool bidirectional,
+ CPUContext* context) {
+ using stack_output = LayerOutput<Tensor, std::vector<t_tuple>>;
+ auto layer_hx = unbind(hx, 0, context);
+ auto layer_cx = unbind(cx, 0, context);
+ int64_t total_layers = layer_hx.size();
+ std::vector<std::tuple<Tensor, Tensor>> hiddens;
+ hiddens.reserve(total_layers);
+ for (int64_t i = 0; i < total_layers; ++i) {
+ hiddens.emplace_back(std::move(layer_hx[i]), std::move(layer_cx[i]));
+ }
+ LSTMCell cell(context);
+ std::shared_ptr<stack_output> stack_output_ptr;
+ if (bidirectional) {
+ auto bidir_result = apply_layer_stack(
+ FullBidirectionalLSTMLayer{cell, context},
+ input,
+ pair_vec(hiddens),
+ pair_vec(params),
+ num_layers);
+ stack_output_ptr.reset(new stack_output(
+ bidir_result.outputs,
+ unpair_vec(std::move(bidir_result.final_hidden))));
+ } else {
+ auto result = apply_layer_stack(
+ FullLSTMLayer{cell, context}, input, hiddens, params, num_layers);
+ stack_output_ptr = std::make_shared<stack_output>(std::move(result));
+ }
+
+ std::vector<Tensor> hy, cy;
+ hy.reserve(total_layers);
+ cy.reserve(total_layers);
+ for (auto& hidden : stack_output_ptr->final_hidden) {
+ hy.push_back(std::move(std::get<0>(hidden)));
+ cy.push_back(std::move(std::get<1>(hidden)));
+ }
+ return std::make_tuple(
+ std::move(stack_output_ptr->outputs),
+ stack(hy, 0, context),
+ stack(cy, 0, context));
+}
+
+// Parses a flat list of parameter tensors into a list of CellParams
+std::vector<CellParams> gather_params(
+ const std::vector<Tensor>& params,
+ bool has_biases,
+ CPUContext* context) {
+ Tensor undefined;
+ std::vector<CellParams> result;
+ if (has_biases) {
+ CAFFE_ENFORCE_EQ(
+ params.size() % 4, 0, "got an incorrect number of LSTM parameters");
+ for (size_t i = 0; i < params.size(); i += 4) {
+ result.emplace_back(
+ params[i], params[i + 1], params[i + 2], params[i + 3], context);
+ }
+ } else {
+ CAFFE_ENFORCE_EQ(
+ params.size() % 2, 0, "got an incorrect number of LSTM parameters");
+ for (size_t i = 0; i < params.size(); i += 2) {
+ result.emplace_back(
+ params[i], params[i + 1], undefined, undefined, context);
+ }
+ }
+ return result;
+}
+
+class InferenceLSTMOp : public Operator<CPUContext> {
+ public:
+ template <class... Args>
+ explicit InferenceLSTMOp(Args&&... args)
+ : Operator(std::forward<Args>(args)...),
+ num_layers_(this->template GetSingleArgument<int64_t>("num_layers", 1)),
+ bidirectional_(
+ this->template GetSingleArgument<bool>("bidirectional", false)),
+ has_biases_(this->template GetSingleArgument<bool>("has_biases", true)),
+ batch_first_(
+ this->template GetSingleArgument<bool>("batch_first", false)) {}
+
+ bool RunOnDevice() override;
+
+ protected:
+ int64_t num_layers_;
+ bool bidirectional_;
+ bool has_biases_;
+ bool batch_first_;
+};
+
+} // namespace
+} // namespace caffe2
+#endif // LSTM_OP_H_
--- /dev/null
+#include <algorithm>
+#include <vector>
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+using t_tuple = std::tuple<Tensor, Tensor>;
+
+template <typename T>
+T copy_ctor(const T& x) {
+ return x;
+}
+
+template <>
+Tensor copy_ctor(const Tensor& X) {
+ return X.UnsafeSharedInstance();
+}
+
+template <>
+t_tuple copy_ctor(const t_tuple& X) {
+ return std::make_tuple(copy_ctor(std::get<0>(X)), copy_ctor(std::get<1>(X)));
+}
+
+template <>
+std::pair<t_tuple, t_tuple> copy_ctor(const std::pair<t_tuple, t_tuple>& X) {
+ return std::make_pair(copy_ctor(X.first), copy_ctor(X.second));
+}
+
+template <>
+std::vector<Tensor> copy_ctor(const std::vector<Tensor>& X) {
+ std::vector<Tensor> Y(X.size());
+ std::transform(X.begin(), X.end(), Y.begin(), [](const Tensor& x) {
+ return copy_ctor(x);
+ });
+ return Y;
+}
+
+template <>
+std::vector<t_tuple> copy_ctor(const std::vector<t_tuple>& X) {
+ std::vector<t_tuple> Y(X.size());
+ std::transform(X.begin(), X.end(), Y.begin(), [](const t_tuple& x) {
+ return copy_ctor(x);
+ });
+ return Y;
+}
+
+template <>
+std::vector<std::pair<t_tuple, t_tuple>> copy_ctor(
+ const std::vector<std::pair<t_tuple, t_tuple>>& X) {
+ std::vector<std::pair<t_tuple, t_tuple>> Y(X.size());
+ std::transform(
+ X.begin(), X.end(), Y.begin(), [](const std::pair<t_tuple, t_tuple>& x) {
+ return copy_ctor(x);
+ });
+ return Y;
+}
+
+// Gathers every two elements of a vector in a vector of pairs
+template <typename T>
+static std::vector<std::pair<T, T>> pair_vec(const std::vector<T>& vals) {
+ CAFFE_ENFORCE_EQ(
+ vals.size() % 2,
+ 0,
+ "Odd number of params or hiddens given to a bidirectional RNN");
+ std::vector<std::pair<T, T>> result;
+ result.reserve(vals.size() / 2);
+ for (int64_t i = 0; i < vals.size(); i += 2) {
+ result.emplace_back(copy_ctor(vals[i]), copy_ctor(vals[i + 1]));
+ }
+ return result;
+}
+
+// Flattens a vector of pairs
+template <typename T>
+static std::vector<T> unpair_vec(std::vector<std::pair<T, T>>&& vals) {
+ std::vector<T> result;
+ result.reserve(vals.size() * 2);
+ for (int64_t i = 0; i < vals.size(); i++) {
+ result.push_back(std::move(vals[i].first));
+ result.push_back(std::move(vals[i].second));
+ }
+ return result;
+}
+
+Tensor matmul(const Tensor& X, const Tensor& W, CPUContext* context) {
+ const auto canonical_axis = X.canonical_axis_index(1);
+ const auto M = X.size_to_dim(canonical_axis);
+ const auto K = X.size_from_dim(canonical_axis);
+ const auto canonical_axis_w = W.canonical_axis_index(1);
+ const int N = W.size_to_dim(canonical_axis_w);
+ auto output_size = X.sizes().vec();
+ output_size.resize(canonical_axis + 1);
+ output_size[canonical_axis] = N;
+ Tensor C(output_size, CPU);
+ math::Gemm<float, CPUContext>(
+ CblasNoTrans,
+ CblasTrans,
+ M,
+ N,
+ K,
+ 1,
+ X.template data<float>(),
+ W.template data<float>(),
+ 0,
+ C.template mutable_data<float>(),
+ context);
+ return C;
+}
+
+Tensor
+linear(const Tensor& X, const Tensor& W, const Tensor& B, CPUContext* context) {
+ auto output = matmul(X, W, context);
+ if (B) {
+ const auto canonical_axis = X.canonical_axis_index(1);
+ const auto M = X.size_to_dim(canonical_axis);
+ const auto canonical_axis_w = W.canonical_axis_index(1);
+ const int N = W.size_to_dim(canonical_axis_w);
+ auto bias_multiplier_ = caffe2::empty({M}, CPU);
+ math::Set<float, CPUContext>(
+ M, 1, bias_multiplier_.template mutable_data<float>(), context);
+ math::Gemm<float, CPUContext>(
+ CblasNoTrans,
+ CblasNoTrans,
+ M,
+ N,
+ 1,
+ 1,
+ bias_multiplier_.template data<float>(),
+ B.template data<float>(),
+ 1,
+ output.template mutable_data<float>(),
+ context);
+ }
+ return output;
+}
+
+std::vector<Tensor>
+chunk(const Tensor& input, int chunks, int axis, CPUContext* context) {
+ int canonical_axis = input.canonical_axis_index(axis);
+ CAFFE_ENFORCE_LT(
+ canonical_axis, input.dim(), "Axis not in input ndim range.");
+ const int input_channels = input.dim32(canonical_axis);
+ CAFFE_ENFORCE_EQ(
+ input_channels % chunks,
+ 0,
+ "input channels should be divisible by the number of chunks.");
+ auto split_size = input_channels / chunks;
+ vector<int64_t> output_dims(input.sizes().vec());
+ int before = 1, after = 1;
+ for (int i = 0; i < canonical_axis; ++i) {
+ before *= input.dim32(i);
+ }
+ for (int i = canonical_axis + 1; i < input.dim(); ++i) {
+ after *= input.dim32(i);
+ }
+ size_t input_offset = 0;
+ std::vector<Tensor> outputs;
+ for (int i = 0; i < chunks; ++i) {
+ auto axis_dim = split_size;
+ output_dims[canonical_axis] = split_size;
+ Tensor output(output_dims, CPU);
+ math::CopyMatrix<CPUContext>(
+ input.itemsize(),
+ before,
+ axis_dim * after,
+ static_cast<const char*>(input.raw_data()) + input_offset,
+ input.dim32(canonical_axis) * after,
+ output.raw_mutable_data(input.dtype()),
+ axis_dim * after,
+ context,
+ input.dtype().copy());
+ input_offset += axis_dim * after * input.itemsize();
+ outputs.push_back(std::move(output));
+ }
+ return outputs;
+}
+
+std::vector<Tensor> unbind(const Tensor& input, int axis, CPUContext* context) {
+ // 1 - Chunk the input tensor along the given axis into N chunks where
+ // N is the dim(axis)
+ auto chunks = chunk(input, input.sizes()[axis], axis, context);
+ // 2 - Compute new dimensions
+ std::vector<int64_t> newDims = input.sizes().vec();
+ newDims.erase(newDims.begin() + axis);
+
+ // 3 - Reshape chunks to drop the extra dimension
+ for (int i = 0; i < chunks.size(); i++) {
+ CAFFE_ENFORCE_EQ(
+ chunks[i].sizes()[axis], 1, "Got an unexpected chunk size");
+ chunks[i].Reshape(newDims);
+ }
+ return chunks;
+}
+
+Tensor
+cat(const std::vector<Tensor>& tensorList, int axis, CPUContext* context) {
+ // Adopted from C2's concat operator
+ auto input_zero = copy_ctor(tensorList.at(0));
+ vector<int64_t> outputDims(input_zero.sizes().vec());
+ CAFFE_ENFORCE(outputDims.size() > 0);
+ for (int i = 1; i < tensorList.size(); i++) {
+ CAFFE_ENFORCE(input_zero.dtype() == tensorList.at(i).dtype());
+ outputDims[axis] += tensorList.at(i).sizes()[axis];
+ }
+ auto output_channels = outputDims[axis];
+ Tensor output(outputDims, CPU);
+ int before = 1, after = 1;
+ for (int i = 0; i < tensorList.at(0).dim(); ++i) {
+ if (i == axis) {
+ continue;
+ }
+ int dim = input_zero.dim32(i);
+ if (i < axis) {
+ before *= dim;
+ } else {
+ after *= dim;
+ }
+ }
+ size_t output_offset = 0;
+ for (const auto& input : tensorList) {
+ auto axis_dim = input.dim32(axis);
+ math::CopyMatrix<CPUContext>(
+ input.itemsize(),
+ before,
+ axis_dim * after,
+ input.raw_data(),
+ axis_dim * after,
+ static_cast<char*>(output.raw_mutable_data(input_zero.dtype())) +
+ output_offset,
+ output_channels * after,
+ context,
+ input_zero.dtype().copy());
+ output_offset += axis_dim * after * input.itemsize();
+ }
+
+ return output;
+}
+
+Tensor
+stack(const std::vector<Tensor>& tensorList, int axis, CPUContext* context) {
+ // 1 - Compute new dimensions
+ std::vector<int64_t> newDims(tensorList[0].sizes().vec());
+ std::vector<Tensor> expandedTensorList;
+ newDims.insert(newDims.begin() + axis, 1);
+ for (int i = 0; i < tensorList.size(); i++) {
+ expandedTensorList.emplace_back(tensorList[i].Clone());
+ expandedTensorList.at(i).Reshape(newDims);
+ }
+ return cat(expandedTensorList, axis, context);
+}
+
+Tensor sigmoid(const Tensor& X) {
+ Tensor Y(X.sizes(), CPU);
+ auto N = X.numel();
+ EigenVectorArrayMap<float>(Y.template mutable_data<float>(), N) = 1.0 /
+ (1.0 +
+ (-ConstEigenVectorArrayMap<float>(X.template data<float>(), N)).exp());
+ return Y;
+}
+
+Tensor tanh(const Tensor& X, CPUContext* context) {
+ Tensor Y(X.sizes(), CPU);
+ math::Tanh<float, CPUContext>(
+ X.numel(),
+ X.template data<float>(),
+ Y.template mutable_data<float>(),
+ context);
+ return Y;
+}
+
+Tensor add(const Tensor& X, const Tensor& Y, CPUContext* context) {
+ Tensor Z(X.sizes().vec(), CPU);
+ math::Add<float, CPUContext>(
+ X.numel(),
+ X.template data<float>(),
+ Y.template data<float>(),
+ Z.template mutable_data<float>(),
+ context);
+ return Z;
+}
+
+Tensor mul(const Tensor& X, const Tensor& Y, CPUContext* context) {
+ Tensor Z(X.sizes().vec(), CPU);
+ math::Mul<float, CPUContext>(
+ X.numel(),
+ X.template data<float>(),
+ Y.template data<float>(),
+ Z.template mutable_data<float>(),
+ context);
+ return Z;
+}
+
+Tensor transpose(const Tensor& X, int dim0, int dim1, CPUContext* context) {
+ int ndim = X.dim();
+ CAFFE_ENFORCE(ndim > dim0 && ndim > dim1, "Invalid transpose dimensions");
+ std::vector<int> axes(ndim);
+ std::iota(axes.begin(), axes.end(), 0);
+ std::swap(axes[dim0], axes[dim1]);
+ std::vector<int> Y_dims(ndim);
+ std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
+ for (int i = 0; i < ndim; ++i) {
+ Y_dims[i] = X_dims[axes[i]];
+ }
+ Tensor Y(Y_dims, CPU);
+ math::Transpose<float, CPUContext>(
+ ndim,
+ X_dims.data(),
+ axes.data(),
+ X.template data<float>(),
+ Y.template mutable_data<float>(),
+ context);
+ return Y;
+}
+} // namespace
+} // namespace caffe2
--- /dev/null
+#!/usr/bin/env python3
+import inspect
+
+import hypothesis.strategies as st
+import numpy as np
+import torch
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+from hypothesis import given
+from torch import nn
+
+
+class TestC2LSTM(TestCase):
+ @given(
+ bsz=st.integers(1, 5),
+ seq_lens=st.integers(1, 6),
+ emb_lens=st.integers(5, 10),
+ hidden_size=st.integers(3, 7),
+ num_layers=st.integers(1, 4),
+ has_biases=st.booleans(),
+ is_bidirectional=st.booleans(),
+ batch_first=st.booleans(),
+ )
+ def test_c2_lstm(
+ self,
+ bsz,
+ seq_lens,
+ emb_lens,
+ hidden_size,
+ num_layers,
+ has_biases,
+ is_bidirectional,
+ batch_first,
+ ):
+ net = core.Net("test_net")
+ num_directions = 2 if is_bidirectional else 1
+ py_lstm = nn.LSTM(
+ emb_lens,
+ hidden_size,
+ batch_first=batch_first,
+ bidirectional=is_bidirectional,
+ bias=has_biases,
+ num_layers=num_layers,
+ )
+
+ hx = np.zeros((num_layers * num_directions, bsz, hidden_size), dtype=np.float32)
+
+ if batch_first:
+ inputs = np.random.randn(bsz, seq_lens, emb_lens).astype(np.float32)
+ else:
+ inputs = np.random.randn(seq_lens, bsz, emb_lens).astype(np.float32)
+
+ py_results = py_lstm(torch.from_numpy(inputs))
+ lstm_in = [
+ torch.from_numpy(inputs),
+ torch.from_numpy(hx),
+ torch.from_numpy(hx),
+ ] + [param.detach() for param in py_lstm._flat_weights]
+
+ c2_results = torch.ops._caffe2.InferenceLSTM(
+ lstm_in, num_layers, has_biases, batch_first, is_bidirectional
+ )
+
+ np.testing.assert_array_almost_equal(
+ py_results[0].detach().numpy(), c2_results[0].detach().numpy()
+ )
+ np.testing.assert_array_almost_equal(
+ py_results[1][0].detach().numpy(), c2_results[1].detach().numpy()
+ )
+ np.testing.assert_array_almost_equal(
+ py_results[1][1].detach().numpy(), c2_results[2].detach().numpy()
+ )