From 5bf9e419383c673b4b8a0b00076a1fca5208d30a Mon Sep 17 00:00:00 2001 From: Hector Yuen Date: Thu, 7 Mar 2019 12:52:54 -0800 Subject: [PATCH] move half<->float conversions to oss operators (#17548) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17548 expose half float operators to OSS common/math/Float16.h is the original implementation this is substituted by caffe2/c10/util/Half.h from the comments seems like the both implementations don't handle denormals Reviewed By: jspark1105 Differential Revision: D14244200 fbshipit-source-id: f90ba28c5bf6a2b451b429cc4925b8cc376ac651 --- .../operators/fused_rowwise_8bit_conversion_ops.cc | 6 +- caffe2/operators/half_float_ops.cc | 118 ++++++++++++++++++--- caffe2/operators/half_float_ops_test.cc | 106 ++++++++++++++++++ caffe2/python/layers_test.py | 1 - 4 files changed, 209 insertions(+), 22 deletions(-) create mode 100644 caffe2/operators/half_float_ops_test.cc diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc index 105675f..2bfd2c4 100644 --- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc +++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc @@ -1,5 +1,4 @@ #include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h" -#include #include "c10/util/Registry.h" namespace caffe2 { @@ -11,14 +10,13 @@ void convertfp32fp32(float* dst, const float* src, size_t N) { void convertfp16fp32(float* dst, const at::Half* src, size_t N) { for (size_t i = 0; i < N; i++) { - dst[i] = fp16_ieee_to_fp32_value(src[i].x); + dst[i] = src[i]; } } void convertfp32fp16(at::Half* dst, const float* src, size_t N) { for (size_t i = 0; i < N; i++) { - uint16_t out = fp16_ieee_from_fp32_value(src[i]); - memcpy(dst + i, &out, sizeof(uint16_t)); + dst[i] = src[i]; } } } // namespace diff --git a/caffe2/operators/half_float_ops.cc b/caffe2/operators/half_float_ops.cc index 208bfce..3745121 100644 --- a/caffe2/operators/half_float_ops.cc +++ b/caffe2/operators/half_float_ops.cc @@ -1,31 +1,115 @@ #include "caffe2/operators/half_float_ops.h" +#include namespace caffe2 { + +template <> +bool FloatToHalfOp::RunOnDevice() { + auto& input = Input(0); + + auto* output = Output(0, input.sizes(), at::dtype()); + const float* data = input.template data(); + at::Half* out = output->template mutable_data(); + auto N = input.numel(); + + for (auto i = 0; i < N; i++) { + out[i] = data[i]; + } + + return true; +} + +template <> +bool HalfToFloatOp::RunOnDevice() { + auto& input = Input(0); + + auto* output = Output(0, input.sizes(), at::dtype()); + const at::Half* data = input.template data(); + float* out = output->template mutable_data(); + auto N = input.numel(); + + for (auto i = 0; i < N; i++) { + out[i] = data[i]; + } + return true; +} + +REGISTER_CPU_OPERATOR(FloatToHalf, FloatToHalfOp); +REGISTER_CPU_OPERATOR(HalfToFloat, HalfToFloatOp); + OPERATOR_SCHEMA(FloatToHalf) .NumInputs(1) .NumOutputs(1) - .TensorInferenceFunction( - [](const OperatorDef& def, const vector& in) { - vector out; - const TensorShape& X = in[0]; - out.push_back(X); - out[0].set_data_type(TensorProto_DataType_FLOAT16); + .TensorInferenceFunction([](const OperatorDef& /* unused */, + const vector& in) { + vector out; + const TensorShape& X = in[0]; + out.push_back(X); + out[0].set_data_type(TensorProto_DataType_FLOAT16); - return out; - }); + return out; + }); OPERATOR_SCHEMA(HalfToFloat) .NumInputs(1) .NumOutputs(1) - .TensorInferenceFunction( - [](const OperatorDef& def, const vector& in) { - vector out; - const TensorShape& X = in[0]; - out.push_back(X); - out[0].set_data_type(TensorProto_DataType_FLOAT); - - return out; - }); + .TensorInferenceFunction([](const OperatorDef& /* unused */, + const vector& in) { + vector out; + const TensorShape& X = in[0]; + out.push_back(X); + out[0].set_data_type(TensorProto_DataType_FLOAT); + + return out; + }); + +bool Float16ConstantFillOp::RunOnDevice() { + auto* output = Output(0, shape_, at::dtype()); + const float givenValue = + this->template GetSingleArgument("value", 0.0f); + at::Half givenFp16Value = givenValue; + + if (output->numel()) { + at::Half* out = output->template mutable_data(); + std::fill(out, out + output->numel(), givenFp16Value); + } + return true; +} + +bool Float16UniformFillOp::RunOnDevice() { + auto* output = Output(0, shape_, at::dtype()); + at::Half* out = output->template mutable_data(); + + // Get a batch row by row and convert + auto leading_dim_sz = output->size(0); + int rowsz = output->numel() / output->size(0); + + vector intermediate_data_; + intermediate_data_.resize(rowsz); + for (uint64_t i = 0; i < leading_dim_sz; i++) { + math::RandUniform( + rowsz, min_, max_, intermediate_data_.data(), &context_); + for (uint64_t j = 0; j < rowsz; j++) { + out[i * rowsz + j] = intermediate_data_[j]; + } + } + return true; +} + +REGISTER_CPU_OPERATOR(Float16ConstantFill, Float16ConstantFillOp); +REGISTER_CPU_OPERATOR(Float16UniformFill, Float16UniformFillOp); +OPERATOR_SCHEMA(Float16UniformFill) + .NumInputs(0) + .NumOutputs(1) + .TensorInferenceFunction(Float16FillerTensorInference) + .SetDoc( + "Fills a half float tensor of a specified shape with" + " values from a uniform distribution[min,max]") + .Arg("shape", "Shape of the tensor") + .Arg("min", "Minimim value to generate") + .Arg("max", "Maximum value to generate"); +NO_GRADIENT(Float16UniformFill); + OPERATOR_SCHEMA(Float16ConstantFill) .NumInputs(0) .NumOutputs(1) diff --git a/caffe2/operators/half_float_ops_test.cc b/caffe2/operators/half_float_ops_test.cc new file mode 100644 index 0000000..8403f88 --- /dev/null +++ b/caffe2/operators/half_float_ops_test.cc @@ -0,0 +1,106 @@ +#include + +#include "caffe2/core/flags.h" +#include "caffe2/core/logging.h" +#include "caffe2/operators/half_float_ops.h" +#include "caffe2/utils/conversions.h" + +#include +C10_DECLARE_string(caffe_test_root); + +namespace caffe2 { + +TEST(Float16, SimpleTest) { + Workspace ws; + vector data = {0.1f, 0.23f, 1.6f, 8.2f, -13.9f}; + + // loading input data + Blob* dataBlob = ws.CreateBlob("data"); + auto tensor = BlobGetMutableTensor(dataBlob, CPU); + tensor->Resize(data.size()); + for (auto i = 0; i < tensor->numel(); ++i) { + tensor->mutable_data()[i] = data[i]; + } + + // encoding fp32 -> fp16 + OperatorDef def; + def.set_name("test"); + def.set_type("FloatToHalf"); + def.add_input("data"); + def.add_output("data16"); + unique_ptr op(CreateOperator(def, &ws)); + EXPECT_NE(nullptr, op.get()); + EXPECT_TRUE(op->Run()); + + // run some sanity checks + Blob* outputBlob = ws.GetBlob("data16"); + EXPECT_NE(nullptr, outputBlob); + EXPECT_TRUE(outputBlob->IsType()); + const TensorCPU& outputTensor = outputBlob->Get(); + EXPECT_EQ(outputTensor.numel(), 5); + EXPECT_NO_THROW(outputTensor.data()); + + // decode fp16 -> fp32 + OperatorDef def2; + def2.set_name("test"); + def2.set_type("HalfToFloat"); + def2.add_input("data16"); + def2.add_output("result"); + unique_ptr op2(CreateOperator(def2, &ws)); + EXPECT_NE(nullptr, op2.get()); + EXPECT_TRUE(op2->Run()); + + // validate result + Blob* resultBlob = ws.GetBlob("result"); + EXPECT_NE(nullptr, resultBlob); + EXPECT_TRUE(resultBlob->IsType()); + const TensorCPU& resultTensor = resultBlob->Get(); + EXPECT_EQ(resultTensor.numel(), 5); + + for (auto i = 0; i < data.size(); ++i) { + EXPECT_NEAR(resultTensor.data()[i], data[i], 0.01); + } +} + +TEST(Float16, UniformDistributionTest) { + Workspace ws; + + OperatorDef def; + def.set_name("test"); + def.set_type("Float16UniformFill"); + int64_t size = 5000000L; + std::vector shape = {size, 32}; + long tot_size = shape[0]; + for (int i = 1; i < shape.size(); i++) { + tot_size *= shape[i]; + } + caffe2::AddArgument>("shape", shape, &def); + caffe2::AddArgument("min", -20.0, &def); + caffe2::AddArgument("max", 20.0, &def); + def.add_output("result"); + + unique_ptr op(CreateOperator(def, &ws)); + EXPECT_NE(nullptr, op.get()); + EXPECT_TRUE(op->Run()); + + Blob* resultBlob = ws.GetBlob("result"); + const TensorCPU& resultTensor = resultBlob->Get(); + EXPECT_EQ(resultTensor.numel(), tot_size); + double mean = 0.0, var = 0.0; + const at::Half* data = resultTensor.data(); + for (auto i = 0; i < resultTensor.numel(); i++) { + float x = caffe2::convert::Get(data[i]); + mean += x; + var += x * x; + } + mean /= tot_size; + var /= tot_size; + LOG(INFO) << "m " << mean << " " << var; + + // The uniform distribution of [-20,20] should have a mean of 0 + // and a variance of 40^2/12 + EXPECT_TRUE(fabs(mean) < 0.1); + EXPECT_TRUE(fabs(var - 133.33) / 133.33 < 0.1); +} + +} // namespace caffe2 diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py index d01b267..b429b43 100644 --- a/caffe2/python/layers_test.py +++ b/caffe2/python/layers_test.py @@ -1278,7 +1278,6 @@ class TestLayers(LayersTestCase): assert len(ops[0].output) == 1 assert ops[0].output[0] in ops[1].input - @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.") def testHalfToFloatTypeInference(self): input = self.new_record(schema.Scalar((np.float32, (32,)))) -- 2.7.4