C10_DECLARE_int32(caffe2_dnnlowp_copy_to_32bit_frequency);
C10_DECLARE_bool(caffe2_dnnlowp_shared_int32_buffer);
+// Thresholds to fallback to 32-bit accumulation when 16-bit accumulation
+// doesn't provide performance benefits.
+C10_DEFINE_double(
+ caffe2_dnnlowp_fallback_to_32_bit_accumulation_density_threshold,
+ 0.05,
+ "If density of outlier is higher than this, fallback to 32-bit accumulation");
+C10_DEFINE_int32(
+ caffe2_dnnlowp_fallback_to_32_bit_accumulation_m_threshold,
+ 0,
+ "If m is smaller than this, fallback to 32-bit accumulation");
+C10_DEFINE_int32(
+ caffe2_dnnlowp_fallback_to_32_bit_accumulation_n_threshold,
+ 0,
+ "If n is smaller than this, fallback to 32-bit accumulation");
+C10_DEFINE_int32(
+ caffe2_dnnlowp_fallback_to_32_bit_accumulation_k_threshold,
+ 0,
+ "If k is smaller than this, fallback to 32-bit accumulation");
+
namespace caffe2 {
using namespace std;
template <bool ReluFused>
bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
+ if (fallback_to_32_bit_accumulation_) {
+ return BaseType::RunOnDeviceWithOrderNCHW();
+ }
const Tensor& X = InputTensorCPU_(INPUT);
if (X.template IsType<uint8_t>()) {
return RunOnDeviceWithOrderNCHWAndType_<uint8_t>();
template <bool ReluFused>
bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
+ if (fallback_to_32_bit_accumulation_) {
+ return BaseType::RunOnDeviceWithOrderNHWC();
+ }
const Tensor& X = InputTensorCPU_(INPUT);
if (X.template IsType<uint8_t>()) {
return RunOnDeviceWithOrderNHWCAndType_<uint8_t>();
int kernel_dim = this->KernelDim_();
const auto& filter = InputTensorCPU_(FILTER);
- int M = filter.dim32(0);
+ int num_out_channels = filter.dim32(0);
+
+ // Check if we should fallback to 32-bit accumulation
+ if (this->order_ == StorageOrder::NHWC) {
+ const Tensor& X = InputTensorCPU_(INPUT);
+ int N = X.dim32(0);
+
+ Tensor* Y = OutputTensorCPU_(0);
+ this->SetOutputSize(X, Y, filter.dim32(0));
+ const int output_image_size = this->GetDimsSize(*Y);
+
+ if (N * output_image_size <
+ FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_m_threshold) {
+ LOG(INFO)
+ << "M " << N * output_image_size << " is smaller than threshold "
+ << FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_m_threshold
+ << " . Falling back to acc32";
+ fallback_to_32_bit_accumulation_ = true;
+ return true;
+ }
+ if (num_out_channels / group_ <
+ FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_n_threshold) {
+ LOG(INFO)
+ << "N " << num_out_channels / group_ << " is smaller than threshold "
+ << FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_n_threshold
+ << " . Falling back to acc32";
+ fallback_to_32_bit_accumulation_ = true;
+ return true;
+ }
+ if (kernel_dim <
+ FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_k_threshold) {
+ LOG(INFO)
+ << "K " << kernel_dim << " is smaller than threshold "
+ << FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_k_threshold
+ << " . Falling back to acc32";
+ fallback_to_32_bit_accumulation_ = true;
+ return true;
+ }
+ }
+
+ if (nbits_in_non_outlier_ == 0) {
+ // nbits_in_non_outlier_ == 0 means everything is outlier and we can just
+ // use 32-bit accumulation.
+ LOG(INFO) << "nbits_in_non_outlier == 0 means everything is outlier so we "
+ "fallback to acc32";
+ fallback_to_32_bit_accumulation_ = true;
+ return true;
+ }
// Separate out outliers
- if (!Wq_outlier_ &&
- ConvPoolOpBase<CPUContext>::order_ == StorageOrder::NHWC &&
+ if (!Wq_outlier_ && this->order_ == StorageOrder::NHWC &&
nbits_in_non_outlier_ < 8) {
CAFFE_ENFORCE(!W_quantized_.empty());
Wq_outlier_.reset(ExtractOutlierMatrix(
- group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized_));
- int outlier_cnt = Wq_outlier_->ColPtr()[M];
+ group_,
+ kernel_dim,
+ num_out_channels,
+ nbits_in_non_outlier_,
+ W_quantized_));
+ int outlier_cnt = Wq_outlier_->ColPtr()[num_out_channels];
LOG(INFO) << "Proportion of outlier for Conv layer with weight blob "
<< OperatorBase::debug_def().input(1) << " is "
- << (float)outlier_cnt / W_quantized_.size();
+ << static_cast<float>(outlier_cnt) / W_quantized_.size();
LOG(INFO) << "nbits_in_non_outlier " << nbits_in_non_outlier_
<< " copy_to_32bit_frequency " << copy_to_32bit_frequency_;
+
+ if (static_cast<float>(outlier_cnt) / W_quantized_.size() >
+ FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_density_threshold) {
+ LOG(INFO)
+ << "Density of outliers is higher than threshold "
+ << FLAGS_caffe2_dnnlowp_fallback_to_32_bit_accumulation_density_threshold
+ << " . Falling back to acc32";
+ fallback_to_32_bit_accumulation_ = true;
+ Wq_outlier_.reset();
+ return true;
+ }
}
bool packW = ConvPoolOpBase<CPUContext>::order_ == StorageOrder::NHWC &&
Wq_acc16_packed_.reset(new fbgemm::PackBMatrix<int8_t, int16_t>(
fbgemm::matrix_op_t::Transpose,
group_ * kernel_dim,
- M / group_,
+ num_out_channels / group_,
W_quantized_.data(),
kernel_dim, // ld
nullptr, // pmat
if (!GetQuantizationParameters_()) {
return false;
}
+ if (fallback_to_32_bit_accumulation_) {
+ return BaseType::template RunOnDeviceWithOrderNCHWAndType_<InType>();
+ }
const Tensor& X = InputTensorCPU_(INPUT);
auto& filter = InputTensorCPU_(FILTER);
0,
"The number of output channels is not divisible by group.");
- ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+ this->SetOutputSize(X, Y, filter.dim32(0));
const vector<int> input_dims = GetDims(X);
const vector<int> output_dims = GetDims(*Y);
vector<uint8_t> col_buffer_quantized;
if (X.template IsType<uint8_t>()) {
col_buffer_quantized_data =
- (uint8_t*)col_buffer_data + tid * col_buffer_size;
+ reinterpret_cast<uint8_t*>(col_buffer_data) +
+ tid * col_buffer_size;
} else {
col_buffer_quantized.resize(kernel_dim * output_image_size);
fbgemm::Quantize<uint8_t>(
- (const float*)col_buffer_data + tid * col_buffer_size,
+ reinterpret_cast<const float*>(col_buffer_data) +
+ tid * col_buffer_size,
col_buffer_quantized.data(),
col_buffer_quantized.size(),
in_qparams_[INPUT]);
template <bool ReluFused>
template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
-void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM(
+void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM_(
PackAMatrix& packA,
const uint8_t* col_buffer_quantized_data,
vector<int32_t>* Y_int32,
auto& filter = InputTensorCPU_(FILTER);
const int M = filter.dim32(0);
+ bool fuse_output_pipeline = Wq_acc16_packed_ && !dequantize_output_;
+ assert(fuse_output_pipeline);
int kernel_dim = this->KernelDim_();
int nthreads = dnnlowp_get_num_threads();
const int kernel_dim = this->KernelDim_();
const int output_image_size = this->GetDimsSize(*Y);
- if (nbits_in_non_outlier_ == 0) {
- memset(Y_int32->data(), 0, sizeof((*Y_int32)[0]) * M * N);
- }
-
#ifdef _OPENMP
#pragma omp parallel
#endif
return false;
}
+ if (fallback_to_32_bit_accumulation_) {
+ return BaseType::template RunOnDeviceWithOrderNHWCAndType_<InType>();
+ }
+
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
t_end = chrono::system_clock::now();
double dt = chrono::duration<double>(t_end - t_begin).count();
const int M = filter.dim32(0);
CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
- ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
+ this->SetOutputSize(X, Y, filter.dim32(0));
// The dimension of each kernel
const int kernel_dim = this->KernelDim_();
// The output image size is the spatial size of the output.
t_begin = chrono::system_clock::now();
#endif
+ bool fuse_output_pipeline = Wq_acc16_packed_ && !dequantize_output_;
bool no_im2col = this->NoIm2ColNHWC_();
// Im2Col, followed by gemm.
#endif
// quantize col_buffer
- uint8_t* col_buffer_quantized_data = nullptr;
+ const uint8_t* col_buffer_quantized_data = nullptr;
vector<uint8_t> col_buffer_quantized;
if (X.template IsType<uint8_t>()) {
- col_buffer_quantized_data = (uint8_t*)col_buffer_data;
+ col_buffer_quantized_data =
+ reinterpret_cast<const uint8_t*>(col_buffer_data);
} else {
col_buffer_quantized.resize(
group_ * kernel_dim * output_image_size * N);
t_begin = chrono::system_clock::now();
#endif
- bool fuse_output_pipeline =
- Wq_acc16_packed_ && nbits_in_non_outlier_ > 0 && !dequantize_output_;
-
using namespace fbgemm;
int row_offset_size_per_thread = -1;
int x_pack_buf_size_per_thread = -1;
Y_uint8_data = Y->template mutable_data<uint8_t>();
}
- if (nbits_in_non_outlier_ > 0) {
- // Main GEMM for non-outlier
- if (Wq_acc16_packed_) {
- // fast path
+ // Main GEMM for non-outlier
+ if (Wq_acc16_packed_)
#ifdef _OPENMP
#pragma omp parallel
#endif
- {
- int nthreads = dnnlowp_get_num_threads();
- int tid = dnnlowp_get_thread_num();
-
- if (fuse_output_pipeline) {
- PackAWithRowOffset<uint8_t, int16_t> packA(
- matrix_op_t::NoTranspose,
- N * output_image_size,
- group_ * kernel_dim,
- col_buffer_quantized_data,
- group_ * kernel_dim,
- X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
- group_,
- row_offsets_.data() + tid * row_offset_size_per_thread);
-
- if (this->quantize_groupwise_) {
- DispatchFBGEMM<
- PackAWithRowOffset<uint8_t, int16_t>,
- QuantizationGranularity::GROUP>(
- packA, col_buffer_quantized_data, Y_int32, Y_uint8_data);
- } else {
- DispatchFBGEMM<
- PackAWithRowOffset<uint8_t, int16_t>,
- QuantizationGranularity::TENSOR>(
- packA, col_buffer_quantized_data, Y_int32, Y_uint8_data);
- }
- } else {
- // !fuse_output_pipeline
- PackAMatrix<uint8_t, int16_t> packA(
- matrix_op_t::NoTranspose,
- N * output_image_size,
- group_ * kernel_dim,
- col_buffer_quantized_data,
- group_ * kernel_dim,
- X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
- group_); // group
-
- DoNothing<int32_t, int32_t> doNothingObj{};
- memCopy<> memCopyObj(doNothingObj);
- fbgemmPacked(
- packA,
- *Wq_acc16_packed_,
- Y_int32->data(),
- Y_int32->data(),
- M,
- memCopyObj,
- tid, // thread_id
- nthreads); // num_threads
- }
- } // omp parallel
- } else {
- // slow path
- conv_nhwc_acc16_ref_(
+ {
+ // fast path
+ int nthreads = dnnlowp_get_num_threads();
+ int tid = dnnlowp_get_thread_num();
+
+ if (fuse_output_pipeline) {
+ // no im2col fusion
+ PackAWithRowOffset<uint8_t, int16_t> packA(
+ matrix_op_t::NoTranspose,
+ N * output_image_size,
+ group_ * kernel_dim,
+ col_buffer_quantized_data,
+ group_ * kernel_dim,
+ X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
group_,
- N,
- output_image_size,
- M,
- kernel_dim,
+ row_offsets_.data() + tid * row_offset_size_per_thread);
+
+ if (this->quantize_groupwise_) {
+ DispatchFBGEMM_<
+ PackAWithRowOffset<uint8_t, int16_t>,
+ QuantizationGranularity::GROUP>(
+ packA, col_buffer_quantized_data, Y_int32, Y_uint8_data);
+ } else {
+ DispatchFBGEMM_<
+ PackAWithRowOffset<uint8_t, int16_t>,
+ QuantizationGranularity::TENSOR>(
+ packA, col_buffer_quantized_data, Y_int32, Y_uint8_data);
+ }
+ } else {
+ // !fuse_output_pipeline
+ PackAMatrix<uint8_t, int16_t> packA(
+ matrix_op_t::NoTranspose,
+ N * output_image_size,
+ group_ * kernel_dim,
col_buffer_quantized_data,
- W_quantized_.data(),
- Y_int32->data()
+ group_ * kernel_dim,
+ X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
+ group_); // group
+
+ DoNothing<int32_t, int32_t> doNothingObj{};
+ memCopy<> memCopyObj(doNothingObj);
+ fbgemmPacked(
+ packA,
+ *Wq_acc16_packed_,
+ Y_int32->data(),
+ Y_int32->data(),
+ M,
+ memCopyObj,
+ tid, // thread_id
+ nthreads); // num_threads
+ } // omp parallel
+ } else {
+ // slow path
+ conv_nhwc_acc16_ref_(
+ group_,
+ N,
+ output_image_size,
+ M,
+ kernel_dim,
+ col_buffer_quantized_data,
+ W_quantized_.data(),
+ Y_int32->data()
#ifdef DNNLOWP_ACC16_IN_SLOW_PATH
- ,
- this
+ ,
+ this
#endif
- );
- } // slow path
- } // nbits_in_non_outlier_ > 0
+ );
+ } // slow path
#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
t_end = chrono::system_clock::now();
out_quantized=st.booleans(),
weight_quantized=st.booleans(),
prepack_weight=st.booleans(),
- nbits_in_non_outlier=st.sampled_from((6, 8)),
+ nbits_in_non_outlier=st.sampled_from((0, 1, 6, 8)),
share_col_buffer=st.booleans(),
preserve_activation_sparsity=st.booleans(),
preserve_weight_sparsity=st.booleans(),
input_channels = input_channels_per_group * group
output_channels = output_channels_per_group * group
- if nbits_in_non_outlier == 0:
- X, W, b = generate_conv_inputs(
- stride,
- pad,
- kernel,
- dilation,
- size,
- group,
- input_channels_per_group,
- output_channels_per_group,
- batch_size,
- order,
- preserve_activation_sparsity=preserve_activation_sparsity,
- preserve_weight_sparsity=preserve_weight_sparsity,
- )
+ X_min = 0 if preserve_activation_sparsity else -77
+ X_max = X_min + 255
+ X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
+ X = np.round(X).astype(np.float32)
+ X[..., 0] = X_min
+ X[0, 0, 0, 1] = X_max
+
+ if preserve_weight_sparsity:
+ W_min = -128
+ W_max = 100
else:
- X_min = 0 if preserve_activation_sparsity else -77
- X_max = X_min + 255
- X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
- X = np.round(X).astype(np.float32)
- X[..., 0] = X_min
- X[0, 0, 0, 1] = X_max
-
- if preserve_weight_sparsity:
- W_min = -128
- W_max = 100
- else:
- W_min = -100
- W_max = W_min + 255
- W = (
- np.random.rand(
- output_channels, kernel, kernel, input_channels_per_group
- )
- * 4
- - 2
- + W_min
- + 128
+ W_min = -100
+ W_max = W_min + 255
+ W = (
+ np.random.rand(
+ output_channels, kernel, kernel, input_channels_per_group
)
- W = np.round(W).astype(np.float32)
- W[0, 0, 0, 0] = W_min
- W[1, 0, 0, 0] = W_max
- W[..., 1] = W_min + 128
+ * 4
+ - 2
+ + W_min
+ + 128
+ )
+ W = np.round(W).astype(np.float32)
+ W[0, 0, 0, 0] = W_min
+ W[1, 0, 0, 0] = W_max
+ W[..., 1] = W_min + 128 # "zeros"
- # No input quantization error in bias
- b = np.round(np.random.randn(output_channels)).astype(np.float32)
+ if order == "NCHW":
+ X = utils.NHWC2NCHW(X)
+ W = utils.NHWC2NCHW(W)
+
+ b = np.round(np.random.randn(output_channels)).astype(np.float32)
Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
outputs = []
bool ConvDNNLowPOp<T, ReluFused>::TakeDepthWise3x3FastPath_() {
const Tensor& X = InputTensorCPU_(INPUT);
return StorageOrder::NHWC == ConvPoolOpBase<CPUContext>::order_ &&
- is_same<T, uint8_t>::value && X.template IsType<T>() &&
- this->debug_def().engine() != "DNNLOWP_ACC16" &&
+ is_same<T, uint8_t>::value && X.template IsType<T>() && !Acc16() &&
group_ == X.dim32(X.dim() - 1) && group_ % 8 == 0 &&
this->kernel_.size() == 2 && kernel_h() == 3 && kernel_w() == 3 &&
stride_h() == stride_w() && (stride_h() == 1 || stride_h() == 2) &&
bool ConvDNNLowPOp<T, ReluFused>::TakeDepthWise3x3x3FastPath_() {
const Tensor& X = InputTensorCPU_(INPUT);
bool ret = StorageOrder::NHWC == ConvPoolOpBase<CPUContext>::order_ &&
- is_same<T, uint8_t>::value && X.template IsType<T>() &&
- this->debug_def().engine() != "DNNLOWP_ACC16" &&
+ is_same<T, uint8_t>::value && X.template IsType<T>() && !Acc16() &&
group_ == X.dim32(X.dim() - 1) && group_ % 8 == 0 &&
this->kernel_.size() == 3 && this->kernel_[0] == 3 &&
this->kernel_[1] == 3 && this->kernel_[2] == 3 &&
int M = filter.dim32(0);
bool packW = ConvPoolOpBase<CPUContext>::order_ == StorageOrder::NHWC &&
- OperatorBase::debug_def().engine() != "DNNLOWP_ACC16" &&
- is_same<T, uint8_t>::value && GetCpuId().avx2() &&
+ !Acc16() && is_same<T, uint8_t>::value && GetCpuId().avx2() &&
!FLAGS_caffe2_dnnlowp_force_slow_path;
bool depthwise_3x3_fast_path = false, depthwise_3x3x3_fast_path = false;
reason = "fbgemm only supports 8-bit integers";
} else if (!GetCpuId().avx2()) {
reason = "fbgemm only supports AVX2+";
- } else if (
- OperatorBase::debug_def().engine() == "DNNLOWP_ACC16" ||
- depthwise_3x3_fast_path) {
+ } else if (Acc16()) {
reason = "";
} else if (FLAGS_caffe2_dnnlowp_force_slow_path) {
reason = "slow path enforced";
template <typename T, bool ReluFused>
template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
-void ConvDNNLowPOp<T, ReluFused>::DispatchFBGEMM(
+void ConvDNNLowPOp<T, ReluFused>::DispatchFBGEMM_(
PackAMatrix& packA,
vector<int32_t>* Y_int32,
uint8_t* Y_uint8_data,
row_offsets_.data() + tid * row_offset_size_per_thread);
if (quantize_groupwise_) {
- DispatchFBGEMM<
+ DispatchFBGEMM_<
PackAWithIm2Col<uint8_t>,
QuantizationGranularity::GROUP>(
packA, Y_int32, Y_uint8_data, Y_float_data);
} else {
- DispatchFBGEMM<
+ DispatchFBGEMM_<
PackAWithIm2Col<uint8_t>,
QuantizationGranularity::TENSOR>(
packA, Y_int32, Y_uint8_data, Y_float_data);
row_offsets_.data() + tid * row_offset_size_per_thread);
if (quantize_groupwise_) {
- DispatchFBGEMM<
+ DispatchFBGEMM_<
PackAWithIm2Col<uint8_t, int32_t, 3>,
QuantizationGranularity::GROUP>(
packA, Y_int32, Y_uint8_data, Y_float_data);
} else {
- DispatchFBGEMM<
+ DispatchFBGEMM_<
PackAWithIm2Col<uint8_t, int32_t, 3>,
QuantizationGranularity::TENSOR>(
packA, Y_int32, Y_uint8_data, Y_float_data);
row_offsets_.data() + tid * row_offset_size_per_thread);
if (quantize_groupwise_) {
- DispatchFBGEMM<
+ DispatchFBGEMM_<
PackAWithRowOffset<uint8_t>,
QuantizationGranularity::GROUP>(
packA, Y_int32, Y_uint8_data, Y_float_data);
} else {
- DispatchFBGEMM<
+ DispatchFBGEMM_<
PackAWithRowOffset<uint8_t>,
QuantizationGranularity::TENSOR>(
packA, Y_int32, Y_uint8_data, Y_float_data);
in_quantized=st.booleans(),
out_quantized=st.booleans(),
prepack_weight=st.booleans(),
- nbits_in_non_outlier=st.sampled_from((6, 8)),
+ nbits_in_non_outlier=st.sampled_from((0, 1, 6, 8)),
share_col_buffer=st.booleans(),
**hu.gcs_cpu_only
)
input_channels = input_channels_per_group * group
output_channels = output_channels_per_group * group
- if nbits_in_non_outlier == 0:
- X, W, b = generate_conv_inputs(
- stride,
- pad,
- kernel,
- dilation,
- size,
- group,
- input_channels_per_group,
- output_channels_per_group,
- batch_size,
- order,
- True, # group-wise
- )
- else:
- X_min = -77
- X_max = X_min + 255
- X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
- X = np.round(X).astype(np.float32)
- X[..., 0] = X_min
- X[0, 0, 0, 1] = X_max
+ X_min = -77
+ X_max = X_min + 255
+ X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
+ X = np.round(X).astype(np.float32)
+ X[..., 0] = X_min
+ X[0, 0, 0, 1] = X_max
- W_min = -100
- W_max = W_min + 255
- W = (
- np.random.rand(
- output_channels, kernel, kernel, input_channels_per_group
- )
- * 4
- - 2
- + W_min
- + 128
+ W_min = -100
+ W_max = W_min + 255
+ W = (
+ np.random.rand(
+ output_channels, kernel, kernel, input_channels_per_group
)
- W = np.round(W).astype(np.float32)
- W[..., 1] = W_min + 128 # "zeros"
- for g in range(group):
- W[g * output_channels_per_group, 0, 0, 0] = W_min
- W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
- W[
- g * output_channels_per_group : (g + 1) * output_channels_per_group,
- ] += g
+ * 4
+ - 2
+ + W_min
+ + 128
+ )
+ W = np.round(W).astype(np.float32)
+ W[..., 1] = W_min + 128 # "zeros"
+ for g in range(group):
+ W[g * output_channels_per_group, 0, 0, 0] = W_min
+ W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
+ W[
+ g * output_channels_per_group : (g + 1) * output_channels_per_group,
+ ] += g
- if order == "NCHW":
- X = utils.NHWC2NCHW(X)
- W = utils.NHWC2NCHW(W)
+ if order == "NCHW":
+ X = utils.NHWC2NCHW(X)
+ W = utils.NHWC2NCHW(W)
- # No input quantization error in bias
- b = np.round(np.random.randn(output_channels)).astype(np.float32)
+ b = np.round(np.random.randn(output_channels)).astype(np.float32)
Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
outputs = []