const OperatorDef& operator_def,
Workspace* ws)
: BaseType(operator_def, ws),
- trans_a_(OperatorBase::GetSingleArgument<int>("trans_a", 0)),
- trans_b_(OperatorBase::GetSingleArgument<int>("trans_b", 0)),
- broadcast_(OperatorBase::GetSingleArgument<int>("broadcast", 0)),
+ trans_a_(this->template GetSingleArgument<int>("trans_a", 0)),
+ trans_b_(this->template GetSingleArgument<int>("trans_b", 0)),
+ broadcast_(this->template GetSingleArgument<int>("broadcast", 0)),
is_B_constant_(
- OperatorBase::GetSingleArgument<bool>("constant_B", false)) {}
+ this->template GetSingleArgument<bool>("constant_B", false)) {}
template <typename T>
bool BatchMatMulDNNLowPOp<T>::RunOnDevice() {
int num_batches_B = B.numel() / (K * N);
if (!first_invocation_ && !Bq_packed_.empty() &&
num_batches_B * N != column_offsets_.size()) {
- LOG(INFO) << "Operator with output " << OperatorBase::debug_def().output(0)
+ LOG(INFO) << "Operator with output " << this->debug_def().output(0)
<< " does not have constant B";
is_B_constant_ = false;
Bq_packed_.clear();
vector<int8_t> B_quantized_temp(K * N);
column_offsets_.resize(num_batches_B * N);
for (int i = 0; i < num_batches_B; ++i) {
- if (OperatorBase::InputIsType<int8::Int8TensorCPU>(1)) {
+ if (this->template InputIsType<int8::Int8TensorCPU>(1)) {
B_qparams_.push_back(TensorQuantizationParams());
B_qparams_[i].scale =
- OperatorBase::Input<int8::Int8TensorCPU>(1).scale;
+ this->template Input<int8::Int8TensorCPU>(1).scale;
B_qparams_[i].zero_point =
- OperatorBase::Input<int8::Int8TensorCPU>(1).zero_point +
+ this->template Input<int8::Int8TensorCPU>(1).zero_point +
signed_min;
const T* B_data = B.template data<T>() + i * B_quantized_temp.size();
} else {
assert(false);
}
- LOG(WARNING) << "BatchMatMul with output "
- << OperatorBase::debug_def().output(0)
+ LOG(WARNING) << "BatchMatMul with output " << this->debug_def().output(0)
<< " falls back to slow path because " << reason;
}
B_qparams_.resize(1);
Workspace* ws)
: BaseType(operator_def, ws) {
if (HasArgument("axis")) {
- axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
- add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
+ axis_ = this->template GetSingleArgument<int>("axis", -1);
+ add_axis_ = this->template GetSingleArgument<int>("add_axis", 0);
} else {
axis_ = GetDimFromOrderString(
- OperatorBase::GetSingleArgument<string>("order", "NCHW"));
+ this->template GetSingleArgument<string>("order", "NCHW"));
add_axis_ = 0;
}
CAFFE_ENFORCE_GE(axis_, 0);
Tensor* split = nullptr;
int* axis_data = nullptr;
if (OutputSize() >= 2) {
- split = OperatorBase::Output<Tensor>(1, CPU);
+ split = this->template Output<Tensor>(1, CPU);
split->Resize(vector<int64_t>(1, InputSize()));
axis_data = split->template mutable_data<int>();
}
const OperatorDef& operator_def,
Workspace* ws)
: ConvDNNLowPOp<uint8_t, ReluFused>(operator_def, ws),
- nbits_in_non_outlier_(OperatorBase::GetSingleArgument<int>(
+ nbits_in_non_outlier_(this->template GetSingleArgument<int>(
"nbits_in_non_outlier",
FLAGS_caffe2_dnnlowp_nbits_in_non_outlier)),
- copy_to_32bit_frequency_(OperatorBase::GetSingleArgument<int>(
+ copy_to_32bit_frequency_(this->template GetSingleArgument<int>(
"copy_to_32bit_frequency",
FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency)) {
if (nbits_in_non_outlier_ == 0) {
static int log_occurences = 0;
if (log_occurences < 32) {
++log_occurences;
- LOG(WARNING) << "Conv with weight "
- << OperatorBase::debug_def().input(FILTER)
+ LOG(WARNING) << "Conv with weight " << this->debug_def().input(FILTER)
<< " falls back to slow path because " << reason;
}
}
dt = chrono::duration<double>(t_end - t_very_begin).count();
double ops = 2. * N * output_image_size * M * kernel_dim;
double gops = ops / dt / 1e9;
- LOG(INFO) << "this=" << this << " " << OperatorBase::debug_def().type()
- << " output=" << OperatorBase::debug_def().output(0) << " "
+ LOG(INFO) << "this=" << this << " " << this->debug_def().type()
+ << " output=" << this->debug_def().output(0) << " "
<< N * output_image_size << "x" << M << "x" << kernel_dim
<< " G=" << group_ << " C/G=" << C / group_ << " K/G=" << M / group_
<< " R=" << kernel_h() << " S=" << kernel_w() << " : " << dt * 1e3
}
quantize_groupwise_ =
- OperatorBase::GetSingleArgument<bool>("quantize_groupwise", false);
+ this->template GetSingleArgument<bool>("quantize_groupwise", false);
}
template <typename T, bool ReluFused>
b_quantized_data_ = b_quantized_->data();
} else {
const auto& bias = InputTensorCPU_(BIAS);
- if (OperatorBase::InputIsType<int8::Int8TensorCPU>(BIAS)) {
+ if (this->template InputIsType<int8::Int8TensorCPU>(BIAS)) {
TensorQuantizationParams bias_qparams;
bias_qparams.scale =
- OperatorBase::Input<int8::Int8TensorCPU>(BIAS).scale;
+ this->template Input<int8::Int8TensorCPU>(BIAS).scale;
bias_qparams.zero_point =
- OperatorBase::Input<int8::Int8TensorCPU>(BIAS).zero_point;
+ this->template Input<int8::Int8TensorCPU>(BIAS).zero_point;
CAFFE_ENFORCE_LE(
std::abs(
bias_qparams.scale -
static int log_occurences = 0;
if (log_occurences < 32) {
++log_occurences;
- LOG(WARNING) << "Conv with weight "
- << OperatorBase::debug_def().input(FILTER)
+ LOG(WARNING) << "Conv with weight " << this->debug_def().input(FILTER)
<< " falls back to slow path because " << reason;
}
}
++log_occurences;
LOG(WARNING) << "Cannot do group-wise quantization without "
"static quantization of activations for "
- << OperatorBase::debug_def().output(0);
+ << this->debug_def().output(0);
}
}
N * Y_HxW * group_,
kernel_dim,
col_buffer_data,
- OperatorBase::debug_def().input(INPUT));
+ this->debug_def().input(INPUT));
// Dump weight
StoreMatrixInMatrixMarketFormat(
group_ * M,
kernel_dim,
W_quantized_.data(),
- OperatorBase::debug_def().input(FILTER));
+ this->debug_def().input(FILTER));
}
if (TakeDepthWise3x3x3FastPath_()) {
double ops = 2. * N * Y_HxW * M * kernel_dim;
dt = chrono::duration<double>(t_end - t_very_begin).count();
double gops = ops / dt / 1e9;
- LOG(INFO) << "this=" << this << " " << OperatorBase::debug_def().type()
- << " output=" << OperatorBase::debug_def().output(0) << " "
- << N * Y_HxW << "x" << M << "x" << kernel_dim << " G=" << group_
+ LOG(INFO) << "this=" << this << " " << this->debug_def().type()
+ << " output=" << this->debug_def().output(0) << " " << N * Y_HxW
+ << "x" << M << "x" << kernel_dim << " G=" << group_
<< " C/G=" << C / group_ << " K/G=" << M / group_
<< " R=" << kernel_h() << " S=" << kernel_w() << " : " << dt * 1e3
<< " ms " << gops << " gops";
template <typename T, class Context>
bool ConvReluOp<T, Context>::RunOnDeviceWithOrderNCHW() {
// Delegate to local conv operator
- for (int i = 0; i < OperatorBase::InputSize(); ++i) {
+ for (int i = 0; i < this->InputSize(); ++i) {
local_input_blobs_[i]->ShareExternal(
- const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
- OperatorBase::Inputs()[i]->meta());
+ const_cast<void*>(this->Inputs()[i]->GetRaw()),
+ this->Inputs()[i]->meta());
}
if (!local_op_->RunOnDeviceWithOrderNCHW()) {
template <typename T, class Context>
bool ConvReluOp<T, Context>::RunOnDeviceWithOrderNHWC() {
// Delegate to local conv operator
- for (int i = 0; i < OperatorBase::InputSize(); ++i) {
+ for (int i = 0; i < this->InputSize(); ++i) {
local_input_blobs_[i]->ShareExternal(
- const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
- OperatorBase::Inputs()[i]->meta());
+ const_cast<void*>(this->Inputs()[i]->GetRaw()),
+ this->Inputs()[i]->meta());
}
if (!local_op_->RunOnDeviceWithOrderNHWC()) {
GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
const TensorCPU& input = InputIsType<int8::Int8TensorCPU>(0)
- ? OperatorBase::Input<int8::Int8TensorCPU>(0).t
+ ? this->template Input<int8::Int8TensorCPU>(0).t
: Input(0);
CAFFE_ENFORCE(input.template IsType<T>());
arguments_parsed_ = true;
}
- auto& input = OperatorBase::Input<int8::Int8TensorCPU>(0).t;
+ auto& input = this->template Input<int8::Int8TensorCPU>(0).t;
auto& output = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>()->t;
output.ResizeLike(input);
functor_(
const OperatorDef& operator_def,
Workspace* ws)
: BaseType(operator_def, ws),
- axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
+ axis_(this->template GetSingleArgument<int>("axis", 1)) {}
template <typename T>
bool ElementwiseLinearDNNLowPOp<T>::RunOnDevice() {
const OperatorDef& operator_def,
Workspace* ws)
: FullyConnectedDNNLowPOp<uint8_t>(operator_def, ws),
- nbits_in_non_outlier_(OperatorBase::GetSingleArgument<int>(
+ nbits_in_non_outlier_(this->template GetSingleArgument<int>(
"nbits_in_non_outlier",
FLAGS_caffe2_dnnlowp_nbits_in_non_outlier)),
- copy_to_32bit_frequency_(OperatorBase::GetSingleArgument<int>(
+ copy_to_32bit_frequency_(this->template GetSingleArgument<int>(
"copy_to_32bit_frequency",
FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency)) {}
int outlier_cnt = Wq_outlier_->ColPtr()[N];
LOG(INFO) << "Proportion of outlier for FC layer with weight blob "
- << OperatorBase::debug_def().input(1) << " is "
+ << this->debug_def().input(1) << " is "
<< (float)outlier_cnt / W_quantized_.size();
LOG(INFO) << "copy_to_32bit_frequency " << copy_to_32bit_frequency_;
const OperatorDef& operator_def,
Workspace* ws)
: BaseType(operator_def, ws),
- axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
- axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
+ axis_(this->template GetSingleArgument<int32_t>("axis", 1)),
+ axis_w_(this->template GetSingleArgument<int32_t>("axis_w", 1)),
b_quantized_(make_shared<vector<int32_t>>()),
column_offsets_(make_shared<vector<int32_t>>()),
is_weight_constant_(
- OperatorBase::GetSingleArgument<bool>("constant_weight", true)) {
+ this->template GetSingleArgument<bool>("constant_weight", true)) {
if (!is_weight_constant_) {
LOG(INFO) << operator_def.output(0) << " is_weight_constant "
<< is_weight_constant_;
if (FLAGS_caffe2_dnnlowp_dump_tensors) {
// Dump input activation
- StoreMatrixInMatrixMarketFormat(
- M, K, Xdata, OperatorBase::debug_def().input(0));
+ StoreMatrixInMatrixMarketFormat(M, K, Xdata, this->debug_def().input(0));
// Dump weight
- StoreMatrixInMatrixMarketFormat(
- N, K, Wdata, OperatorBase::debug_def().input(1));
+ StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1));
}
if (VLOG_IS_ON(3)) {
dt = chrono::duration<double>(t_end - t_very_begin).count();
double gops = ops / dt / 1e9;
VLOG(3) << "@PERF this=" << this
- << " output=" << OperatorBase::debug_def().output(0) << " " << M
- << "x" << N << "x" << K << ": " << dt * 1e3 << " ms " << gops
- << " gops";
+ << " output=" << this->debug_def().output(0) << " " << M << "x" << N
+ << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
}
return true;
int signed_min = -(1 << (qfactory_->GetWeightPrecision() - 1));
if (is_weight_constant_) {
bool fast_path = is_same<T, uint8_t>::value && GetCpuId().avx2() &&
- OperatorBase::debug_def().engine() != "DNNLOWP_ACC16";
+ this->debug_def().engine() != "DNNLOWP_ACC16";
if ((fast_path && !Wq_packed_) || (!fast_path && W_quantized_.empty())) {
if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
reason = "fbgemm only supports 8-bit integers";
} else if (!GetCpuId().avx2()) {
reason = "fbgemm only supports AVX2";
- } else if (OperatorBase::debug_def().engine() == "DNNLOWP_ACC16") {
+ } else if (this->debug_def().engine() == "DNNLOWP_ACC16") {
reason = "";
} else {
assert(false);
}
if (!reason.empty()) {
- LOG(WARNING) << "Conv with weight "
- << OperatorBase::debug_def().input(1)
+ LOG(WARNING) << "Conv with weight " << this->debug_def().input(1)
<< " falls back to slow path because " << reason;
}
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
FullyConnectedFakeLowpFPOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
- axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
- axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
+ axis_(this->template GetSingleArgument<int32_t>("axis", 1)),
+ axis_w_(this->template GetSingleArgument<int32_t>("axis_w", 1)),
float16_compute_(
- OperatorBase::GetSingleArgument<bool>("float16_compute", false)) {}
+ this->template GetSingleArgument<bool>("float16_compute", false)) {}
~FullyConnectedFakeLowpFPOp() {}
template <
const OperatorDef& operator_def,
Workspace* ws)
: Operator<Context>(operator_def, ws),
- axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
- axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
+ axis_(this->template GetSingleArgument<int32_t>("axis", 1)),
+ axis_w_(this->template GetSingleArgument<int32_t>("axis_w", 1)),
float16_compute_(
- OperatorBase::GetSingleArgument<bool>("float16_compute", false)) {}
+ this->template GetSingleArgument<bool>("float16_compute", false)) {}
~FullyConnectedGradientFakeLowpFPOp() {}
template <
const OperatorDef& operator_def,
Workspace* ws)
: BaseType(operator_def, ws),
- axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
- axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
+ axis_(this->template GetSingleArgument<int32_t>("axis", 1)),
+ axis_w_(this->template GetSingleArgument<int32_t>("axis_w", 1)),
b_quantized_(make_shared<vector<int32_t>>()),
column_offsets_(make_shared<vector<int32_t>>()),
is_weight_constant_(
- OperatorBase::GetSingleArgument<bool>("constant_weight", true)) {
+ this->template GetSingleArgument<bool>("constant_weight", true)) {
using namespace dnnlowp;
LOG(INFO) << "Using Rowwise Quantization!";
if (!is_weight_constant_) {
dt = chrono::duration<double>(t_end - t_very_begin).count();
double gops = ops / dt / 1e9;
VLOG(3) << "@PERF this=" << this
- << " output=" << OperatorBase::debug_def().output(0) << " " << M
- << "x" << N << "x" << K << ": " << dt * 1e3 << " ms " << gops
- << " gops";
+ << " output=" << this->debug_def().output(0) << " " << M << "x" << N
+ << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
}
return true;
const int C = gamma.size();
gamma_quantized_.resize(C);
gamma_quantized_data_ = gamma_quantized_.data();
- if (OperatorBase::InputIsType<int8::Int8TensorCPU>(GAMMA)) {
+ if (this->template InputIsType<int8::Int8TensorCPU>(GAMMA)) {
const auto& gamma_int8 =
- OperatorBase::Input<int8::Int8TensorCPU>(GAMMA);
+ this->template Input<int8::Int8TensorCPU>(GAMMA);
auto& gamma_qparams = in_qparams_[GAMMA];
gamma_qparams.scale = gamma_int8.scale;
const T* gamma_data = gamma.template data<T>();
const auto& X_qparams = in_qparams_[INPUT];
const auto& gamma_qparams = in_qparams_[GAMMA];
auto& beta_qparams = in_qparams_[BETA];
- if (OperatorBase::InputIsType<int8::Int8TensorCPU>(BETA)) {
- const auto& beta_int8 = OperatorBase::Input<int8::Int8TensorCPU>(BETA);
+ if (this->template InputIsType<int8::Int8TensorCPU>(BETA)) {
+ const auto& beta_int8 = this->template Input<int8::Int8TensorCPU>(BETA);
beta_qparams.scale = beta_int8.scale;
beta_qparams.zero_point = beta_int8.zero_point;
CAFFE_ENFORCE_LE(
Workspace* ws)
: LSTMUnitOp<CPUContext>(operator_def, ws),
drop_states_(
- OperatorBase::template GetSingleArgument<bool>("drop_states", false)),
+ this->template GetSingleArgument<bool>("drop_states", false)),
qfactory_(GetQuantizationFactoryOf(this)) {}
template <typename T>
template <typename T>
const TensorCPU& LSTMUnitDNNLowPOp<T>::InputTensorCPU_(int idx) {
return InputIsType<int8::Int8TensorCPU>(idx)
- ? OperatorBase::Input<int8::Int8TensorCPU>(idx).t
+ ? this->template Input<int8::Int8TensorCPU>(idx).t
: Input(idx);
}
template <typename T>
bool ReluDNNLowPOp<T>::RunOnDevice() {
auto& X = InputIsType<int8::Int8TensorCPU>(0)
- ? OperatorBase::Input<int8::Int8TensorCPU>(0).t
+ ? (this->template Input<int8::Int8TensorCPU>(0)).t
: Input(0);
TensorCPU* Y = nullptr;
template <typename T>
class ReluDNNLowPOp final : public Operator<CPUContext> {
public:
+ USE_OPERATOR_FUNCTIONS(CPUContext);
ReluDNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
qfactory_(dnnlowp::GetQuantizationFactoryOf(this)) {}
bool DoRunWithType() {
// If we endup using it on GPU doing O(N) memcpy is probably not best :)
// TODO: implement prefetching if it starts mattering (TF does it)
- auto& data = OperatorBase::Input<int8::Int8TensorCPU>(DATA).t;
+ auto& data = (this->template Input<int8::Int8TensorCPU>(DATA)).t;
auto& indices = Input(INDICES);
auto* output = &Outputs()[0]->template GetMutable<int8::Int8TensorCPU>()->t;