"#endif\n\n";
}
+ // Enable atomic_add used by get_valid_counts. Only needed for OpenCL < 1.1.
+ if (enable_atomics_) {
+ decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+ "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n\n";
+ }
return CodeGenC::Finish();
}
return os.str();
}
+void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
+ if (op->op.same_as(builtin::address_of())) {
+ // Overload tvm_address_of to add storage scope (e.g. __global).
+ const LoadNode* load = op->args[0].as<LoadNode>();
+ CHECK(op->args.size() == 1 && load);
+ os << "((";
+ auto it = alloc_storage_scope_.find(load->buffer_var.get());
+ if (it != alloc_storage_scope_.end()) {
+ PrintStorageScope(it->second, os);
+ }
+ this->PrintType(load->dtype.element_of(), os);
+ os << " *)" << this->GetVarID(load->buffer_var.get()) << " + ";
+ this->PrintExpr(load->index, os);
+ os << ')';
+ } else if (op->op.same_as(builtin_call_extern_)) {
+ auto func = Downcast<StringImm>(op->args[0]);
+ // Enable atomics extension if used.
+ if (func->value == "atomic_add") {
+ enable_atomics_ = true;
+ }
+ CodeGenC::VisitExpr_(op, os);
+ } else {
+ CodeGenC::VisitExpr_(op, os);
+ }
+}
+
void CodeGenOpenCL::VisitExpr_(const BroadcastNode* op, std::ostream& os) { // NOLINT(*)
std::string v = PrintExpr(op->value);
os << "((";
std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*)
// overload visitor
+ void VisitExpr_(const CallNode* op, std::ostream& os) final; // NOLINT(*)
void VisitExpr_(const BroadcastNode* op, std::ostream& os) final; // NOLINT(*)
void VisitExpr_(const FloatImmNode* op, std::ostream& os) final; // NOLINT(*)
// whether enable fp16 and fp64 extension
bool enable_fp16_{false};
bool enable_fp64_{false};
+ // Whether to enable atomics extension.
+ bool enable_atomics_{false};
};
} // namespace codegen
intrp = relay.create_executor("debug", ctx=ctx, target=target)
out = intrp.evaluate(func)(np_data)
tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
- # get_valid_count for cuda doesn't do data rearrangement
- if target == 'cuda':
+ # get_valid_count for cuda, opencl doesn't do data rearrangement
+ if target in ['cuda', 'opencl']:
return
tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
raise RuntimeError("only support int32, float32 and float64")
+def opencl_atomic_add_rule(op):
+ if op.dtype == "int32":
+ return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
+ raise RuntimeError("only support int32")
tvm.target.intrin.register_intrin_rule(
"cuda", "atomic_add", cuda_atomic_add_rule, override=True)
+tvm.target.intrin.register_intrin_rule(
+ "opencl", "atomic_add", opencl_atomic_add_rule, override=True)
+
tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
def atomic_add(x, y):