Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62191
This moves start timestamping to end of callback. This way we dont
account for callstack/module hierarchy related overhead in op runtime.
Test Plan:
CI
Imported from OSS
Reviewed By: ilia-cher
Differential Revision:
D29910519
fbshipit-source-id:
f462031a81ae12b3db7993cf482e5ad93a35e096
#endif // USE_KINETO
auto ctx_ptr = std::make_unique<KinetoObserverContext>();
- ctx_ptr->startUs = getTimeUs();
ctx_ptr->correlationId = corr_id;
ctx_ptr->startThreadId = at::RecordFunction::currentThreadId();
ctx_ptr->module_hierarchy = jit::currentModuleHierarchy();
}
#endif
+ ctx_ptr->startUs = getTimeUs();
if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
try {
cudaStubs()->record(nullptr, &ctx_ptr->cuda_event_start_, nullptr);