From 37e33881b9e995f8328da0a149d71f4314bd7aa1 Mon Sep 17 00:00:00 2001 From: Samaresh Kumar Singh Date: Tue, 31 Mar 2026 21:39:08 -0500 Subject: [PATCH 1/3] profiler: fix USE_KINETO=OFF build failure due to unconditional ActivityType.h include kineto_shim.h was unconditionally including , which is a Kineto header, causing a fatal compile error when building with USE_KINETO=OFF and Kineto is not installed (e.g. on Gentoo with system libraries). Guard the include with #ifdef USE_KINETO and provide a minimal stub (enum class ActivityType : uint8_t { NONE = 0 }) so the data structures and function signatures that reference libkineto::ActivityType still compile. Guard the corresponding function bodies in kineto_shim.cpp (addCPUActivity, deviceTypeFromActivity) and collection.cpp (scopeToType, kinetoType) that use concrete enum values, adding no-op stubs for the non-Kineto paths.Fixes the bug #178939 --- torch/csrc/profiler/collection.cpp | 8 ++++++++ torch/csrc/profiler/kineto_shim.cpp | 16 ++++++++++++++-- torch/csrc/profiler/kineto_shim.h | 7 +++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index 1be2d80310910..e8054d4f44df7 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -573,11 +573,13 @@ std::string toString(const ExtraFields& e) { e.callsite_.funcname_.str()); } +#ifdef USE_KINETO auto scopeToType(at::RecordScope scope) { return scope == at::RecordScope::USER_SCOPE ? libkineto::ActivityType::USER_ANNOTATION : libkineto::ActivityType::CPU_OP; } +#endif int64_t torchOpEndNS( const ExtraFields& e, @@ -626,6 +628,7 @@ std::string Result::overload_name() const { [](const auto& e) -> std::string { return ""; })); } +#ifdef USE_KINETO libkineto::ActivityType Result::kinetoType() const { return visit(c10::overloaded( ATTRIBUTE(TorchOp, scopeToType(e.scope_)), @@ -638,6 +641,11 @@ libkineto::ActivityType Result::kinetoType() const { ATTRIBUTE(PythonGC, libkineto::ActivityType::PYTHON_FUNCTION), ATTRIBUTE(Kineto, e.activity_type_))); } +#else +libkineto::ActivityType Result::kinetoType() const { + return libkineto::ActivityType::NONE; +} +#endif uint64_t Result::correlationID() const { return visit(c10::overloaded( diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp index fa232e1a01016..0ddf62997a07e 100644 --- a/torch/csrc/profiler/kineto_shim.cpp +++ b/torch/csrc/profiler/kineto_shim.cpp @@ -138,6 +138,7 @@ TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name) } #endif // USE_KINETO +#ifdef USE_KINETO activity_t* TraceWrapper::addCPUActivity( const std::string& name, const libkineto::ActivityType type, @@ -145,7 +146,6 @@ activity_t* TraceWrapper::addCPUActivity( const uint64_t correlation_id, const int64_t start_time, const int64_t end_time) { -#ifdef USE_KINETO TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace."); cpu_trace_->emplace_activity(cpu_trace_->span, type, name); auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back()); @@ -157,10 +157,18 @@ activity_t* TraceWrapper::addCPUActivity( act.endTime = end_time; } return cpu_trace_->activities.back().get(); +} #else +activity_t* TraceWrapper::addCPUActivity( + const std::string& name, + const libkineto::ActivityType type, + const DeviceAndResource device_and_resource, + const uint64_t correlation_id, + const int64_t start_time, + const int64_t end_time) { return nullptr; -#endif // USE_KINETO } +#endif // USE_KINETO void TraceWrapper::transferCpuTrace(int64_t end_time) { #ifdef USE_KINETO @@ -473,6 +481,7 @@ void logInvariantViolation( namespace autograd::profiler { c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) { +#ifdef USE_KINETO // PrivateUse1 kineto backend reuse some ActivityTypes, // If PrivateUse1 backend is enabled, this should return // c10::DeviceType::PrivateUse1. @@ -524,6 +533,9 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) { return c10::DeviceType::CPU; } } +#else + return c10::DeviceType::CPU; +#endif // USE_KINETO } void addMetadataJson(const std::string& key, const std::string& value) { diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h index 4f9bdc6770507..44bcb0e18a6e0 100644 --- a/torch/csrc/profiler/kineto_shim.h +++ b/torch/csrc/profiler/kineto_shim.h @@ -12,7 +12,14 @@ #undef USE_KINETO #endif +#ifdef USE_KINETO #include +#else +// Minimal stub so non-Kineto builds can compile types that hold ActivityType. +namespace libkineto { +enum class ActivityType : uint8_t { NONE = 0 }; +} // namespace libkineto +#endif #include #include From f5e3a9d9aa12c6e2006a9243c97ef7703fcbbe2f Mon Sep 17 00:00:00 2001 From: Samaresh Kumar Singh Date: Wed, 1 Apr 2026 10:34:24 -0500 Subject: [PATCH 2/3] Fixed the USER_ANNOTATION/GPU_USER_ANNOTATION build errors with external kineto When building against a system-installed kineto (e.g. Gentoo's sci-ml/kineto) that lacks USER_ANNOTATION and GPU_USER_ANNOTATION in its ActivityType enum, init.cpp failed to compile because it referenced those enum member names directly. --- torch/csrc/autograd/init.cpp | 7 +------ torch/csrc/autograd/profiler_kineto.cpp | 7 +++++++ torch/csrc/autograd/profiler_kineto.h | 1 + torch/csrc/profiler/kineto_shim.h | 11 +++++++++-- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 1e2442a5d99db..f8e8795c5805e 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -305,12 +305,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) { .def("privateuse1_elapsed_us", &KinetoEvent::privateuse1ElapsedUs) .def( "is_user_annotation", - [](const KinetoEvent& e) { - return e.activityType() == - (uint8_t)libkineto::ActivityType::USER_ANNOTATION || - e.activityType() == - (uint8_t)libkineto::ActivityType::GPU_USER_ANNOTATION; - }) + [](const KinetoEvent& e) { return e.isUserAnnotation(); }) .def( "is_python_function", [](const KinetoEvent& e) { return e.isPythonFunction(); }) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 9a076f58d7143..6b5165f0872fb 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -1081,6 +1081,13 @@ int64_t KinetoEvent::privateuse1ElapsedUs() const { return -1; } +bool KinetoEvent::isUserAnnotation() const { + constexpr uint8_t kUserAnnotation = 1; + constexpr uint8_t kGpuUserAnnotation = 2; + const auto type = activityType(); + return type == kUserAnnotation || type == kGpuUserAnnotation; +} + void KinetoEvent::getPerfEventCounters(std::vector& in) const { return result_->visit(c10::overloaded( [&in](const ExtraFields& e) -> void { diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 777b8a5851ed5..c3135c61245d4 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -67,6 +67,7 @@ struct TORCH_API KinetoEvent { bool isPythonFunction() const; int64_t cudaElapsedUs() const; int64_t privateuse1ElapsedUs() const; + bool isUserAnnotation() const; void getPerfEventCounters(torch::profiler::perf_counters_t& /*in*/) const; extra_meta_t extraMeta() const; std::string metadataJson() const; diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h index 44bcb0e18a6e0..ddc00c03fa9cc 100644 --- a/torch/csrc/profiler/kineto_shim.h +++ b/torch/csrc/profiler/kineto_shim.h @@ -15,9 +15,16 @@ #ifdef USE_KINETO #include #else -// Minimal stub so non-Kineto builds can compile types that hold ActivityType. namespace libkineto { -enum class ActivityType : uint8_t { NONE = 0 }; +enum class ActivityType : uint8_t { + CPU_OP = 0, + USER_ANNOTATION, + GPU_USER_ANNOTATION, + NONE = CPU_OP, +}; +inline const char* toString(ActivityType) { + return "CPU_OP"; +} } // namespace libkineto #endif From 717526b7ce90d242cb917db90f4cb13c85c93661 Mon Sep 17 00:00:00 2001 From: Samaresh Kumar Singh Date: Fri, 3 Apr 2026 13:01:01 -0500 Subject: [PATCH 3/3] profiler: guard GPU ActivityType checks with KINETO GPU backend macros The externalId() function referenced GPU-specific libkineto::ActivityType values (GPU_MEMCPY, GPU_MEMSET, CONCURRENT_KERNEL, CUDA_RUNTIME, CUDA_DRIVER, PRIVATEUSE1_RUNTIME, PRIVATEUSE1_DRIVER) that are only present in kineto builds with GPU backend support. System-installed kineto packages built without CUDA/ROCm support (LIBKINETO_NOCUPTI + LIBKINETO_NOROCTRACER) omit these enum values, causing compile errors. Guard the check with the existing GPU backend macros. Also restructures the condition to an early-return to avoid duplicating the correlation ID lookup. --- torch/csrc/autograd/profiler_kineto.cpp | 38 ++++++++++++++----------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 6b5165f0872fb..466e8d66ab518 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -1125,27 +1125,33 @@ int64_t KinetoEvent::externalId() const { return static_cast(linked); } +#if defined(USE_KINETO) && \ + (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER)) // Orphaned GPU activities (no linked CPU op) in these types should not get // an External id, to avoid incorrect cross-linking in trace viewers. + // These GPU-specific ActivityType values are only present when kineto is + // built with GPU backend support (CUPTI or ROCtracer). CPU-only kineto + // builds (e.g. system packages without GPU support) omit them. auto type = static_cast(activityType()); - if (type != libkineto::ActivityType::GPU_MEMCPY && - type != libkineto::ActivityType::GPU_MEMSET && - type != libkineto::ActivityType::CONCURRENT_KERNEL && - type != libkineto::ActivityType::CUDA_RUNTIME && - type != libkineto::ActivityType::CUDA_DRIVER && - type != libkineto::ActivityType::PRIVATEUSE1_RUNTIME && - type != libkineto::ActivityType::PRIVATEUSE1_DRIVER) { - return static_cast(result_->visit(c10::overloaded( - [](const ExtraFields& e) -> uint64_t { - return e.correlation_id_; - }, - [](const ExtraFields& e) -> uint64_t { - return e.correlation_id_; - }, - [](const auto&) -> uint64_t { return 0; }))); + if (type == libkineto::ActivityType::GPU_MEMCPY || + type == libkineto::ActivityType::GPU_MEMSET || + type == libkineto::ActivityType::CONCURRENT_KERNEL || + type == libkineto::ActivityType::CUDA_RUNTIME || + type == libkineto::ActivityType::CUDA_DRIVER || + type == libkineto::ActivityType::PRIVATEUSE1_RUNTIME || + type == libkineto::ActivityType::PRIVATEUSE1_DRIVER) { + return 0; } +#endif - return 0; + return static_cast(result_->visit(c10::overloaded( + [](const ExtraFields& e) -> uint64_t { + return e.correlation_id_; + }, + [](const ExtraFields& e) -> uint64_t { + return e.correlation_id_; + }, + [](const auto&) -> uint64_t { return 0; }))); } #define FORWARD_FROM_RESULT(method_name, result_expr) \