are too large. Splitting such files into separate files helps.
(Example: `THTensorMath`, `THTensorMoreMath`, `THTensorEvenMoreMath`.)
+* MSVC's preprocessor (but not the standard compiler) has a bug
+ where it incorrectly tokenizes raw string literals, ending when it sees a `"`.
+ This causes preprocessor tokens inside the literal like an`#endif` to be incorrectly
+ treated as preprocessor directives. See https://godbolt.org/z/eVTIJq as an example.
+
### Running Clang-Tidy
[Clang-Tidy](https://clang.llvm.org/extra/clang-tidy/index.html) is a C++
c.value()->setUniqueName(cname);
d.value()->setUniqueName(dname);
graph->registerOutput(d.value());
+ torch::jit::overrideCanFuseOnCPU(true);
FuseGraph(graph);
+ torch::jit::overrideCanFuseOnCPU(false);
return graph;
};
@unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
@unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+ @skipIfRocm
def test_small_constant_cuda(self):
def fn_test_small_constant(x, y):
return (1e-8 * x + 5e-9 * y) * 1e8
"torch/csrc/jit/c10_ops/layer_norm.cpp",
"torch/csrc/utils/tensor_flatten.cpp",
"torch/csrc/utils/variadic.cpp",
+ "torch/csrc/jit/fuser/kernel_cache.cpp",
+ "torch/csrc/jit/fuser/compiler.cpp",
+ "torch/csrc/jit/fuser/executor.cpp",
+ "torch/csrc/jit/fuser/codegen.cpp",
+ "torch/csrc/jit/fuser/fallback.cpp",
+ "torch/csrc/jit/fuser/cpu/fused_kernel.cpp",
+ "torch/csrc/jit/fuser/cpu/dynamic_library_unix.cpp",
+ "torch/csrc/jit/fuser/interface.cpp",
]
r["torch_sources_no_python"] = (
torch_sources_no_python_default
- + ["torch/csrc/cuda/comm.cpp", "torch/csrc/cuda/nccl.cpp"]
- + native.glob(["torch/csrc/jit/fuser/**/*.cpp"])
+ + ["torch/csrc/cuda/comm.cpp", "torch/csrc/cuda/nccl.cpp", "torch/csrc/jit/fuser/cuda/fused_kernel.cpp"]
)
- r["torch_sources_no_python_cpu"] = torch_sources_no_python_default + native.glob(
- ["torch/csrc/jit/fuser/**/*.cpp"], exclude=["torch/csrc/jit/fuser/cuda/*.cpp"]
- )
+ r["torch_sources_no_python_cpu"] = torch_sources_no_python_default
r["torch_csrc_flags"] = {
"compiler_flags": [
"-Icaffe2/torch/csrc",
"-Icaffe2/torch/csrc/nn",
"-Icaffe2/torch/lib",
- "-DUSE_CPU_FUSER_FBCODE=1",
- "-DUSE_CUDA_FUSER_FBCODE=1",
],
}
"-Icaffe2/torch/csrc",
"-Icaffe2/torch/csrc/nn",
"-Icaffe2/torch/lib",
- "-DUSE_CPU_FUSER_FBCODE=1",
- "-DUSE_CUDA_FUSER_FBCODE=0",
]
return r
${TORCH_SRC_DIR}/csrc/jit/c10_ops/layer_norm.cpp
${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/compiler.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/executor.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/codegen.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/fallback.cpp
${TORCH_ROOT}/test/cpp/jit/no-gtest.cpp
)
-SET(USE_CPU_FUSER 0)
-if (NOT WIN32)
- SET(USE_CPU_FUSER 1)
-
+if (WIN32)
list(APPEND TORCH_SRCS
- ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp
- ${TORCH_SRC_DIR}/csrc/jit/fuser/compiler.cpp
- ${TORCH_SRC_DIR}/csrc/jit/fuser/executor.cpp
- ${TORCH_SRC_DIR}/csrc/jit/fuser/codegen.cpp
- ${TORCH_SRC_DIR}/csrc/jit/fuser/fallback.cpp
- ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_win.cpp
)
-endif()
-
-SET(USE_CUDA_FUSER 0)
-if (USE_CUDA AND NOT USE_ROCM AND NOT WIN32)
- SET(USE_CUDA_FUSER 1)
-
+else ()
list(APPEND TORCH_SRCS
- ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/fused_kernel.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_unix.cpp
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp
)
-
-endif()
-
-CONFIGURE_FILE(
- ${TORCH_SRC_DIR}/csrc/jit/fuser/config.h.in
- ${CMAKE_CURRENT_SOURCE_DIR}/csrc/jit/fuser/config.h)
+ if (USE_CUDA AND NOT USE_ROCM)
+ list(APPEND TORCH_SRCS
+ ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/fused_kernel.cpp
+ )
+ endif()
+endif ()
if (NOT NO_API)
list(APPEND TORCH_SRCS
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
-
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/fuser/tensor_desc.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/jit/assertions.h>
#include <torch/csrc/jit/code_template.h>
#include <torch/csrc/jit/fuser/compiler.h>
-#include <torch/csrc/jit/fuser/config.h>
#include <torch/csrc/jit/fuser/interface.h>
#include <torch/csrc/jit/fuser/tensor_info.h>
#include <torch/csrc/jit/ir.h>
-#if USE_CUDA_FUSER
-#include <torch/csrc/jit/fuser/cuda/resource_strings.h>
-#endif
-
-#if USE_CPU_FUSER
#include <torch/csrc/jit/fuser/cpu/resource_strings.h>
-#endif
+#include <torch/csrc/jit/fuser/cuda/resource_strings.h>
#include <cmath>
#include <cstdint>
for (const auto& input : inputs) {
emitFormal(input.first, input.second);
}
-
// Writes output parameters
for (const auto& output : outputs) {
}
}
-// Includes headers
-// Note: CUDA kernels support halfs and random generation, CPU kernels do not
-#if USE_CUDA_FUSER
+ // Includes headers
+ // Note: CUDA kernels support halfs and random generation, CPU kernels do not
if (has_half_tensor) {
env.s("HalfHeader", cuda::half_support_literal);
} else {
env.s("RandParam", "");
env.s("RandInit", "");
}
-#endif // USE_CUDA_FUSER
// Insantiates the CUDA or CPU-specific templates
env.s("tensorOffsets", tensorOffsets.str());
env.v("argument_loads", argument_loads);
std::string code_string;
if (use_cuda) {
-#if USE_CUDA_FUSER
env.s("type_declarations", cuda::type_declarations_template.format(env));
code_string = cuda::cuda_compilation_unit_template.format(env);
-#else
- throw std::runtime_error("CUDA Fusion requested but not supported.");
-#endif // USE_CUDA_FUSER
} else {
-#if USE_CPU_FUSER
env.s("type_declarations", cpu::type_declarations_template.format(env));
code_string = cpu::cpu_compilation_unit_template.format(env);
-#else
- throw std::runtime_error("CPU Fusion requested but not supported");
-#endif // USE_CPU_FUSER
}
if (debugFuser()) {
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/fuser/arg_spec.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/jit/type.h>
#include "torch/csrc/jit/fuser/interface.h"
-#if USE_CUDA_FUSER
-#include <torch/csrc/jit/fuser/cuda/fused_kernel.h>
-#endif // USE_CUDA_FUSER
-
-#if USE_CPU_FUSER
-#include <torch/csrc/jit/fuser/cpu/fused_kernel.h>
-#endif // USE_CUDA_FUSER
-
#include <atomic>
#include <iostream>
#include <memory>
namespace jit {
namespace fuser {
+std::mutex fusion_backends_lock_;
+static std::unordered_map<at::Device::Type, FusedKernelConstructor>&
+getFusionBackends() {
+ static std::unordered_map<at::Device::Type, FusedKernelConstructor>
+ fusion_backends;
+ return fusion_backends;
+}
+
+void registerFusionBackend(
+ at::Device::Type backend_type,
+ FusedKernelConstructor ctor) {
+ std::lock_guard<std::mutex> guard(fusion_backends_lock_);
+ getFusionBackends()[backend_type] = std::move(ctor);
+}
+
+bool hasFusionBackend(at::Device::Type backend_type) {
+ std::lock_guard<std::mutex> guard(fusion_backends_lock_);
+ return getFusionBackends().count(backend_type);
+}
+
+const FusedKernelConstructor& getConstructor(at::Device::Type backend_type) {
+ std::lock_guard<std::mutex> guard(fusion_backends_lock_);
+ return getFusionBackends().at(backend_type);
+}
+
+
// Counter for number of kernels compiled, used for debugging and
// creating arbitrary kernel names.
static std::atomic<size_t> next_kernel_id{0};
const std::string name = "kernel_" + std::to_string(next_kernel_id++);
const bool use_cuda = device.is_cuda();
- std::string code = generateKernel(name, *graph, flat_inputs, flat_outputs, use_cuda);
- std::shared_ptr<FusedKernel> fused_kernel;
- if (use_cuda) {
-#if USE_CUDA_FUSER
- fused_kernel = std::make_shared<cuda::FusedKernelCUDA>(
- device.index(),
- name,
- code,
- input_desc,
- output_desc,
- chunk_desc,
- concat_desc,
- spec.hasRandom());
-#else
- throw std::runtime_error("CUDA Fusion is not supported on this build.");
-#endif // USE_CUDA_FUSER
- } else {
-#if USE_CPU_FUSER
- fused_kernel = std::make_shared<cpu::FusedKernelCPU>(
- name,
- code,
- input_desc,
- output_desc,
- chunk_desc,
- concat_desc,
- spec.hasRandom());
-#else
- throw std::runtime_error("CPU Fusion is not supported on this build.");
-#endif // USE_CPU_FUSER
- }
-
- return fused_kernel;
+ std::string code =
+ generateKernel(name, *graph, flat_inputs, flat_outputs, use_cuda);
+ const FusedKernelConstructor& kernel_ctor =
+ getConstructor(use_cuda ? at::DeviceType::CUDA : at::DeviceType::CPU);
+ return kernel_ctor(
+ device.index(),
+ name,
+ code,
+ input_desc,
+ output_desc,
+ chunk_desc,
+ concat_desc,
+ spec.hasRandom());
}
} // namespace fuser
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/fuser/arg_spec.h>
-#include <torch/csrc/jit/fuser/config.h>
#include <torch/csrc/jit/fuser/fused_kernel.h>
#include <torch/csrc/jit/fuser/interface.h>
#include <torch/csrc/jit/fuser/kernel_spec.h>
TORCH_API int debugFuser();
+using FusedKernelConstructor = std::function<std::shared_ptr<FusedKernel>(
+ int16_t device,
+ std::string name,
+ std::string code,
+ std::vector<TensorDesc> input_desc,
+ std::vector<TensorDesc> output_desc,
+ std::vector<PartitionDesc> chunk_desc,
+ std::vector<PartitionDesc> concat_desc,
+ bool has_random)>;
+
+TORCH_API void registerFusionBackend(
+ at::Device::Type backend_type,
+ FusedKernelConstructor ctor);
+TORCH_API bool hasFusionBackend(at::Device::Type backend_type);
+struct TORCH_API RegisterFusionBackend {
+ RegisterFusionBackend(
+ at::Device::Type backend_type,
+ FusedKernelConstructor ctor) {
+ registerFusionBackend(backend_type, std::move(ctor));
+ }
+};
+
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
+++ /dev/null
-#pragma once
-
-// clang-format off
-#define USE_CPU_FUSER @USE_CPU_FUSER@
-#define USE_CUDA_FUSER @USE_CUDA_FUSER@
-// clang-format on
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CPU_FUSER
#include <torch/csrc/jit/assertions.h>
#include <torch/csrc/utils/disallow_copy.h>
-#include <dlfcn.h>
-
namespace torch {
namespace jit {
namespace fuser {
namespace cpu {
-static void* checkDL(void* x) {
- if (!x) {
- AT_ERROR("error in dlopen or dlsym: ", dlerror());
- }
-
- return x;
-}
-
struct DynamicLibrary {
TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
- DynamicLibrary(const char* name) {
- // NOLINTNEXTLINE(hicpp-signed-bitwise)
- handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
- }
+ DynamicLibrary(const char* name);
- void* sym(const char* name) {
- JIT_ASSERT(handle);
- return checkDL(dlsym(handle, name));
- }
+ void* sym(const char* name);
- ~DynamicLibrary() {
- if (!handle)
- return;
- dlclose(handle);
- }
+ ~DynamicLibrary();
private:
void* handle = nullptr;
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CPU_FUSER
--- /dev/null
+
+#include <torch/csrc/jit/assertions.h>
+#include <torch/csrc/jit/fuser/cpu/dynamic_library.h>
+#include <torch/csrc/utils/disallow_copy.h>
+
+#include <dlfcn.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cpu {
+
+static void* checkDL(void* x) {
+ if (!x) {
+ AT_ERROR("error in dlopen or dlsym: ", dlerror());
+ }
+
+ return x;
+}
+DynamicLibrary::DynamicLibrary(const char* name) {
+ // NOLINTNEXTLINE(hicpp-signed-bitwise)
+ handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
+}
+
+void* DynamicLibrary::sym(const char* name) {
+ JIT_ASSERT(handle);
+ return checkDL(dlsym(handle, name));
+}
+
+DynamicLibrary::~DynamicLibrary() {
+ if (!handle)
+ return;
+ dlclose(handle);
+}
+
+} // namespace cpu
+} // namespace fuser
+} // namespace jit
+} // namespace torch
--- /dev/null
+#include <torch/csrc/jit/assertions.h>
+#include <torch/csrc/jit/fuser/cpu/dynamic_library.h>
+#include <torch/csrc/utils/disallow_copy.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cpu {
+
+DynamicLibrary::DynamicLibrary(const char* name) {
+ // NOLINTNEXTLINE(hicpp-signed-bitwise)
+ AT_ERROR("NYI: DynamicLibrary on Windows");
+}
+
+void* DynamicLibrary::sym(const char* name) {
+ AT_ERROR("NYI: DynamicLibrary on Windows");
+}
+
+DynamicLibrary::~DynamicLibrary() {}
+
+} // namespace cpu
+} // namespace fuser
+} // namespace jit
+} // namespace torch
#include <torch/csrc/jit/fuser/cpu/fused_kernel.h>
-
#include <torch/csrc/jit/assertions.h>
#include <torch/csrc/jit/code_template.h>
#include <torch/csrc/jit/fuser/compiler.h>
#pragma GCC diagnostic pop
}
+static std::shared_ptr<FusedKernel> createFusionKernel(
+ int16_t device,
+ std::string name,
+ std::string code,
+ std::vector<TensorDesc> input_desc,
+ std::vector<TensorDesc> output_desc,
+ std::vector<PartitionDesc> chunk_desc,
+ std::vector<PartitionDesc> concat_desc,
+ bool has_random) {
+ return std::make_shared<FusedKernelCPU>(
+ std::move(name),
+ std::move(code),
+ std::move(input_desc),
+ std::move(output_desc),
+ std::move(chunk_desc),
+ std::move(concat_desc),
+ has_random);
+}
+
+RegisterFusionBackend reg(at::DeviceType::CPU, createFusionKernel);
} // namespace cpu
} // namespace fuser
} // namespace jit
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CPU_FUSER
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CPU_FUSER
#include <torch/csrc/jit/code_template.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CPU_FUSER
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CPU_FUSER
#include <torch/csrc/jit/fuser/cuda/fused_kernel.h>
+#include <torch/csrc/jit/fuser/compiler.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
at::cuda::set_device(prior_device);
}
+static std::shared_ptr<FusedKernel> createFusionKernel(
+ int16_t device,
+ std::string name,
+ std::string code,
+ std::vector<TensorDesc> input_desc,
+ std::vector<TensorDesc> output_desc,
+ std::vector<PartitionDesc> chunk_desc,
+ std::vector<PartitionDesc> concat_desc,
+ bool has_random) {
+ return std::make_shared<FusedKernelCUDA>(
+ device,
+ std::move(name),
+ std::move(code),
+ std::move(input_desc),
+ std::move(output_desc),
+ std::move(chunk_desc),
+ std::move(concat_desc),
+ has_random);
+}
+
+RegisterFusionBackend reg(at::DeviceType::CUDA, createFusionKernel);
+
} // namespace cuda
} // namespace fuser
} // namespace jit
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/code_template.h>
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
return val;
}
+)"
+// MSVC's preprocesor (but not the standard compiler) has a bug
+// where it incorrectly tokenizes raw string literals, ending when it sees a "
+// this causes the #endif in this string literal to be treated as a preprocessor
+// token which, in turn, cause sccache on windows CI to fail.
+// See https://godbolt.org/z/eVTIJq as an example.
+// This workaround uses string-pasting to separate the " and the #endif into different
+// strings
+R"(
#endif /* defined(__CUDACC__) */
#endif /* defined(__cplusplus) */
#undef __HALF_TO_US
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER
#include <ATen/ExpandUtils.h>
#include <c10/util/Optional.h>
#include <torch/csrc/jit/fuser/compiler.h>
-#include <torch/csrc/jit/fuser/config.h>
#include <torch/csrc/jit/fuser/interface.h>
#include <torch/csrc/jit/fuser/kernel_cache.h>
#include <torch/csrc/jit/fuser/kernel_spec.h>
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/stack.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/jit/stack.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <ATen/ATen.h>
#include <torch/csrc/jit/fuser/partition_desc.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/jit/fuser/interface.h>
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/jit/fuser/compiler.h>
#include <torch/csrc/jit/fuser/executor.h>
#include <torch/csrc/jit/fuser/fallback.h>
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#include <stdexcept>
} // namespace detail
int64_t registerFusion(const Node* fusion_group) {
-#if USE_CUDA_FUSER || USE_CPU_FUSER
return fuser::registerFusion(fusion_group);
-#else
- throw std::runtime_error("Fusion not supported for this build.");
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
}
void runFusion(const int64_t key, Stack& stack) {
-#if USE_CUDA_FUSER || USE_CPU_FUSER
const auto result = fuser::runFusion(key, stack);
if (!result)
fuser::runFallback(key, stack);
-#else
- throw std::runtime_error("Fusion not supported for this build.");
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
}
bool canFuseOnCPU() {
-#if USE_CPU_FUSER
- return detail::cpu_fuser_enabled;
-#endif // USE_CPU_FUSER
-
- return false;
+ return fuser::hasFusionBackend(at::DeviceType::CPU) &&
+ detail::cpu_fuser_enabled;
}
bool canFuseOnGPU() {
-#if USE_CUDA_FUSER
- return true;
-#endif // USE_CUDA_FUSER
-
- return false;
+ return fuser::hasFusionBackend(at::DeviceType::CUDA);
}
void overrideCanFuseOnCPU(bool value) {
std::vector<at::Tensor> debugLaunchGraph(
Graph& graph,
at::ArrayRef<at::Tensor> inputs) {
-#if USE_CUDA_FUSER || USE_CPU_FUSER
// Creates a fusion group node
auto wrapper_graph = std::make_shared<Graph>();
Node* fusion_group =
const auto key = fuser::registerFusion(fusion_group);
fuser::runFusion(key, stack);
return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
-#else
- throw std::runtime_error("Fusion not supported for this build.");
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
}
size_t nCompiledKernels() {
-#if USE_CUDA_FUSER || USE_CPU_FUSER
return fuser::nCompiledKernels();
-#else
- return 0;
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
}
} // namespace jit
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <c10/util/Optional.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <ATen/ATen.h>
#include <c10/util/Optional.h>
// Note: assumes the spec is a single block
// Note: This is the appropriate place to generalize if you want to add other
// passes to upfront compilation that walk the graph.
- KernelSpec(
- const int64_t _key,
- const std::shared_ptr<Graph>& _graph)
- : key_{_key},
- graph_{_graph},
- code_{_graph},
- nInputs_{_graph->inputs().size()},
- inputBroadcastGroups_{},
- inputChunks_{},
- has_random_{false},
- kernels_{} {
-
+ KernelSpec(const int64_t _key, const std::shared_ptr<Graph>& _graph)
+ : key_{_key},
+ graph_{_graph},
+ code_{_graph},
+ nInputs_{_graph->inputs().size()},
+ inputBroadcastGroups_{},
+ inputChunks_{},
+ has_random_{false},
+ kernels_{} {
for (const auto& n : graph_->nodes()) {
if (n->kind() == aten::rand_like) {
has_random_ = true;
break;
- }
+ }
}
}
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CPU_FUSER || USE_CUDA_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/assertions.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
#pragma once
-#include <torch/csrc/jit/fuser/config.h>
-#if USE_CUDA_FUSER || USE_CPU_FUSER
-
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <cstdint>
} // namespace fuser
} // namespace jit
} // namespace torch
-
-#endif // USE_CUDA_FUSER || USE_CPU_FUSER
py::register_exception<JITException>(m, "JITException");
- py::class_<python::IODescriptor>(
+ py::class_<python::IODescriptor> iodescriptor(
m, "IODescriptor"); // NOLINT(bugprone-unused-raii)
m.def("_jit_init", loadPythonClasses)
-#if USE_CUDA_FUSER || USE_CPU_FUSER
.def(
"_jit_debug_fuser_num_cached_kernel_specs",
torch::jit::fuser::debugNumCachedKernelSpecs)
-#endif
.def("_jit_pass_onnx", ToONNX)
.def("_jit_pass_lower_all_tuples", LowerAllTuples)
.def("_jit_pass_onnx_peephole", PeepholeOptimizeONNX)
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
#include <torch/csrc/jit/script/compiler.h>
#include <torch/csrc/jit/symbolic_variable.h>
-#include <unordered_map>
-#ifdef USE_CUDA
-#include <cuda.h> // for CUDA_VERSION
-#endif
+#include <unordered_map>
namespace torch {
namespace jit {
} // anonymous namespace
void FuseGraph(std::shared_ptr<Graph>& graph) {
-// NYI on Windows
-#ifndef _WIN32
-
- GraphFuser(graph->block(), graph).run();
- // After FuseGraph some common subexpressions may come back
- EliminateCommonSubexpression(graph);
- // We might have emitted a fair amount of useless shape propagating code, so
- // remove it
- EliminateDeadCode(graph);
- // Improve the quality of shape propagation code that was left
- PeepholeOptimizeShapeExpressions(graph->block());
-
-#endif
+ if (canFuseOnCPU() || canFuseOnGPU()) {
+ GraphFuser(graph->block(), graph).run();
+ // After FuseGraph some common subexpressions may come back
+ EliminateCommonSubexpression(graph);
+ // We might have emitted a fair amount of useless shape propagating code, so
+ // remove it
+ EliminateDeadCode(graph);
+ // Improve the quality of shape propagation code that was left
+ PeepholeOptimizeShapeExpressions(graph->block());
+ }
}
} // namespace jit