From 206646d6ed692de8aee663dda7e79e8fc5f42f73 Mon Sep 17 00:00:00 2001 From: Priya Ramani Date: Wed, 15 Sep 2021 19:12:47 -0700 Subject: [PATCH] Add NNC AOT Compiler executable (#63994) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63994 Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D30582149 Pulled By: priyaramani fbshipit-source-id: 3bbf085428824c3cb308e006c18bb0a57f50fef6 --- binaries/CMakeLists.txt | 3 + binaries/aot_model_compiler.cc | 170 +++++++++++++++++++++++++++++ tools/build_variables.bzl | 1 + torch/CMakeLists.txt | 6 + torch/csrc/jit/mobile/nnc/aot_compiler.cpp | 112 +++++++++++++++++++ torch/csrc/jit/mobile/nnc/aot_compiler.h | 23 ++++ 6 files changed, 315 insertions(+) create mode 100644 binaries/aot_model_compiler.cc create mode 100644 torch/csrc/jit/mobile/nnc/aot_compiler.cpp create mode 100644 torch/csrc/jit/mobile/nnc/aot_compiler.h diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt index 4dfe767..f048aba 100644 --- a/binaries/CMakeLists.txt +++ b/binaries/CMakeLists.txt @@ -108,3 +108,6 @@ caffe2_binary_target("tutorial_blob.cc") caffe2_binary_target("dump_operator_names.cc") caffe2_binary_target("optimize_for_mobile.cc") + +caffe2_binary_target(aot_model_compiler "aot_model_compiler.cc") +target_link_libraries(aot_model_compiler aot_compiler) diff --git a/binaries/aot_model_compiler.cc b/binaries/aot_model_compiler.cc new file mode 100644 index 0000000..d757af1 --- /dev/null +++ b/binaries/aot_model_compiler.cc @@ -0,0 +1,170 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +C10_DEFINE_string(model, "", "The torch script model to optimize."); +C10_DEFINE_string(model_name, "", "The name of the model."); +C10_DEFINE_string(model_version, "", "The version of the model."); +C10_DEFINE_string( + input_dims, + "", + "For input float TensorCPUs, specify the dimension using comma " + "separated numbers. If multiple inputs needed, use semicolon " + "to separate the dimension of different tensors."); +C10_DEFINE_string( + output_llvm, + "", + "Name of the output llvm assembly to be saved."); +C10_DEFINE_string(output_model, "", "Name of the output model to be saved."); + +namespace { + +std::vector split( + char separator, + const std::string& string, + bool ignore_empty = true) { + std::vector pieces; + std::stringstream ss(string); + std::string item; + while (getline(ss, item, separator)) { + if (!ignore_empty || !item.empty()) { + pieces.push_back(std::move(item)); + } + } + return pieces; +} + +std::vector> parseInputShapes() { + CAFFE_ENFORCE_GE(FLAGS_input_dims.size(), 0, "Input dims must be specified."); + std::vector input_dims_list = split(';', FLAGS_input_dims); + std::vector> inputs; + for (const auto& input_dims_item : input_dims_list) { + auto input_dims_str = split(',', input_dims_item); + std::vector input_dims; + input_dims.reserve(input_dims_str.size()); + for (const auto& s : input_dims_str) { + input_dims.push_back(c10::stoi(s)); + } + inputs.push_back(input_dims); + } + return inputs; +} + +c10::Dict createCompileSpec() { + c10::Dict compile_spec( + c10::StringType::get(), c10::AnyType::get()); + c10::Dict method_spec( + c10::StringType::get(), c10::AnyType::get()); + auto input_shapes = parseInputShapes(); + TORCH_CHECK( + input_shapes.size() == 1, + "Wrong # of input shapes: ", + input_shapes.size()); + method_spec.insert("sizes", input_shapes[0]); // TODO: support multiple inputs + compile_spec.insert("forward", method_spec); + return compile_spec; +} + +std::vector getInputSizesForMethod( + const c10::Dict& method_compile_spec, + const std::string& method_name) { + return method_compile_spec.at(method_name) + .toGenericDict() + .at("sizes") + .toIntVector(); +} + +std::string getNncKernelId(const std::string& method_name) { + // TODO: calculate the version_token. + const std::string version_token = "VERTOKEN"; + return FLAGS_model_name + ":" + FLAGS_model_version + ":" + method_name + + ":" + version_token; +} + +void writeOutputLlvmAssembly(const std::string& asm_code) { + std::string output_llvm_file_name = FLAGS_output_llvm; + if (output_llvm_file_name.empty()) { + output_llvm_file_name = + FLAGS_model.substr(0, FLAGS_model.find('.')) + ".compiled.ll"; + } + + std::ofstream output(output_llvm_file_name); + output << asm_code; +} + +c10::IValue preprocess( + const torch::jit::Module& mod, + const c10::Dict& method_compile_spec, + const torch::jit::BackendDebugHandleGenerator& generate_debug_handles) { + const std::string& method_name = "forward"; + auto method = mod.get_method(method_name); + auto graph = method.function().graph()->copy(); + auto sizes = getInputSizesForMethod(method_compile_spec, method_name); + + std::string llvm_asm_code; + auto func = + torch::jit::mobile::nnc::aotCompile(method_name, graph, sizes, &llvm_asm_code); + writeOutputLlvmAssembly(llvm_asm_code); + + func->set_nnc_kernel_id(getNncKernelId(method_name)); + + torch::jit::mobile::nnc::CompilationUnit cu; + cu.register_function(std::move(func)); + return cu.serialize(); +} + +static auto reg = torch::jit::backend_preprocess_register("nnc", preprocess); + +} // namespace + +int main(int argc, char** argv) { + c10::SetUsageMessage( + "Run NNC AOT compiler for pytorch model. Example usage:\n" + "build/bin/aot_model_compiler" + " --model=" + " --model_name=" + " --model_version=" + " --input_dims='1,3,224,224'" + " [--output_llvm=]" + " [--output_model=]"); + + if (!c10::ParseCommandLineFlags(&argc, &argv)) { + std::cerr << "Failed to parse command line flags!" << std::endl; + std::cout << c10::UsageMessage() << std::endl; + return 1; + } + + CAFFE_ENFORCE(!FLAGS_model.empty(), c10::UsageMessage()); + + std::string output_model_name = FLAGS_output_model; + if (output_model_name.empty()) { + output_model_name = + FLAGS_model.substr(0, FLAGS_model.find('.')) + ".compiled.pt"; + } + + auto m = torch::jit::load(FLAGS_model); + m.eval(); + auto frozen_m = torch::jit::freeze_module(m.clone()); + auto graph = frozen_m.get_method("forward").graph(); + torch::jit::OptimizeFrozenGraph(graph, true); + + auto compile_spec = createCompileSpec(); + auto any_dict_ty = + c10::DictType::create(c10::StringType::get(), c10::AnyType::get()); + auto compiled_module = torch::jit::detail::codegen_backend_module( + "nnc", frozen_m, compile_spec, any_dict_ty); + compiled_module._save_for_mobile(output_model_name); + std::cout << "The compiled model was saved to " << output_model_name + << std::endl; + return 0; +} diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index a139515..30ee081 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -183,6 +183,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/ir/subgraph_matcher.cpp", "torch/csrc/jit/jit_log.cpp", "torch/csrc/jit/jit_opt_limit.cpp", + "torch/csrc/jit/mobile/nnc/aot_compiler.cpp", "torch/csrc/jit/mobile/nnc/backend.cpp", "torch/csrc/jit/mobile/nnc/context.cpp", "torch/csrc/jit/mobile/nnc/registry.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 7c08685..4de3346 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -423,3 +423,9 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # Pybind11 requires explicit linking of the torch_python library target_link_libraries(nnapi_backend torch torch_python) endif() + +if(BUILD_BINARY) + add_library(aot_compiler SHARED + ${TORCH_SRC_DIR}/csrc/jit/mobile/nnc/aot_compiler.cpp + ) +endif() diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp new file mode 100644 index 0000000..0790fdf --- /dev/null +++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp @@ -0,0 +1,112 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace torch::jit; +using namespace torch::jit::tensorexpr; + +namespace torch { +namespace jit { +namespace mobile { +namespace nnc { + +std::vector getConstSizes(const BufPtr b) { + std::vector r; + for (const auto& dim : b->dims()) { + LongImmPtr imm_dim = to(dim); + // TODO: assert it's actually immediate + int64_t s = imm_dim->value(); + r.push_back(s); + } + return r; +} + +void getCompiledFunction( + std::shared_ptr kernel, + Function* func) { + std::vector parameters; + + auto const_descriptors = kernel->getConstantDescriptors(); + for (const auto& cd : const_descriptors) { + auto sizes = getConstSizes(cd.buf); + at::Tensor const_tensor = at::from_blob(cd.ptr, sizes).clone(); + parameters.push_back(const_tensor); + } + func->set_parameters(c10::impl::toList(c10::List(parameters))); + + MemoryPlan plan; + plan.buffer_sizes_ = {}; // temp_sizes_; + // TODO: implement prealloc optimization and fill in temp_sizes + func->set_memory_plan(plan); + + int64_t n_inputs = kernel->graph()->inputs().size(); + int64_t n_outputs = kernel->graph()->outputs().size(); + std::vector out_spec; + for (int64_t idx = n_inputs; idx < n_inputs + n_outputs; idx++) { + const auto& ba = kernel->getBufferArgs()[idx]; + OutputSpec output; + output.sizes_ = getConstSizes(ba.buf()); + // TODO: assert the output is a buffer and not a scalar + // TODO: use actual dtype + output.dtype_ = c10::ScalarType::Float; + out_spec.push_back(output); + } + func->set_output_specs(out_spec); +} + +std::unique_ptr aotCompile( + const std::string& method_name, + std::shared_ptr& g, + const std::vector& sizes, + std::string* compiled_assembly) { + auto g2 = g->copy(); + GRAPH_DEBUG("Input sizes ", sizes); + + RemoveTensorMutation(g); + EliminateDeadCode(g->block()); + g = tensorexpr::removeUnusedSelfArgument(g); + GRAPH_DUMP("graph before shape propagation ", g); + + std::vector> example_inputs = {at::rand(sizes)}; + tensorexpr::annotateInputShapes(g, example_inputs); + + PropagateShapesOnGraph(g); + PeepholeOptimize(g, false); + ConstantPropagation(g); + PropagateShapesOnGraph(g); + GRAPH_DUMP("graph after shape propagation ", g); + + std::shared_ptr kernel = + std::make_shared(g); + *compiled_assembly = kernel->getCodeText(); + + g = g2; + + auto func = std::make_unique(); + func->set_name(method_name); + + InputSpec input; + input.sizes_ = sizes; + input.dtype_ = c10::ScalarType::Float; + func->set_input_specs({input}); + + getCompiledFunction(kernel, func.get()); + return func; +} + +} // namespace nnc +} // namespace mobile +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.h b/torch/csrc/jit/mobile/nnc/aot_compiler.h new file mode 100644 index 0000000..71f6d92 --- /dev/null +++ b/torch/csrc/jit/mobile/nnc/aot_compiler.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include +#include + +namespace torch { +namespace jit { +namespace mobile { +namespace nnc { + +// Performs Ahead Of Time compilation of a given method in a model +// returning the compiled function and LLVM assembly code +TORCH_API std::unique_ptr aotCompile( + const std::string& method_name, + std::shared_ptr& subgraph, + const std::vector& sizes, + std::string* compiled_assembly); + +} // namespace nnc +} // namespace mobile +} // namespace jit +} // namespace torch -- 2.7.4