xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
+ // Builds an XLA allocator for the device.
+ XlaAllocator xla_allocator(client->platform(), ctx);
+
XlaCompiler::Options options;
options.client = client;
options.device_type = &cache->device_type();
options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
options.graph_def_version = ctx->function_library()->graph_def_version();
options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+ options.device_allocator = &xla_allocator;
const XlaCompiler::CompilationResult* kernel;
xla::LocalExecutable* executable;
VLOG(1) << "Executing XLA Computation...";
- // Builds an XLA allocator for the device.
- XlaAllocator xla_allocator(client->platform(), ctx);
-
std::unique_ptr<xla::ShapedBuffer> output;
// Build xla::ShapedBuffers that point directly to the Tensor buffers.
std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
xla::ExecutableBuildOptions build_options;
build_options.set_device_ordinal(client_->default_device_ordinal());
build_options.set_result_layout(result.xla_output_shape);
+ build_options.set_device_allocator(options.device_allocator);
auto compile_result =
client_->Compile(*result.computation, argument_layouts, build_options);
// device is created, and can be used to create metadata objects
// that can be accessed by XLA op kernels.
std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+
+ // If not nullptr, this memory allocator can be used by the compiler for
+ // temporary allocations it might want to make during compilation.
+ //
+ // For example, the compiler may want to try out different algorithms and
+ // choose the fastest one, and it might run those algorithms over buffers
+ // created using this allocator.
+ //
+ // The compiler can function correctly without an explicit allocator given
+ // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
+ // allocate most or all available memory on the device, leaving none for the
+ // compiler to access, unless it can use TensorFlow's allocator.
+ xla::DeviceMemoryAllocator* device_allocator = nullptr;
};
explicit XlaCompiler(Options options);
return result_layout_set_ ? &result_layout_ : nullptr;
}
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
+ DeviceMemoryAllocator* allocator) {
+ device_allocator_ = allocator;
+ return *this;
+}
+
+DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+ return device_allocator_;
+}
+
namespace {
StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
Backend* backend) {
int device_ordinal = options.device_ordinal() == -1
? default_device_ordinal()
: options.device_ordinal();
- TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
- local_service_->CompileExecutable(
- computation.handle(), argument_layouts,
- options.result_layout(), device_ordinal));
+ TF_ASSIGN_OR_RETURN(
+ std::unique_ptr<Executable> executable,
+ local_service_->CompileExecutable(computation.handle(), argument_layouts,
+ options.result_layout(), device_ordinal,
+ options.device_allocator()));
return WrapUnique(new LocalExecutable(std::move(executable),
local_service_->mutable_backend(),
device_ordinal, options));
ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
const Shape* result_layout() const;
+ // If set, this specifies an allocator that can be used to allocate temporary
+ // space on the device during compilation. For example, the compiler might
+ // want to run various algorithms on the device and pick the fastest one -- it
+ // might allocate buffers for use by these algorithms using this allocator.
+ //
+ // This does not need to be the same as the DeviceMemoryAllocator passed when
+ // running the executable.
+ ExecutableBuildOptions& set_device_allocator(
+ DeviceMemoryAllocator* allocator);
+ DeviceMemoryAllocator* device_allocator() const;
+
private:
int device_ordinal_ = -1;
Shape result_layout_;
bool result_layout_set_ = false;
+ DeviceMemoryAllocator* device_allocator_ = nullptr;
};
class LocalExecutable {
// Returns the ID of the platform to which these options apply.
virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+ // Optional allocator that may be used for allocating temp space on the device
+ // during compilation.
+ DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
+ void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+ device_allocator_ = device_allocator;
+ }
+
protected:
AotCompilationOptions() = default;
+
+ private:
+ DeviceMemoryAllocator* device_allocator_ = nullptr;
};
// Abstract compiler interface that is subclassed for compilation on a
// Runs Hlo passes to optimize the given Hlo module, returns the optimized
// module.
+ //
+ // If device_allocator is not null, the compiler may use it to allocate temp
+ // space on the device for use during compilation. For example, the compiler
+ // may allocate buffers on the device and then run variants of a given
+ // algorithm over those buffers, to see which variant is fastest. Any space
+ // allocated should be deallocated before this function returns.
virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* executor) = 0;
+ perftools::gputools::StreamExecutor* executor,
+ DeviceMemoryAllocator* device_allocator) = 0;
// Compiles the HLO module for execution on a device given by the executor,
// and returns an executable object or an error status. No HLO passes are
// The compiler may optionally specialize to the individual device
// (not just type of device) indicated by the executor.
//
+ // device_allocator is optional; see RunHloPasses.
+ //
// Use the overload below to compile computations that run in parallel.
virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* executor) = 0;
+ perftools::gputools::StreamExecutor* executor,
+ DeviceMemoryAllocator* device_allocator) = 0;
// Compiles a set of HLO modules that can run in parallel, potentially
// communicating data between the modules, and returns a corresponding
// sequence of executable objects.
//
+ // device_allocator is optional; see RunHloPasses.
+ //
// TODO(b/68666782): Remove this method after adding support for multiple
// modules to RunHloPasses and RunBackends.
virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
std::vector<std::unique_ptr<HloModule>> modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
- stream_exec) = 0;
+ stream_exec,
+ DeviceMemoryAllocator* device_allocator) = 0;
// Compiles the HLO module for ahead-of-time execution. This is intended for
// use in static compilation.
StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* /*stream_exec*/) {
+ perftools::gputools::StreamExecutor* /*stream_exec*/,
+ DeviceMemoryAllocator* /*device_allocator*/) {
VLOG(2) << "Before optimization:";
XLA_VLOG_LINES(2, module->ToString());
StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* stream_exec) {
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* /*device_allocator*/) {
const string timer_message =
"Compiling [" + module->name() + "] for CPU using JIT";
XLA_SCOPED_LOGGING_TIMER(timer_message);
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* stream_exec) override;
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* stream_exec) override;
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
// Modifies the given HLO module so that it will be accepted by IrEmitter.
// Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
+tensorflow::Status PrepareHloModuleForIrEmitting(
+ HloModule* hlo_module, se::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* /*device_allocator*/) {
// In some cases, we have to place the result of an instruction in a temporary
// buffer. For instance, the buffer that holds an external parameter is
// assumed immutable at this point, and should not be reused for output
.getPointerSize(0 /* default address space */)) {}
StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
- std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+ std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) {
XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
Tracing::TraceMe annotation("HLO Transforms", module->name(),
/*is_expensive=*/true);
}
StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
- std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+ std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) {
XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
TF_RET_CHECK(stream_exec != nullptr);
- TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
+ TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get(), stream_exec,
+ device_allocator));
llvm::LLVMContext llvm_context;
std::string buffer;
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* stream_exec) override;
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> module,
- perftools::gputools::StreamExecutor* stream_exec) override;
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
if (run_hlo_passes) {
TF_ASSIGN_OR_RETURN(
module, backend().compiler()->RunHloPasses(
- std::move(module), backend().default_stream_executor()));
+ std::move(module), backend().default_stream_executor(),
+ /*device_allocator=*/nullptr));
}
TF_ASSIGN_OR_RETURN(
std::unique_ptr<Executable> executable,
backend().compiler()->RunBackend(std::move(module),
- backend().default_stream_executor()));
+ backend().default_stream_executor(),
+ /*device_allocator=*/nullptr));
se::Stream stream(backend().default_stream_executor());
stream.Init();
}
StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
- std::unique_ptr<HloModule> hlo_module,
- se::StreamExecutor* /*stream_exec*/) {
+ std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
+ DeviceMemoryAllocator* /*device_allocator*/) {
VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
return std::move(hlo_module);
}
StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
- std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
+ std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* /*device_allocator*/) {
TF_RET_CHECK(stream_exec != nullptr);
VLOG(1) << "Run backend " << hlo_module->name();
StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
- std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/) {
+ std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/,
+ DeviceMemoryAllocator* /*device_allocator*/) {
return tensorflow::errors::Unimplemented(
"Compilation of multiple HLO modules is not supported on Interpreter.");
}
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> hlo_module,
- perftools::gputools::StreamExecutor* stream_exec) override;
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> hlo_module,
- perftools::gputools::StreamExecutor* stream_exec) override;
+ perftools::gputools::StreamExecutor* stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
std::vector<std::unique_ptr<HloModule>> hlo_modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
- stream_exec) override;
+ stream_exec,
+ DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
namespace xla {
StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
std::vector<std::unique_ptr<HloModule>> modules,
- std::vector<std::vector<perftools::gputools::StreamExecutor*>>
- stream_execs) {
+ std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+ DeviceMemoryAllocator* device_allocator) {
std::vector<std::unique_ptr<Executable>> result;
for (size_t i = 0; i < modules.size(); i++) {
if (stream_execs[i].size() != 1) {
"Model partitioning not implemented for the CPU/GPU compilers!");
}
- TF_ASSIGN_OR_RETURN(
- modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
+ TF_ASSIGN_OR_RETURN(modules[i],
+ RunHloPasses(std::move(modules[i]), stream_execs[i][0],
+ device_allocator));
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
- RunBackend(std::move(modules[i]), stream_execs[i][0]));
+ RunBackend(std::move(modules[i]), stream_execs[i][0],
+ device_allocator));
result.push_back(std::move(executable));
}
// Bring in
// StatusOr<std::unique_ptr<Executable>> RunBackend(
// std::unique_ptr<HloModule> module,
- // perftools::gputools::StreamExecutor* stream_exec)
+ // perftools::gputools::StreamExecutor* stream_exec,
+ // DeviceMemoryAllocator* device_allocator)
// StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
// std::unique_ptr<HloModule> module,
- // perftools::gputools::StreamExecutor* stream_exec)
+ // perftools::gputools::StreamExecutor* stream_exec,
+ // DeviceMemoryAllocator* device_allocator)
using Compiler::RunBackend;
using Compiler::RunHloPasses;
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
std::vector<std::unique_ptr<HloModule>> modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
- stream_execs) override;
+ stream_execs,
+ DeviceMemoryAllocator* device_allocator) override;
protected:
ModuleHook user_pre_optimization_hook_;
StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
const ComputationHandle& computation,
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
- const Shape* result_layout, int device_ordinal) {
+ const Shape* result_layout, int device_ordinal,
+ DeviceMemoryAllocator* device_allocator) {
TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
computation_tracker_.Resolve(computation));
VersionedComputationHandle versioned_handle =
execute_backend_->stream_executor(device_ordinal));
return BuildExecutable(versioned_handle, std::move(module_config),
- execute_backend_.get(), executor);
+ execute_backend_.get(), executor, device_allocator);
}
StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
// Builds an Executable with the given argument layouts and options. If
// result_layout is non-null, then the executable is compiled to produce a
- // result of the given layout.
+ // result of the given layout. If device_allocator is non-null, then the
+ // compiler may use it to allocate temp space on the device. The compiler is
+ // responsible for freeing any memory it allocates this way.
StatusOr<std::unique_ptr<Executable>> CompileExecutable(
const ComputationHandle& computation,
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
- const Shape* result_layout, int device_ordinal);
+ const Shape* result_layout, int device_ordinal,
+ DeviceMemoryAllocator* device_allocator);
// Returns the device ordinal that corresponds to the given replica number.
//
std::vector<VersionedComputationHandle> versioned_handles,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend,
- std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors) {
+ std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+ DeviceMemoryAllocator* device_allocator) {
VLOG(1) << Printf("BuildExecutable on service %p", this);
// Dump computation proto state if flag is set.
TF_ASSIGN_OR_RETURN(
std::vector<std::unique_ptr<Executable>> executables,
- backend->compiler()->Compile(std::move(modules), std::move(executors)));
+ backend->compiler()->Compile(std::move(modules), std::move(executors),
+ device_allocator));
for (size_t i = 0; i < versioned_handles.size(); ++i) {
if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
const VersionedComputationHandle& versioned_handle,
- std::unique_ptr<HloModuleConfig> module_config,
- Backend* backend, se::StreamExecutor* executor) {
+ std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+ se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
versioned_handle.ToString().c_str());
TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
TF_ASSIGN_OR_RETURN(
- module, backend->compiler()->RunHloPasses(std::move(module), executor));
+ module, backend->compiler()->RunHloPasses(std::move(module), executor,
+ device_allocator));
- TF_ASSIGN_OR_RETURN(
- std::unique_ptr<Executable> executable,
- backend->compiler()->RunBackend(std::move(module), executor));
+ TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+ backend->compiler()->RunBackend(
+ std::move(module), executor, device_allocator));
if (!other_directory_path.empty()) {
executable->set_session_module(std::move(session_module));
StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
const VersionedComputationHandle& versioned_handle,
- std::unique_ptr<HloModuleConfig> module_config,
- Backend* backend, perftools::gputools::StreamExecutor* executor,
- ExecutionProfile* profile) {
+ std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+ perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+ DeviceMemoryAllocator* device_allocator) {
std::shared_ptr<Executable> executable =
compilation_cache_.LookUp(versioned_handle, *module_config);
TF_ASSIGN_OR_RETURN(
std::unique_ptr<Executable> executable_unique_ptr,
BuildExecutable(versioned_handle, std::move(module_config), backend,
- executor));
+ executor, device_allocator));
if (profile != nullptr) {
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
// Build the user computations into HloModules and compile to generate the
// executables.
+ //
+ // TODO(jlebar): There's currently no way to pass a device allocator to
+ // ExecuteParallel, so we have to pass a null device_allocator below.
TF_ASSIGN_OR_RETURN(
std::vector<std::unique_ptr<Executable>> executables,
BuildExecutables(versioned_handles, std::move(module_configs),
- execute_backend_.get(), all_executors));
+ execute_backend_.get(), all_executors,
+ /*device_allocator=*/nullptr));
std::vector<Executable*> executable_ptrs;
executable_ptrs.reserve(executables.size());
for (const auto& executable : executables) {
const UserComputation& user_computation);
// Builds an Executable for the given parameters.
+ //
+ // If device_allocator is not null, the compiler may use it to allocate temp
+ // buffers, which the compiler is responsible for freeing. The allocator
+ // given here need not match the allocator used when running the executable.
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
const VersionedComputationHandle& versioned_handle,
- std::unique_ptr<HloModuleConfig> module_config,
- Backend* backend, perftools::gputools::StreamExecutor* executor);
+ std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+ perftools::gputools::StreamExecutor* executor,
+ DeviceMemoryAllocator* device_allocator = nullptr);
// Same as BuildExecutable() above, but builds a list of Executables for the
// given computations that may interact with each other.
std::vector<VersionedComputationHandle> versioned_handles,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend,
- std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors);
+ std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+ DeviceMemoryAllocator* device_allocator);
// Similar to BuildExecutable, but look in the compilation cache for the
// executable first. If the executable is not in the cache, it is built and
// inserted into the cache.
StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
const VersionedComputationHandle& versioned_handle,
- std::unique_ptr<HloModuleConfig> module_config,
- Backend* backend, perftools::gputools::StreamExecutor* executor,
- ExecutionProfile* profile);
+ std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+ perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+ DeviceMemoryAllocator* device_allocator = nullptr);
// Runs the given executable with the given arguments and register the result
// in the allocation tracker. The handle of the result from the tracker is
std::unique_ptr<HloModule> hlo_module) {
TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
std::move(hlo_module),
- backend().default_stream_executor()));
+ backend().default_stream_executor(),
+ /*device_allocator=*/nullptr));
return backend().compiler()->RunBackend(std::move(hlo_module),
- backend().default_stream_executor());
+ backend().default_stream_executor(),
+ /*device_allocator=*/nullptr);
}
StatusOr<std::unique_ptr<AotCompilationResult>>
ASSERT_TRUE(compiler
->RunBackend(std::move(hlo_module),
- backend_->default_stream_executor())
+ backend_->default_stream_executor(),
+ /*device_allocator=*/nullptr)
.ok());
// Test that hooks were called.
executors.push_back({backend_->default_stream_executor()});
executors.push_back({backend_->default_stream_executor()});
- EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors)));
+ EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors),
+ /*device_allocator=*/nullptr));
}
private:
layouts.push_back(&program_shape->parameters(i));
}
StatusOr<std::unique_ptr<Executable>> executable =
- local_service->CompileExecutable(computation.handle(), layouts,
- &program_shape->result(),
- /*device_ordinal=*/0);
+ local_service->CompileExecutable(
+ computation.handle(), layouts, &program_shape->result(),
+ /*device_ordinal=*/0, /*device_allocator=*/nullptr);
const HloModule& module = executable.ValueOrDie()->module();
layouts.push_back(&program_shape->parameters(i));
}
StatusOr<std::unique_ptr<Executable>> executable =
- local_service->CompileExecutable(computation.handle(), layouts,
- &program_shape->result(),
- /*device_ordinal=*/0);
+ local_service->CompileExecutable(
+ computation.handle(), layouts, &program_shape->result(),
+ /*device_ordinal=*/0, /*device_allocator=*/nullptr);
const HloModule& module = executable.ValueOrDie()->module();