From 29ae608baabb673fb0b2d3ced9321f0e1798f72e Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Sun, 10 May 2020 22:05:59 -0500 Subject: [PATCH] [RUNTIME] Hexagon driver for offloading kernels to simulator (#5492) * [RUNTIME] Hexagon driver for offloading kernels to simulator * Add sim_dev as external project when building with Hexagon/sim support * Change target CPU for sim_dev to v60 --- cmake/modules/Hexagon.cmake | 9 + src/runtime/hexagon/sim/driver/CMakeLists.txt | 62 +++ src/runtime/hexagon/sim/driver/README.md | 38 ++ src/runtime/hexagon/sim/driver/fake_pthread.cc | 292 +++++++++++++ src/runtime/hexagon/sim/driver/pthread.h | 96 +++++ src/runtime/hexagon/sim/driver/sched.h | 31 ++ src/runtime/hexagon/sim/driver/sim_device.cc | 573 +++++++++++++++++++++++++ src/runtime/threading_backend.cc | 11 + 8 files changed, 1112 insertions(+) create mode 100644 src/runtime/hexagon/sim/driver/CMakeLists.txt create mode 100644 src/runtime/hexagon/sim/driver/README.md create mode 100644 src/runtime/hexagon/sim/driver/fake_pthread.cc create mode 100644 src/runtime/hexagon/sim/driver/pthread.h create mode 100644 src/runtime/hexagon/sim/driver/sched.h create mode 100644 src/runtime/hexagon/sim/driver/sim_device.cc diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index e70a964..30b4ccb 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +include(ExternalProject) + set(PICK_SIM "sim") set(PICK_HW "target") set(PICK_NONE "OFF") @@ -77,6 +79,13 @@ if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}") include_directories("${HEXAGON_TOOLCHAIN}/include/iss") link_directories("${HEXAGON_TOOLCHAIN}/lib/iss") list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper") + ExternalProject_Add(sim_dev + SOURCE_DIR "${CMAKE_SOURCE_DIR}/src/runtime/hexagon/sim/driver" + CMAKE_ARGS + "-DCMAKE_C_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang" + "-DCMAKE_CXX_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang++" + INSTALL_COMMAND "true" + ) elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}") find_hexagon_sdk_root() find_hexagon_toolchain() diff --git a/src/runtime/hexagon/sim/driver/CMakeLists.txt b/src/runtime/hexagon/sim/driver/CMakeLists.txt new file mode 100644 index 0000000..8632b49 --- /dev/null +++ b/src/runtime/hexagon/sim/driver/CMakeLists.txt @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +project(SIM_DEV C CXX) +cmake_minimum_required(VERSION 3.0.2) + +set(CMAKE_SYSTEM_NAME "Linux") + +if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake) + include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake) +endif() + +set(EXTRA_CXX_FLAGS + "-O2" + "-Wno-format" + "-mhvx -mhvx-length=128b" + "-mv60" + "-stdlib=libc++" +) + +set(EXTRA_LINK_FLAGS + "-stdlib=libc++" + "-G0" + "-Wl,--force-dynamic" + "-Wl,--export-dynamic" + "-Wl,--whole-archive" # This should link entire libc, libc++ and libc+abi. + "-Wl,--defsym=HEAP_SIZE=0x40000000" +) + +string(REGEX REPLACE ";" " " EXTRA_CXX_FLAGS_STR "${EXTRA_CXX_FLAGS}") +string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}") + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_FLAGS "${EXTRA_CXX_FLAGS_STR} ${CMAKE_CXX_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${EXTRA_LINK_FLAGS_STR} ${CMAKE_EXE_LINKER_FLAGS}") + +# Set project properties. + +file(GLOB SOURCE_FILES "*.cc") +add_executable(sim_dev ${SOURCE_FILES}) +target_include_directories(sim_dev + PUBLIC "." + PUBLIC ".." + PUBLIC "../../../../../include" + PUBLIC "../../../../../3rdparty/dlpack/include" +) + +target_link_libraries(sim_dev "-ldl") diff --git a/src/runtime/hexagon/sim/driver/README.md b/src/runtime/hexagon/sim/driver/README.md new file mode 100644 index 0000000..3aee1a1 --- /dev/null +++ b/src/runtime/hexagon/sim/driver/README.md @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + +# Hexagon simulator driver + +The driver (`sim_dev` executable) is the process running on the Hexagon simulator that handles the Hexagon-side communication with the TVM runtime running on x86. The location of `sim_dev` should be added to `PATH` before running any python code that uses Hexagon. The `sim_dev` executable is not intended to be run by users, it is automatically loaded by the simulator control code (in `hexagon_device_sim.cc`). + +### Prerequisites + +1. Hexagon C/C++ toolchain (such as the one in Hexagon SDK version 3.5.0 or later). + +Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk. + +### Configuring + +Set +``` +CMAKE_C_COMPILER=hexagon-clang +CMAKE_CXX_COMPILER=hexagon-clang++ +``` + +### Building + +There are no special options required for `make` (or the tool selected with `cmake`). The location of the resulting binary `sim_dev` should be added to `PATH`. diff --git a/src/runtime/hexagon/sim/driver/fake_pthread.cc b/src/runtime/hexagon/sim/driver/fake_pthread.cc new file mode 100644 index 0000000..74090d0 --- /dev/null +++ b/src/runtime/hexagon/sim/driver/fake_pthread.cc @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "pthread.h" +#include "sched.h" + +/*! + * Implementation of a subset of pthread API for single-threaded execution. + * + * They main idea is that the thread function ("start_routine" in the call + * to pthread_create) is executed immediately. When pthread_create returns, + * the thread function has already finished. + * + * Since the thread routine can itself call pthread_create, it is possible + * to have multiple threads existing at the same time, although only the + * last one is running. + * + * There are two main things that need to be taken care of: + * - thread-specific data, i.e. pthread_setspecific, pthread_getspecific, + * and the handling of thread keys, + * - handling of thread return values. + * + * Threads are identified by thread ids (of type pthread_t). The main process + * thread has the id of 0, the remaining threads have ids starting at 1 and + * incrementing by 1. For each thread there is some data (thread_info_t) + * associated with it, and stored in "thread_data" map. When a thread + * terminates, the corresponding entry from "thread_data" cannot be removed + * until the return value is claimed (pthread_join), unless it is explicitly + * discarded (pthread_detach). When a new thread is created, it gets the + * first available id for which there is no entry in "thread_data". This + * could be an id that was never allocated, or an id that was used, but + * has since been removed from the map. + * A thread can terminate through thread_exit. This means that when the + * thread function calls thread_exit, the execution should return to the + * pthread_create call that ran it. This is implemented via setjmp/longjmp + * (neither longjmp nor pthread_exit unwind the stack). + * + * Any mutexes or condition variables cannot block, or else it would cause + * a deadlock. Since there is only one thread running at a time, locking + * a mutex or waiting for a condition always succeeds (returns immediately). + */ + +struct key_entry_t { + key_entry_t(void* v, void (*d)(void*)) : value(v), dtor(d) {} + void* value = nullptr; + void (*dtor)(void*) = nullptr; +}; + +struct thread_info_t { + thread_info_t() = default; + std::map keys; + std::jmp_buf env; + void* ret_value = nullptr; + bool finished = false; + bool detached = false; +}; + +static pthread_t main_thread_id = 0; + +static std::map thread_data = { + // Reserve the 0th entry. + {main_thread_id, {}}}; + +static std::vector running_threads = {main_thread_id}; + +template +K first_available_key(const std::map& m) { + auto i = m.begin(), e = m.end(); + K key = 1; + for (; i != e && key == i->first; ++i, ++key) { + } + return key; +} + +int pthread_cond_destroy(pthread_cond_t* cond) { return 0; } + +int pthread_cond_init(pthread_cond_t* __restrict cond, + const pthread_condattr_t* __restrict attr) { + return 0; +} + +int pthread_cond_signal(pthread_cond_t* cond) { return 0; } + +int pthread_cond_broadcast(pthread_cond_t* cond) { return 0; } + +int pthread_cond_timedwait(pthread_cond_t* __restrict cond, + pthread_mutex_t* __restrict mutex, + const struct timespec* __restrict abstime) { + return 0; +} + +int pthread_cond_wait(pthread_cond_t* __restrict cond, + pthread_mutex_t* __restrict mutex) { + return 0; +} + +int pthread_mutexattr_init(pthread_mutexattr_t* attr) { return 0; } + +int pthread_mutexattr_destroy(pthread_mutexattr_t* attr) { return 0; } + +int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type) { + return 0; +} + +int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr, + int* __restrict type) { + *type = PTHREAD_MUTEX_NORMAL; + return 0; +} + +int pthread_mutex_init(pthread_mutex_t* __restrict mutex, + const pthread_mutexattr_t* __restrict attr) { + return 0; +} + +int pthread_mutex_destroy(pthread_mutex_t* mutex) { return 0; } + +int pthread_mutex_lock(pthread_mutex_t* mutex) { return 0; } + +int pthread_mutex_trylock(pthread_mutex_t* mutex) { return 0; } + +int pthread_mutex_unlock(pthread_mutex_t* mutex) { return 0; } + +int pthread_once(pthread_once_t* once_control, void (*init_routine)(void)) { + static_assert(PTHREAD_ONCE_INIT != PTHREAD_ONCE_DONE, + "PTHREAD_ONCE_INIT must be different from PTHREAD_ONCE_DONE"); + if (*once_control == PTHREAD_ONCE_INIT) { + init_routine(); + *once_control = PTHREAD_ONCE_DONE; + } + return 0; +} + +int pthread_equal(pthread_t t1, pthread_t t2) { return t1 == t2; } + +int pthread_create(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg) { + std::jmp_buf& env = thread_data[pthread_self()].env; + volatile pthread_t tid; + if (setjmp(env) == 0) { + tid = first_available_key(thread_data); + *thread = tid; + running_threads.push_back(pthread_t(tid)); + thread_info_t& thr = thread_data[pthread_t(tid)]; + thr.ret_value = start_routine(arg); + } + thread_info_t& thr = thread_data[pthread_t(tid)]; + thr.finished = true; + running_threads.pop_back(); + + // Destroy all keys. + bool repeat = true; + size_t iter = 0; + while (repeat && iter++ < PTHREAD_DESTRUCTOR_ITERATIONS) { + repeat = false; + // Assume that destructors can create new keys (i.e. modify the map). + for (size_t k = 0; k != PTHREAD_KEYS_MAX; ++k) { + auto f = thr.keys.find(k); + if (f == thr.keys.end()) { + continue; + } + key_entry_t& key = f->second; + if (key.dtor == nullptr || key.value == nullptr) { + continue; + } + key.dtor(key.value); + repeat = true; + } + } + + if (thr.detached) { + thread_data.erase(pthread_t(tid)); + } + + return 0; +} + +int pthread_join(pthread_t thread, void** retval) { + auto f = thread_data.find(thread); + if (f == thread_data.end()) { + return ESRCH; + } + thread_info_t& thr = f->second; + if (!thr.finished) { + return EDEADLK; + } + if (retval != nullptr) { + *retval = thr.ret_value; + } + thread_data.erase(f); + return 0; +} + +int pthread_detach(pthread_t thread) { + auto f = thread_data.find(thread); + if (f == thread_data.end()) { + return ESRCH; + } + // Can discard the return value. + f->second.detached = true; + return 0; +} + +void pthread_exit(void* retval) { + pthread_t sid = pthread_self(); + if (sid != main_thread_id) { + thread_info_t& self = thread_data[sid]; + self.ret_value = retval; + self.finished = true; + longjmp(self.env, 1); + } + exit(0); // Only executes for the main thread, plus silences + // the "should not return" warning. +} + +int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) { + if (key == nullptr) { + return EINVAL; + } + auto& keys = thread_data[pthread_self()].keys; + pthread_key_t k = first_available_key(keys); + if (k >= PTHREAD_KEYS_MAX) { + return EAGAIN; + } + *key = k; + keys.emplace(k, key_entry_t{nullptr, destructor}); + return 0; +} + +int pthread_key_delete(pthread_key_t key) { + auto& keys = thread_data[pthread_self()].keys; + auto f = keys.find(key); + if (f == keys.end()) { + return EINVAL; + } + // pthread_key_delete does not call key destructors. + keys.erase(f); + return 0; +} + +int pthread_setspecific(pthread_key_t key, const void* value) { + auto& keys = thread_data[pthread_self()].keys; + auto f = keys.find(key); + if (f == keys.end()) { + return EINVAL; + } + f->second.value = const_cast(value); + return 0; +} + +void* pthread_getspecific(pthread_key_t key) { + auto& keys = thread_data[pthread_self()].keys; + auto f = keys.find(key); + if (f != keys.end()) { + return f->second.value; + } + return nullptr; +} + +pthread_t pthread_self(void) { return running_threads.back(); } + +int sched_yield(void) { return 0; } + +#ifdef __cplusplus_ +extern "C" int nanosleep(const struct timespec* req, struct timespec* rem); +#endif + +int nanosleep(const struct timespec* req, struct timespec* rem) { return 0; } diff --git a/src/runtime/hexagon/sim/driver/pthread.h b/src/runtime/hexagon/sim/driver/pthread.h new file mode 100644 index 0000000..1748d61 --- /dev/null +++ b/src/runtime/hexagon/sim/driver/pthread.h @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_ +#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_ + +#define _PROVIDE_POSIX_TIME_DECLS 1 +#include +#undef _PROVIDE_POSIX_TIME_DECLS + +typedef int pthread_t; +typedef int pthread_attr_t; +typedef int pthread_cond_t; +typedef int pthread_condattr_t; +typedef int pthread_key_t; +typedef int pthread_mutex_t; +typedef int pthread_mutexattr_t; +typedef int pthread_once_t; + +enum { + PTHREAD_COND_INITIALIZER, + PTHREAD_MUTEX_DEFAULT, + PTHREAD_MUTEX_ERRORCHECK, + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_MUTEX_NORMAL, + PTHREAD_MUTEX_RECURSIVE, + PTHREAD_ONCE_INIT = 0, // Must be same as in QuRT + PTHREAD_ONCE_DONE, // Non-standard +}; + +const size_t PTHREAD_KEYS_MAX = 128; +const size_t PTHREAD_DESTRUCTOR_ITERATIONS = 4; + +#ifdef __cplusplus +extern "C" { +#endif +int pthread_cond_destroy(pthread_cond_t* cond); +int pthread_cond_init(pthread_cond_t* __restrict cond, + const pthread_condattr_t* __restrict attr); +int pthread_cond_signal(pthread_cond_t* cond); +int pthread_cond_broadcast(pthread_cond_t* cond); +int pthread_cond_timedwait(pthread_cond_t* __restrict cond, + pthread_mutex_t* __restrict mutex, + const struct timespec* __restrict abstime); +int pthread_cond_wait(pthread_cond_t* __restrict cond, + pthread_mutex_t* __restrict mutex); + +int pthread_mutexattr_init(pthread_mutexattr_t* attr); +int pthread_mutexattr_destroy(pthread_mutexattr_t* attr); +int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr, + int* __restrict type); +int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type); + +int pthread_mutex_init(pthread_mutex_t* __restrict mutex, + const pthread_mutexattr_t* __restrict attr); +int pthread_mutex_destroy(pthread_mutex_t* mutex); +int pthread_mutex_lock(pthread_mutex_t* mutex); +int pthread_mutex_trylock(pthread_mutex_t* mutex); +int pthread_mutex_unlock(pthread_mutex_t* mutex); + +int pthread_once(pthread_once_t* once_control, void (*init_routine)(void)); +int pthread_equal(pthread_t t1, pthread_t t2); + +int pthread_create(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg); +int pthread_join(pthread_t thread, void** retval); +int pthread_detach(pthread_t thread); +void pthread_exit(void* retval) __attribute__((__noreturn__)); + +int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)); +int pthread_key_delete(pthread_key_t key); +int pthread_setspecific(pthread_key_t key, const void* value); +void* pthread_getspecific(pthread_key_t key); + +pthread_t pthread_self(void); +#ifdef __cplusplus +} +#endif + +#endif // TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_ diff --git a/src/runtime/hexagon/sim/driver/sched.h b/src/runtime/hexagon/sim/driver/sched.h new file mode 100644 index 0000000..cc63630 --- /dev/null +++ b/src/runtime/hexagon/sim/driver/sched.h @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_ +#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_ + +#ifdef __cplusplus +extern "C" { +#endif +int sched_yield(void); +#ifdef __cplusplus +} +#endif + +#endif // TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_ diff --git a/src/runtime/hexagon/sim/driver/sim_device.cc b/src/runtime/hexagon/sim/driver/sim_device.cc new file mode 100644 index 0000000..23dc053 --- /dev/null +++ b/src/runtime/hexagon/sim/driver/sim_device.cc @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + Required options: + -ldl -G0 For dlinit/dlopen/dlclose. + -Wl,--force-dynamic Make this a dynamic executable (with dynamic + symbol table). + -Wl,-E Export all defined symbols as dynamic. + -Wl,--whole-archive Link the entire contents of libc. + -mhvx -mhvx-length=128b Enable HVX. + -Wno-format Silence format warning (unsigned vs uint32_t). +*/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "hexagon_sim_proto.h" +#include "pthread.h" +#include "tvm/runtime/c_runtime_api.h" + +static std::string timeNow() { + char str[11]; // [hh:mm:ss] + time_t time_value = time(NULL); + tm* pnow = localtime(&time_value); // NOLINT(runtime/threadsafe_fn) + + snprintf(str, sizeof(str), "[%02d:%02d:%02d]", pnow->tm_hour, pnow->tm_min, + pnow->tm_sec); + return std::string(str); +} + +#define LOG(FMT, ...) \ + fprintf(stderr, "%s %s:%d: " FMT "\n", timeNow().c_str(), __FILE__, \ + __LINE__, ##__VA_ARGS__) + +using HVX_Vector = + int __attribute__((__vector_size__(128))) __attribute__((aligned(128))); + +static unsigned getVectorLength() { + HVX_Vector v = __builtin_HEXAGON_V6_lvsplatw_128B(0x01010101); + unsigned char* p = reinterpret_cast(&v); + if (p[127] == 1) return 128; + assert(p[63] == 1); + return 64; +} + +extern "C" { +// Print vector functions. They can be used to help debug tensorized +// code, via +// ib.emit(tvm.call_extern('int32', 'V6_pv8', 'vector:', v)) +// ib.emit(tvm.call_extern('int32', 'V6_pv16', 'info:', v)) +// ib.emit(tvm.call_extern('int32', 'V6_pv32', 'value:', v)) + +// The first argument is a string printed before the vector contents. +int V6_pv8(const char* s, HVX_Vector v); +int V6_pv16(const char* s, HVX_Vector v); +int V6_pv32(const char* s, HVX_Vector v); +} + +int V6_pv8(const char* s, HVX_Vector v) { + unsigned vlen = getVectorLength(); + uint8_t* ptr = reinterpret_cast(&v); + fprintf(stderr, "%s:", s); + for (unsigned i = 0; i != vlen; ++i) { + fprintf(stderr, " %02x", ptr[i]); + } + fprintf(stderr, "\n"); + return 0; +} + +int V6_pv16(const char* s, HVX_Vector v) { + unsigned vlen = getVectorLength(); + uint16_t* ptr = reinterpret_cast(&v); + fprintf(stderr, "%s:", s); + for (unsigned i = 0; i != vlen / sizeof(uint16_t); ++i) { + fprintf(stderr, " %04x", ptr[i]); + } + fprintf(stderr, "\n"); + return 0; +} + +int V6_pv32(const char* s, HVX_Vector v) { + unsigned vlen = getVectorLength(); + uint32_t* ptr = reinterpret_cast(&v); + fprintf(stderr, "%s:", s); + for (unsigned i = 0; i != vlen / sizeof(uint32_t); ++i) { + fprintf(stderr, " %08x", ptr[i]); + } + fprintf(stderr, "\n"); + return 0; +} + +extern "C" { +// Function referenced from libc++.a, but not defined in libc.a. +int clock_gettime(clockid_t clock_id, struct timespec* tp); +// pthread_create is wrapped so that we can set a bigger stack size +// for QuRT. Here this isn't needed, but we still need to implement +// the wrapper. +int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg); +} + +int clock_gettime(clockid_t clock_id, struct timespec* tp) { + // Stub implementation. + return 0; +} + +int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg) { + LOG("%s", __func__); + return pthread_create(thread, attr, start_routine, arg); +} + +// FIXME(kparzysz-quic): query the cfg register to compute the VTCM base. +// This works now. +const unsigned int TCM_BASE = 0xD8000000; +const unsigned int VTCM_BASE = TCM_BASE + 0x400000; + +class Allocator { + private: + struct Block { + Block(void* p, size_t s) : ptr_(p), size_(s), vtcm_(false) {} + Block(void* p, size_t s, bool v) : ptr_(p), size_(s), vtcm_(v) {} + bool operator<(const Block& b) const { + return uintptr_t(ptr_) < uintptr_t(b.ptr_); + } + void* ptr_; + size_t size_; + bool vtcm_; + }; + + using vector_type = std::vector; + using iterator = vector_type::iterator; + vector_type allocations_; + + uintptr_t cur_vtcm = VTCM_BASE; + + public: + void* alloc(unsigned size, size_t align); + void* vtcm_alloc(unsigned size, size_t align); + void free(void* p); +}; + +void* Allocator::alloc(unsigned size, size_t align) { + void* ptr = aligned_alloc(align, size); + if (ptr == nullptr) { + perror("device: error allocating memory:"); + return ptr; + } + + Block b(ptr, size); + iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b); + iterator w = allocations_.insert(i, b); + if (w != allocations_.begin()) { + iterator pw = w - 1; + assert(uintptr_t(pw->ptr_) + pw->size_ < uintptr_t(w->ptr_)); + } + if (w + 1 != allocations_.end()) { + iterator nw = w + 1; + assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_)); + } + + LOG("device: allocated %d bytes aligned at %d: %p", size, align, ptr); + return ptr; +} + +// For now, just allocation sequentially. This needs to be improved to use a +// free list. +void* Allocator::vtcm_alloc(unsigned size, size_t align) { + uintptr_t a = cur_vtcm; + a = (a + (align - 1)) & -align; + cur_vtcm = a + size; + void* ptr = reinterpret_cast(a); + if (ptr == nullptr) { + perror("device: error allocating vtcm memory:"); + return ptr; + } + + Block b(ptr, size, true); + iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b); + iterator w = allocations_.insert(i, b); + if (w != allocations_.begin()) { + iterator pw = w - 1; + assert(uintptr_t(pw->ptr_) + pw->size_ <= uintptr_t(w->ptr_)); + } + if (w + 1 != allocations_.end()) { + iterator nw = w + 1; + assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_)); + } + + LOG("device: allocated vtcm %d bytes aligned at %d: %p", size, align, ptr); + return ptr; +} + +void Allocator::free(void* ptr) { + LOG("device: freeing %p", ptr); + iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), + Block(ptr, 0)); + assert(i != allocations_.end()); + assert(i->ptr_ == ptr); + if (!i->vtcm_) ::free(i->ptr_); + allocations_.erase(i); +} + +static void printMsgCall(const MsgCall& mc) { + auto to_dec_string = [](int v) { + char tmp[11]; + snprintf(tmp, sizeof(tmp), "%d", v); + return std::string(tmp); + }; + auto to_hex_string = [](uint32_t v) { + char tmp[9]; + snprintf(tmp, sizeof(tmp), "%lx", v); + return std::string(tmp); + }; + std::string str = "device: launching " + to_hex_string(mc.func_va) + + " sc:" + to_dec_string(mc.scalar_num) + " {"; + for (unsigned i = 0; i != mc.scalar_num; ++i) { + str += ' ' + to_hex_string(mc.data[i]); + if (i + 1 != mc.scalar_num) str += ','; + } + str += " }, st:" + to_dec_string(mc.stack_num) + " {"; + for (unsigned i = 0; i != mc.stack_num; ++i) { + str += ' ' + to_hex_string(mc.data[i + mc.scalar_num]); + if (i + 1 != mc.stack_num) str += ','; + } + str += " }"; + LOG("%s", str.c_str()); +} + +static std::vector task_queue; + +struct Environment { + Allocator alloc; + void* dl_handle = nullptr; +}; + +extern "C" { +volatile Message message_buffer; +int dispatch(Environment* env) __attribute__((noinline)); +} + +static volatile unsigned char payload_buffer[4096]; + +static void setMsg(uint32_t code, uint32_t len, uint32_t va) { + message_buffer.code = code; + message_buffer.len = len; + message_buffer.va = va; +} + +inline void* pointer(uint32_t v) { + return reinterpret_cast(static_cast(v)); +} + +inline uint32_t va(const volatile void* p) { + return static_cast(reinterpret_cast(p)); +} + +__attribute__((naked)) uint32_t launcher(volatile MsgCall* mc, uint64_t* pcc) { + __asm__( + "// This function is intentionally written to be readable, \n" + "// rather than fast. \n" + "// r0 = value of 'volatile MsgCall *mc' \n" + "// r1 = address where to store the program cycle count \n" + "{ memd(r29+#-16) = r21:20 \n" + " allocframe(#24) } \n" + "{ memd(r29+#0) = r17:16 \n" + " memd(r29+#8) = r19:18 } \n" + "{ r17:16 = combine(r1,r0) \n" + " r18 = r29 \n" + " r1 = memw(r0+#4) // scalar_num \n" + " r2 = memw(r0+#8) } // stack_num \n" + "// If there are no stack values, skip the stack setup. \n" + "{ p0 = cmp.eq(r2,#0) \n" + " if (p0.new) jump:t .Llauncher1 } \n" + + "// Allocate space on the stack. Let r2 = needed space \n" + "// rounded up to a multiple of 8. \n" + "{ loop0(.Llauncher0,r2) \n" + " r2 = asl(r2,#2) } \n" + "{ r2 = add(r2,#4) } \n" + "{ r2 = clrbit(r2,#2) } \n" + "{ r29 = sub(r29,r2) } \n" + + "// Copy stack contents onto the stack. Stack contents start \n" + "// at r3 = r0 + offsetof(data) + scalar_num*4 \n" + "{ r3 = addasl(r0,r1,#2) \n" + " r4 = r29 } \n" + "{ r3 = add(r3,#12) } // offsetof(data) \n" + ".Llauncher0: \n" + "{ r5 = memw(r3++#4) \n" + " memw(r4++#4) = r5.new } :endloop0 \n" + + "// Load registers. Some of the loaded data may actually be \n" + "// values from the stack part of 'data', but it's not an issue.\n" + ".Llauncher1: \n" + "{ r0 = memw(r16+#12) // mc + offsetof(data) \n" + " r1 = memw(r16+#16) } \n" + "{ r2 = memw(r16+#20) \n" + " r3 = memw(r16+#24) } \n" + "{ r4 = memw(r16+#28) \n" + " r5 = memw(r16+#32) } \n" + + "// Call. \n" + "{ r6 = memw(r16+#0) \n" + " r21:20 = upcycle } \n" + "{ callr r6 } \n" + + "// Restore stack pointer (free up r18), calculate cycle count. \n" + "{ r29 = r18 \n" + " r19:18 = upcycle } \n" + "{ r19:18 = sub(r19:18, r21:20) } \n" + + "// Store pcount, restore non-volatile registers, and return. \n" + "{ memd(r17+#0) = r19:18 \n" + " r21:20 = memd(r29+#16) } \n" + "{ r19:18 = memd(r29+#8) \n" + " r17:16 = memd(r29+#0) } \n" + "{ dealloc_return } // implicit-use r1:0 \n"); +} + +int dispatch(Environment* env) { + uint32_t code = message_buffer.code; + // Special handling of MsgReq. + if (code == kMsgReq) { + assert(message_buffer.len <= sizeof(payload_buffer)); + setMsg(kMsgAck, sizeof(payload_buffer), va(payload_buffer)); + return 0; + } + + switch (code) { + case kAlloc: { + LOG("device: {kAlloc, %lu, %lx}", message_buffer.len, message_buffer.va); + assert(message_buffer.len == sizeof(MsgAlloc)); + auto* ma = reinterpret_cast(message_buffer.va); + void* p = env->alloc.alloc(ma->size, ma->align); + reinterpret_cast(payload_buffer)->va = va(p); + setMsg(kNone, sizeof(MsgPointer), va(payload_buffer)); + break; + } + case kFree: { + LOG("device: {kFree, %lu, %lx}", message_buffer.len, message_buffer.va); + assert(message_buffer.len == sizeof(MsgPointer)); + auto* mp = reinterpret_cast(message_buffer.va); + env->alloc.free(pointer(mp->va)); + setMsg(kNone, 0u, 0u); + break; + } + case kAllocVtcm: { + LOG("device: {kAllocVtcm, %lu, %lx}", message_buffer.len, + message_buffer.va); + assert(message_buffer.len == sizeof(MsgAlloc)); + auto* ma = reinterpret_cast(message_buffer.va); + void* p = env->alloc.vtcm_alloc(ma->size, ma->align); + reinterpret_cast(payload_buffer)->va = va(p); + setMsg(kNone, sizeof(MsgPointer), va(payload_buffer)); + break; + } + case kCopy: { + LOG("device: {kCopy, %lu, %lx}", message_buffer.len, message_buffer.va); + assert(message_buffer.len == sizeof(MsgCopy)); + auto* mc = reinterpret_cast(message_buffer.va); + memcpy(pointer(mc->dst), pointer(mc->src), mc->len); + setMsg(kNone, 0u, 0u); + break; + } + case kLoad: { + if (env->dl_handle != nullptr) dlclose(env->dl_handle); + const char* name = static_cast(pointer(message_buffer.va)); + // LOG(stderr, "device: dlopen(%s)", name); + env->dl_handle = dlopen(name, RTLD_LAZY); + if (env->dl_handle == nullptr) LOG("dlopen: %s\n", dlerror()); + assert(env->dl_handle != nullptr); + reinterpret_cast(payload_buffer)->va = + va(env->dl_handle); + setMsg(kNone, sizeof(MsgPointer), va(payload_buffer)); + break; + } + case kUnload: { + assert(env->dl_handle != nullptr); + assert(message_buffer.len == sizeof(MsgPointer)); + auto* mp = reinterpret_cast(message_buffer.va); + assert(pointer(mp->va) == env->dl_handle); + dlclose(env->dl_handle); + env->dl_handle = nullptr; + setMsg(kNone, 0u, 0u); + break; + } + case kResolve: { + LOG("device: {kResolve, %lu, %lx}", message_buffer.len, + message_buffer.va); + assert(env->dl_handle != nullptr); + dlerror(); + const char* name = static_cast(pointer(message_buffer.va)); + void* s = dlsym(env->dl_handle, name); + reinterpret_cast(payload_buffer)->va = va(s); + setMsg(kNone, sizeof(MsgPointer), va(payload_buffer)); + break; + } + case kCall: { + LOG("device: {kCall, %lu, %lx}", message_buffer.len, message_buffer.va); + // Add the task to the queue. + auto* mc = reinterpret_cast(message_buffer.va); + uint32_t size = 4 * (3 + mc->scalar_num + mc->stack_num); + MsgCall* t = static_cast(malloc(size)); + memcpy(t, mc, size); + task_queue.push_back(t); + // Return 0. + *reinterpret_cast(payload_buffer) = 0; + setMsg(kNone, sizeof(uint32_t), va(payload_buffer)); + break; + } + case kFlush: { + LOG("device: {kFlush}"); + LOG("device: %d tasks in the queue", task_queue.size()); + // Execute all tasks from the queue and release memory buffers + // for as long as the return values are 0. Upon receiving a non-zero + // return value, continue freeing memory but no longer execute + // any tasks. The task queue will be cleared in any case. + uint32_t rv = 0; + uint64_t pcc; // Pcycle counter, will be 0 under simulator (upcycle). + for (MsgCall* t : task_queue) { + if (rv == 0) { + printMsgCall(*t); + rv = launcher(t, &pcc); + LOG("device: execution took %lld pcycles", pcc); + } + free(t); + } + task_queue.clear(); + *reinterpret_cast(payload_buffer) = rv; + setMsg(kNone, sizeof(uint32_t), va(payload_buffer)); + break; + } + default: + LOG("device: unknown code: %lu", message_buffer.code); + abort(); + break; + } + return 0; +} + +extern "C" { +int acquire_vector_unit(int); +void release_vector_unit(); +} + +static void makePathList(const std::string& arg, + std::vector* list) { + size_t p = 0, e = arg.size(); + std::vector tmp; + + while (p < e) { + tmp.clear(); + bool check_next = true; + size_t i = p; + for (; i != e; ++i) { + char c = arg[i]; + if (check_next) { + if (c == '\\') { + check_next = false; + continue; + } else if (c == ':') { + break; + } + } + check_next = true; + tmp.push_back(c); + } + if (!tmp.empty()) list->emplace_back(tmp.begin(), tmp.end()); + p = i + 1; + } +} + +static std::string findInPaths(const std::string& filename, + const std::string& paths) { + std::vector path_list; + makePathList(paths, &path_list); + + for (const auto& p : path_list) { + std::string pf = p + '/' + filename; + if (access(pf.c_str(), X_OK) == 0) return std::move(pf); + } + // If the search failed, try bare filename. If it cannot be loaded, + // dlerror will print a meaningful message. + return filename; +} + +// Presence of this function indicates that sim_dev is running. +extern "C" int running_in_sim_dev_17bc90206f6cf5a7(); +int running_in_sim_dev_17bc90206f6cf5a7() { return 0; } + +int main(int argc, char* argv[]) { + int opt; + std::string ld_path; + while ((opt = getopt(argc, argv, "L:")) != -1) { + switch (opt) { + case 'L': + ld_path += ':' + std::string(optarg); + break; + case '?': + LOG("Usage %s: [-L path1[:path2...]]", argv[0]); + return 1; + } + } + + std::string rt_path = findInPaths("libtvm_runtime.so", ld_path); + LOG("TVM runtime path: %s", rt_path.c_str()); + + Environment env; + acquire_vector_unit(0); + + const char* builtin[] = { + "libgcc.so", "libc.so", "libc++.so", + "libc++abi.so", "libc++.so.1", "libc++abi.so.1" // Alternative names. + }; + dlinit(sizeof(builtin) / sizeof(builtin[0]), const_cast(builtin)); + void* rt_handle = dlopen(rt_path.c_str(), RTLD_GLOBAL); + if (rt_handle == nullptr) { + LOG("error loading TVM runtime: %s", dlerror()); + return 1; + } + + // When running TVM runtime on Hexagon there is no longer a device + // for Hexagon, but standalone ops can still refer to it. All of + // required DeviceAPI's functionality is adequately implemented + // via the CPU device, so remap device_api.hexagon to device_api.cpu. + auto* get_global = reinterpret_cast( + dlsym(rt_handle, "TVMFuncGetGlobal")); + assert(get_global != nullptr); + auto* register_global = reinterpret_cast( + dlsym(rt_handle, "TVMFuncRegisterGlobal")); + assert(register_global != nullptr); + + TVMFunctionHandle cpu_api; + if (get_global("device_api.cpu", &cpu_api) != 0 || + register_global("device_api.hexagon", cpu_api, true) != 0) { + LOG("error setting device_api.hexagon"); + return 1; + } + + while (!dispatch(&env)) { + } + + dlclose(rt_handle); + release_vector_unit(); + return 0; +} diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 9d14d3a..0a2a60c 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -255,6 +255,17 @@ int MaxConcurrency() { max_concurrency = std::thread::hardware_concurrency(); #if defined(_M_X64) || defined(__x86_64__) max_concurrency /= 2; // ignore hyper-threading +#elif defined(__hexagon__) + // With unsigned PDs, getting the number of available hardware threads + // is not supported in earlier versions of QuRT. In such cases assume 4. + // If running on simulator, set max_concurrency to 1. + if (max_concurrency == 0) { + if (dlsym(RTLD_DEFAULT, "running_in_sim_dev_17bc90206f6cf5a7")) { + max_concurrency = 1; + } else { + max_concurrency = 4; + } + } #endif } return std::max(max_concurrency, 1); -- 2.7.4