/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
/// thread_local is not supported. In that case, we don't provide
/// `at::NonVariableTypeMode`.
-#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
thread_local bool NonVariableTypeMode_enabled = false;
NonVariableTypeMode_enabled = enabled;
}
-#else // C10_MOBILE || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+#else // defined(C10_MOBILE) || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
bool NonVariableTypeMode::is_enabled() {
throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
// To explicitly use interned strings as symbols in your code, you must add
// them to this list.
-#if !C10_MOBILE
+#ifndef C10_MOBILE
#define FORALL_ATEN_BASE_SYMBOLS(_) \
_(aten, __and__) \
_(aten, __iand__) \
namespace c10 {
-#if !C10_MOBILE
+#ifndef C10_MOBILE
#define FORALL_NS_SYMBOLS(_) \
_(namespaces, prim) \
_(namespaces, aten) \
!is_dilated() && // or dilation
!transposed && // or transposed tensors
input.ndimension() == 4 // must be in NCHW format
-#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
&& input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
#endif
;
#define C10_DEVICE __device__
#define C10_HOST __host__
// constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
-// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
-// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
+// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
+// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
// Hence, the following macros adjust the input values from the user to resolve potential warnings.
#if __CUDA_ARCH__ >= 750
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block size.
-// 256 is a good number for this fallback and should give good occupancy and
+// 256 is a good number for this fallback and should give good occupancy and
// versatility across all architectures.
constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
-// turns out that although __launch_bounds__ can take constexpr, it
-// can't take a constexpr that has anything to do with templates.
-// Currently we use launch_bounds that depend on template arguments in
-// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
+// turns out that although __launch_bounds__ can take constexpr, it
+// can't take a constexpr that has anything to do with templates.
+// Currently we use launch_bounds that depend on template arguments in
+// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
// C10_MIN_BLOCKS_PER_SM are kept as macros.
-// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
-// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
+// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
+// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
// which will also properly respect limits on old architectures.
#define C10_MAX_THREADS_PER_BLOCK(val) (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) : CUDA_THREADS_PER_BLOCK_FALLBACK)
#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) ((((threads_per_block)*(blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) ? (blocks_per_sm) : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / (threads_per_block))))
#define C10_MOBILE 1
#elif (defined(__APPLE__) && TARGET_OS_MAC)
#define C10_IOS 1
-#define C10_MOBILE 0
-#else
-#define C10_MOBILE 0
#endif // ANDROID / IOS / MACOS
// Portably determine if a type T is trivially copyable or not.
C10_DEFINE_bool(caffe2_cpu_numa_enabled, false, "Use NUMA whenever possible.");
-#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && C10_MOBILE == 0
+#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && !defined(C10_MOBILE)
#include <numa.h>
#include <numaif.h>
#include <unistd.h>
* - calling C10_REGISTER_CAFFE2_OPERATOR_CUDA is optional and can be omitted if
* you don't want to expose the operator for CUDA operations.
*/
-#if !C10_MOBILE
+#ifndef C10_MOBILE
#define C10_DECLARE_CAFFE2_OPERATOR(OperatorName) \
namespace caffe2 { \
namespace _c10_ops { \
Workspace*);
// TODO Also register c10 operators on mobile
-#if !C10_MOBILE
+#ifndef C10_MOBILE
// TODO Currently we only register the CPU variant. This is going to be fixed
// once the tensor detemplatization lands.
#define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OperatorHandle, Name, NumOutputParameters) \
#include "caffe2/core/flags.h"
#include "caffe2/core/tensor.h"
-#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
+#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
#endif
bool allowMetalOperators) {
caffe2::NetDef metal_predict_net;
bool usingMetalOperators = false;
-#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
+#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
if (allowMetalOperators) {
caffe2::dumpDef(predict_net);
if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
bool disableMultithreadProcessing,
bool usingMetalOperators)
: usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
-#if C10_MOBILE
+#ifdef C10_MOBILE
if (disableMultithreadProcessing) {
caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
if (threadpool != nullptr) {
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
-#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
+#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "caffe2/core/common.h"
-#if C10_MOBILE
+#ifdef C10_MOBILE
#include "mpscnn_context.h"
#include "mpscnn_kernels.h"
#include "caffe2/core/common.h"
-#if C10_MOBILE && defined(CAFFE2_USE_MPSCNN_TEST)
+#if defined(C10_MOBILE) && defined(CAFFE2_USE_MPSCNN_TEST)
#include "mpscnn_context.h"
#include "mpscnn_graph_mask.h"
// r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
// }
state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
-#if C10_MOBILE
+#ifdef C10_MOBILE
ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
#else
for (size_t v = 0; v < range; ++v) {
#include "caffe2/utils/map_utils.h"
#include "caffe2/utils/proto_utils.h"
-#if !C10_MOBILE
+#ifndef C10_MOBILE
#include "onnx/checker.h"
#include "onnx/optimizer/optimize.h"
#endif
return d;
}
-#if !C10_MOBILE
+#ifndef C10_MOBILE
ModelProto OptimizeOnnx(const ModelProto& input, bool init) {
std::vector<std::string> passes{"fuse_consecutive_transposes",
"eliminate_nop_transpose",
const std::vector<Caffe2Ops>& extras) {
auto device_option = GetDeviceOption(Device(device));
-#if !C10_MOBILE
+#ifndef C10_MOBILE
ModelProto init_model = OptimizeOnnx(onnx_model, true);
ModelProto pred_model = OptimizeOnnx(onnx_model, false);
#else
ModelProto onnx_model;
ParseProtoFromLargeString(onnx_model_str, &onnx_model);
-#if !C10_MOBILE
+#ifndef C10_MOBILE
::ONNX_NAMESPACE::checker::check_model(onnx_model);
#endif
namespace caffe2 {
-#ifndef C10_MOBILE
-#error "mobile build state not defined"
-#endif
-
-#if C10_MOBILE
+#ifdef C10_MOBILE
// mobile-only implementation (tiled + vectorized + multithreaded)
REGISTER_CPU_OPERATOR_WITH_ENGINE(
ConvTranspose,
#include "caffe2/core/common.h"
-#ifndef C10_MOBILE
-#error "mobile build state not defined"
-#endif
-
-#if C10_MOBILE
+#ifdef C10_MOBILE
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/common.h"
-#ifndef C10_MOBILE
-#error "mobile build state not defined"
-#endif
-
-#if C10_MOBILE
+#ifdef C10_MOBILE
#include "caffe2/core/logging.h"
#include "caffe2/operators/conv_op_shared.h"
if (run_init) {
CAFFE_ENFORCE(ws.RunNetOnce(init_net));
}
-#if C10_MOBILE
+#ifdef C10_MOBILE
GlobalInit();
#endif
if (optimization &&
Timer t;
-#if C10_MOBILE
+#ifdef C10_MOBILE
ws_->GetThreadPool()->run(
[&](int, int n_g) {
const int g = n_g / N;
#include "observers/perf_observer.h"
#include "observers/observer_config.h"
-#if !C10_MOBILE
+#ifndef C10_MOBILE
#include "caffe2/core/flags.h"
#include "observers/net_observer_reporter_print.h"
#endif
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
-#if !C10_MOBILE
+#ifndef C10_MOBILE
C10_DEFINE_int64(
aiBench_netInitSampleRate,
0,
return caffe2::make_unique<PerfNetObserver>(subject);
});
-#if !C10_MOBILE
+#if !defined(C10_MOBILE)
// for aibench usage
caffe2::ObserverConfig::setReporter(
caffe2::make_unique<caffe2::NetObserverReporterPrint>());