caffe2 mobile opengl (#15322)
authorJerry Zhang <jerryzh@fb.com>
Tue, 18 Dec 2018 16:17:56 +0000 (08:17 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Tue, 18 Dec 2018 16:20:52 +0000 (08:20 -0800)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15322

caffe2 mobile opengl code is not used, deleting it to reduce complications when we perform other changes

Reviewed By: Maratyszcza

Differential Revision: D13499943

fbshipit-source-id: 6479f6b9f50f08b5ae28f8f0bc4a1c4fc3f3c3c2

70 files changed:
CMakeLists.txt
caffe2/mobile/contrib/CMakeLists.txt
caffe2/mobile/contrib/opengl/CMakeLists.txt [deleted file]
caffe2/mobile/contrib/opengl/android/AndroidGLContext.cc [deleted file]
caffe2/mobile/contrib/opengl/android/AndroidGLContext.h [deleted file]
caffe2/mobile/contrib/opengl/android/CMakeLists.txt [deleted file]
caffe2/mobile/contrib/opengl/android/GLContext.cc [deleted file]
caffe2/mobile/contrib/opengl/android/GLImageAllocator.cc [deleted file]
caffe2/mobile/contrib/opengl/android/arm_neon_support.h [deleted file]
caffe2/mobile/contrib/opengl/android/gl3stub.c [deleted file]
caffe2/mobile/contrib/opengl/android/gl3stub.h [deleted file]
caffe2/mobile/contrib/opengl/core/CMakeLists.txt [deleted file]
caffe2/mobile/contrib/opengl/core/DataTransfer.cc [deleted file]
caffe2/mobile/contrib/opengl/core/DataTransfer.h [deleted file]
caffe2/mobile/contrib/opengl/core/GL.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLContext.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLContext.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLFilter.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLFilter.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLImage.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLImage.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLImageAllocator.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLImageAllocator.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLLogging.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLPBO.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLPBO.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLPlainTexture.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLPlainTexture.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLPredictor.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLPredictor.h [deleted file]
caffe2/mobile/contrib/opengl/core/GLTexture.cc [deleted file]
caffe2/mobile/contrib/opengl/core/GLTexture.h [deleted file]
caffe2/mobile/contrib/opengl/core/ImageAllocator.h [deleted file]
caffe2/mobile/contrib/opengl/core/arm_neon_support.h [deleted file]
caffe2/mobile/contrib/opengl/core/rewrite_net.cc [deleted file]
caffe2/mobile/contrib/opengl/core/rewrite_net.h [deleted file]
caffe2/mobile/contrib/opengl/ios/CMakeLists.txt [deleted file]
caffe2/mobile/contrib/opengl/ios/GLContext.cc [deleted file]
caffe2/mobile/contrib/opengl/ios/GLImageAllocator.cc [deleted file]
caffe2/mobile/contrib/opengl/ios/IOSGLContext.h [deleted file]
caffe2/mobile/contrib/opengl/ios/IOSGLContext.mm [deleted file]
caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.cc [deleted file]
caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.h [deleted file]
caffe2/mobile/contrib/opengl/ios/IOSGLTexture.h [deleted file]
caffe2/mobile/contrib/opengl/ios/IOSGLTexture.mm [deleted file]
caffe2/mobile/contrib/opengl/operators/CMakeLists.txt [deleted file]
caffe2/mobile/contrib/opengl/operators/GLAdd.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLConcat.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLConvolution.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLConvolution.h [deleted file]
caffe2/mobile/contrib/opengl/operators/GLCopyOps.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLMul.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLNormPlanarYUV.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLPRelu.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLPadImage.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLPool.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLResize.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLStylizer.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/GLSub.cc [deleted file]
caffe2/mobile/contrib/opengl/operators/gl_tiling_utils.h [deleted file]
caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc [deleted file]
caffe2/mobile/contrib/opengl/test/TestGLConvolution.h [deleted file]
caffe2/mobile/contrib/opengl/test/opengl_test.cc [deleted file]
caffe2/mobile/contrib/opengl/test/opengl_test.h [deleted file]
cmake/Dependencies.cmake
cmake/Summary.cmake
scripts/build_android.sh

index a14f38c..a29cd82 100644 (file)
@@ -92,7 +92,6 @@ option(USE_LEVELDB "Use LEVELDB" ON)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
 option(USE_LMDB "Use LMDB" ON)
 option(USE_METAL "Use Metal for iOS build" ON)
-option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 option(USE_NCCL "Use NCCL" ON)
 option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
index e49c2ef..33e8237 100644 (file)
@@ -1,8 +1,4 @@
 add_subdirectory(ios)
-# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
-if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
-  # add_subdirectory(opengl)
-endif()
 if (USE_ACL)
   # add_subdirectory(arm-compute)
 endif()
diff --git a/caffe2/mobile/contrib/opengl/CMakeLists.txt b/caffe2/mobile/contrib/opengl/CMakeLists.txt
deleted file mode 100644 (file)
index 6d11625..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-add_subdirectory(core)
-add_subdirectory(operators)
-
-if (ANDROID)
-  add_subdirectory(android)
-endif()
-
-if (IOS)
-  add_subdirectory(ios)
-endif()
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/android/AndroidGLContext.cc b/caffe2/mobile/contrib/opengl/android/AndroidGLContext.cc
deleted file mode 100644 (file)
index 14a23c4..0000000
+++ /dev/null
@@ -1,156 +0,0 @@
-#include "AndroidGLContext.h"
-#include "caffe2/core/logging.h"
-#include "gl3stub.h"
-#include <regex>
-
-namespace {
-
-static const std::unordered_map<std::string, GL_Renderer>& renderer_map() {
-  static std::unordered_map<std::string, GL_Renderer> m = {
-      {"Adreno", Adreno},
-      {"Mali", Mali},
-      {"NVIDIA", Tegra} /*, {"PowerVR", PowerVR} */};
-  return m;
-}
-
-} // namespace
-
-EGLContext AndroidGLContext::create_opengl_thread_context() {
-  EGLSurface surface = EGL_NO_SURFACE;
-  EGLContext context = EGL_NO_CONTEXT;
-  EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-  if (display == EGL_NO_DISPLAY) {
-    // We failed to get a display
-    CAFFE_THROW("Problem with OpenGL context");
-    return context;
-  }
-
-  EGLint major;
-  EGLint minor;
-  eglInitialize(display, &major, &minor);
-
-  const EGLint configAttr[] = {EGL_RENDERABLE_TYPE,
-                               EGL_OPENGL_ES2_BIT,
-                               EGL_SURFACE_TYPE,
-                               EGL_PBUFFER_BIT, // we create a pixelbuffer surface
-                               EGL_NONE};
-
-  EGLint numConfig;
-  EGLConfig eglConfig;
-  if (!eglChooseConfig(display, configAttr, &eglConfig, 1, &numConfig)) {
-    // We failed to find a suitable config
-    eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
-    eglTerminate(display);
-    display = EGL_NO_DISPLAY;
-    CAFFE_THROW("Problem with OpenGL context");
-    return context;
-  }
-
-  const EGLint ctxAttr[] = {EGL_CONTEXT_CLIENT_VERSION,
-                            2, // very important!
-                            EGL_NONE};
-
-  // Create an EGL context based on the chosen configuration.
-  context = eglCreateContext(display, eglConfig, EGL_NO_CONTEXT, ctxAttr);
-
-  // We need a surface. For most mixed JNI/Java based apps it is suggested
-  // that we pass a Java surface through JNI and extract the surface
-  // Pure NDK apps get passed the android_app structure which includes a surface
-  // We want our own OpenGL context for the current thread.
-  // Here we create a fake 1x1 'pixel buffer' surface.
-  // We don't expecting to run vertex or fragment shaders.
-
-  const EGLint surfaceAttr[] = {EGL_WIDTH, 1, EGL_HEIGHT, 1, EGL_NONE};
-
-  surface = eglCreatePbufferSurface(display, eglConfig, surfaceAttr);
-
-  // Bind context, draw and surface to current thread
-  eglMakeCurrent(display, surface, surface, context);
-
-  // Bind the API for this context.  In our case we want to use OpenGL_ES
-  eglBindAPI(EGL_OPENGL_ES_API);
-  return context;
-}
-
-bool AndroidGLContext::opengl_thread_context_exists() {
-  return eglGetCurrentContext() != EGL_NO_CONTEXT;
-}
-
-bool AndroidGLContext::release_opengl_thread_context() {
-  EGLContext display = eglGetCurrentDisplay();
-  if (display != EGL_NO_DISPLAY) {
-    if (_eglcontext != EGL_NO_CONTEXT) {
-      eglDestroyContext(display, _eglcontext);
-      _eglcontext = EGL_NO_CONTEXT;
-    }
-    EGLSurface surface = eglGetCurrentSurface(EGL_DRAW);
-    if (surface != EGL_NO_SURFACE) {
-      eglDestroySurface(display, surface);
-      surface = EGL_NO_SURFACE;
-    }
-    surface = eglGetCurrentSurface(EGL_READ);
-    if (surface != EGL_NO_SURFACE) {
-      eglDestroySurface(display, surface);
-      surface = EGL_NO_SURFACE;
-    }
-    eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
-    eglTerminate(display);
-    display = EGL_NO_DISPLAY;
-  }
-  eglReleaseThread();
-  return true;
-}
-
-void AndroidGLContext::init_gles3() {
-  if (!gl3stubInit()) {
-    CAFFE_THROW("OpenGL ES 3 not initialized");
-  } else {
-    LOG(INFO) << "OpenGL ES 3 successfully enabled";
-  }
-}
-
-GL_Renderer AndroidGLContext::get_platform() {
-  std::string rendererStr((const char*)glGetString(GL_RENDERER));
-  std::regex regStr("^[A-Za-z]*");
-  std::smatch matchs;
-  if (std::regex_search(rendererStr, matchs, regStr)) {
-    const std::string renderer = *matchs.begin();
-    auto found = renderer_map().find(renderer);
-    if (found != renderer_map().end()) {
-      return found->second;
-    }
-  }
-  CAFFE_THROW("Unsupported GPU renderer");
-}
-
-AndroidGLContext::AndroidGLContext() {
-  if (!opengl_thread_context_exists()) {
-    _eglcontext = create_opengl_thread_context();
-    LOG(INFO) << "New EGLContext created";
-
-    if (!supportOpenGLES3(&half_float_supported)) {
-      CAFFE_THROW("OpenGL ES 3 not supported");
-    }
-
-    if (!isSupportedDevice()) {
-      LOG(ERROR) << "Device not fully supported";
-    }
-  } else {
-    _eglcontext = EGL_NO_CONTEXT;
-    LOG(INFO) << "Reusing EGLContext, make sure OpenGL ES 3 is supported";
-  }
-  static std::once_flag once;
-  std::call_once(once, [&]() { init_gles3(); });
-}
-
-AndroidGLContext::~AndroidGLContext() {
-  if (_eglcontext != EGL_NO_CONTEXT) {
-    release_opengl_thread_context();
-  }
-}
-
-void AndroidGLContext::set_context() {}
-
-void AndroidGLContext::reset_context() {}
-
-void AndroidGLContext::flush_context() {}
diff --git a/caffe2/mobile/contrib/opengl/android/AndroidGLContext.h b/caffe2/mobile/contrib/opengl/android/AndroidGLContext.h
deleted file mode 100644 (file)
index 51f1970..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-
-#pragma once
-
-#include "../core/GLContext.h"
-#include "../core/GLTexture.h"
-#include <unordered_map>
-
-enum GL_Renderer { Adreno, Mali, Tegra /*, PowerVR */ };
-
-class AndroidGLContext : public GLContext {
- private:
-  EGLContext _eglcontext;
-
-  EGLContext create_opengl_thread_context();
-  bool opengl_thread_context_exists();
-  bool release_opengl_thread_context();
-
- public:
-  AndroidGLContext();
-  ~AndroidGLContext();
-  void set_context();
-  void reset_context();
-  void flush_context();
-  void init_gles3();
-  GL_Renderer get_platform();
-};
diff --git a/caffe2/mobile/contrib/opengl/android/CMakeLists.txt b/caffe2/mobile/contrib/opengl/android/CMakeLists.txt
deleted file mode 100644 (file)
index 9fe2085..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB_RECURSE tmp *.cc *.c)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/android/GLContext.cc b/caffe2/mobile/contrib/opengl/android/GLContext.cc
deleted file mode 100644 (file)
index ea707e1..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#include "AndroidGLContext.h"
-
-std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
-
-void GLContext::initGLContext() {
-  if (_glcontext == nullptr) {
-    _glcontext.reset(new AndroidGLContext());
-  }
-}
-
-GLContext* GLContext::getGLContext() {
-  if (_glcontext == nullptr) {
-    initGLContext();
-  }
-  return _glcontext.get();
-}
-
-void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
diff --git a/caffe2/mobile/contrib/opengl/android/GLImageAllocator.cc b/caffe2/mobile/contrib/opengl/android/GLImageAllocator.cc
deleted file mode 100644 (file)
index 1c05833..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-
-#include "../core/GLImageAllocator.h"
-#include "../core/arm_neon_support.h"
-
-template <typename T>
-GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
-  return new GLImageAllocator<T>();
-}
-
-template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
-template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
diff --git a/caffe2/mobile/contrib/opengl/android/arm_neon_support.h b/caffe2/mobile/contrib/opengl/android/arm_neon_support.h
deleted file mode 100644 (file)
index ddd9a85..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-
-#pragma once
-
-#include <arm_neon.h>
-typedef __fp16 float16_t;
diff --git a/caffe2/mobile/contrib/opengl/android/gl3stub.c b/caffe2/mobile/contrib/opengl/android/gl3stub.c
deleted file mode 100755 (executable)
index 13411de..0000000
+++ /dev/null
@@ -1,357 +0,0 @@
-
-// clang-format off
-
-#include <EGL/egl.h>
-#include "gl3stub.h"
-
-GLboolean gl3stubInit() {
-    #define FIND_PROC(s) s = (void*)eglGetProcAddress(#s)
-    FIND_PROC(glReadBuffer);
-    FIND_PROC(glDrawRangeElements);
-    FIND_PROC(glTexImage3D);
-    FIND_PROC(glTexSubImage3D);
-    FIND_PROC(glCopyTexSubImage3D);
-    FIND_PROC(glCompressedTexImage3D);
-    FIND_PROC(glCompressedTexSubImage3D);
-    FIND_PROC(glGenQueries);
-    FIND_PROC(glDeleteQueries);
-    FIND_PROC(glIsQuery);
-    FIND_PROC(glBeginQuery);
-    FIND_PROC(glEndQuery);
-    FIND_PROC(glGetQueryiv);
-    FIND_PROC(glGetQueryObjectuiv);
-    FIND_PROC(glUnmapBuffer);
-    FIND_PROC(glGetBufferPointerv);
-    FIND_PROC(glDrawBuffers);
-    FIND_PROC(glUniformMatrix2x3fv);
-    FIND_PROC(glUniformMatrix3x2fv);
-    FIND_PROC(glUniformMatrix2x4fv);
-    FIND_PROC(glUniformMatrix4x2fv);
-    FIND_PROC(glUniformMatrix3x4fv);
-    FIND_PROC(glUniformMatrix4x3fv);
-    FIND_PROC(glBlitFramebuffer);
-    FIND_PROC(glRenderbufferStorageMultisample);
-    FIND_PROC(glFramebufferTextureLayer);
-    FIND_PROC(glMapBufferRange);
-    FIND_PROC(glFlushMappedBufferRange);
-    FIND_PROC(glBindVertexArray);
-    FIND_PROC(glDeleteVertexArrays);
-    FIND_PROC(glGenVertexArrays);
-    FIND_PROC(glIsVertexArray);
-    FIND_PROC(glGetIntegeri_v);
-    FIND_PROC(glBeginTransformFeedback);
-    FIND_PROC(glEndTransformFeedback);
-    FIND_PROC(glBindBufferRange);
-    FIND_PROC(glBindBufferBase);
-    FIND_PROC(glTransformFeedbackVaryings);
-    FIND_PROC(glGetTransformFeedbackVarying);
-    FIND_PROC(glVertexAttribIPointer);
-    FIND_PROC(glGetVertexAttribIiv);
-    FIND_PROC(glGetVertexAttribIuiv);
-    FIND_PROC(glVertexAttribI4i);
-    FIND_PROC(glVertexAttribI4ui);
-    FIND_PROC(glVertexAttribI4iv);
-    FIND_PROC(glVertexAttribI4uiv);
-    FIND_PROC(glGetUniformuiv);
-    FIND_PROC(glGetFragDataLocation);
-    FIND_PROC(glUniform1ui);
-    FIND_PROC(glUniform2ui);
-    FIND_PROC(glUniform3ui);
-    FIND_PROC(glUniform4ui);
-    FIND_PROC(glUniform1uiv);
-    FIND_PROC(glUniform2uiv);
-    FIND_PROC(glUniform3uiv);
-    FIND_PROC(glUniform4uiv);
-    FIND_PROC(glClearBufferiv);
-    FIND_PROC(glClearBufferuiv);
-    FIND_PROC(glClearBufferfv);
-    FIND_PROC(glClearBufferfi);
-    FIND_PROC(glGetStringi);
-    FIND_PROC(glCopyBufferSubData);
-    FIND_PROC(glGetUniformIndices);
-    FIND_PROC(glGetActiveUniformsiv);
-    FIND_PROC(glGetUniformBlockIndex);
-    FIND_PROC(glGetActiveUniformBlockiv);
-    FIND_PROC(glGetActiveUniformBlockName);
-    FIND_PROC(glUniformBlockBinding);
-    FIND_PROC(glDrawArraysInstanced);
-    FIND_PROC(glDrawElementsInstanced);
-    FIND_PROC(glFenceSync);
-    FIND_PROC(glIsSync);
-    FIND_PROC(glDeleteSync);
-    FIND_PROC(glClientWaitSync);
-    FIND_PROC(glWaitSync);
-    FIND_PROC(glGetInteger64v);
-    FIND_PROC(glGetSynciv);
-    FIND_PROC(glGetInteger64i_v);
-    FIND_PROC(glGetBufferParameteri64v);
-    FIND_PROC(glGenSamplers);
-    FIND_PROC(glDeleteSamplers);
-    FIND_PROC(glIsSampler);
-    FIND_PROC(glBindSampler);
-    FIND_PROC(glSamplerParameteri);
-    FIND_PROC(glSamplerParameteriv);
-    FIND_PROC(glSamplerParameterf);
-    FIND_PROC(glSamplerParameterfv);
-    FIND_PROC(glGetSamplerParameteriv);
-    FIND_PROC(glGetSamplerParameterfv);
-    FIND_PROC(glVertexAttribDivisor);
-    FIND_PROC(glBindTransformFeedback);
-    FIND_PROC(glDeleteTransformFeedbacks);
-    FIND_PROC(glGenTransformFeedbacks);
-    FIND_PROC(glIsTransformFeedback);
-    FIND_PROC(glPauseTransformFeedback);
-    FIND_PROC(glResumeTransformFeedback);
-    FIND_PROC(glGetProgramBinary);
-    FIND_PROC(glProgramBinary);
-    FIND_PROC(glProgramParameteri);
-    FIND_PROC(glInvalidateFramebuffer);
-    FIND_PROC(glInvalidateSubFramebuffer);
-    FIND_PROC(glTexStorage2D);
-    FIND_PROC(glTexStorage3D);
-    FIND_PROC(glGetInternalformativ);
-
-    // Bind GL_EXT_texture_border_clamp
-
-    FIND_PROC(glTexParameterIivEXT);
-    FIND_PROC(glTexParameterIuivEXT);
-    FIND_PROC(glGetTexParameterIivEXT);
-    FIND_PROC(glGetTexParameterIuivEXT);
-    FIND_PROC(glSamplerParameterIivEXT);
-    FIND_PROC(glSamplerParameterIuivEXT);
-    FIND_PROC(glGetSamplerParameterIivEXT);
-    FIND_PROC(glGetSamplerParameterIuivEXT);
-
-    #undef FIND_PROC
-
-    if (!glReadBuffer ||
-        !glDrawRangeElements ||
-        !glTexImage3D ||
-        !glTexSubImage3D ||
-        !glCopyTexSubImage3D ||
-        !glCompressedTexImage3D ||
-        !glCompressedTexSubImage3D ||
-        !glGenQueries ||
-        !glDeleteQueries ||
-        !glIsQuery ||
-        !glBeginQuery ||
-        !glEndQuery ||
-        !glGetQueryiv ||
-        !glGetQueryObjectuiv ||
-        !glUnmapBuffer ||
-        !glGetBufferPointerv ||
-        !glDrawBuffers ||
-        !glUniformMatrix2x3fv ||
-        !glUniformMatrix3x2fv ||
-        !glUniformMatrix2x4fv ||
-        !glUniformMatrix4x2fv ||
-        !glUniformMatrix3x4fv ||
-        !glUniformMatrix4x3fv ||
-        !glBlitFramebuffer ||
-        !glRenderbufferStorageMultisample ||
-        !glFramebufferTextureLayer ||
-        !glMapBufferRange ||
-        !glFlushMappedBufferRange ||
-        !glBindVertexArray ||
-        !glDeleteVertexArrays ||
-        !glGenVertexArrays ||
-        !glIsVertexArray ||
-        !glGetIntegeri_v ||
-        !glBeginTransformFeedback ||
-        !glEndTransformFeedback ||
-        !glBindBufferRange ||
-        !glBindBufferBase ||
-        !glTransformFeedbackVaryings ||
-        !glGetTransformFeedbackVarying ||
-        !glVertexAttribIPointer ||
-        !glGetVertexAttribIiv ||
-        !glGetVertexAttribIuiv ||
-        !glVertexAttribI4i ||
-        !glVertexAttribI4ui ||
-        !glVertexAttribI4iv ||
-        !glVertexAttribI4uiv ||
-        !glGetUniformuiv ||
-        !glGetFragDataLocation ||
-        !glUniform1ui ||
-        !glUniform2ui ||
-        !glUniform3ui ||
-        !glUniform4ui ||
-        !glUniform1uiv ||
-        !glUniform2uiv ||
-        !glUniform3uiv ||
-        !glUniform4uiv ||
-        !glClearBufferiv ||
-        !glClearBufferuiv ||
-        !glClearBufferfv ||
-        !glClearBufferfi ||
-        !glGetStringi ||
-        !glCopyBufferSubData ||
-        !glGetUniformIndices ||
-        !glGetActiveUniformsiv ||
-        !glGetUniformBlockIndex ||
-        !glGetActiveUniformBlockiv ||
-        !glGetActiveUniformBlockName ||
-        !glUniformBlockBinding ||
-        !glDrawArraysInstanced ||
-        !glDrawElementsInstanced ||
-        !glFenceSync ||
-        !glIsSync ||
-        !glDeleteSync ||
-        !glClientWaitSync ||
-        !glWaitSync ||
-        !glGetInteger64v ||
-        !glGetSynciv ||
-        !glGetInteger64i_v ||
-        !glGetBufferParameteri64v ||
-        !glGenSamplers ||
-        !glDeleteSamplers ||
-        !glIsSampler ||
-        !glBindSampler ||
-        !glSamplerParameteri ||
-        !glSamplerParameteriv ||
-        !glSamplerParameterf ||
-        !glSamplerParameterfv ||
-        !glGetSamplerParameteriv ||
-        !glGetSamplerParameterfv ||
-        !glVertexAttribDivisor ||
-        !glBindTransformFeedback ||
-        !glDeleteTransformFeedbacks ||
-        !glGenTransformFeedbacks ||
-        !glIsTransformFeedback ||
-        !glPauseTransformFeedback ||
-        !glResumeTransformFeedback ||
-        !glGetProgramBinary ||
-        !glProgramBinary ||
-        !glProgramParameteri ||
-        !glInvalidateFramebuffer ||
-        !glInvalidateSubFramebuffer ||
-        !glTexStorage2D ||
-        !glTexStorage3D ||
-        !glGetInternalformativ)
-    {
-        return GL_FALSE;
-    }
-
-    return GL_TRUE;
-}
-
-/* Function pointer definitions */
-GL_APICALL void           (* GL_APIENTRY glReadBuffer) (GLenum mode);
-GL_APICALL void           (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
-GL_APICALL void           (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
-GL_APICALL void           (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
-GL_APICALL void           (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
-GL_APICALL void           (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
-GL_APICALL void           (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
-GL_APICALL void           (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
-GL_APICALL void           (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
-GL_APICALL GLboolean      (* GL_APIENTRY glIsQuery) (GLuint id);
-GL_APICALL void           (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
-GL_APICALL void           (* GL_APIENTRY glEndQuery) (GLenum target);
-GL_APICALL void           (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
-GL_APICALL void           (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
-GL_APICALL GLboolean      (* GL_APIENTRY glUnmapBuffer) (GLenum target);
-GL_APICALL void           (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
-GL_APICALL void           (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
-GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
-GL_APICALL void           (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
-GL_APICALL void           (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
-GL_APICALL GLvoid*        (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-GL_APICALL void           (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
-GL_APICALL void           (* GL_APIENTRY glBindVertexArray) (GLuint array);
-GL_APICALL void           (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
-GL_APICALL void           (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
-GL_APICALL GLboolean      (* GL_APIENTRY glIsVertexArray) (GLuint array);
-GL_APICALL void           (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
-GL_APICALL void           (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
-GL_APICALL void           (* GL_APIENTRY glEndTransformFeedback) (void);
-GL_APICALL void           (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
-GL_APICALL void           (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
-GL_APICALL void           (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
-GL_APICALL void           (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
-GL_APICALL void           (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
-GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
-GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
-GL_APICALL void           (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
-GL_APICALL void           (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
-GL_APICALL void           (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
-GL_APICALL void           (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
-GL_APICALL void           (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
-GL_APICALL GLint          (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
-GL_APICALL void           (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
-GL_APICALL void           (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
-GL_APICALL void           (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
-GL_APICALL void           (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
-GL_APICALL void           (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void           (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void           (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void           (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void           (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
-GL_APICALL void           (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
-GL_APICALL void           (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
-GL_APICALL void           (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
-GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
-GL_APICALL void           (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
-GL_APICALL void           (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
-GL_APICALL void           (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
-GL_APICALL GLuint         (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
-GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
-GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
-GL_APICALL void           (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
-GL_APICALL void           (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
-GL_APICALL void           (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
-GL_APICALL GLsync         (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
-GL_APICALL GLboolean      (* GL_APIENTRY glIsSync) (GLsync sync);
-GL_APICALL void           (* GL_APIENTRY glDeleteSync) (GLsync sync);
-GL_APICALL GLenum         (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-GL_APICALL void           (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-GL_APICALL void           (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
-GL_APICALL void           (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
-GL_APICALL void           (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
-GL_APICALL void           (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
-GL_APICALL void           (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
-GL_APICALL void           (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
-GL_APICALL GLboolean      (* GL_APIENTRY glIsSampler) (GLuint sampler);
-GL_APICALL void           (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
-GL_APICALL void           (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
-GL_APICALL void           (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
-GL_APICALL void           (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
-GL_APICALL void           (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
-GL_APICALL void           (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
-GL_APICALL void           (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
-GL_APICALL void           (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
-GL_APICALL void           (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
-GL_APICALL void           (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
-GL_APICALL void           (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
-GL_APICALL GLboolean      (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
-GL_APICALL void           (* GL_APIENTRY glPauseTransformFeedback) (void);
-GL_APICALL void           (* GL_APIENTRY glResumeTransformFeedback) (void);
-GL_APICALL void           (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
-GL_APICALL void           (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
-GL_APICALL void           (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
-GL_APICALL void           (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
-GL_APICALL void           (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
-GL_APICALL void           (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
-GL_APICALL void           (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
-GL_APICALL void           (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
-
-// GL_EXT_texture_border_clamp
-
-GL_APICALL void           (* GL_APIENTRY  glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
-GL_APICALL void           (* GL_APIENTRY  glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
-GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
-GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
-GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
-GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
-GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
-GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
-
-// End GL_EXT_texture_border_clamp
-
-// clang-format on
diff --git a/caffe2/mobile/contrib/opengl/android/gl3stub.h b/caffe2/mobile/contrib/opengl/android/gl3stub.h
deleted file mode 100644 (file)
index 49637b6..0000000
+++ /dev/null
@@ -1,488 +0,0 @@
-
-#ifndef __gl3_h_
-#define __gl3_h_
-
-/*
- * stub gl3.h for dynamic loading, based on:
- * gl3.h last updated on $Date: 2013-02-12 14:37:24 -0800 (Tue, 12 Feb 2013) $
- *
- * Changes:
- * - Added #include <GLES2/gl2.h>
- * - Removed duplicate OpenGL ES 2.0 declarations
- * - Converted OpenGL ES 3.0 function prototypes to function pointer
- *   declarations
- * - Added gl3stubInit() declaration
- */
-
-#include <GLES2/gl2.h>
-#include <android/api-level.h>
-
-// clang-format off
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Call this function before calling any OpenGL ES 3.0 functions. It will
- * return GL_TRUE if the OpenGL ES 3.0 was successfully initialized, GL_FALSE
- * otherwise. */
-GLboolean gl3stubInit();
-
-/*-------------------------------------------------------------------------
- * Data type definitions
- *-----------------------------------------------------------------------*/
-
-/* OpenGL ES 3.0 */
-
-typedef unsigned short   GLhalf;
-#if __ANDROID_API__ <= 19
-typedef khronos_int64_t  GLint64;
-typedef khronos_uint64_t GLuint64;
-typedef struct __GLsync *GLsync;
-#endif
-
-/*-------------------------------------------------------------------------
- * Token definitions
- *-----------------------------------------------------------------------*/
-
-/* OpenGL ES core versions */
-#define GL_ES_VERSION_3_0                                1
-
-/* OpenGL ES 3.0 */
-
-#define GL_READ_BUFFER                                   0x0C02
-#define GL_UNPACK_ROW_LENGTH                             0x0CF2
-#define GL_UNPACK_SKIP_ROWS                              0x0CF3
-#define GL_UNPACK_SKIP_PIXELS                            0x0CF4
-#define GL_PACK_ROW_LENGTH                               0x0D02
-#define GL_PACK_SKIP_ROWS                                0x0D03
-#define GL_PACK_SKIP_PIXELS                              0x0D04
-#define GL_COLOR                                         0x1800
-#define GL_DEPTH                                         0x1801
-#define GL_STENCIL                                       0x1802
-#define GL_RED                                           0x1903
-#define GL_RGB8                                          0x8051
-#define GL_RGBA8                                         0x8058
-#define GL_RGB10_A2                                      0x8059
-#define GL_TEXTURE_BINDING_3D                            0x806A
-#define GL_UNPACK_SKIP_IMAGES                            0x806D
-#define GL_UNPACK_IMAGE_HEIGHT                           0x806E
-#define GL_TEXTURE_3D                                    0x806F
-#define GL_TEXTURE_WRAP_R                                0x8072
-#define GL_MAX_3D_TEXTURE_SIZE                           0x8073
-#define GL_UNSIGNED_INT_2_10_10_10_REV                   0x8368
-#define GL_MAX_ELEMENTS_VERTICES                         0x80E8
-#define GL_MAX_ELEMENTS_INDICES                          0x80E9
-#define GL_TEXTURE_MIN_LOD                               0x813A
-#define GL_TEXTURE_MAX_LOD                               0x813B
-#define GL_TEXTURE_BASE_LEVEL                            0x813C
-#define GL_TEXTURE_MAX_LEVEL                             0x813D
-#define GL_MIN                                           0x8007
-#define GL_MAX                                           0x8008
-#define GL_DEPTH_COMPONENT24                             0x81A6
-#define GL_MAX_TEXTURE_LOD_BIAS                          0x84FD
-#define GL_TEXTURE_COMPARE_MODE                          0x884C
-#define GL_TEXTURE_COMPARE_FUNC                          0x884D
-#define GL_CURRENT_QUERY                                 0x8865
-#define GL_QUERY_RESULT                                  0x8866
-#define GL_QUERY_RESULT_AVAILABLE                        0x8867
-#define GL_BUFFER_MAPPED                                 0x88BC
-#define GL_BUFFER_MAP_POINTER                            0x88BD
-#define GL_STREAM_READ                                   0x88E1
-#define GL_STREAM_COPY                                   0x88E2
-#define GL_STATIC_READ                                   0x88E5
-#define GL_STATIC_COPY                                   0x88E6
-#define GL_DYNAMIC_READ                                  0x88E9
-#define GL_DYNAMIC_COPY                                  0x88EA
-#define GL_MAX_DRAW_BUFFERS                              0x8824
-#define GL_DRAW_BUFFER0                                  0x8825
-#define GL_DRAW_BUFFER1                                  0x8826
-#define GL_DRAW_BUFFER2                                  0x8827
-#define GL_DRAW_BUFFER3                                  0x8828
-#define GL_DRAW_BUFFER4                                  0x8829
-#define GL_DRAW_BUFFER5                                  0x882A
-#define GL_DRAW_BUFFER6                                  0x882B
-#define GL_DRAW_BUFFER7                                  0x882C
-#define GL_DRAW_BUFFER8                                  0x882D
-#define GL_DRAW_BUFFER9                                  0x882E
-#define GL_DRAW_BUFFER10                                 0x882F
-#define GL_DRAW_BUFFER11                                 0x8830
-#define GL_DRAW_BUFFER12                                 0x8831
-#define GL_DRAW_BUFFER13                                 0x8832
-#define GL_DRAW_BUFFER14                                 0x8833
-#define GL_DRAW_BUFFER15                                 0x8834
-#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS               0x8B49
-#define GL_MAX_VERTEX_UNIFORM_COMPONENTS                 0x8B4A
-#define GL_SAMPLER_3D                                    0x8B5F
-#define GL_SAMPLER_2D_SHADOW                             0x8B62
-#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT               0x8B8B
-#define GL_PIXEL_PACK_BUFFER                             0x88EB
-#define GL_PIXEL_UNPACK_BUFFER                           0x88EC
-#define GL_PIXEL_PACK_BUFFER_BINDING                     0x88ED
-#define GL_PIXEL_UNPACK_BUFFER_BINDING                   0x88EF
-#define GL_FLOAT_MAT2x3                                  0x8B65
-#define GL_FLOAT_MAT2x4                                  0x8B66
-#define GL_FLOAT_MAT3x2                                  0x8B67
-#define GL_FLOAT_MAT3x4                                  0x8B68
-#define GL_FLOAT_MAT4x2                                  0x8B69
-#define GL_FLOAT_MAT4x3                                  0x8B6A
-#define GL_SRGB                                          0x8C40
-#define GL_SRGB8                                         0x8C41
-#define GL_SRGB8_ALPHA8                                  0x8C43
-#define GL_COMPARE_REF_TO_TEXTURE                        0x884E
-#define GL_MAJOR_VERSION                                 0x821B
-#define GL_MINOR_VERSION                                 0x821C
-#define GL_NUM_EXTENSIONS                                0x821D
-#define GL_RGBA32F                                       0x8814
-#define GL_RGB32F                                        0x8815
-#define GL_RGBA16F                                       0x881A
-#define GL_RGB16F                                        0x881B
-#define GL_VERTEX_ATTRIB_ARRAY_INTEGER                   0x88FD
-#define GL_MAX_ARRAY_TEXTURE_LAYERS                      0x88FF
-#define GL_MIN_PROGRAM_TEXEL_OFFSET                      0x8904
-#define GL_MAX_PROGRAM_TEXEL_OFFSET                      0x8905
-#define GL_MAX_VARYING_COMPONENTS                        0x8B4B
-#define GL_TEXTURE_2D_ARRAY                              0x8C1A
-#define GL_TEXTURE_BINDING_2D_ARRAY                      0x8C1D
-#define GL_R11F_G11F_B10F                                0x8C3A
-#define GL_UNSIGNED_INT_10F_11F_11F_REV                  0x8C3B
-#define GL_RGB9_E5                                       0x8C3D
-#define GL_UNSIGNED_INT_5_9_9_9_REV                      0x8C3E
-#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH         0x8C76
-#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE                0x8C7F
-#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS    0x8C80
-#define GL_TRANSFORM_FEEDBACK_VARYINGS                   0x8C83
-#define GL_TRANSFORM_FEEDBACK_BUFFER_START               0x8C84
-#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE                0x8C85
-#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN         0x8C88
-#define GL_RASTERIZER_DISCARD                            0x8C89
-#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A
-#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS       0x8C8B
-#define GL_INTERLEAVED_ATTRIBS                           0x8C8C
-#define GL_SEPARATE_ATTRIBS                              0x8C8D
-#define GL_TRANSFORM_FEEDBACK_BUFFER                     0x8C8E
-#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING             0x8C8F
-#define GL_RGBA32UI                                      0x8D70
-#define GL_RGB32UI                                       0x8D71
-#define GL_RGBA16UI                                      0x8D76
-#define GL_RGB16UI                                       0x8D77
-#define GL_RGBA8UI                                       0x8D7C
-#define GL_RGB8UI                                        0x8D7D
-#define GL_RGBA32I                                       0x8D82
-#define GL_RGB32I                                        0x8D83
-#define GL_RGBA16I                                       0x8D88
-#define GL_RGB16I                                        0x8D89
-#define GL_RGBA8I                                        0x8D8E
-#define GL_RGB8I                                         0x8D8F
-#define GL_RED_INTEGER                                   0x8D94
-#define GL_RGB_INTEGER                                   0x8D98
-#define GL_RGBA_INTEGER                                  0x8D99
-#define GL_SAMPLER_2D_ARRAY                              0x8DC1
-#define GL_SAMPLER_2D_ARRAY_SHADOW                       0x8DC4
-#define GL_SAMPLER_CUBE_SHADOW                           0x8DC5
-#define GL_UNSIGNED_INT_VEC2                             0x8DC6
-#define GL_UNSIGNED_INT_VEC3                             0x8DC7
-#define GL_UNSIGNED_INT_VEC4                             0x8DC8
-#define GL_INT_SAMPLER_2D                                0x8DCA
-#define GL_INT_SAMPLER_3D                                0x8DCB
-#define GL_INT_SAMPLER_CUBE                              0x8DCC
-#define GL_INT_SAMPLER_2D_ARRAY                          0x8DCF
-#define GL_UNSIGNED_INT_SAMPLER_2D                       0x8DD2
-#define GL_UNSIGNED_INT_SAMPLER_3D                       0x8DD3
-#define GL_UNSIGNED_INT_SAMPLER_CUBE                     0x8DD4
-#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY                 0x8DD7
-#define GL_BUFFER_ACCESS_FLAGS                           0x911F
-#define GL_BUFFER_MAP_LENGTH                             0x9120
-#define GL_BUFFER_MAP_OFFSET                             0x9121
-#define GL_DEPTH_COMPONENT32F                            0x8CAC
-#define GL_DEPTH32F_STENCIL8                             0x8CAD
-#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV                0x8DAD
-#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING         0x8210
-#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE         0x8211
-#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE               0x8212
-#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE             0x8213
-#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE              0x8214
-#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE             0x8215
-#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE             0x8216
-#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE           0x8217
-#define GL_FRAMEBUFFER_DEFAULT                           0x8218
-#define GL_FRAMEBUFFER_UNDEFINED                         0x8219
-#define GL_DEPTH_STENCIL_ATTACHMENT                      0x821A
-#define GL_DEPTH_STENCIL                                 0x84F9
-#define GL_UNSIGNED_INT_24_8                             0x84FA
-#define GL_DEPTH24_STENCIL8                              0x88F0
-#define GL_UNSIGNED_NORMALIZED                           0x8C17
-#define GL_DRAW_FRAMEBUFFER_BINDING                      GL_FRAMEBUFFER_BINDING
-#define GL_READ_FRAMEBUFFER                              0x8CA8
-#define GL_DRAW_FRAMEBUFFER                              0x8CA9
-#define GL_READ_FRAMEBUFFER_BINDING                      0x8CAA
-#define GL_RENDERBUFFER_SAMPLES                          0x8CAB
-#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER          0x8CD4
-#define GL_MAX_COLOR_ATTACHMENTS                         0x8CDF
-#define GL_COLOR_ATTACHMENT1                             0x8CE1
-#define GL_COLOR_ATTACHMENT2                             0x8CE2
-#define GL_COLOR_ATTACHMENT3                             0x8CE3
-#define GL_COLOR_ATTACHMENT4                             0x8CE4
-#define GL_COLOR_ATTACHMENT5                             0x8CE5
-#define GL_COLOR_ATTACHMENT6                             0x8CE6
-#define GL_COLOR_ATTACHMENT7                             0x8CE7
-#define GL_COLOR_ATTACHMENT8                             0x8CE8
-#define GL_COLOR_ATTACHMENT9                             0x8CE9
-#define GL_COLOR_ATTACHMENT10                            0x8CEA
-#define GL_COLOR_ATTACHMENT11                            0x8CEB
-#define GL_COLOR_ATTACHMENT12                            0x8CEC
-#define GL_COLOR_ATTACHMENT13                            0x8CED
-#define GL_COLOR_ATTACHMENT14                            0x8CEE
-#define GL_COLOR_ATTACHMENT15                            0x8CEF
-#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE            0x8D56
-#define GL_MAX_SAMPLES                                   0x8D57
-#define GL_HALF_FLOAT                                    0x140B
-#define GL_MAP_READ_BIT                                  0x0001
-#define GL_MAP_WRITE_BIT                                 0x0002
-#define GL_MAP_INVALIDATE_RANGE_BIT                      0x0004
-#define GL_MAP_INVALIDATE_BUFFER_BIT                     0x0008
-#define GL_MAP_FLUSH_EXPLICIT_BIT                        0x0010
-#define GL_MAP_UNSYNCHRONIZED_BIT                        0x0020
-#define GL_RG                                            0x8227
-#define GL_RG_INTEGER                                    0x8228
-#define GL_R8                                            0x8229
-#define GL_RG8                                           0x822B
-#define GL_R16F                                          0x822D
-#define GL_R32F                                          0x822E
-#define GL_RG16F                                         0x822F
-#define GL_RG32F                                         0x8230
-#define GL_R8I                                           0x8231
-#define GL_R8UI                                          0x8232
-#define GL_R16I                                          0x8233
-#define GL_R16UI                                         0x8234
-#define GL_R32I                                          0x8235
-#define GL_R32UI                                         0x8236
-#define GL_RG8I                                          0x8237
-#define GL_RG8UI                                         0x8238
-#define GL_RG16I                                         0x8239
-#define GL_RG16UI                                        0x823A
-#define GL_RG32I                                         0x823B
-#define GL_RG32UI                                        0x823C
-#define GL_VERTEX_ARRAY_BINDING                          0x85B5
-#define GL_R8_SNORM                                      0x8F94
-#define GL_RG8_SNORM                                     0x8F95
-#define GL_RGB8_SNORM                                    0x8F96
-#define GL_RGBA8_SNORM                                   0x8F97
-#define GL_SIGNED_NORMALIZED                             0x8F9C
-#define GL_PRIMITIVE_RESTART_FIXED_INDEX                 0x8D69
-#define GL_COPY_READ_BUFFER                              0x8F36
-#define GL_COPY_WRITE_BUFFER                             0x8F37
-#define GL_COPY_READ_BUFFER_BINDING                      GL_COPY_READ_BUFFER
-#define GL_COPY_WRITE_BUFFER_BINDING                     GL_COPY_WRITE_BUFFER
-#define GL_UNIFORM_BUFFER                                0x8A11
-#define GL_UNIFORM_BUFFER_BINDING                        0x8A28
-#define GL_UNIFORM_BUFFER_START                          0x8A29
-#define GL_UNIFORM_BUFFER_SIZE                           0x8A2A
-#define GL_MAX_VERTEX_UNIFORM_BLOCKS                     0x8A2B
-#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS                   0x8A2D
-#define GL_MAX_COMBINED_UNIFORM_BLOCKS                   0x8A2E
-#define GL_MAX_UNIFORM_BUFFER_BINDINGS                   0x8A2F
-#define GL_MAX_UNIFORM_BLOCK_SIZE                        0x8A30
-#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS        0x8A31
-#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS      0x8A33
-#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT               0x8A34
-#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH          0x8A35
-#define GL_ACTIVE_UNIFORM_BLOCKS                         0x8A36
-#define GL_UNIFORM_TYPE                                  0x8A37
-#define GL_UNIFORM_SIZE                                  0x8A38
-#define GL_UNIFORM_NAME_LENGTH                           0x8A39
-#define GL_UNIFORM_BLOCK_INDEX                           0x8A3A
-#define GL_UNIFORM_OFFSET                                0x8A3B
-#define GL_UNIFORM_ARRAY_STRIDE                          0x8A3C
-#define GL_UNIFORM_MATRIX_STRIDE                         0x8A3D
-#define GL_UNIFORM_IS_ROW_MAJOR                          0x8A3E
-#define GL_UNIFORM_BLOCK_BINDING                         0x8A3F
-#define GL_UNIFORM_BLOCK_DATA_SIZE                       0x8A40
-#define GL_UNIFORM_BLOCK_NAME_LENGTH                     0x8A41
-#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS                 0x8A42
-#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES          0x8A43
-#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER     0x8A44
-#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER   0x8A46
-#define GL_INVALID_INDEX                                 0xFFFFFFFFu
-#define GL_MAX_VERTEX_OUTPUT_COMPONENTS                  0x9122
-#define GL_MAX_FRAGMENT_INPUT_COMPONENTS                 0x9125
-#define GL_MAX_SERVER_WAIT_TIMEOUT                       0x9111
-#define GL_OBJECT_TYPE                                   0x9112
-#define GL_SYNC_CONDITION                                0x9113
-#define GL_SYNC_STATUS                                   0x9114
-#define GL_SYNC_FLAGS                                    0x9115
-#define GL_SYNC_FENCE                                    0x9116
-#define GL_SYNC_GPU_COMMANDS_COMPLETE                    0x9117
-#define GL_UNSIGNALED                                    0x9118
-#define GL_SIGNALED                                      0x9119
-#define GL_ALREADY_SIGNALED                              0x911A
-#define GL_TIMEOUT_EXPIRED                               0x911B
-#define GL_CONDITION_SATISFIED                           0x911C
-#define GL_WAIT_FAILED                                   0x911D
-#define GL_SYNC_FLUSH_COMMANDS_BIT                       0x00000001
-#define GL_TIMEOUT_IGNORED                               0xFFFFFFFFFFFFFFFFull
-#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR                   0x88FE
-#define GL_ANY_SAMPLES_PASSED                            0x8C2F
-#define GL_ANY_SAMPLES_PASSED_CONSERVATIVE               0x8D6A
-#define GL_SAMPLER_BINDING                               0x8919
-#define GL_RGB10_A2UI                                    0x906F
-#define GL_TEXTURE_SWIZZLE_R                             0x8E42
-#define GL_TEXTURE_SWIZZLE_G                             0x8E43
-#define GL_TEXTURE_SWIZZLE_B                             0x8E44
-#define GL_TEXTURE_SWIZZLE_A                             0x8E45
-#define GL_GREEN                                         0x1904
-#define GL_BLUE                                          0x1905
-#define GL_INT_2_10_10_10_REV                            0x8D9F
-#define GL_TRANSFORM_FEEDBACK                            0x8E22
-#define GL_TRANSFORM_FEEDBACK_PAUSED                     0x8E23
-#define GL_TRANSFORM_FEEDBACK_ACTIVE                     0x8E24
-#define GL_TRANSFORM_FEEDBACK_BINDING                    0x8E25
-#define GL_PROGRAM_BINARY_RETRIEVABLE_HINT               0x8257
-#define GL_PROGRAM_BINARY_LENGTH                         0x8741
-#define GL_NUM_PROGRAM_BINARY_FORMATS                    0x87FE
-#define GL_PROGRAM_BINARY_FORMATS                        0x87FF
-#define GL_COMPRESSED_R11_EAC                            0x9270
-#define GL_COMPRESSED_SIGNED_R11_EAC                     0x9271
-#define GL_COMPRESSED_RG11_EAC                           0x9272
-#define GL_COMPRESSED_SIGNED_RG11_EAC                    0x9273
-#define GL_COMPRESSED_RGB8_ETC2                          0x9274
-#define GL_COMPRESSED_SRGB8_ETC2                         0x9275
-#define GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2      0x9276
-#define GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2     0x9277
-#define GL_COMPRESSED_RGBA8_ETC2_EAC                     0x9278
-#define GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC              0x9279
-#define GL_TEXTURE_IMMUTABLE_FORMAT                      0x912F
-#define GL_MAX_ELEMENT_INDEX                             0x8D6B
-#define GL_NUM_SAMPLE_COUNTS                             0x9380
-#define GL_TEXTURE_IMMUTABLE_LEVELS                      0x82DF
-
-/*-------------------------------------------------------------------------
- * Entrypoint definitions
- *-----------------------------------------------------------------------*/
-
-/* OpenGL ES 3.0 */
-
-extern GL_APICALL void           (* GL_APIENTRY glReadBuffer) (GLenum mode);
-extern GL_APICALL void           (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
-extern GL_APICALL void           (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
-extern GL_APICALL void           (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
-extern GL_APICALL void           (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
-extern GL_APICALL void           (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
-extern GL_APICALL void           (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
-extern GL_APICALL void           (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
-extern GL_APICALL void           (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
-extern GL_APICALL GLboolean      (* GL_APIENTRY glIsQuery) (GLuint id);
-extern GL_APICALL void           (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
-extern GL_APICALL void           (* GL_APIENTRY glEndQuery) (GLenum target);
-extern GL_APICALL void           (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
-extern GL_APICALL void           (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
-extern GL_APICALL GLboolean      (* GL_APIENTRY glUnmapBuffer) (GLenum target);
-extern GL_APICALL void           (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
-extern GL_APICALL void           (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
-extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
-extern GL_APICALL void           (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
-extern GL_APICALL void           (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
-extern GL_APICALL GLvoid*        (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-extern GL_APICALL void           (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
-extern GL_APICALL void           (* GL_APIENTRY glBindVertexArray) (GLuint array);
-extern GL_APICALL void           (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
-extern GL_APICALL void           (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
-extern GL_APICALL GLboolean      (* GL_APIENTRY glIsVertexArray) (GLuint array);
-extern GL_APICALL void           (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
-extern GL_APICALL void           (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
-extern GL_APICALL void           (* GL_APIENTRY glEndTransformFeedback) (void);
-extern GL_APICALL void           (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
-extern GL_APICALL void           (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
-extern GL_APICALL void           (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
-extern GL_APICALL void           (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
-extern GL_APICALL void           (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
-extern GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
-extern GL_APICALL void           (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
-extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
-extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
-extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
-extern GL_APICALL void           (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
-extern GL_APICALL void           (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
-extern GL_APICALL GLint          (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
-extern GL_APICALL void           (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
-extern GL_APICALL void           (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
-extern GL_APICALL void           (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
-extern GL_APICALL void           (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
-extern GL_APICALL void           (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void           (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void           (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
-extern GL_APICALL void           (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
-extern GL_APICALL void           (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
-extern GL_APICALL void           (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
-extern GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
-extern GL_APICALL void           (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
-extern GL_APICALL void           (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
-extern GL_APICALL void           (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
-extern GL_APICALL GLuint         (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
-extern GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
-extern GL_APICALL void           (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
-extern GL_APICALL void           (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
-extern GL_APICALL void           (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
-extern GL_APICALL void           (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
-extern GL_APICALL GLsync         (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
-extern GL_APICALL GLboolean      (* GL_APIENTRY glIsSync) (GLsync sync);
-extern GL_APICALL void           (* GL_APIENTRY glDeleteSync) (GLsync sync);
-extern GL_APICALL GLenum         (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-extern GL_APICALL void           (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-extern GL_APICALL void           (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
-extern GL_APICALL void           (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
-extern GL_APICALL void           (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
-extern GL_APICALL void           (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
-extern GL_APICALL void           (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
-extern GL_APICALL void           (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
-extern GL_APICALL GLboolean      (* GL_APIENTRY glIsSampler) (GLuint sampler);
-extern GL_APICALL void           (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
-extern GL_APICALL void           (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
-extern GL_APICALL void           (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
-extern GL_APICALL void           (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
-extern GL_APICALL void           (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
-extern GL_APICALL void           (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
-extern GL_APICALL void           (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
-extern GL_APICALL void           (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
-extern GL_APICALL void           (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
-extern GL_APICALL void           (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
-extern GL_APICALL void           (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
-extern GL_APICALL GLboolean      (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
-extern GL_APICALL void           (* GL_APIENTRY glPauseTransformFeedback) (void);
-extern GL_APICALL void           (* GL_APIENTRY glResumeTransformFeedback) (void);
-extern GL_APICALL void           (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
-extern GL_APICALL void           (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
-extern GL_APICALL void           (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
-extern GL_APICALL void           (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
-extern GL_APICALL void           (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
-extern GL_APICALL void           (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
-extern GL_APICALL void           (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
-extern GL_APICALL void           (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
-
-#ifndef GL_EXT_texture_border_clamp
-#define GL_EXT_texture_border_clamp 1
-#define GL_TEXTURE_BORDER_COLOR_EXT       0x1004
-#define GL_CLAMP_TO_BORDER_EXT            0x812D
-extern GL_APICALL void           (* GL_APIENTRY  glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
-extern GL_APICALL void           (* GL_APIENTRY  glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
-extern GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
-extern GL_APICALL void           (* GL_APIENTRY  glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
-extern GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
-extern GL_APICALL void           (* GL_APIENTRY  glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
-extern GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
-extern GL_APICALL void           (* GL_APIENTRY  glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
-#endif /* GL_EXT_texture_border_clamp */
-
-#ifdef __cplusplus
-}
-#endif
-// clang-format on
-
-#endif
diff --git a/caffe2/mobile/contrib/opengl/core/CMakeLists.txt b/caffe2/mobile/contrib/opengl/core/CMakeLists.txt
deleted file mode 100644 (file)
index dbc170e..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB_RECURSE tmp *.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/core/DataTransfer.cc b/caffe2/mobile/contrib/opengl/core/DataTransfer.cc
deleted file mode 100644 (file)
index 775d921..0000000
+++ /dev/null
@@ -1,249 +0,0 @@
-
-#include "DataTransfer.h"
-#include "GLLogging.h"
-
-#include "caffe2/core/common.h"
-
-inline uint16x4x4_t vld4_u16_aligned16(const uint16_t* address) {
-  return vld4_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 16)));
-}
-
-inline uint16x4_t vld1_u16_aligned8(const uint16_t* address) {
-  return vld1_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 8)));
-}
-
-inline void vst4_u16_aligned16(uint16_t* address, uint16x4x4_t data) {
-  vst4_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 16)), data);
-}
-
-inline void vst1_u16_aligned8(uint16_t* address, uint16x4_t data) {
-  vst1_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 8)), data);
-}
-
-template <int input_channels>
-static void interleaveSlice(
-    void* output, const float* input, size_t width, size_t height, size_t row_stride) {
-  const float* input_r = input;
-  const float* input_g = input_r + height * width;
-  const float* input_b = input_g + height * width;
-  const float* input_a = input_b + height * width;
-  uint16_t* output_f16 = static_cast<uint16_t*>(output);
-  if (width >= 4) {
-    for (size_t y = 0; y < height; y++) {
-      size_t nx = width;
-      while (nx >= 4) {
-        const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
-        input_r += 4;
-        uint16x4_t g, b, a;
-        g = b = a = vdup_n_u16(0);
-        if (input_channels >= 2) {
-          g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
-          input_g += 4;
-          if (input_channels >= 3) {
-            b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
-            input_b += 4;
-            if (input_channels >= 4) {
-              a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
-              input_a += 4;
-            }
-          }
-        }
-
-        const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
-        vst4_u16_aligned16(output_f16, rgba);
-        output_f16 += 4 * 4;
-
-        nx -= 4;
-      }
-      if (nx != 0) {
-        output_f16 -= (4 - nx) * 4;
-        input_r -= 4 - nx;
-        if (input_channels >= 2) {
-          input_g -= 4 - nx;
-          if (input_channels >= 3) {
-            input_b -= 4 - nx;
-            if (input_channels >= 4) {
-              input_a -= 4 - nx;
-            }
-          }
-        }
-
-        const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
-        input_r += 4;
-        uint16x4_t g, b, a;
-        g = b = a = vdup_n_u16(0);
-        if (input_channels >= 2) {
-          g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
-          input_g += 4;
-          if (input_channels >= 3) {
-            b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
-            input_b += 4;
-            if (input_channels >= 4) {
-              a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
-              input_a += 4;
-            }
-          }
-        }
-
-        const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
-        vst4_u16_aligned16(output_f16, rgba);
-        output_f16 += 4 * 4;
-      }
-      output_f16 += (row_stride - width) * 4;
-    }
-  } else {
-    for (size_t y = 0; y < height; y++) {
-      for (size_t x = 0; x < width; x++) {
-        float32x4_t rgba = vld1q_dup_f32(input_r++);
-        if (input_channels >= 2) {
-          rgba = vld1q_lane_f32(input_g++, rgba, 1);
-          if (input_channels >= 3) {
-            rgba = vld1q_lane_f32(input_b++, rgba, 2);
-            if (input_channels >= 4) {
-              rgba = vld1q_lane_f32(input_a++, rgba, 3);
-            }
-          }
-        }
-        vst1_u16_aligned8(output_f16, uint16x4_t(vcvt_f16_f32(rgba)));
-        output_f16 += 4;
-      }
-      output_f16 += (row_stride - width) * 4;
-    }
-  }
-}
-
-void interleaveSlice(void* output,
-                     const float* input,
-                     size_t width,
-                     size_t height,
-                     size_t row_stride,
-                     uint16_t input_channels) {
-  switch (input_channels) {
-  case 1:
-    interleaveSlice<1>(output, input, width, height, row_stride);
-    break;
-  case 2:
-    interleaveSlice<2>(output, input, width, height, row_stride);
-    break;
-  case 3:
-    interleaveSlice<3>(output, input, width, height, row_stride);
-    break;
-  case 4:
-    interleaveSlice<4>(output, input, width, height, row_stride);
-    break;
-  }
-}
-
-template <int output_channels>
-static void deInterleaveSlice(
-    float* output, const void* input, size_t width, size_t height, size_t row_stride) {
-  float* output_r = output;
-  float* output_g = output_r + height * width;
-  float* output_b = output_g + height * width;
-  float* output_a = output_b + height * width;
-  const uint16_t* input_f16 = static_cast<const uint16_t*>(input);
-  if (width >= 4) {
-    for (size_t y = 0; y < height; y++) {
-      size_t nx = width;
-      while (nx >= 4) {
-        const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
-        input_f16 += 4 * 4;
-        const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
-        vst1q_f32(output_r, r);
-        output_r += 4;
-        if (output_channels >= 2) {
-          const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
-          vst1q_f32(output_g, g);
-          output_g += 4;
-          if (output_channels >= 3) {
-            const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
-            vst1q_f32(output_b, b);
-            output_b += 4;
-            if (output_channels >= 4) {
-              const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
-              vst1q_f32(output_a, a);
-              output_a += 4;
-            }
-          }
-        }
-
-        nx -= 4;
-      }
-      if (nx != 0) {
-        input_f16 -= (4 - nx) * 4;
-        output_r -= 4 - nx;
-        if (output_channels >= 2) {
-          output_g -= 4 - nx;
-          if (output_channels >= 3) {
-            output_b -= 4 - nx;
-            if (output_channels >= 4) {
-              output_a -= 4 - nx;
-            }
-          }
-        }
-
-        const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
-        input_f16 += 4 * 4;
-        const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
-        vst1q_f32(output_r, r);
-        output_r += 4;
-        if (output_channels >= 2) {
-          const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
-          vst1q_f32(output_g, g);
-          output_g += 4;
-          if (output_channels >= 3) {
-            const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
-            vst1q_f32(output_b, b);
-            output_b += 4;
-            if (output_channels >= 4) {
-              const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
-              vst1q_f32(output_a, a);
-              output_a += 4;
-            }
-          }
-        }
-      }
-      input_f16 += (row_stride - width) * 4;
-    }
-  } else {
-    for (size_t y = 0; y < height; y++) {
-      for (size_t x = 0; x < width; x++) {
-        const float32x4_t rgba = vcvt_f32_f16(float16x4_t(vld1_u16_aligned8(input_f16)));
-        input_f16 += 4;
-        vst1q_lane_f32(output_r++, rgba, 0);
-        if (output_channels >= 2) {
-          vst1q_lane_f32(output_g++, rgba, 1);
-          if (output_channels >= 3) {
-            vst1q_lane_f32(output_b++, rgba, 2);
-            if (output_channels >= 4) {
-              vst1q_lane_f32(output_a++, rgba, 3);
-            }
-          }
-        }
-      }
-      input_f16 += (row_stride - width) * 4;
-    }
-  }
-}
-
-void deInterleaveSlice(float* output,
-                       const void* input,
-                       size_t width,
-                       size_t height,
-                       size_t row_stride,
-                       uint32_t output_channels) {
-  switch (output_channels) {
-  case 1:
-    deInterleaveSlice<1>(output, input, width, height, row_stride);
-    break;
-  case 2:
-    deInterleaveSlice<2>(output, input, width, height, row_stride);
-    break;
-  case 3:
-    deInterleaveSlice<3>(output, input, width, height, row_stride);
-    break;
-  case 4:
-    deInterleaveSlice<4>(output, input, width, height, row_stride);
-    break;
-  }
-}
diff --git a/caffe2/mobile/contrib/opengl/core/DataTransfer.h b/caffe2/mobile/contrib/opengl/core/DataTransfer.h
deleted file mode 100644 (file)
index 59a91dd..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-
-#pragma once
-
-#include "arm_neon_support.h"
-
-void interleaveSlice(void* output,
-                     const float* input,
-                     size_t width,
-                     size_t height,
-                     size_t row_stride,
-                     uint16_t input_channels);
-void deInterleaveSlice(float* output,
-                       const void* input,
-                       size_t width,
-                       size_t height,
-                       size_t input_stride,
-                       uint32_t output_channels);
diff --git a/caffe2/mobile/contrib/opengl/core/GL.h b/caffe2/mobile/contrib/opengl/core/GL.h
deleted file mode 100644 (file)
index 31e1c0d..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-
-#pragma once
-#include "caffe2/core/common.h"
-
-#if CAFFE2_IOS
-#include <OpenGLES/ES3/gl.h>
-#include <OpenGLES/ES3/glext.h>
-#elif CAFFE2_ANDROID
-#include <EGL/egl.h>
-#include <GLES2/gl2.h>
-#include "caffe2/mobile/contrib/opengl/android/gl3stub.h"
-#endif
diff --git a/caffe2/mobile/contrib/opengl/core/GLContext.cc b/caffe2/mobile/contrib/opengl/core/GLContext.cc
deleted file mode 100644 (file)
index 0f5086c..0000000
+++ /dev/null
@@ -1,126 +0,0 @@
-
-#include "caffe2/core/logging.h"
-
-#include "GL.h"
-#include "GLContext.h"
-#include "GLLogging.h"
-
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#if CAFFE2_IOS
-#include "sys/utsname.h"
-#include <regex>
-#endif
-
-void getOpenGLESVersion(int& major, int& minor) {
-  glGetIntegerv(GL_MAJOR_VERSION, &major);
-  glGetIntegerv(GL_MINOR_VERSION, &minor);
-}
-
-bool checkOpenGLExtensions(std::string gl_ext_str) {
-  static std::unordered_set<std::string> extensions;
-  if (extensions.empty()) {
-    const caffe2::string extension_str((const char*)glGetString(GL_EXTENSIONS));
-    LOG(INFO) << "GL_EXTENSIONS: " << extension_str;
-
-    std::stringstream ss(extension_str);
-    while (!ss.eof()) {
-      std::string extension;
-      ss >> extension;
-      extensions.insert(extension);
-    }
-  }
-
-  return extensions.count(gl_ext_str) > 0;
-}
-
-bool GLContext::GL_EXT_texture_border_clamp_defined() {
-  static int major = 0, minor = 0;
-  if (major == 0) {
-    getOpenGLESVersion(major, minor);
-  }
-
-  if (major == 3 && minor == 2) {
-    return true;
-  }
-
-  return checkOpenGLExtensions("GL_EXT_texture_border_clamp") || // Most common
-         checkOpenGLExtensions("GL_OES_texture_border_clamp");
-}
-
-bool supportOpenGLES3(bool* half_float_supported) {
-  int major = 0, minor = 0;
-  getOpenGLESVersion(major, minor);
-
-  LOG(INFO) << "GL_VERSION: OpenGL ES " << major << "." << minor;
-
-  if (major < 3) {
-    LOG(ERROR) << "OpenGL ES 3.0 not supported";
-    return false;
-  }
-
-  if (!checkOpenGLExtensions("GL_EXT_color_buffer_half_float")) {
-    LOG(ERROR) << "GL_EXT_color_buffer_half_float is not available";
-    if (half_float_supported) {
-      *half_float_supported = false;
-    }
-  }
-  return true;
-}
-
-#if CAFFE2_IOS
-int iPhoneVersion() {
-  static int version = 0;
-  static std::once_flag once;
-  std::call_once(once, [&]() {
-    struct utsname systemInfo;
-    uname(&systemInfo);
-    std::string iphone_ver_str = systemInfo.machine;
-    LOG(INFO) << systemInfo.machine;
-
-    if (iphone_ver_str.find("iPhone") != std::string::npos) {
-      std::regex regStr("([0-9]+)");
-      std::smatch matchs;
-      if (std::regex_search(iphone_ver_str, matchs, regStr)) {
-        version = stoi(matchs[0]);
-      }
-    }
-  });
-  return version;
-}
-#endif
-
-#if CAFFE2_ANDROID
-// whitelist of supported GPUs
-bool isSupportedRenderer() {
-  static std::unordered_set<std::string> supported_renderers = {
-      "Adreno (TM) 540",
-      "Adreno (TM) 530",
-      "Adreno (TM) 510",
-      "Adreno (TM) 430",
-      "Adreno (TM) 418",
-      "Mali-G71",
-      "Mali-T880",
-      "NVIDIA Tegra"};
-  std::string rendererStr((const char*)glGetString(GL_RENDERER));
-  LOG(INFO) << "GL_RENDERER: " << rendererStr;
-
-  int start = rendererStr.find_first_not_of(" ");
-  int end = rendererStr.find_last_not_of(" ");
-  rendererStr = rendererStr.substr(start, end - start + 1);
-  return supported_renderers.count(rendererStr) > 0;
-}
-#endif
-
-bool isSupportedDevice() {
-#if CAFFE2_IOS
-  return iPhoneVersion() >= 7; // iPhone 6 and up
-#elif CAFFE2_ANDROID
-  return isSupportedRenderer();
-#else
-  return false;
-#endif
-}
diff --git a/caffe2/mobile/contrib/opengl/core/GLContext.h b/caffe2/mobile/contrib/opengl/core/GLContext.h
deleted file mode 100644 (file)
index fc84f8f..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#pragma once
-#include "GLTexture.h"
-#include "caffe2/core/common.h"
-#include <functional>
-
-class GLContext {
- private:
-  static std::unique_ptr<GLContext> _glcontext;
-  std::function<const GLTexture*(const int width, const int height)> foreignTextureAllocator =
-      nullptr;
-
- protected:
-  bool half_float_supported = true;
-
- public:
-  virtual void set_context() = 0;
-  virtual void reset_context() = 0;
-  virtual void flush_context() = 0;
-  virtual ~GLContext(){};
-
-  static void initGLContext();
-  static GLContext* getGLContext();
-  static void deleteGLContext();
-
-  static bool GL_EXT_texture_border_clamp_defined();
-
-  inline bool halfFloatTextureSupported() { return half_float_supported; }
-
-  void setTextureAllocator(
-      std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
-    foreignTextureAllocator = textureAllocator;
-  }
-
-  std::function<const GLTexture*(const int width, const int height)> getTextureAllocator() {
-    return foreignTextureAllocator;
-  }
-};
-
-bool supportOpenGLES3(bool* hfs = nullptr);
-
-bool isSupportedDevice();
-
-#if CAFFE2_IOS
-int iPhoneVersion();
-#endif
diff --git a/caffe2/mobile/contrib/opengl/core/GLFilter.cc b/caffe2/mobile/contrib/opengl/core/GLFilter.cc
deleted file mode 100644 (file)
index 7c039e9..0000000
+++ /dev/null
@@ -1,567 +0,0 @@
-
-#include "GLFilter.h"
-#include <sstream>
-
-GLFilter::GLFilter(const std::string _kernel_name,
-                   const std::string _vertex_shader,
-                   const std::string _fragment_shader,
-                   const std::vector<binding*> uniforms,
-                   const std::vector<binding*> uniform_blocks,
-                   const std::vector<binding*> attributes,
-                   const replacements_t& _replacements)
-    : kernel_name(_kernel_name),
-      uniforms_(uniforms),
-      uniform_blocks_(uniform_blocks),
-      attributes_(attributes) {
-  // shader program
-  if (createProgram(_vertex_shader.c_str(),
-                    process_replacements(_fragment_shader, _replacements).c_str(),
-                    &program)) {
-    gl_log(GL_VERBOSE, "created program %d\n", program);
-  } else {
-    releaseBuffers();
-
-    throwRuntimeError(
-        [&](std::stringstream& errmsg) { errmsg << "Problem initializing OpenGL program"; });
-  }
-}
-
-const char* shader_utils = R"GLSL(
-#define unpackHalf4x16(pd) vec4(unpackHalf2x16(pd.x), unpackHalf2x16(pd.y))
-#define packHalf4x16(pd) uvec2(packHalf2x16(pd.xy), packHalf2x16(pd.zw))
-)GLSL";
-
-const char* half_float_texture_utils = R"GLSL(
-precision mediump sampler2D;
-
-#define TEXTURE_OUTPUT(_loc, _var) \
-        layout(location = _loc) out mediump vec4 _var
-#define TEXTURE_INPUT(_var) \
-        uniform sampler2D _var
-#define TEXTURE_LOAD(_input, _coord) \
-        texelFetch((_input), (_coord), 0)
-#define TEXTURE_STORE(_val) \
-        (_val)
-)GLSL";
-
-const char* half_float_compat_texture_utils = R"GLSL(
-precision highp usampler2D;
-
-#define TEXTURE_OUTPUT(_loc, _var) \
-        layout(location = _loc) out highp uvec2 _var
-#define TEXTURE_INPUT(_var) \
-        uniform usampler2D _var
-#define TEXTURE_LOAD(_input, _coord) \
-        unpackHalf4x16(texelFetch((_input), (_coord), 0).xy)
-#define TEXTURE_STORE(_val) \
-        (uvec2(packHalf4x16((_val))))
-)GLSL";
-
-std::string GLFilter::process_replacements(std::string shader,
-                                           const replacements_t& replacements) const {
-  for (auto&& replacement : replacements) {
-    std::string tag = "$(" + replacement.first + ")";
-    std::string value = replacement.second;
-
-    size_t position = shader.find(tag);
-    if (position != std::string::npos) {
-      shader.replace(position, tag.size(), value);
-    } else {
-      throwRuntimeError(
-          [&](std::stringstream& errmsg) { errmsg << "Couldn't find replacement tag: " << tag; });
-    }
-  }
-
-  // Add some #defines for convenience
-  std::string version_tag = "#version 300 es";
-  if (GLContext::getGLContext()->halfFloatTextureSupported()) {
-    shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
-  } else {
-    shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
-  }
-  shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
-  return shader;
-}
-
-template <typename T>
-void GLFilter::attach_uniform_buffer(const binding* block,
-                                     GLuint bindingPoint,
-                                     std::function<void(T*, size_t)> loader) {
-  if (block->location >= 0) {
-    if (bindingPoint < kMaxUniformBlocks) {
-      if (uniformBlock[bindingPoint] == 0) {
-        // Associate the uniform block index with a binding point
-        glUniformBlockBinding(program, block->location, bindingPoint);
-
-        // Get the size of block
-        glGetActiveUniformBlockiv(program, block->location, GL_UNIFORM_BLOCK_DATA_SIZE, &blockSize[bindingPoint]);
-
-        // Create and fill a buffer object
-        glGenBuffers(1, &uniformBlock[bindingPoint]);
-
-        gl_log(GL_VERBOSE, "created uniform buffer block %d\n", uniformBlock[bindingPoint]);
-      }
-
-      // Fill a buffer object
-      glBindBuffer(GL_UNIFORM_BUFFER, uniformBlock[bindingPoint]);
-      glBufferData(GL_UNIFORM_BUFFER, blockSize[bindingPoint], NULL, GL_DYNAMIC_DRAW);
-
-      checkGLError([&](std::stringstream& errmsg) {
-        errmsg << "Unable to bind uniform buffer " << block->name << ":" << block->location
-               << " at binding point " << bindingPoint;
-      });
-
-      T* blockData = (T*)glMapBufferRange(
-          GL_UNIFORM_BUFFER, 0, blockSize[bindingPoint], GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
-      if (blockData != NULL) {
-        // Copy the data into the mapped buffer
-        if (loader)
-          loader(blockData, blockSize[bindingPoint]);
-
-        // Unmap the buffer
-        if (glUnmapBuffer(GL_UNIFORM_BUFFER) == GL_TRUE) {
-          // Bind the buffer object to the uniform block binding point
-          glBindBufferBase(GL_UNIFORM_BUFFER, bindingPoint, uniformBlock[bindingPoint]);
-        } else {
-          throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "Error unmapping element buffer object"; });
-        }
-      } else {
-        throwRuntimeError([&](std::stringstream& errmsg) {
-          errmsg << "Error mapping element buffer object, blockSize: " << blockSize;
-        });
-      }
-
-      glBindBuffer(GL_UNIFORM_BUFFER, 0);
-    } else {
-      throwRuntimeError([&](std::stringstream& errmsg) {
-        errmsg << "Uniform block binding point out of range: " << bindingPoint << ", should be < "
-               << kMaxUniformBlocks;
-      });
-    }
-  } else {
-    throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "unbound uniform block"; });
-  }
-}
-
-template void GLFilter::attach_uniform_buffer<float16_t>(const binding* block,
-                                                         GLuint bindingPoint,
-                                                         std::function<void(float16_t*, size_t)> loader);
-
-static const GLenum unused_capability[] = {GL_CULL_FACE,
-                                           GL_BLEND,
-                                           GL_DITHER,
-                                           GL_STENCIL_TEST,
-                                           GL_DEPTH_TEST,
-                                           GL_SCISSOR_TEST,
-                                           GL_POLYGON_OFFSET_FILL,
-                                           GL_SAMPLE_ALPHA_TO_COVERAGE,
-                                           GL_SAMPLE_COVERAGE};
-
-void GLFilter::run(const std::vector<texture_attachment>& input,
-                   const std::vector<const GLTexture*>& output,
-                   std::function<void(void)> uniforms_initializer,
-                   int width,
-                   int height) {
-  const int first_texture_id = GL_TEXTURE0;
-
-  GLint defaultFramebuffer = 0;
-  glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
-
-  gl_log(GL_VERBOSE,
-         "GLFilter::run %s - inputs: %d, outputs: %d, width: %d, height: %d\n",
-         kernel_name.c_str(),
-         input.size(),
-         output.size(),
-         width,
-         height);
-
-  if (output.size() > 4) {
-    throwRuntimeError([&](std::stringstream& errmsg) {
-      errmsg << "Too many output textures: " << output.size() << ", should be <= 4";
-    });
-  }
-
-  if (frameBuffer == 0) {
-    // create the frame buffer
-    glGenFramebuffers(1, &frameBuffer);
-    gl_log(GL_VERBOSE, "created frame buffer %d\n", frameBuffer);
-  }
-
-  glBindFramebuffer(GL_FRAMEBUFFER, frameBuffer);
-  checkGLError([&](std::stringstream& errmsg) { errmsg << "glBindFramebuffer"; });
-
-  // Set up the output textures
-  for (int i = 0; i < output.size(); i++) {
-    GLenum target = output[i]->target();
-    GLuint texture = output[i]->name();
-
-    glBindTexture(target, texture);
-    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, target, texture, 0);
-
-    checkGLError([&](std::stringstream& errmsg) {
-      errmsg << "Unable to connect output texture " << texture << " at color attachment " << i;
-    });
-
-    gl_log(GL_VERBOSE, "connected output texture %d to color attachment %d\n", texture, i);
-  }
-
-  // Bind the output textures to the frame buffer attachments
-  if (!frame_buffer_initialized) {
-    const int attachments_number = output.size();
-    const GLenum attachments[4] = {
-        GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1, GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3};
-
-    glDrawBuffers(attachments_number, attachments);
-
-    int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
-
-    if (fbs != GL_FRAMEBUFFER_COMPLETE) {
-      throwRuntimeError(
-          [&](std::stringstream& errmsg) { errmsg << "Frame buffer incomplete: " << fbs; });
-    }
-
-    frame_buffer_initialized = true;
-  }
-
-  glUseProgram(program);
-  checkGLError([&](std::stringstream& errmsg) { errmsg << "glUseProgram"; });
-
-  // Set up the input textures
-  GLenum texture_idx = first_texture_id;
-  for (int i = 0; i < input.size(); i++, texture_idx++) {
-    if (input[i].uniform->location >= 0) {
-      GLenum target = input[i].texture->target();
-      GLuint texture = input[i].texture->name();
-
-      glActiveTexture(texture_idx);
-      glBindTexture(target, texture);
-      glUniform1i(input[i].uniform->location, texture_idx - GL_TEXTURE0);
-
-      checkGLError([&](std::stringstream& errmsg) {
-        errmsg << ": Unable to attach input texture " << texture << " to uniform "
-               << input[i].uniform->name << ":" << input[i].uniform->location << " at index "
-               << texture_idx - GL_TEXTURE0;
-      });
-
-      gl_log(GL_VERBOSE,
-             "connected input texture %d to texture unit %d\n",
-             texture,
-             texture_idx - GL_TEXTURE0);
-    } else {
-      gl_log(GL_VERBOSE, "something wrong happened when i = %d\n", i);
-    }
-  }
-
-  // Caller supplied uniforms initializer
-  if (uniforms_initializer) {
-    uniforms_initializer();
-
-    checkGLError([&](std::stringstream& errmsg) {
-      errmsg << "errors in the uniforms initializer callback";
-    });
-  }
-
-  // Validate program
-  if (check_opengl_errors && !validateProgram(program)) {
-    throwRuntimeError(
-        [&](std::stringstream& errmsg) { errmsg << "Couldn't validate OpenGL program"; });
-  }
-
-  glViewport(0, 0, width, height);
-
-  // Disable stuff we don't need and make sure that we have all the channels ebabled
-  for (int i = 0; i < sizeof(unused_capability) / sizeof(GLenum); i++) {
-    glDisable(unused_capability[i]);
-  }
-  glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
-
-  // glDrawElements should be more efficient, but on iOS glDrawArrays is faster.
-
-  const bool useDrawArrays = true;
-
-  if (useDrawArrays) {
-    enum { ATTRIB_VERTEX, ATTRIB_TEXTUREPOSITON, NUM_ATTRIBUTES };
-
-    static const GLfloat squareVertices[] = {
-        -1.0f,
-        -1.0f, // bottom left
-        1.0f,
-        -1.0f, // bottom right
-        -1.0f,
-        1.0f, // top left
-        1.0f,
-        1.0f, // top right
-    };
-
-    static const float textureVertices[] = {
-        0.0f,
-        0.0f, // bottom left
-        1.0f,
-        0.0f, // bottom right
-        0.0f,
-        1.0f, // top left
-        1.0f,
-        1.0f, // top right
-    };
-
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, squareVertices);
-    glEnableVertexAttribArray(ATTRIB_VERTEX);
-    checkGLError(
-        [&](std::stringstream& errmsg) { errmsg << "glEnableVertexAttribArray(ATTRIB_VERTEX)"; });
-
-    glVertexAttribPointer(ATTRIB_TEXTUREPOSITON, 2, GL_FLOAT, 0, 0, textureVertices);
-    glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON);
-    checkGLError([&](std::stringstream& errmsg) {
-      errmsg << "glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON)";
-    });
-
-    gl_log(GL_VERBOSE, "Calling glDrawArrays\n");
-    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-
-    checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawArrays"; });
-  } else {
-    // Run the shaders on the output geometry
-    static const GLfloat vVertices[] = {
-        -1.0f, -1.0f, 0.0f, // Position 0
-        0.0f,  0.0f, // TexCoord 0
-        -1.0f, 1.0f,  0.0f, // Position 1
-        0.0f,  1.0f, // TexCoord 1
-        1.0f,  1.0f,  0.0f, // Position 2
-        1.0f,  1.0f, // TexCoord 2
-        1.0f,  -1.0f, 0.0f, // Position 3
-        1.0f,  0.0f // TexCoord 3
-    };
-    static const GLushort indices[] = {0, 1, 2, 0, 2, 3};
-
-    // Load the vertex position
-    glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), vVertices);
-    // Load the texture coordinate
-    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), &vVertices[3]);
-
-    glEnableVertexAttribArray(0);
-    glEnableVertexAttribArray(1);
-
-    gl_log(GL_VERBOSE, "Calling glDrawElements\n");
-    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, indices);
-
-    checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawElements"; });
-  }
-
-#if CAFFE2_ANDROID
-  glFlush();
-#endif
-
-  // Unbind the current texture - Man, this is expensive!
-  for (int i = texture_idx - 1; i >= first_texture_id; i--) {
-    gl_log(GL_VERBOSE, "unbinding texture unit %d\n", i - GL_TEXTURE0);
-    glActiveTexture(i);
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    checkGLError([&](std::stringstream& errmsg) {
-      errmsg << "Error unbinding texture unit " << i - GL_TEXTURE0;
-    });
-  }
-
-  glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
-}
-
-void GLFilter::releaseBuffers() {
-  for (int i = 0; i < kMaxUniformBlocks; i++) {
-    if (uniformBlock[i]) {
-      gl_log(GL_VERBOSE, "deleting uniform buffer block %d\n", uniformBlock[i]);
-      glDeleteBuffers(1, &uniformBlock[i]);
-      uniformBlock[i] = 0;
-    }
-  }
-  if (frameBuffer) {
-    gl_log(GL_VERBOSE, "deleting frame buffer %d\n", frameBuffer);
-    glDeleteFramebuffers(1, &frameBuffer);
-    frameBuffer = 0;
-  }
-}
-
-void GLFilter::deleteProgram() {
-  if (program) {
-    gl_log(GL_VERBOSE, "deleting program %d\n", program);
-    glDeleteProgram(program);
-    program = 0;
-  }
-}
-
-void GLFilter::deleteBindings() {
-  for (binding* uniform : uniforms_) {
-    delete uniform;
-  }
-  for (binding* uniform_block : uniform_blocks_) {
-    delete uniform_block;
-  }
-  for (binding* attribute : attributes_) {
-    delete attribute;
-  }
-}
-
-// Simple vertex shader setting up the coordinates system
-const char* GLFilter::vertex_shader = R"GLSL(#version 300 es
-
-  layout(location = 0) in vec4 a_position;
-  layout(location = 1) in vec2 a_texCoord;
-  out vec2 v_texCoord;
-
-  void main()
-  {
-     gl_Position = a_position;
-     v_texCoord = a_texCoord;
-  }
-)GLSL";
-
-bool GLFilter::createProgram(const GLchar* vertSource,
-                             const GLchar* fragSource,
-                             GLuint* program) const {
-  GLuint vertShader = 0, fragShader = 0, prog = 0, status = 1;
-
-  // Clear the error state. We check error state later in the function and
-  // want to capture only errors in filter program initialization.
-  glGetError();
-
-  // Create shader program
-  prog = glCreateProgram();
-
-  // Create and compile vertex shader
-  status *= compileShader(GL_VERTEX_SHADER, 1, &vertSource, &vertShader);
-
-  // Create and compile fragment shader
-  status *= compileShader(GL_FRAGMENT_SHADER, 1, &fragSource, &fragShader);
-
-  // Attach vertex shader to program
-  glAttachShader(prog, vertShader);
-
-  // Attach fragment shader to program
-  glAttachShader(prog, fragShader);
-
-  // Bind attribute locations
-  // This needs to be done prior to linking
-  for (auto&& attribute : attributes_) {
-    glBindAttribLocation(prog, attribute->location, attribute->name.c_str());
-
-    checkGLError([&](std::stringstream& errmsg) {
-      errmsg << "Couldn't bind attribute: " << attribute->name << " at location "
-             << attribute->location;
-    });
-  }
-
-  // Link program
-  status *= linkProgram(prog);
-
-  // Get locations of uniforms
-  if (status) {
-    for (auto&& uniform : uniforms_) {
-      uniform->location = glGetUniformLocation(prog, uniform->name.c_str());
-
-      checkGLError([&](std::stringstream& errmsg) {
-        errmsg << "Couldn't resolve uniform: " << uniform->name;
-      });
-    }
-
-    for (auto&& uniform_block : uniform_blocks_) {
-      uniform_block->location = glGetUniformBlockIndex(prog, uniform_block->name.c_str());
-      gl_log(GL_VERBOSE,
-             "Getting location for uniform block: %s, location: %d\n",
-             uniform_block->name.c_str(),
-             uniform_block->location);
-
-      checkGLError([&](std::stringstream& errmsg) {
-        errmsg << "Couldn't resolve uniform block: " << uniform_block->name;
-      });
-    }
-
-    *program = prog;
-  }
-
-  // Release vertex and fragment shaders
-  if (vertShader) {
-    glDetachShader(prog, vertShader);
-    glDeleteShader(vertShader);
-  }
-  if (fragShader) {
-    glDetachShader(prog, fragShader);
-    glDeleteShader(fragShader);
-  }
-
-  return status == 1;
-}
-
-#include <stdlib.h>
-
-/* Compile a shader from the provided source(s) */
-GLint GLFilter::compileShader(GLenum target,
-                              GLsizei count,
-                              const GLchar** sources,
-                              GLuint* shader) const {
-  GLint status = 1;
-
-  *shader = glCreateShader(target);
-  glShaderSource(*shader, count, sources, NULL);
-  glCompileShader(*shader);
-
-  GLint logLength = 0;
-  glGetShaderiv(*shader, GL_INFO_LOG_LENGTH, &logLength);
-  if (logLength > 0) {
-    std::vector<GLchar> log(logLength);
-    glGetShaderInfoLog(*shader, logLength, &logLength, &log[0]);
-    gl_log(GL_ERR, "Shader compile log:\n%s", &log[0]);
-  }
-
-  glGetShaderiv(*shader, GL_COMPILE_STATUS, &status);
-  if (status == 0) {
-    int i;
-
-    gl_log(GL_ERR, "Failed to compile shader:\n");
-    for (i = 0; i < count; i++)
-      gl_log(GL_ERR, "%s", sources[i]);
-  }
-
-  return status;
-}
-
-/* Link a program with all currently attached shaders */
-GLint GLFilter::linkProgram(GLuint program) const {
-  GLint status = 1;
-
-  glLinkProgram(program);
-
-  GLint logLength = 0;
-  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
-  if (logLength > 0) {
-    std::vector<GLchar> log(logLength);
-    glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
-    gl_log(GL_ERR, "Program link log:\n%s", &log[0]);
-  }
-
-  glGetProgramiv(program, GL_LINK_STATUS, &status);
-  if (status == 0)
-    gl_log(GL_ERR, "Failed to link program %d\n", program);
-
-  return status;
-}
-
-/* Validate a program (for i.e. inconsistent samplers) */
-GLint GLFilter::validateProgram(GLuint program) const {
-  GLint status = 1;
-
-  glValidateProgram(program);
-
-  GLint logLength = 0;
-  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
-  if (logLength > 0) {
-    std::vector<GLchar> log(logLength);
-    glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
-    gl_log(GL_ERR, "Program validate log:\n%s", &log[0]);
-  }
-
-  glGetProgramiv(program, GL_VALIDATE_STATUS, &status);
-  if (status == 0)
-    gl_log(GL_ERR, "Failed to validate program %d\n", program);
-
-  return status;
-}
diff --git a/caffe2/mobile/contrib/opengl/core/GLFilter.h b/caffe2/mobile/contrib/opengl/core/GLFilter.h
deleted file mode 100644 (file)
index d34eac0..0000000
+++ /dev/null
@@ -1,104 +0,0 @@
-
-#pragma once
-
-#include "GLContext.h"
-#include "GLTexture.h"
-#include "arm_neon_support.h"
-
-#include <functional>
-#include <string>
-#include <vector>
-
-#define BINDING(variableName) (variableName = new binding{#variableName})
-#define ATTRIBUTE(variableName, value) (variableName = new binding{#variableName, value})
-
-class GLFilter {
- protected:
-  const std::string kernel_name;
-  GLuint program = 0;
-  GLuint frameBuffer = 0;
-  static constexpr int kMaxUniformBlocks = 12;
-  GLuint uniformBlock[kMaxUniformBlocks] = {0};
-  GLint blockSize[kMaxUniformBlocks]     = {0};
-  bool frame_buffer_initialized = false;
-
-  // glGetError() can be expensive, we should turn error checking off when we're done with debugging
-
-  static constexpr bool check_opengl_errors = true;
-
-public:
-  typedef std::vector<std::pair<std::string, std::string>> replacements_t;
-
-  struct binding {
-    const std::string name;
-    GLint location;
-  };
-
-  struct texture_attachment {
-    const GLTexture* texture;
-    const binding* uniform;
-  };
-
-  GLFilter(const std::string kernel_name,
-           const std::string vertex_shader,
-           const std::string fragment_shader,
-           const std::vector<binding*> uniforms,
-           const std::vector<binding*> uniform_blocks = {},
-           const std::vector<binding*> attributes = {},
-           const replacements_t& replacements = {});
-
-  // TODO: The set and reset context need to be commented out for unit testing
-  ~GLFilter() {
-    releaseBuffers();
-    deleteProgram();
-    deleteBindings();
-  }
-
-  void throwRuntimeError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
-    std::stringstream errmsg;
-    errmsg << kernel_name << ": ";
-    error_formatter(errmsg);
-    throw std::runtime_error(errmsg.str());
-  }
-
-  void checkGLError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
-    if (check_opengl_errors) {
-      GLenum glError = glGetError();
-      if (glError != GL_NO_ERROR) {
-        throwRuntimeError([&](std::stringstream& errmsg) {
-          error_formatter(errmsg);
-          errmsg << ", " << glError;
-        });
-      }
-    }
-  }
-
-  template <typename T>
-  void attach_uniform_buffer(const binding* block,
-                             GLuint bindingPoint, std::function<void(T*, size_t)> loader);
-
-  void run(const std::vector<texture_attachment>& input,
-           const std::vector<const GLTexture*>& output,
-           std::function<void(void)> uniforms_initializer,
-           int width,
-           int height);
-
-  void releaseBuffers();
-  void deleteProgram();
-  void deleteBindings();
-
-  static const char* vertex_shader;
-
- private:
-  const std::vector<binding*> uniforms_;
-  const std::vector<binding*> uniform_blocks_;
-  const std::vector<binding*> attributes_;
-
-  std::string process_replacements(std::string source, const replacements_t& replacements) const;
-
-  bool createProgram(const GLchar* vertSource, const GLchar* fragSource, GLuint* program) const;
-
-  GLint compileShader(GLenum target, GLsizei count, const GLchar** sources, GLuint* shader) const;
-  GLint linkProgram(GLuint program) const;
-  GLint validateProgram(GLuint program) const;
-};
diff --git a/caffe2/mobile/contrib/opengl/core/GLImage.cc b/caffe2/mobile/contrib/opengl/core/GLImage.cc
deleted file mode 100644 (file)
index 19956e5..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-
-#include "GLImage.h"
-#include "arm_neon_support.h"
-#include <c10/util/typeid.h>
-
-namespace caffe2 {
-CAFFE_KNOWN_TYPE(GLImage<float>);
-CAFFE_KNOWN_TYPE(GLImage<uint8_t>);
-CAFFE_KNOWN_TYPE(GLImageVector<float>);
-CAFFE_KNOWN_TYPE(GLImageVector<uint8_t>);
-#ifdef __ARM_NEON__
-CAFFE_KNOWN_TYPE(GLImage<float16_t>);
-CAFFE_KNOWN_TYPE(GLImageVector<float16_t>);
-#endif
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/GLImage.h b/caffe2/mobile/contrib/opengl/core/GLImage.h
deleted file mode 100644 (file)
index 4b3a057..0000000
+++ /dev/null
@@ -1,151 +0,0 @@
-
-#pragma once
-
-#include "GLTexture.h"
-#include "caffe2/core/logging.h"
-
-#include <functional>
-#include <vector>
-
-template <typename T>
-class GLImage {
- public:
-  const int width;
-  const int height;
-  const int channels;
-  const int data_size;
-
-  const int tile_x;
-  const int tile_y;
-  const int texture_width;
-  const int texture_height;
-  const int slices;
-
-  const std::vector<const GLTexture*> textures;
-
-  constexpr static int slice_channels = 4;
-
-  static constexpr int channels_to_slices(int channels, int tile_x, int tile_y) {
-    return ((channels + slice_channels - 1) / slice_channels + tile_x * tile_y - 1) /
-           (tile_x * tile_y);
-  }
-
-  static const std::vector<const GLTexture*> allocate_textures(
-      int slices, std::function<const GLTexture*(int slice)> texture_loader) {
-    std::vector<const GLTexture*> textures;
-    for (int i = 0; i < slices; i++) {
-      textures.push_back(texture_loader(i));
-    }
-    return textures;
-  }
-
-  GLImage(int _width,
-          int _height,
-          int _channels,
-          int _tile_x,
-          int _tile_y,
-          std::function<const GLTexture*(int slice)> texture_loader)
-      : width(_width),
-        height(_height),
-        channels(_channels),
-        data_size(sizeof(T)),
-        tile_x(_tile_x),
-        tile_y(_tile_y),
-        texture_width(_width * _tile_x),
-        texture_height(_height * _tile_y),
-        slices(channels_to_slices(_channels, _tile_x, _tile_y)),
-        textures(allocate_textures(slices, texture_loader)) {
-    CAFFE_ENFORCE_EQ(
-        slices, ((channels + 3) / 4 + tile_x * tile_y - 1) / (tile_x * tile_y));
-  }
-
-  GLImage(int _width,
-          int _height,
-          int _channels,
-          int _tile_x,
-          int _tile_y,
-          bool _destroy,
-          std::function<const GLTexture*(int slice)> texture_loader)
-      : width(_width),
-        height(_height),
-        channels(_channels),
-        data_size(sizeof(T)),
-        tile_x(_tile_x),
-        tile_y(_tile_y),
-        texture_width(_width * _tile_x),
-        texture_height(_height * _tile_y),
-        slices(channels_to_slices(_channels, _tile_x, _tile_y)),
-        textures(allocate_textures(slices, texture_loader)) {
-    CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
-  }
-
-  GLImage()
-      : width(0),
-        height(0),
-        channels(0),
-        data_size(sizeof(T)),
-        tile_x(0),
-        tile_y(0),
-        texture_width(0),
-        texture_height(0),
-        slices(0){};
-
-  virtual ~GLImage() {
-    gl_log(GL_VERBOSE, "deleting GLImage\n");
-    for (auto&& texture : textures) {
-      delete texture;
-    }
-  }
-};
-
-template <typename T>
-class GLImageVector {
- private:
-  std::vector<GLImage<T>*> images_;
-  int num_images_ = 0;
-  int width_ = 0;
-  int height_ = 0;
-  int channels_ = 0;
-  int tile_x_ = 0;
-  int tile_y_ = 0;
-
- public:
-  GLImage<T>* operator[](int index) const {
-    CAFFE_ENFORCE_LT(index, num_images_, "Out of bounds when accessing GLImageVector");
-    return images_[index];
-  }
-
-  void push_back(GLImage<T>* image) {
-    CAFFE_ENFORCE_EQ(image->channels, channels_);
-    CAFFE_ENFORCE_EQ(image->width, width_);
-    CAFFE_ENFORCE_EQ(image->height, height_);
-    CAFFE_ENFORCE_EQ(image->tile_x, tile_x_);
-    CAFFE_ENFORCE_EQ(image->tile_y, tile_y_);
-    images_.push_back(image);
-    CAFFE_ENFORCE_LE(images_.size(), num_images_);
-  }
-
-  int size() const { return images_.size(); }
-  int channels() const { return channels_; }
-  int width() const { return width_; }
-  int height() const { return height_; }
-  int tile_x() const { return tile_x_; }
-  int tile_y() const { return tile_y_; }
-  int slices() const { return size() > 0 ? images_[0]->slices : 0; }
-
-  GLImageVector(int num_images, int width, int height, int channels, int tile_x = 1, int tile_y = 1)
-      : num_images_(num_images),
-        width_(width),
-        height_(height),
-        channels_(channels),
-        tile_x_(tile_x),
-        tile_y_(tile_y) {}
-
-  GLImageVector() {}
-
-  ~GLImageVector() {
-    for (int i = 0; i < images_.size(); i++) {
-      delete images_[i];
-    }
-  }
-};
diff --git a/caffe2/mobile/contrib/opengl/core/GLImageAllocator.cc b/caffe2/mobile/contrib/opengl/core/GLImageAllocator.cc
deleted file mode 100644 (file)
index 5f15840..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-
-#include "GLImageAllocator.h"
-#include "arm_neon_support.h"
-
-template <class T>
-GLImageVector<T>* GLImageAllocator<T>::newImage(
-    int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output) {
-  GLImageVector<T>* images =
-      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
-  for (int i = 0; i < num_images; i++) {
-    images->push_back(
-        new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
-          bool usePadding = is_output;
-          return new GLPlainTexture(type, nullptr, width * tile_x, height * tile_y, usePadding);
-        }));
-  }
-  return images;
-}
-
-template <class T>
-GLImageVector<T>* GLImageAllocator<T>::newImage(
-    int num_images,
-    int width,
-    int height,
-    int channels,
-    int tile_x,
-    int tile_y,
-    std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
-  GLImageVector<T>* images =
-      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
-  for (int i = 0; i < num_images; i++) {
-    images->push_back(
-        new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
-          return textureAllocator(width, height);
-        }));
-  }
-  return images;
-}
-
-template <class T>
-GLImageVector<T>* GLImageAllocator<T>::ShareTexture(const GLuint textureID,
-                                                    int num_images,
-                                                    int width,
-                                                    int height,
-                                                    int channels,
-                                                    int tile_x,
-                                                    int tile_y) {
-  GLImageVector<T>* images =
-      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
-  for (int i = 0; i < num_images; i++) {
-    images->push_back(
-        new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
-          return new GLPlainTexture(
-              GLImageAllocator<T>::type, textureID, width * tile_x, height * tile_y);
-        }));
-  }
-  return images;
-}
-
-template <>
-const GLTexture::Type& GLImageAllocator<float16_t>::type = GLTexture::FP16;
-template <>
-const GLTexture::Type& GLImageAllocator<uint8_t>::type = GLTexture::UI8;
-
-template class GLImageAllocator<float16_t>;
-template class GLImageAllocator<uint8_t>;
diff --git a/caffe2/mobile/contrib/opengl/core/GLImageAllocator.h b/caffe2/mobile/contrib/opengl/core/GLImageAllocator.h
deleted file mode 100644 (file)
index a6764e6..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-
-#pragma once
-
-#include "GLImage.h"
-#include "GLPlainTexture.h"
-
-template <class T>
-class GLImageAllocator {
- public:
-  static const GLTexture::Type& type;
-
-  GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
-
-  virtual ~GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
-
-  virtual GLImageVector<T>* newImage(
-      int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output);
-
-  virtual GLImageVector<T>* newImage(
-      int num_images,
-      int width,
-      int height,
-      int channels,
-      int tile_x,
-      int tile_y,
-      std::function<const GLTexture*(const int width, const int height)> textureAllocator);
-
-  virtual GLImageVector<T>* ShareTexture(const GLuint textureID,
-                                         int num_images,
-                                         int width,
-                                         int height,
-                                         int channels,
-                                         int tile_x = 1,
-                                         int tile_y = 1);
-
-  static GLImageAllocator<T>* newGLImageAllocator();
-};
diff --git a/caffe2/mobile/contrib/opengl/core/GLLogging.h b/caffe2/mobile/contrib/opengl/core/GLLogging.h
deleted file mode 100644 (file)
index 9e57660..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-
-#pragma once
-
-#include <stdarg.h>
-#include <stdio.h>
-
-enum { GL_ERR = -1, GL_LOG = 0, GL_VERBOSE = 1 };
-
-static constexpr int GL_LOG_LEVEL = GL_LOG;
-
-static inline int gl_log(int level, const char* format, ...) {
-  int r = 0;
-  if (level <= GL_LOG_LEVEL) {
-    va_list args;
-    va_start(args, format);
-    r = vfprintf(stderr, format, args);
-    va_end(args);
-  }
-  return r;
-}
diff --git a/caffe2/mobile/contrib/opengl/core/GLPBO.cc b/caffe2/mobile/contrib/opengl/core/GLPBO.cc
deleted file mode 100644 (file)
index eea3bed..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-
-#include "GLPBO.h"
-
-#include "caffe2/core/logging.h"
-
-GLPBO::~GLPBO() {
-  if (pboId != 0) {
-    gl_log(GL_LOG, "deleting PBO buffer %d\n", pboId);
-    glDeleteBuffers(1, &pboId);
-    pboId = 0;
-  }
-  if (pboFrameBuffer != 0) {
-    gl_log(GL_LOG, "deleting PBO frame buffer %d\n", pboFrameBuffer);
-    glDeleteFramebuffers(1, &pboFrameBuffer);
-    pboFrameBuffer = 0;
-  }
-}
-
-GLPBO* GLPBO::pboContext = NULL;
-
-GLPBO* GLPBO::getContext() {
-  if (pboContext == NULL) {
-    pboContext = new GLPBO();
-  }
-  return pboContext;
-}
-
-void GLPBO::mapTextureData(GLuint _textureId,
-                           GLsizei _width,
-                           GLsizei _height,
-                           GLsizei _stride,
-                           GLsizei _channels,
-                           const GLTexture::Type& _type,
-                           std::function<void(const void* buffer,
-                                              size_t width,
-                                              size_t height,
-                                              size_t stride,
-                                              size_t channels,
-                                              const GLTexture::Type& type)> process) {
-  GLint defaultFramebuffer = 0;
-  glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
-
-  if (pboFrameBuffer == 0) {
-    glGenFramebuffers(1, &pboFrameBuffer);
-    gl_log(GL_VERBOSE, "created PBO frame buffer %d\n", pboFrameBuffer);
-  }
-
-  glBindFramebuffer(GL_FRAMEBUFFER, pboFrameBuffer);
-
-  glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _textureId, 0);
-
-  int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
-  if (fbs != GL_FRAMEBUFFER_COMPLETE) {
-    std::stringstream errmsg;
-    errmsg << ": Frame buffer incomplete: " << fbs;
-    throw std::runtime_error(errmsg.str());
-  }
-
-  if (pboId == 0) {
-    glGenBuffers(1, &pboId);
-    gl_log(GL_VERBOSE, "created PBO buffer %d\n", pboId);
-  }
-  glBindBuffer(GL_PIXEL_PACK_BUFFER, pboId);
-
-  size_t buffer_size = _stride * _height * _channels * _type.dataSize();
-
-  if (buffer_size > pboSize) {
-    LOG(INFO) << "Allocating PBO of capacity " << buffer_size;
-
-    glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, NULL, GL_DYNAMIC_READ);
-    pboSize = buffer_size;
-  }
-
-  glReadBuffer(GL_COLOR_ATTACHMENT0);
-  glReadPixels(0, 0, _stride, _height, _type.format, _type.type, 0);
-
-  GLhalf* ptr = (GLhalf*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, buffer_size, GL_MAP_READ_BIT);
-
-  if (ptr) {
-    process(ptr, _width, _height, _stride, _channels, _type);
-  } else {
-    std::stringstream errmsg;
-    errmsg << ": glMapBufferRange using PBO incomplete";
-    throw std::runtime_error(errmsg.str());
-  }
-
-  // Unmap buffer
-  glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
-  glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
-
-  // Bind to the default FrameBuffer
-  glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
-}
diff --git a/caffe2/mobile/contrib/opengl/core/GLPBO.h b/caffe2/mobile/contrib/opengl/core/GLPBO.h
deleted file mode 100644 (file)
index c904656..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-
-#pragma once
-
-#include "GLTexture.h"
-#include <functional>
-
-class GLPBO {
-  GLuint pboId = 0;
-  GLuint pboSize = 0;
-  GLuint pboFrameBuffer = 0;
-
-  ~GLPBO();
-
-  static GLPBO* pboContext;
-
- public:
-  void mapTextureData(GLuint _textureId,
-                      GLsizei _width,
-                      GLsizei _height,
-                      GLsizei _stride,
-                      GLsizei _channels,
-                      const GLTexture::Type& type,
-                      std::function<void(const void* buffer,
-                                         size_t width,
-                                         size_t height,
-                                         size_t stride,
-                                         size_t channels,
-                                         const GLTexture::Type& type)> process);
-
-  static GLPBO* getContext();
-};
diff --git a/caffe2/mobile/contrib/opengl/core/GLPlainTexture.cc b/caffe2/mobile/contrib/opengl/core/GLPlainTexture.cc
deleted file mode 100644 (file)
index 71b69c5..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-
-#include "GLPlainTexture.h"
-#include "GLPBO.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/timer.h"
-
-#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
-
-#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
-
-GLPlainTexture::GLPlainTexture(
-    const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
-    : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
-  //  caffe2::Timer timer;
-  //  timer.Start();
-  glGenTextures(1, &_textureId);
-  glBindTexture(GL_TEXTURE_2D, _textureId);
-  glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
-
-  gl_log(
-      GL_VERBOSE,
-      "GLPlainTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
-      _textureId,
-      _type.internalFormat,
-      _type.format,
-      _type.type);
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, _filter);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, _filter);
-
-#if GL_EXT_texture_border_clamp
-  GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
-  glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
-  // Set the texture to use the border clamp wrapping mode.
-  _wrap = GL_CLAMP_TO_BORDER_EXT;
-#endif
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, _wrap);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, _wrap);
-
-  glBindTexture(GL_TEXTURE_2D, 0);
-  //  LOG(INFO) << "glTexImage2D takes " << timer.MilliSeconds() << " ms";
-}
-
-GLPlainTexture::GLPlainTexture(
-    const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
-    : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
-  _textureId = textureID;
-  isOwner = false;
-  gl_log(
-      GL_VERBOSE,
-      "GLPlainTexture() - wrapped textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
-      _textureId,
-      _type.internalFormat,
-      _type.format,
-      _type.type);
-}
diff --git a/caffe2/mobile/contrib/opengl/core/GLPlainTexture.h b/caffe2/mobile/contrib/opengl/core/GLPlainTexture.h
deleted file mode 100644 (file)
index 4a211ac..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-
-#pragma once
-
-#include "GLContext.h"
-#include "GLTexture.h"
-
-class GLPlainTexture : public GLTexture {
- private:
-  bool isOwner = true;
-
- public:
-  GLPlainTexture(const Type& type,
-                 const void* input,
-                 GLsizei width,
-                 GLsizei height,
-                 bool use_padding = false,
-                 GLint filter = GL_NEAREST,
-                 GLint wrap = GL_CLAMP_TO_EDGE);
-
-  GLPlainTexture(const Type& type,
-                 const GLuint textureID,
-                 GLsizei width,
-                 GLsizei height,
-                 bool use_padding = false,
-                 GLint filter = GL_NEAREST,
-                 GLint wrap = GL_CLAMP_TO_EDGE);
-
-  ~GLPlainTexture() {
-    if (glIsTexture(_textureId)) {
-      if (isOwner) {
-        gl_log(GL_VERBOSE, "~GLPlainTexture() - deleting texture %d\n", _textureId);
-        glDeleteTextures(1, &_textureId);
-      }
-    } else {
-      gl_log(GL_ERR, "not deleting texture %d\n", _textureId);
-    }
-  }
-
-  GLuint name() const { return _textureId; };
-
-  GLenum target() const { return GL_TEXTURE_2D; };
-
-  bool flipped() const { return false; };
-};
diff --git a/caffe2/mobile/contrib/opengl/core/GLPredictor.cc b/caffe2/mobile/contrib/opengl/core/GLPredictor.cc
deleted file mode 100644 (file)
index 405292a..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include "GLPredictor.h"
-#include "GLContext.h"
-#include "rewrite_net.h"
-#include <vector>
-
-namespace caffe2 {
-
-template <class T>
-void shareInputGLImage(Workspace* ws, const std::string& name, GLImageVector<T>* input) {
-  auto* blob = ws->GetBlob(name);
-  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  blob->ShareExternal<GLImageVector<T>>(input);
-}
-
-template <class T>
-const GLImageVector<T>* extractOutputGLImage(Workspace* ws, const std::string& name) {
-  auto* blob = ws->GetBlob(name);
-  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return &blob->template Get<GLImageVector<T>>();
-}
-
-const NetDef create_gl_run_net(const NetDef& init_net,
-                               const NetDef& run_net,
-                               bool use_texture_input) {
-  NetDef gl_run_net;
-  if (!tryConvertToOpenGL(init_net, run_net, &gl_run_net, use_texture_input)) {
-    CAFFE_THROW("Failed to convert model to OpenGL");
-  }
-  return gl_run_net;
-}
-
-GLPredictor::GLPredictor(const NetDef& init_net,
-                         const NetDef& run_net,
-                         bool use_texture_input,
-                         Workspace* parent)
-    : Predictor(init_net, create_gl_run_net(init_net, run_net, use_texture_input), parent) {}
-
-GLPredictor::~GLPredictor() {}
-
-template <class T>
-bool GLPredictor::run(std::vector<GLImageVector<T>*>& inputs,
-                      std::vector<const GLImageVector<T>*>* outputs) {
-  const NetDef& run_net_ = Predictor::def();
-  CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
-  for (auto i = 0; i < inputs.size(); ++i) {
-    shareInputGLImage<T>(Predictor::ws(), run_net_.external_input(i), inputs[i]);
-  }
-
-  if (!Predictor::ws()->RunNet(run_net_.name())) {
-    return false;
-  }
-
-  for (auto i = 0; i < run_net_.external_output_size(); ++i) {
-    outputs->push_back(extractOutputGLImage<T>(Predictor::ws(), run_net_.external_output(i)));
-  }
-
-  return true;
-}
-
-template bool GLPredictor::run(std::vector<GLImageVector<uint8_t>*>& inputs,
-                               std::vector<const GLImageVector<uint8_t>*>* outputs);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/GLPredictor.h b/caffe2/mobile/contrib/opengl/core/GLPredictor.h
deleted file mode 100644 (file)
index 24c3197..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#pragma once
-
-#include "GLImage.h"
-#include "caffe2/core/net.h"
-#include "caffe2/predictor/predictor.h"
-
-namespace caffe2 {
-class GLPredictor : public Predictor {
- public:
-  GLPredictor(const NetDef& init_net,
-              const NetDef& run_net,
-              bool use_texture_input = false,
-              Workspace* parent = nullptr);
-
-  template <class T>
-  bool run(std::vector<GLImageVector<T>*>& inputs, std::vector<const GLImageVector<T>*>* outputs);
-
-  ~GLPredictor();
-};
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/GLTexture.cc b/caffe2/mobile/contrib/opengl/core/GLTexture.cc
deleted file mode 100644 (file)
index 2e3e406..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-
-#include "GLTexture.h"
-#include "DataTransfer.h"
-#include "GLPBO.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/timer.h"
-
-#if CAFFE2_ANDROID && defined(__ARM_NEON__)
-
-#include "../android/AndroidGLContext.h"
-
-// https://community.arm.com/thread/10002
-void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz) {
-  if (sz & 63) {
-    sz = (sz & -64) + 64;
-  }
-
-  asm volatile(
-      "NEONCopyPLD: \n"
-      " VLDM %[src]!,{d0-d7} \n"
-      " VSTM %[dst]!,{d0-d7} \n"
-      " SUBS %[sz],%[sz],#0x40 \n"
-      " BGT NEONCopyPLD \n"
-      : [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
-      :
-      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
-}
-#endif
-
-const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
-const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
-const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
-
-void GLTexture::map_read(std::function<void(const void* buffer,
-                                            size_t width,
-                                            size_t height,
-                                            size_t stride,
-                                            size_t channels,
-                                            const Type& type)> process) const {
-  GLPBO* pbo = GLPBO::getContext();
-  pbo->mapTextureData(_textureId, _width, _height, _stride, _channels, _type, process);
-}
-
-void GLTexture::map_load(std::function<void(void* buffer,
-                                            size_t width,
-                                            size_t height,
-                                            size_t stride,
-                                            size_t channels,
-                                            const Type& type)> process) const {
-  const int alignment = 32; // 4 * _type.dataSize();
-  void* buffer = nullptr;
-  size_t buffer_size = _width * _height * _channels * _type.dataSize();
-
-#ifdef __ANDROID__
-  buffer = (void*)memalign(alignment, buffer_size);
-#else
-  posix_memalign((void**)&buffer, alignment, buffer_size);
-#endif
-  CAFFE_ENFORCE(buffer);
-
-  process(buffer, _width, _height, _width, _channels, _type);
-  loadData(buffer);
-  free(buffer);
-}
-
-void GLTexture::loadData(const void* pixels) const {
-  glBindTexture(GL_TEXTURE_2D, _textureId);
-  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, _width, _height, _type.format, _type.type, pixels);
-  glBindTexture(GL_TEXTURE_2D, 0);
-}
diff --git a/caffe2/mobile/contrib/opengl/core/GLTexture.h b/caffe2/mobile/contrib/opengl/core/GLTexture.h
deleted file mode 100644 (file)
index c12152e..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-
-#pragma once
-#include "GL.h"
-#include "GLLogging.h"
-
-class GLTexture {
- public:
-  struct Type {
-    const GLenum internalFormat;
-    const GLenum format;
-    const GLenum type;
-
-    int dataSize() const {
-      switch (type) {
-      case GL_UNSIGNED_INT:
-        return 4;
-      case GL_HALF_FLOAT:
-        return 2;
-      case GL_UNSIGNED_BYTE:
-        return 1;
-      default:
-        throw std::runtime_error("Unknown Texture Type");
-      }
-    }
-
-    int channels() const {
-      switch (format) {
-      case GL_R8:
-        return 1;
-      case GL_RG8:
-        return 2;
-      // case GL_BGRA:
-      case GL_RG_INTEGER:
-      case GL_RGBA:
-        return 4;
-      default:
-        throw std::runtime_error("Unknown Texture Format");
-      }
-    }
-  };
-
-  static const Type FP16;
-  static const Type FP16_COMPAT;
-  static const Type UI8;
-
- protected:
-  const Type& _type;
-
-  const GLsizei _width;
-  const GLsizei _height;
-  const GLsizei _stride;
-  const GLsizei _channels;
-  const bool _use_padding;
-
-  GLint _filter;
-  GLint _wrap;
-  GLuint _textureId;
-
- public:
-  GLTexture(const Type& type,
-            int width,
-            int height,
-            int stride,
-            bool use_padding,
-            GLint filter,
-            GLint wrap)
-      : _type(type),
-        _width(width),
-        _height(height),
-        _stride(stride),
-        _channels(type.channels()),
-        _use_padding(use_padding),
-        _filter(filter),
-        _wrap(wrap) {}
-
-  GLTexture(const Type& type, int width, int height, bool use_padding, GLint filter, GLint wrap)
-      : GLTexture(type,
-                  width,
-                  height,
-                  use_padding ? (width + 7) / 8 * 8 : width,
-                  use_padding,
-                  filter,
-                  wrap) {}
-
-  virtual ~GLTexture() {}
-  virtual GLuint name() const = 0;
-  virtual GLenum target() const = 0;
-  virtual bool flipped() const = 0;
-
-  virtual void map_read(std::function<void(const void* buffer,
-                                           size_t width,
-                                           size_t height,
-                                           size_t stride,
-                                           size_t channels,
-                                           const Type& type)> process) const;
-
-  virtual void map_load(std::function<void(void* buffer,
-                                           size_t width,
-                                           size_t height,
-                                           size_t stride,
-                                           size_t channels,
-                                           const Type& type)> process) const;
-
-  void loadData(const void* pixels) const;
-};
diff --git a/caffe2/mobile/contrib/opengl/core/ImageAllocator.h b/caffe2/mobile/contrib/opengl/core/ImageAllocator.h
deleted file mode 100644 (file)
index 22eb25d..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-
-#pragma once
-
-#include "GLImageAllocator.h"
-
-namespace caffe2 {
-
-template <class T>
-class ImageAllocator {
-  GLImageAllocator<T>* glImageAllocator;
-
- public:
-  ImageAllocator() : glImageAllocator(GLImageAllocator<T>::newGLImageAllocator()) {}
-
-  virtual ~ImageAllocator() { delete glImageAllocator; }
-
-  GLImageVector<T>* newImage(
-      int num_images, int width, int height, int channels, bool is_output = false) {
-    const int tile_x = 1, tile_y = 1;
-    return glImageAllocator->newImage(
-        num_images, width, height, channels, tile_x, tile_y, is_output);
-  }
-
-  GLImageVector<T>* newImage(int num_images,
-                             int width,
-                             int height,
-                             int channels,
-                             int tile_x,
-                             int tile_y,
-                             bool is_output = false) {
-    return glImageAllocator->newImage(
-        num_images, width, height, channels, tile_x, tile_y, is_output);
-  }
-
-  GLImageVector<T>* newImage(
-      int num_images,
-      int width,
-      int height,
-      int channels,
-      int tile_x,
-      int tile_y,
-      std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
-    return glImageAllocator->newImage(
-        num_images, width, height, channels, tile_x, tile_y, textureAllocator);
-  }
-};
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/arm_neon_support.h b/caffe2/mobile/contrib/opengl/core/arm_neon_support.h
deleted file mode 100644 (file)
index d1e1a58..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-
-#pragma once
-
-#include "caffe2/core/common.h"
-
-#ifdef __ARM_NEON__
-#if CAFFE2_IOS
-#include "arm_neon.h"
-#elif CAFFE2_ANDROID
-#include "caffe2/mobile/contrib/opengl/android/arm_neon_support.h"
-#endif
-#endif
diff --git a/caffe2/mobile/contrib/opengl/core/rewrite_net.cc b/caffe2/mobile/contrib/opengl/core/rewrite_net.cc
deleted file mode 100644 (file)
index 1919c90..0000000
+++ /dev/null
@@ -1,367 +0,0 @@
-
-#include "rewrite_net.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/proto_utils.h"
-#include <unordered_map>
-#include <unordered_set>
-
-#ifdef CAFFE2_ANDROID
-#include "../android/AndroidGLContext.h"
-#endif
-
-namespace caffe2 {
-
-struct Analysis {
-  struct SSA {
-    using BlobVersions = std::unordered_map<std::string, size_t>;
-    BlobVersions inVersions;
-    BlobVersions outVersions;
-  };
-  std::vector<SSA> ssa;
-  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
-};
-
-static Analysis analyzeNet(const NetDef& net) {
-  Analysis::SSA::BlobVersions frontier;
-  Analysis analysis;
-
-  auto play = [&](size_t i, const OperatorDef& op) {
-    Analysis::SSA::BlobVersions inVersions;
-    for (const auto& s : op.input()) {
-      inVersions[s] = frontier[s];
-      analysis.inUsages[s][frontier[s]].push_back(i);
-    }
-    Analysis::SSA::BlobVersions outVersions;
-    for (const auto& s : op.output()) {
-      if (frontier.find(s) != frontier.end()) {
-        frontier[s] += 1;
-      }
-      outVersions[s] = frontier[s];
-    }
-    analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
-  };
-
-  for (auto i = 0; i < net.op_size(); ++i) {
-    play(i, net.op(i));
-  }
-  return analysis;
-}
-
-static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
-  auto* op = predictNet.add_op();
-  op->set_name("CopyToOpenGL");
-  op->set_type("CopyToOpenGL");
-  op->add_input(cpu_blob);
-  op->add_output(cpu_blob + "_M");
-}
-
-static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
-  // add argument "is_last" to the last op to signal this is the last operator before the
-  // CopyFromOpenGL op
-  auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1);
-  auto* arg = last_op->add_arg();
-  arg->set_name("is_last");
-  arg->set_i(1);
-
-  auto* op = predictNet.add_op();
-  op->set_name("CopyFromOpenGL");
-  op->set_type("CopyFromOpenGL");
-  op->add_input(cpu_blob + "_M");
-  op->add_output(cpu_blob);
-}
-
-static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) {
-  // Do some validation of the outputs. For this version, we require:
-  // - a single input (first element of external_input()) is consumed by the NetDef
-  // - a single output (first element of external_output()) is produced by the NetDef.
-  // - the input is consumed by def.op(0), and this is the only consumer.
-  // - the output is produced by def.op(-1).
-  CAFFE_ENFORCE_GE(def.external_input_size(), 1);
-  CAFFE_ENFORCE_GE(def.external_output_size(), 1);
-  auto analysis = analyzeNet(def);
-  // enforce a single use of the input blob.
-  CAFFE_ENFORCE_GE(def.op_size(), 1);
-
-  const auto& inputBlob = def.external_input(0);
-  // Enforce that the input blob has a single usage - in the first operator.
-  CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
-  // Enforce that the external_output(0) blob is produced by the last operator in this sequence.
-  const auto& outputBlob = def.external_output(0);
-  CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) !=
-                analysis.ssa.back().outVersions.end());
-  const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
-  // This should hold true by definition of the SSA analysis.
-  CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
-                analysis.inUsages[outputBlob].end());
-
-  NetDef mdef;
-  mdef.CopyFrom(def);
-  mdef.clear_op();
-
-  std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
-  cpu_blobs[def.external_input(0)].insert(0);
-
-  for (auto i = 0; i < def.op_size(); i++) {
-    const auto& currentOp = def.op(i);
-    if (glOps.count(currentOp.type()) > 0) {
-      // OpenGL Op
-      // insert copyToOpenGLOp
-      for (auto j = 0; j < currentOp.input_size(); j++) {
-        auto& input = currentOp.input(j);
-        auto version = analysis.ssa[i].inVersions[input];
-        if (cpu_blobs[input].count(version) > 0) {
-          insertCopyToGPUOp(mdef, input);
-          gpu_blobs[input].insert(version);
-          cpu_blobs[input].erase(version);
-        }
-        // Only the first input should be OpenGL texture
-        // Otherwise, copyToOpenGLOp will be inserted for the weights,
-        // which are outputs of QuantDecode
-        if (currentOp.type().find("OpenGLConv") == 0) {
-          if (j == 0) {
-            break;
-          }
-        }
-      }
-
-      auto* op = mdef.add_op();
-      op->CopyFrom(currentOp);
-
-      // swap input blob
-      for (auto j = 0; j < currentOp.input_size(); j++) {
-        auto& input = currentOp.input(j);
-        auto version = analysis.ssa[i].inVersions[input];
-        if (gpu_blobs[input].count(version) > 0) {
-          op->set_input(j, input + "_M");
-        }
-      }
-
-      // swap output blob
-      for (auto j = 0; j < currentOp.output_size(); j++) {
-        auto& output = currentOp.output(j);
-        auto version = analysis.ssa[i].outVersions[output];
-        op->set_output(j, output + "_M");
-        gpu_blobs[output].insert(version);
-      }
-      // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op
-      if (i == def.op_size() - 1) {
-        insertCopyFromGPUOp(mdef, currentOp.output(0));
-      }
-    } else {
-      // CPU Op
-      // insert copyFromOpenGLOp
-      for (auto j = 0; j < currentOp.input_size(); j++) {
-        auto& input = currentOp.input(j);
-        auto version = analysis.ssa[i].inVersions[input];
-        if (gpu_blobs[input].count(version) > 0) {
-          insertCopyFromGPUOp(mdef, input);
-        }
-      }
-      auto* op = mdef.add_op();
-      op->CopyFrom(currentOp);
-      for (auto j = 0; j < currentOp.output_size(); j++) {
-        auto& output = currentOp.output(j);
-        auto version = analysis.ssa[i].outVersions[output];
-        cpu_blobs[output].insert(version);
-      }
-    }
-  }
-  return mdef;
-}
-
-static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
-                               const OperatorDef& nextOp,
-                               OperatorDef* fusedOp,
-                               std::unordered_set<std::string>& glOps) {
-  // Check for possible invalid opportunities.
-  if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
-    return false;
-  }
-  // The fused op cannot be inplace
-  if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
-    return false;
-  }
-
-  static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
-      {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"},
-      {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"},
-      {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"},
-      {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}};
-  auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
-  if (it == fusionOpportunities.end()) {
-    return false;
-  }
-
-  glOps.insert(it->second);
-  fusedOp->CopyFrom(currentOp);
-  fusedOp->set_output(0, nextOp.output(0));
-  fusedOp->set_type(it->second);
-  for (auto i = 1; i < nextOp.input_size(); i++) {
-    fusedOp->add_input(nextOp.input(i));
-  }
-  return true;
-}
-
-static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
-  CHECK_GE(def.op_size(), 1);
-  NetDef mdef;
-  mdef.CopyFrom(def);
-  mdef.clear_op();
-  auto i = 0;
-
-  while (i < def.op_size()) {
-    if (i == def.op_size() - 1) {
-      VLOG(2) << "Last operator, skipping";
-      auto* op = mdef.add_op();
-      op->CopyFrom(def.op(i));
-      i += 1;
-      continue;
-    }
-
-    const auto& currentOp = def.op(i);
-    const auto& nextOp = def.op(i + 1);
-    OperatorDef fusedOp;
-    if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
-      VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
-      // We can fuse.
-      auto* op = mdef.add_op();
-      op->CopyFrom(fusedOp);
-      i += 2;
-      continue;
-    }
-    VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
-    // Just emit the current type.
-    auto* op = mdef.add_op();
-    op->CopyFrom(currentOp);
-    i += 1;
-  }
-  return mdef;
-}
-
-void dumpDefForOpenGL(const NetDef& d) {
-  for (const auto& op : d.op()) {
-    LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
-  }
-}
-
-// // For debugging
-// void dumpDefForOpenGL(const NetDef &net) {
-//  for (const auto &op : net.op()) {
-//    printf("***Operator: %s\n", op.type().c_str());
-//    for (auto input : op.input()) {
-//      printf("\tInput: %s\n", input.c_str());
-//    }
-//
-//    for (auto output : op.output()) {
-//      printf("\tOutput: %s\n", output.c_str());
-//    }
-//  }
-//}
-
-NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
-  CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
-  NetDef net;
-  net.CopyFrom(predictNet);
-
-  std::unordered_map<std::string, std::string> replacements(
-      {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess",
-        useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess"
-                        : "OpenGLTensorToTextureStylizerPreprocess"},
-       {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess",
-        useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess"
-                        : "OpenGLTextureToTensorStylizerDeprocess"}});
-
-  std::unordered_set<std::string> openGLOps; // Used to insert copy ops
-  bool needCopyOps = false;
-
-  const auto& opKeyList = CPUOperatorRegistry()->Keys();
-  auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
-
-#ifdef CAFFE2_ANDROID
-  // TODO: debug InstanceNorm models on Mali devices
-  AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext();
-  if (context->get_platform() == Mali) {
-    opKeySet.erase("OpenGLInstanceNorm");
-    opKeySet.erase("OpenGLInstanceNormPRelu");
-  }
-#endif
-  for (auto i = 0; i < net.op_size(); ++i) {
-    auto* op = net.mutable_op(i);
-    string openGLOp = std::string("OpenGL") + op->type();
-    if (replacements.count(openGLOp) > 0) {
-      openGLOp = replacements[openGLOp];
-    }
-
-    if (opKeySet.find(openGLOp) != opKeySet.end()) {
-      op->set_type(openGLOp);
-      openGLOps.insert(openGLOp);
-
-      if (useTiling) {
-        auto* arg = op->add_arg();
-        arg->set_name("tiling");
-        arg->set_i(1);
-      }
-    } else {
-      needCopyOps = true;
-    }
-  }
-
-  if (useTextureInput && needCopyOps) {
-    CAFFE_THROW("OpenGL operator missing");
-  }
-
-  if (runFusion) {
-    net = runOpenGLFusion(net, openGLOps);
-  }
-
-  if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
-    // For end-to-end testing
-    if (net.op(net.op_size() - 1).type() !=
-        replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) {
-      auto* last_op = net.mutable_op(net.op_size() - 1);
-      auto output = last_op->output(0) + "M";
-      last_op->set_output(0, output);
-      auto* copy_op = net.add_op();
-      copy_op->set_name("CopyFromOpenGL");
-      copy_op->set_type("CopyFromOpenGL");
-      copy_op->add_input(output);
-      // rename output blob in case input and output blob has the same name
-      copy_op->add_output(net.external_output(0));
-    }
-  } else {
-    if (!useTextureInput) {
-      needCopyOps = true;
-    }
-  }
-
-  // copy ops are needed when the input is not a texture
-  if (needCopyOps) {
-    // For non style transfer cases
-    net = insertInputOutputCopyOps(net, openGLOps);
-  }
-
-  return net;
-}
-
-bool tryConvertToOpenGL(const NetDef& initNet,
-                        const NetDef& predictNet,
-                        NetDef* glPredictNet,
-                        bool useTextureInput,
-                        bool useTiling,
-                        bool runFusion) {
-  try {
-    // Throws if unsupported operators are found.
-    *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
-    dumpDefForOpenGL(*glPredictNet);
-    // Throws if unsupported parameters are found.
-    Workspace ws;
-    ws.RunNetOnce(initNet);
-    ws.CreateNet(*glPredictNet);
-    LOG(INFO) << "OpenGL is successfully enabled";
-    return true;
-  } catch (const std::exception& e) {
-    LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
-    return false;
-  }
-}
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/core/rewrite_net.h b/caffe2/mobile/contrib/opengl/core/rewrite_net.h
deleted file mode 100644 (file)
index d0bc921..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-
-#pragma once
-#include "GLPredictor.h"
-#include "caffe2/predictor/predictor.h"
-
-namespace caffe2 {
-bool tryConvertToOpenGL(const NetDef& initNet,
-                        const NetDef& predictNet,
-                        NetDef* glPredictNet,
-                        bool useTextureInput = false,
-                        bool useTiling       = false,
-                        bool runFusion       = true);
-
-// Exposed for testing
-NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
-                                  bool useTextureInput = false,
-                                  bool useTiling       = false,
-                                  bool runFusion       = true);
-void dumpDefForOpenGL(const NetDef& net);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/ios/CMakeLists.txt b/caffe2/mobile/contrib/opengl/ios/CMakeLists.txt
deleted file mode 100644 (file)
index 8801961..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB_RECURSE tmp *.mm *.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/ios/GLContext.cc b/caffe2/mobile/contrib/opengl/ios/GLContext.cc
deleted file mode 100644 (file)
index f708ce6..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#include "IOSGLContext.h"
-
-std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
-
-void GLContext::initGLContext() {
-  if (_glcontext == nullptr) {
-    _glcontext.reset(new IOSGLContext());
-  }
-}
-
-GLContext* GLContext::getGLContext() {
-  if (_glcontext == nullptr) {
-    initGLContext();
-  }
-  return _glcontext.get();
-}
-
-void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
diff --git a/caffe2/mobile/contrib/opengl/ios/GLImageAllocator.cc b/caffe2/mobile/contrib/opengl/ios/GLImageAllocator.cc
deleted file mode 100644 (file)
index 9f6e04a..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-
-#include "IOSGLImageAllocator.h"
-#include <arm_neon.h>
-
-template <typename T>
-GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
-  return new IOSGLImageAllocator<T>();
-}
-
-template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
-template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLContext.h b/caffe2/mobile/contrib/opengl/ios/IOSGLContext.h
deleted file mode 100644 (file)
index 09e807a..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#pragma once
-
-#include "../core/GLContext.h"
-#include "../core/GLTexture.h"
-
-#import <CoreVideo/CoreVideo.h>
-
-class IOSGLContext : public GLContext {
-  void* oglContext;
-  void* oldContext;
-  CVOpenGLESTextureCacheRef textureCache;
-
- public:
-  IOSGLContext();
-  ~IOSGLContext();
-
-  const GLTexture* createNewTexture(CVPixelBufferRef pixelBuffer, const GLTexture::Type& type);
-  void set_context();
-  void reset_context();
-  void flush_context();
-};
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLContext.mm b/caffe2/mobile/contrib/opengl/ios/IOSGLContext.mm
deleted file mode 100644 (file)
index c436e59..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-
-#include "IOSGLContext.h"
-#include "IOSGLTexture.h"
-#import <sstream>
-
-#import <OpenGLES/EAGL.h>
-
-IOSGLContext::IOSGLContext() {
-  auto const currentContext = [EAGLContext currentContext];
-  oldContext = (void*)CFBridgingRetain(currentContext);
-
-  if (currentContext != nil && [currentContext API] == kEAGLRenderingAPIOpenGLES3) {
-    oglContext = (void*)CFBridgingRetain(currentContext);
-
-    gl_log(GL_LOG, "Reusing current context %p\n", oglContext);
-  } else {
-    oglContext =
-        (void*)CFBridgingRetain([[EAGLContext alloc] initWithAPI:kEAGLRenderingAPIOpenGLES3]);
-
-    gl_log(GL_LOG, "Created a new context %p\n", oglContext);
-  }
-
-  if (!oglContext) {
-    throw std::runtime_error("Problem with OpenGL context");
-  }
-
-  set_context();
-  textureCache = NULL;
-  CVReturn err = CVOpenGLESTextureCacheCreate(
-      kCFAllocatorDefault, NULL, (__bridge EAGLContext*)oglContext, NULL, &textureCache);
-
-  if (err) {
-    std::stringstream errmsg;
-    errmsg << "Error at CVOpenGLESTextureCacheCreate " << err;
-    throw std::runtime_error(errmsg.str());
-  }
-}
-
-IOSGLContext::~IOSGLContext() {
-  gl_log(GL_VERBOSE, "~IOSGLContext()");
-
-  set_context();
-  if (textureCache) {
-    CFRelease(textureCache);
-    textureCache = 0;
-  }
-  reset_context();
-
-  // Explicitly release only after we `reset_context` since otherwise we are going to read from a
-  // dangling pointer.
-  if (oglContext) {
-    CFBridgingRelease(oglContext);
-  }
-  if (oldContext) {
-    CFBridgingRelease(oldContext);
-  }
-}
-
-const GLTexture* IOSGLContext::createNewTexture(CVPixelBufferRef pixelBuffer,
-                                                const GLTexture::Type& type) {
-  return new IOSGLTexture(type, textureCache, pixelBuffer);
-}
-
-void IOSGLContext::set_context() {
-  auto const currentContext = [EAGLContext currentContext];
-
-  if ((__bridge void*)currentContext != oglContext) {
-    if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oglContext]) {
-      throw std::runtime_error("Problem setting OpenGL context");
-    }
-    GLenum glError = glGetError();
-    if (glError != GL_NO_ERROR) {
-      gl_log(GL_ERR, "There is an error: 0x%X\n", glError);
-    }
-    gl_log(GL_VERBOSE, "Set context to %p\n", oglContext);
-  }
-}
-
-void IOSGLContext::reset_context() {
-  EAGLContext* currentContext = [EAGLContext currentContext];
-
-  if ((__bridge void*)currentContext != oldContext) {
-    GLenum glError = glGetError();
-    if (glError != GL_NO_ERROR) {
-      gl_log(GL_ERR, "There is an error before: 0x%X\n", glError);
-    }
-    if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oldContext]) {
-      throw std::runtime_error("Problem setting OpenGL context");
-    }
-    glError = glGetError();
-    if (glError != GL_NO_ERROR) {
-      gl_log(GL_ERR, "There is an error after: 0x%X\n", glError);
-    }
-    gl_log(GL_VERBOSE, "Reset context to %p\n", oldContext);
-  }
-}
-
-void IOSGLContext::flush_context() { CVOpenGLESTextureCacheFlush(textureCache, 0); }
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.cc b/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.cc
deleted file mode 100644 (file)
index 2c4824a..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-
-#include "IOSGLImageAllocator.h"
-
-#include "../core/GLImage.h"
-#include "../core/GLImageAllocator.h"
-#include "../core/GLPlainTexture.h"
-
-#include "IOSGLContext.h"
-#include "IOSGLTexture.h"
-
-#include "../core/arm_neon_support.h"
-
-template <class T>
-GLImageVector<T>* IOSGLImageAllocator<T>::newImage(int num_images,
-                                                   int width,
-                                                   int height,
-                                                   int channels,
-                                                   int tile_x,
-                                                   int tile_y,
-                                                   bool useCVPixelBuffer) {
-  GLImageVector<T>* output_images =
-      new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
-  if (useCVPixelBuffer) {
-    IOSGLContext* gl_context = (IOSGLContext*)GLContext::getGLContext();
-    for (int i = 0; i < num_images; i++) {
-      GLImage<T>* output_image = new GLImage<T>(
-          width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
-            gl_log(GL_VERBOSE,
-                   "%s pixelbuffers.size(): %ld\n",
-                   __PRETTY_FUNCTION__,
-                   pixelbuffers.size());
-
-            CVPixelBufferRef buffer = NULL;
-            int slices = (channels + 3) / 4;
-            int slice_index = i * slices + slice;
-            if (pixelbuffers.size() < slice_index + 1) {
-              const int texture_width = width * tile_x;
-              const int texture_height = height * tile_y;
-              buffer =
-                  IOSGLTexture::createCVPixelBuffer(pixelFormat, texture_width, texture_height);
-              gl_log(GL_VERBOSE,
-                     "created a new buffer %p for image %d slice %d of dimensions %dx%d\n",
-                     buffer,
-                     i,
-                     slice,
-                     texture_width,
-                     texture_height);
-              pixelbuffers.push_back(buffer);
-            } else {
-              buffer = pixelbuffers[slice_index];
-
-              gl_log(GL_VERBOSE, "reused buffer %p for image %d slice %d\n", buffer, i, slice);
-            }
-
-            return gl_context->createNewTexture(buffer, GLImageAllocator<T>::type);
-          });
-      output_images->push_back(output_image);
-    }
-  } else {
-    for (int i = 0; i < num_images; i++) {
-      GLImage<T>* image = new GLImage<T>(
-          width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
-            return new GLPlainTexture(
-                GLImageAllocator<T>::type, nullptr, width * tile_x, height * tile_y);
-          });
-      output_images->push_back(image);
-    }
-  }
-  return output_images;
-}
-
-template <>
-const FourCharCode IOSGLImageAllocator<float16_t>::pixelFormat = kCVPixelFormatType_64RGBAHalf;
-template <>
-const FourCharCode IOSGLImageAllocator<uint8_t>::pixelFormat = kCVPixelFormatType_32BGRA;
-
-template class IOSGLImageAllocator<float16_t>;
-template class IOSGLImageAllocator<uint8_t>;
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.h b/caffe2/mobile/contrib/opengl/ios/IOSGLImageAllocator.h
deleted file mode 100644 (file)
index ced28bc..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-
-#pragma once
-
-#include "../core/GLImageAllocator.h"
-
-#import <CoreVideo/CoreVideo.h>
-
-template <class T>
-class IOSGLImageAllocator : public GLImageAllocator<T> {
-  static const GLTexture::Type& type;
-
-  std::vector<CVPixelBufferRef> pixelbuffers;
-
- public:
-  static const FourCharCode pixelFormat;
-
-  IOSGLImageAllocator() : GLImageAllocator<T>() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
-
-  ~IOSGLImageAllocator() {
-    gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__);
-
-    for (auto&& pixelbuffer : pixelbuffers) {
-      CFRelease(pixelbuffer);
-    }
-  }
-
-  GLImageVector<T>* newImage(int num_images,
-                             int width,
-                             int height,
-                             int channels,
-                             int tile_x,
-                             int tile_y,
-                             bool useCVPixelBuffer);
-};
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.h b/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.h
deleted file mode 100644 (file)
index 94eb7a8..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-
-#pragma once
-
-#include "../core/GLContext.h"
-#include "../core/GLTexture.h"
-
-#import <CoreVideo/CoreVideo.h>
-
-class IOSGLTexture : public GLTexture {
-  CVOpenGLESTextureRef textureRef;
-
-  IOSGLTexture(const Type& type,
-               CVOpenGLESTextureCacheRef textureCache,
-               CVPixelBufferRef sourceImage,
-               GLint _filter = GL_NEAREST,
-               GLint _wrap = GL_CLAMP_TO_EDGE);
-
-  friend class IOSGLContext;
-
- public:
-  const CVPixelBufferRef sourceImage;
-
-  ~IOSGLTexture() { CFRelease(textureRef); }
-
-  void map_buffer(std::function<void(void* buffer,
-                                     size_t width,
-                                     size_t height,
-                                     size_t stride,
-                                     size_t channels,
-                                     const Type& type)> process) const;
-
-  virtual void map_read(std::function<void(const void* buffer,
-                                           size_t width,
-                                           size_t height,
-                                           size_t stride,
-                                           size_t channels,
-                                           const Type& type)> process) const;
-
-  virtual void map_load(std::function<void(void* buffer,
-                                           size_t width,
-                                           size_t height,
-                                           size_t stride,
-                                           size_t channels,
-                                           const Type& type)> process) const;
-
-  GLuint name() const { return CVOpenGLESTextureGetName(textureRef); }
-  GLenum target() const { return CVOpenGLESTextureGetTarget(textureRef); };
-  bool flipped() const { return CVOpenGLESTextureIsFlipped(textureRef); };
-
-  static CVPixelBufferRef createCVPixelBuffer(OSType pixelType, int32_t width, int32_t height);
-};
diff --git a/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.mm b/caffe2/mobile/contrib/opengl/ios/IOSGLTexture.mm
deleted file mode 100644 (file)
index 7fc1bbc..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-
-#include "IOSGLTexture.h"
-#include "../core/DataTransfer.h"
-
-IOSGLTexture::IOSGLTexture(const Type& type,
-                           CVOpenGLESTextureCacheRef textureCache,
-                           CVPixelBufferRef _sourceImage,
-                           GLint filter,
-                           GLint wrap)
-    : GLTexture(type,
-                CVPixelBufferGetWidth(_sourceImage),
-                CVPixelBufferGetHeight(_sourceImage),
-                CVPixelBufferGetBytesPerRow(_sourceImage) / (type.channels() * type.dataSize()),
-                false,
-                filter,
-                wrap),
-      sourceImage(_sourceImage) {
-  CVReturn err = CVOpenGLESTextureCacheCreateTextureFromImage(kCFAllocatorDefault,
-                                                              textureCache,
-                                                              _sourceImage,
-                                                              NULL,
-                                                              GL_TEXTURE_2D,
-                                                              _type.internalFormat,
-                                                              _width,
-                                                              _height,
-                                                              _type.format,
-                                                              _type.type,
-                                                              0,
-                                                              &textureRef);
-
-  if (!textureRef || err) {
-    gl_log(GL_ERR,
-           "something went wrong, sourceImage: %p, width: %d, height: %d, filter: %d, wrap: %d\n",
-           _sourceImage,
-           _width,
-           _height,
-           filter,
-           wrap);
-  }
-  _textureId = name();
-  gl_log(
-      GL_VERBOSE,
-      "IOSGLTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
-      _textureId,
-      _type.internalFormat,
-      _type.format,
-      _type.type);
-
-  glActiveTexture(GL_TEXTURE0);
-  glBindTexture(GL_TEXTURE_2D, _textureId);
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
-
-#if GL_EXT_texture_border_clamp
-  GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
-  glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
-  // Set the texture to use the border clamp wrapping mode.
-  wrap = GL_CLAMP_TO_BORDER_EXT;
-#endif
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, wrap);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, wrap);
-
-  glBindTexture(GL_TEXTURE_2D, 0);
-}
-
-CVPixelBufferRef IOSGLTexture::createCVPixelBuffer(OSType pixelFormat,
-                                                   int32_t width,
-                                                   int32_t height) {
-  NSDictionary* pixelBufferAttributes = @{
-    (id)kCVPixelBufferPixelFormatTypeKey : @(pixelFormat),
-    (id)kCVPixelFormatOpenGLESCompatibility : @YES,
-    (id)kCVPixelBufferIOSurfacePropertiesKey : @{/*empty dictionary*/}
-  };
-
-  CVPixelBufferRef buffer = NULL;
-  CVPixelBufferCreate(kCFAllocatorDefault,
-                      width,
-                      height,
-                      pixelFormat,
-                      (__bridge CFDictionaryRef)(pixelBufferAttributes),
-                      &buffer);
-  return buffer;
-}
-
-void IOSGLTexture::map_buffer(std::function<void(void* buffer,
-                                                 size_t width,
-                                                 size_t height,
-                                                 size_t stride,
-                                                 size_t channels,
-                                                 const Type& type)> process) const {
-  if (CVPixelBufferLockBaseAddress(sourceImage, 0) == kCVReturnSuccess) {
-    void* buffer = CVPixelBufferGetBaseAddress(sourceImage);
-    int buffer_stride = CVPixelBufferGetBytesPerRow(sourceImage) / (_channels * _type.dataSize());
-    process(buffer, _width, _height, buffer_stride, _channels, _type);
-
-    CVPixelBufferUnlockBaseAddress(sourceImage, 0);
-  }
-}
-
-void IOSGLTexture::map_load(std::function<void(void* buffer,
-                                               size_t width,
-                                               size_t height,
-                                               size_t stride,
-                                               size_t channels,
-                                               const Type& type)> process) const {
-  map_buffer(process);
-}
-
-void IOSGLTexture::map_read(std::function<void(const void* buffer,
-                                               size_t width,
-                                               size_t height,
-                                               size_t stride,
-                                               size_t channels,
-                                               const Type& type)> process) const {
-  // TODO: why is glFlush() only necessary when running tests
-  glFlush();
-
-  map_buffer(process);
-}
diff --git a/caffe2/mobile/contrib/opengl/operators/CMakeLists.txt b/caffe2/mobile/contrib/opengl/operators/CMakeLists.txt
deleted file mode 100644 (file)
index dbc170e..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB_RECURSE tmp *.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/operators/GLAdd.cc b/caffe2/mobile/contrib/opengl/operators/GLAdd.cc
deleted file mode 100644 (file)
index 755678f..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLAdd : public GLFilter {
- public:
-  binding* inputData[2];
-  binding* outputSize;
-
-  GLAdd()
-      : GLFilter("GLAdd",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>(
-                     {BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {/* no replacements */}) {}
-
-  template <typename T>
-  void add(const GLImageVector<T>& input_image0,
-           const GLImageVector<T>& input_image1,
-           const GLImageVector<T>& output_image);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLAdd::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData[2]);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-    ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-    vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
-    vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
-    vec4 value = A + B;
-    outputData = TEXTURE_STORE(value);
-}
-
-)GLSL";
-
-template <typename T>
-void GLAdd::add(const GLImageVector<T>& input_images0,
-                const GLImageVector<T>& input_images1,
-                const GLImageVector<T>& output_images) {
-  const int num_images = input_images0.size();
-  for (int i = 0; i < num_images; i++) {
-    GLImage<T>* input_image0 = input_images0[i];
-    GLImage<T>* input_image1 = input_images1[i];
-    int input_slices = input_image0->slices;
-    GLImage<T>* output_image = output_images[i];
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      std::vector<texture_attachment> input_attachments;
-      input_attachments.push_back({input_image0->textures[is], inputData[0]});
-      input_attachments.push_back({input_image1->textures[is], inputData[1]});
-
-      run(input_attachments,
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
-          output_image->texture_width,
-          output_image->texture_height);
-    }
-  }
-}
-
-namespace caffe2 {
-template <typename T>
-class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLAddOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
-                           "OpenGLAdd does not support broadcast");
-
-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
-    const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
-
-    CAFFE_ENFORCE_EQ(input0.size(), input1.size());
-
-    const int num_images = input0.size();
-    const int input_channels = input0.channels();
-    const int input_width = input0.width();
-    const int input_height = input0.height();
-    const int input_tile_x   = input0.tile_x();
-    const int input_tile_y   = input0.tile_y();
-
-    CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
-    CAFFE_ENFORCE_EQ(input1.width(), input_width);
-    CAFFE_ENFORCE_EQ(input1.height(), input_height);
-    CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
-    CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-    const int output_tile_x   = input_tile_x;
-    const int output_tile_y   = input_tile_y;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-
-    if (!_add) {
-      _add.reset(new GLAdd());
-    }
-
-    _add->add(input0, input1, *output);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLAdd> _add;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLAdd, OpenGLAddOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLAdd).NumInputs(2).NumOutputs(1);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConcat.cc b/caffe2/mobile/contrib/opengl/operators/GLConcat.cc
deleted file mode 100644 (file)
index a3d8bfc..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-#include "gl_tiling_utils.h"
-
-#include <iostream>
-#include <vector>
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/utils/math.h"
-
-class GLConcat : public GLFilter {
- public:
-  bool tiling_;
-  binding* inputData;
-  binding* outputSize;
-  binding* inputTileRange;
-  binding* input_tile_x;
-
-  GLConcat(tile_descriptor output_tile_geometries, bool tiling = false)
-      : GLFilter(
-            "GLConcat",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(outputSize),
-                                   BINDING(inputData),
-                                   BINDING(inputTileRange),
-                                   BINDING(input_tile_x)}),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"TILING", c10::to_string(tiling)},
-             {"OUTPUT_TILES", c10::to_string(output_tile_geometries.tiles)},
-             {"OUTPUT_TILE_X",
-              c10::to_string(output_tile_geometries.tile_dims.x)},
-             {"OUTPUT_TILE_WIDTH",
-              c10::to_string(output_tile_geometries.tile_size.x)},
-             {"OUTPUT_TILE_HEIGHT",
-              c10::to_string(output_tile_geometries.tile_size.y)}}),
-        tiling_(tiling) {}
-
-  template <typename T>
-  void concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_image, int size);
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLConcat::fragment_shader = R"GLSL(#version 300 es
-#define TILING                      $(TILING)
-
-// tiling
-#define OUTPUT_TILES                $(OUTPUT_TILES)
-#define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
-#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-uniform ivec2 outputSize;
-uniform ivec2 inputTileRange; // (]
-uniform int input_tile_x;
-
-#if TILING
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
-  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
-  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-
-  if (tileNum >= inputTileRange.x && tileNum < inputTileRange.y) {
-    tileNum = tileNum - inputTileRange.x;
-    texelCoord = ivec2(tileNum % input_tile_x, tileNum / input_tile_x)  * ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT) + tileCoord;
-    vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-    outputData = TEXTURE_STORE(value);
-  } else {
-    // early termination
-    discard;
-  }
-}
-
-#else
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = TEXTURE_STORE(value);
-}
-#endif
-
-)GLSL";
-
-template <typename T>
-void GLConcat::concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_images, int input_size) {
-  for (int k = 0; k < output_images.size(); k++) {
-    GLImage<T>* output_image = output_images[k];
-
-    int is = 0, os = 0;
-    for (int i = 0; i < input_size; i++) {
-      for (int j = 0; j < input_images[i]->slices(); j++) {
-        GLImage<T>* input_image = (*input_images[i])[k];
-        std::vector<texture_attachment> input_attachments;
-        input_attachments.push_back({input_image->textures[j], inputData});
-
-        run(input_attachments,
-            {output_image->textures.begin() + os, output_image->textures.begin() + os + 1},
-            [&]() {
-              glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
-              glUniform2i(inputTileRange->location, is, is + input_image->tile_x * input_image->tile_y);
-              glUniform1i(input_tile_x->location, input_image->tile_x);
-            },
-            output_image->texture_width,
-            output_image->texture_height);
-        if (!tiling_) {
-          os++; // for tiling, you always write to the same texture
-        }
-        is += input_image->tile_x * input_image->tile_y;
-      }
-    }
-  }
-}
-
-namespace caffe2 {
-template <typename T>
-class OpenGLConcatOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLConcatOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = input0.size();
-
-    const GLImageVector<T>** input_images = new const GLImageVector<T>*[Inputs().size()];
-    input_images[0] = &input0;
-    int channelCount = input0.channels();
-
-    bool tiling = OperatorBase::GetSingleArgument<int>("tiling", 0);
-
-    // Only supports input channels divisible by 4 for now
-    CAFFE_ENFORCE_EQ(input0.channels() % 4, 0);
-    for (auto i = 1; i < Inputs().size(); i++) {
-      const GLImageVector<T>& inputi = Inputs()[i]->template Get<GLImageVector<T>>();
-      channelCount += inputi.channels();
-      CAFFE_ENFORCE_EQ(num_images, inputi.size());
-      CAFFE_ENFORCE_EQ(inputi.channels() % 4, 0);
-      CAFFE_ENFORCE_EQ(input0.width(), inputi.width());
-      CAFFE_ENFORCE_EQ(input0.height(), inputi.height());
-      input_images[i] = &inputi;
-
-      if (inputi.tile_x() > 1 || inputi.tile_y() > 1) {
-        tiling = true;
-      }
-    }
-
-    const int input_width = input0.width();
-    const int input_height = input0.height();
-
-    const int output_channels = channelCount;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int output_tile_x = 1;
-    int output_tile_y = 1;
-    if (tiling) {
-      computeOutputTiles(output_channels, output_tile_x, output_tile_y);
-    }
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-    if (!_concat) {
-      tile_descriptor output_tile_geometries{
-          {output_tile_x, output_tile_y}, {output_width, output_height}, output_tile_x * output_tile_y};
-      _concat.reset(new GLConcat(output_tile_geometries, tiling));
-    }
-
-    _concat->concat(input_images, *output, Inputs().size());
-    delete[] input_images;
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  StorageOrder order_;
-  std::unique_ptr<GLConcat> _concat;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLConcat, OpenGLConcatOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLConcat).NumInputs(2, 4).NumOutputs(1, 2);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConvolution.cc b/caffe2/mobile/contrib/opengl/operators/GLConvolution.cc
deleted file mode 100644 (file)
index 0926a61..0000000
+++ /dev/null
@@ -1,1068 +0,0 @@
-#include "GLConvolution.h"
-#include "../core/GLContext.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/conv_transpose_unpool_op_base.h"
-#include <iostream>
-#include <vector>
-
-#define MaxOutputTileBatchSize 2
-
-// MARK: GLSL
-const char* GLConvolution::fragment_shader = R"GLSL(#version 300 es
-#define TILED_CONVOLUTION           $(TILED_CONVOLUTION)
-#define TRANSPOSED_CONVOLUTION      $(TRANSPOSED_CONVOLUTION)
-
-// batching
-#define INPUT_BATCH_SIZE            $(INPUT_BATCH_SIZE)
-#define OUTPUT_BATCH_SIZE           $(OUTPUT_BATCH_SIZE)
-
-// tiling
-#define INPUT_TILES                 $(INPUT_TILES)
-#define OUTPUT_TILES                $(OUTPUT_TILES)
-#define INPUT_TILE_WIDTH            $(INPUT_TILE_WIDTH)
-#define INPUT_TILE_HEIGHT           $(INPUT_TILE_HEIGHT)
-#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
-#define INPUT_TILE_X                $(INPUT_TILE_X)
-#define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
-#define INPUT_TILE_CHUNK_SIZE       $(INPUT_TILE_CHUNK_SIZE)
-#define OUTPUT_TILE_CHUNK_SIZE      $(OUTPUT_TILE_CHUNK_SIZE)
-#define OUTPUT_TILE_BATCH_SIZE      $(OUTPUT_TILE_BATCH_SIZE)
-
-#define BOUNDS_CHECK_MODE           $(BOUNDS_CHECK_MODE)
-
-// common
-const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
-const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
-const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
-
-precision mediump float;
-precision mediump int;
-precision mediump sampler2D;
-
-in highp vec2 v_texCoord;
-
-#define unpackKernel(pk) \
-  mat4(vec4(unpackHalf2x16(pk.packed_data[0].x), unpackHalf2x16(pk.packed_data[0].y)), \
-       vec4(unpackHalf2x16(pk.packed_data[0].z), unpackHalf2x16(pk.packed_data[0].w)), \
-       vec4(unpackHalf2x16(pk.packed_data[1].x), unpackHalf2x16(pk.packed_data[1].y)), \
-       vec4(unpackHalf2x16(pk.packed_data[1].z), unpackHalf2x16(pk.packed_data[1].w)))
-
-#if BOUNDS_CHECK_MODE == 0
-  #define IN_BOUNDS(p, p0, p1) (true)
-#else
-  #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
-#endif
-
-#if TILED_CONVOLUTION
-// Tiled convolution
-const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-uniform ivec2 outputSize;
-uniform bool accumulate;
-uniform bool fusePRelu;
-
-uniform ivec2 inputTileRange;
-
-TEXTURE_INPUT(inputData[1]);
-TEXTURE_INPUT(previousData[1]);
-
-struct packedKernel {
-  highp uvec4 packed_data[2];
-};
-
-struct kernel {
-  packedKernel data[kernel_size.x * kernel_size.y];
-};
-
-layout (std140) uniform Kernel_block {
-  kernel kernel_data[INPUT_TILE_CHUNK_SIZE * OUTPUT_TILE_CHUNK_SIZE];
-} kernel_block[OUTPUT_TILE_BATCH_SIZE];
-
-layout (std140) uniform bias_block {
-  highp uvec4 bias[(OUTPUT_TILES + 1) / 2];
-};
-
-layout (std140) uniform prelu_scale_block {
-  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
-};
-
-TEXTURE_OUTPUT(0, outputData0);
-
-#if TRANSPOSED_CONVOLUTION
-
-#define CONVOLUTION(ib) { \
-  ivec2 p0 = (input_padding + input_stride - tileCoord % input_stride) % input_stride; \
-  for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
-    for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
-      int i = y * kernel_size.x + x; \
-      ivec2 idx = tileCoord + ivec2(x, y) - input_padding; \
-      if IN_BOUNDS(idx, ivec2(0), inputTileSize * input_stride) { \
-        vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx / input_stride); \
-        mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
-        sum += k * data; \
-      } \
-    } \
-  } \
-}
-
-#else
-
-#define CONVOLUTION(ib) { \
-  for (int y = 0, i = 0; y < kernel_size.y; y++) { \
-    for (int x = 0; x < kernel_size.x; x++, i++) { \
-      ivec2 idx = tileCoord + ivec2(x, y); \
-      if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
-        vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx); \
-        mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
-        sum += k * data; \
-      } \
-    } \
-  } \
-}
-#endif // TRANSPOSED_CONVOLUTION
-
-void main() {
-  ivec2 inputSize = textureSize(inputData[0], 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
-  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
-  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
-
-  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-
-#if !TRANSPOSED_CONVOLUTION
-  tileCoord = input_stride * tileCoord - input_padding;
-#endif
-
-  highp vec4 sum = vec4(0);
-
-  for (int tile_idx = inputTileRange.x; tile_idx < inputTileRange.y; tile_idx++) {
-    int inTileX = tile_idx % INPUT_TILE_X;
-    int inTileY = tile_idx / INPUT_TILE_X;
-    int inTileId = tile_idx % INPUT_TILE_CHUNK_SIZE; // normalized input tile idx, used to index the kernel
-
-    int kernelIdx = OUTPUT_TILE_CHUNK_SIZE * inTileId + tileNum % OUTPUT_TILE_CHUNK_SIZE;
-    ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
-
-    int outputChunkIdx = tileNum / OUTPUT_TILE_CHUNK_SIZE;
-    if (outputChunkIdx == 0) {
-      CONVOLUTION(0);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 1
-    else if (outputChunkIdx == 1) {
-      CONVOLUTION(1);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 2
-    else if (outputChunkIdx == 2) {
-      CONVOLUTION(2);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 3
-    else if (outputChunkIdx == 3) {
-      CONVOLUTION(3);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 4
-    else if (outputChunkIdx == 4) {
-      CONVOLUTION(4);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 5
-    else if (outputChunkIdx == 5) {
-      CONVOLUTION(5);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 6
-    else if (outputChunkIdx == 6) {
-      CONVOLUTION(6);
-    }
-#if OUTPUT_TILE_BATCH_SIZE > 7
-    else if (outputChunkIdx == 7) {
-      CONVOLUTION(7);
-    }
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-  }
-
-  vec4 biasValue = (tileNum % 2 == 0) ? unpackHalf4x16(bias[tileNum/2].xy) : unpackHalf4x16(bias[tileNum/2].zw);
-  vec4 prevData = TEXTURE_LOAD(previousData[0], texelCoord);
-  vec4 value = sum + (accumulate ? prevData : biasValue);
-
-  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
-
-  vec4 o0 = fusePRelu ? mix(value * preluValue, value, vec4(greaterThan(value, vec4(0)))) : value;
-  outputData0 = TEXTURE_STORE(o0);
-}
-
-#else
-
-// batched convolution
-
-uniform ivec2 outputSize;
-uniform bool accumulate;
-uniform bool fusePRelu;
-
-TEXTURE_INPUT(inputData[INPUT_BATCH_SIZE]);
-TEXTURE_INPUT(previousData[OUTPUT_BATCH_SIZE]);
-
-struct packedKernel {
-  highp uvec4 packed_data[2];
-};
-
-struct kernel {
-  packedKernel data[kernel_size.x * kernel_size.y];
-};
-
-layout (std140) uniform Kernel_block {
-  kernel kernel_data[OUTPUT_BATCH_SIZE];
-} kernel_block[INPUT_BATCH_SIZE];
-
-layout (std140) uniform bias_block {
-  highp uvec4 bias[(OUTPUT_BATCH_SIZE + 1) / 2];
-};
-
-layout (std140) uniform prelu_scale_block {
-  highp uvec4 scale[(OUTPUT_BATCH_SIZE + 1) / 2];
-};
-
-TEXTURE_OUTPUT(0, outputData0);
-#if OUTPUT_BATCH_SIZE > 1
-TEXTURE_OUTPUT(1, outputData1);
-#if OUTPUT_BATCH_SIZE > 2
-TEXTURE_OUTPUT(2, outputData2);
-#if OUTPUT_BATCH_SIZE > 3
-TEXTURE_OUTPUT(3, outputData3);
-#endif
-#endif
-#endif
-
-#if TRANSPOSED_CONVOLUTION
-#define CONVOLUTION(ib) { \
-  ivec2 p0 = (input_padding + input_stride - texelCoord % input_stride) % input_stride; \
-  for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
-    for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
-      int i = y * kernel_size.x + x; \
-      ivec2 idx = texelCoord + ivec2(x, y) - input_padding; \
-      if IN_BOUNDS(idx, ivec2(0), inputSize * input_stride) { \
-        vec4 data = TEXTURE_LOAD(inputData[ib], idx / input_stride); \
-        for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
-          mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
-          sum[ob] += k * data; \
-        } \
-      } \
-    } \
-  } \
-}
-
-#else
-
-#define CONVOLUTION(ib) { \
-  for (int y = 0, i = 0; y < kernel_size.y; y++) { \
-    for (int x = 0; x < kernel_size.x; x++, i++) { \
-      ivec2 idx = coord + ivec2(x, y); \
-      if IN_BOUNDS(idx, ivec2(0), inputSize) { \
-        vec4 data = TEXTURE_LOAD(inputData[ib], idx); \
-        for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
-          mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
-          sum[ob] += k * data; \
-        } \
-      } \
-    } \
-  } \
-}
-
-#endif // TRANSPOSED_CONVOLUTION
-
-void main() {
-  ivec2 inputSize = textureSize(inputData[0], 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
-#if !TRANSPOSED_CONVOLUTION
-  ivec2 coord = input_stride * texelCoord - input_padding;
-#endif
-
-  highp vec4 sum[OUTPUT_BATCH_SIZE] = vec4[OUTPUT_BATCH_SIZE](vec4(0)
-#if OUTPUT_BATCH_SIZE > 1
-                                                                       , vec4(0)
-#if OUTPUT_BATCH_SIZE > 2
-                                                                       , vec4(0)
-#if OUTPUT_BATCH_SIZE > 3
-                                                                       , vec4(0)
-#endif
-#endif
-#endif
-                                                                       );
-
-      CONVOLUTION(0);
-#if INPUT_BATCH_SIZE > 1
-      CONVOLUTION(1);
-#if INPUT_BATCH_SIZE > 2
-      CONVOLUTION(2);
-#if INPUT_BATCH_SIZE > 3
-      CONVOLUTION(3);
-#if INPUT_BATCH_SIZE > 4
-      CONVOLUTION(4);
-#if INPUT_BATCH_SIZE > 5
-      CONVOLUTION(5);
-#if INPUT_BATCH_SIZE > 6
-      CONVOLUTION(6);
-#if INPUT_BATCH_SIZE > 7
-      CONVOLUTION(7);
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-
-  vec4 prev0 = TEXTURE_LOAD(previousData[0], texelCoord);
-  vec4 value = sum[0] + (accumulate ? prev0: unpackHalf4x16(bias[0].xy));
-  vec4 o0 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
-  outputData0 = TEXTURE_STORE(o0);
-#if OUTPUT_BATCH_SIZE > 1
-  vec4 prev1 = TEXTURE_LOAD(previousData[1], texelCoord);
-  value = sum[1] + (accumulate ? prev1 : unpackHalf4x16(bias[0].zw));
-  vec4 o1 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
-  outputData1 = TEXTURE_STORE(o1);
-#if OUTPUT_BATCH_SIZE > 2
-  vec4 prev2 = TEXTURE_LOAD(previousData[2], texelCoord);
-  value = sum[2] + (accumulate ? prev2 : unpackHalf4x16(bias[1].xy));
-  vec4 o2 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
-  outputData2 = TEXTURE_STORE(o2);
-#if OUTPUT_BATCH_SIZE > 3
-  vec4 prev3 = TEXTURE_LOAD(previousData[3], texelCoord);
-  value = sum[3] + (accumulate ? prev3: unpackHalf4x16(bias[1].zw));
-  vec4 o3 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
-  outputData3 = TEXTURE_STORE(o3);
-#endif
-#endif
-#endif
-}
-
-#endif // TILED_CONVOLUTION
-
-)GLSL";
-
-void GLConvolution::pack_kernel_data_for_bached_conv(
-    float16_t* data,
-    size_t size,
-    int input_channels,
-    int output_channels,
-    int is,
-    int os,
-    int ib) {
-  typedef float16_t(packedKernel)[output_batch_size][geometry.kernel_size.y]
-                                 [geometry.kernel_size.x][4][4];
-  packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
-
-  const int batch_input_channels = std::min(4, input_channels - 4 * (is + ib));
-  for (int ob = 0; ob < output_batch_size; ob++) {
-    const int batch_output_channels =
-        std::min(4, output_channels - 4 * (os + ob));
-    for (int out = 0; out < batch_output_channels; out++) {
-      for (int in = 0; in < batch_input_channels; in++) {
-        for (int y = 0; y < geometry.kernel_size.y; y++) {
-          for (int x = 0; x < geometry.kernel_size.x; x++) {
-            // clang-format off
-            if (geometry.transposed) {
-              typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
-              const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
-              packed_kernel_data[ob][y][x][in][out] =
-              kernel_data[4 * (is + ib) + in][4 * (os + ob) + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
-            } else {
-              typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
-              const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
-              packed_kernel_data[ob][y][x][in][out] = kernel_data[4 * (os + ob) + out][4 * (is + ib) + in][y][x];
-            }
-            // clang-format on
-          }
-        }
-      }
-    }
-  }
-}
-
-void GLConvolution::pack_kernel_data_for_tiled_conv(
-    float16_t* data, // destination
-    size_t size,
-    int input_channels,
-    int output_channels,
-    point input_tile_range,
-    point output_tile_range) {
-  typedef float16_t(
-      packedKernel)[input_tile_chunk_size][output_tile_chunk_size]
-                   [geometry.kernel_size.y][geometry.kernel_size.x][4][4];
-  packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
-
-  for (int it = input_tile_range.x; it < input_tile_range.y; it++) {
-    for (int ot = output_tile_range.x; ot < output_tile_range.y; ot++) {
-      for (int y = 0; y < geometry.kernel_size.y; y++) {
-        for (int x = 0; x < geometry.kernel_size.x; x++) {
-          for (int out = 0; out < std::min(4, (output_channels - ot * 4));
-               out++) {
-            for (int in = 0; in < std::min(4, (input_channels - it * 4));
-                 in++) {
-              // clang-format off
-              if (geometry.transposed) {
-                typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
-                const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
-                packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
-                kernel_data[4 * it + in] [4 * ot + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
-              } else {
-                typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
-                const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
-                packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
-                kernel_data[4 * ot + out][4 * it + in][y][x];
-              }
-              // clang-format on
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void GLConvolution::convolution(
-    const GLImageVector<T>& input_images,
-    const GLImageVector<T>& output_images) {
-  if (tiling) {
-    run_tiled_conv(input_images, output_images);
-  } else {
-    run_batched_conv(input_images, output_images);
-  }
-}
-
-template <typename T>
-void GLConvolution::run_batched_conv(
-    const GLImageVector<T>& input_images,
-    const GLImageVector<T>& output_images) {
-  for (int i = 0; i < input_images.size(); i++) {
-    GLImage<T>* input_image = input_images[i];
-    GLImage<T>* output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is += input_batch_size) {
-      for (int os = 0; os < output_slices; os += output_batch_size) {
-        const int output_channels_per_batch =
-            std::min(4 * output_batch_size, geometry.output_channels - 4 * os);
-
-        gl_log(
-            GL_VERBOSE,
-            "GLConvolution::convolution - is: %d, os: %d\n",
-            is,
-            os);
-
-        // Note the order of the binding point needs to be the same as in the
-        // constructor
-        int binding_point = 0;
-
-        // bias
-        attach_uniform_buffer<float16_t>(
-            bias_block, binding_point++, [&](float16_t* data, size_t size) {
-              CAFFE_ENFORCE_GE(
-                  size,
-                  output_channels_per_batch * sizeof(float16_t),
-                  "Bias buffer size too small");
-              for (int ob = 0; ob < output_channels_per_batch; ob++) {
-                data[ob] = bias[4 * os + ob];
-              }
-            });
-
-        // kernel weights
-        for (int ib = 0; ib < input_batch_size; ib++) {
-          attach_uniform_buffer<float16_t>(
-              kernel_block[ib],
-              binding_point++,
-              [&](float16_t* data, size_t size) {
-                CAFFE_ENFORCE_EQ(
-                    size,
-                    4 * (4 * output_batch_size) * geometry.kernel_size.y *
-                        geometry.kernel_size.x * sizeof(float16_t),
-                    "Kernel size mismatch");
-                pack_kernel_data_for_bached_conv(
-                    data,
-                    size,
-                    input_image->channels,
-                    output_image->channels,
-                    is,
-                    os,
-                    ib);
-              });
-        }
-
-        // PRelu scale
-        if (prelu_scale != nullptr && is == input_slices - input_batch_size) {
-          attach_uniform_buffer<float16_t>(
-              prelu_scale_block,
-              binding_point++,
-              [&](float16_t* data, size_t size) {
-                CAFFE_ENFORCE_GE(
-                    size,
-                    output_channels_per_batch * sizeof(float16_t),
-                    "PRelu buffer size too small");
-                for (int ob = 0; ob < output_channels_per_batch; ob++) {
-                  data[ob] = prelu_scale_size == geometry.output_channels
-                      ? prelu_scale[4 * os + ob]
-                      : prelu_scale[0];
-                }
-              });
-        }
-
-        std::vector<texture_attachment> input_attachments;
-        for (int ib = 0; ib < input_batch_size; ib++) {
-          input_attachments.push_back(
-              {input_image->textures[is + ib], inputData[ib]});
-        }
-        for (int ob = 0; ob < output_batch_size; ob++) {
-          input_attachments.push_back(
-              {output_image->textures[os + ob], previousData[ob]});
-        }
-
-        run(input_attachments,
-            {output_image->textures.begin() + os,
-             output_image->textures.begin() + os + output_batch_size},
-            [&]() {
-              glUniform2i(
-                  outputSize->location,
-                  output_image->texture_width,
-                  output_image->texture_height);
-              glUniform2i(inputTileRange->location, 0, 1);
-              glUniform1i(accumulate->location, is != 0);
-              glUniform1i(
-                  fusePRelu->location,
-                  prelu_scale != nullptr &&
-                      (is == input_slices - input_batch_size));
-            },
-            output_image->texture_width,
-            output_image->texture_height);
-      }
-    }
-  }
-}
-
-template <typename T>
-void GLConvolution::run_tiled_conv(
-    const GLImageVector<T>& input_images,
-    const GLImageVector<T>& output_images) {
-  for (int i = 0; i < input_images.size(); i++) {
-    GLImage<T>* input_image = input_images[i];
-    GLImage<T>* output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-    int input_tile_x = input_image->tile_x;
-    int input_tile_y = input_image->tile_y;
-    int input_tiles = input_image->tile_x * input_image->tile_y;
-    int output_tiles = output_image->tile_x * output_image->tile_y;
-
-    for (int ib = 0, it = 0; it < input_tiles;
-         ib++, it += input_tile_chunk_size) {
-      // Note the order of the binding point needs to be the same as in the
-      // constructor
-      int binding_point = 0;
-
-      // bias
-      attach_uniform_buffer<float16_t>(
-          bias_block, binding_point++, [&](float16_t* data, size_t size) {
-            CAFFE_ENFORCE_GE(
-                size,
-                geometry.output_channels * sizeof(float16_t),
-                "Bias buffer size too small");
-            for (int ob = 0; ob < geometry.output_channels; ob++) {
-              data[ob] = bias[ob];
-            }
-          });
-
-      // kernel weights
-      for (int ob = 0, ot = 0; ot < output_tiles;
-           ob++, ot += output_tile_chunk_size) {
-        attach_uniform_buffer<float16_t>(
-            kernel_block[ob],
-            binding_point++,
-            [&](float16_t* data, size_t size) {
-              CAFFE_ENFORCE_EQ(
-                  size,
-                  (4 * input_tile_chunk_size) * (4 * output_tile_chunk_size) *
-                      geometry.kernel_size.y * geometry.kernel_size.x *
-                      sizeof(float16_t),
-                  "Kernel size mismatch");
-              pack_kernel_data_for_tiled_conv(
-                  data,
-                  size,
-                  input_image->channels,
-                  output_image->channels,
-                  {it, std::min(it + input_tile_chunk_size, input_tiles)},
-                  {ot, std::min(ot + output_tile_chunk_size, output_tiles)});
-            });
-      }
-
-      // PRelu scale
-      if (prelu_scale != nullptr && ib == input_tile_batch_size - 1) {
-        attach_uniform_buffer<float16_t>(
-            prelu_scale_block,
-            binding_point++,
-            [&](float16_t* data, size_t size) {
-              CAFFE_ENFORCE_GE(
-                  size,
-                  geometry.output_channels * sizeof(float16_t),
-                  "PRelu buffer size too small");
-              for (int ob = 0; ob < geometry.output_channels; ob++) {
-                data[ob] = prelu_scale_size == geometry.output_channels
-                    ? prelu_scale[ob]
-                    : prelu_scale[0];
-              }
-            });
-      }
-
-      std::vector<texture_attachment> input_attachments(
-          {{input_image->textures[0], inputData[0]},
-           {output_image->textures[0], previousData[0]}});
-
-      run(input_attachments,
-          {output_image->textures[0]},
-          [&]() {
-            glUniform2i(
-                outputSize->location,
-                output_image->texture_width,
-                output_image->texture_height);
-            // [inputTileFrom, inputTileTo)
-            glUniform2i(
-                inputTileRange->location,
-                it,
-                std::min(it + input_tile_chunk_size, input_tiles));
-
-            glUniform1i(accumulate->location, it != 0);
-            glUniform1i(
-                fusePRelu->location,
-                prelu_scale != nullptr && (ib == input_tile_batch_size - 1));
-          },
-          output_image->texture_width,
-          output_image->texture_height);
-    }
-  }
-}
-
-namespace caffe2 {
-
-template <typename OPBase>
-static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
-  Tensor<CPUContext> input, output;
-  input.Resize(1, 1, H, W);
-  op->SetOutputSize(input, &output, 1);
-  CAFFE_ENFORCE_EQ(output.ndim(), 4);
-  *OH = output.dim(2);
-  *OW = output.dim(3);
-}
-
-static int computeOutputTileChunkSize(int output_tile_x,
-                                      int output_tile_y,
-                                      int kernel_width,
-                                      int kernel_height) {
-  static const int maxUniformBlockBufferSize = 16 * 1024;
-  return std::min(
-      output_tile_x * output_tile_y,
-      maxUniformBlockBufferSize / 4 /
-          (4 * kernel_width * kernel_height * (int)sizeof(float16_t)));
-}
-
-static int computeInputTileChunkSize(
-    int input_tile_x,
-    int input_tile_y,
-    int output_tile_chunk_size,
-    int kernel_width,
-    int kernel_height) {
-  static const int maxUniformBlockBufferSize = 16 * 1024;
-  return std::min(
-      input_tile_x * input_tile_y,
-      maxUniformBlockBufferSize / 4 /
-          (4 * output_tile_chunk_size * kernel_width * kernel_height *
-           (int)sizeof(float16_t)));
-}
-
-// Todo: optimize input/output batch size and use of uniforms/textures for
-// kernel data
-static void computeBatchSizes(
-    GLConvolution::descriptor& geometry,
-    int& input_batch_size,
-    int& output_batch_size) {
-  int kernel_size = std::max(geometry.kernel_size.x, geometry.kernel_size.y);
-  int input_slices = (geometry.input_channels + 3) / 4;
-  int output_slices = (geometry.output_channels + 3) / 4;
-
-#if CAFFE2_ANDROID
-  input_batch_size = input_slices % 2 == 0 ? 2 : 1;
-  output_batch_size = output_slices % 2 == 0 ? 2 : 1;
-#else
-  if (iPhoneVersion() >= 8) {
-    // iPhone 6S and up
-    input_batch_size =
-        /* input_slices % 8 == 0 ? 8 : */ input_slices % 4 == 0
-            ? 4
-            : input_slices % 3 == 0 ? 3 : input_slices % 2 == 0 ? 2 : 1;
-    output_batch_size = output_slices % 4 == 0
-        ? 4
-        : output_slices % 3 == 0 ? 3 : output_slices % 2 == 0 ? 2 : 1;
-  }
-#endif
-}
-
-template <class T, bool fusePRelu, bool fuseRelu>
-class OpenGLConvOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
- public:
-  USE_OPERATOR_BASE_FUNCTIONS;
-  OpenGLConvOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-    OPERATOR_NEEDS_FEATURE(group_ == 1, "OpenGL only supports group == 1");
-    OPERATOR_NEEDS_FEATURE(
-        dilation_h() == 1 && dilation_w() == 1,
-        "OpenGL only supports dialation == 1");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
-    auto& filter = Input(FILTER);
-    auto& bias = Input(BIAS);
-
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    CAFFE_ENFORCE(filter.ndim(), 4);
-    const int M = filter.dim32(0);
-    const int kernel_width = filter.dim32(2);
-    const int kernel_height = filter.dim32(3);
-
-    CAFFE_ENFORCE(filter.dim32(1) == input_channels, "");
-    CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
-    CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
-    CAFFE_ENFORCE(bias.ndim() == 1, "");
-    CAFFE_ENFORCE(bias.dim32(0) == M, "");
-
-    int output_height;
-    int output_width;
-    const int output_channels = M;
-    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
-    float val = 0;
-    const float* prelu_scale = nullptr;
-    int prelu_scale_size = 0;
-    if (fusePRelu) {
-      auto& prelu = Input(PRELU);
-      prelu_scale = prelu.template data<float>();
-      prelu_scale_size = prelu.size();
-    } else if (fuseRelu) {
-      prelu_scale = &val;
-      prelu_scale_size = 1;
-    }
-
-    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
-    int output_tile_x = 1, output_tile_y = 1;
-    int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
-    int input_tile_chunk_size = 1, output_tile_chunk_size = 1;
-    int input_tile_batch_size = 1, output_tile_batch_size = 1;
-
-    const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
-
-    if (tiling) {
-      // Turn on tiling
-      CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
-      computeOutputTiles(output_channels, output_tile_x, output_tile_y);
-      output_tiles = output_tile_x * output_tile_y;
-
-      output_tile_chunk_size = computeOutputTileChunkSize(
-          output_tile_x, output_tile_y, kernel_width, kernel_height);
-      output_tile_batch_size = std::max(
-          MaxOutputTileBatchSize,
-          (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
-      output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
-      output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
-
-      input_tile_chunk_size = computeInputTileChunkSize(
-          input_tile_x,
-          input_tile_y,
-          output_tile_chunk_size,
-          kernel_width,
-          kernel_height);
-      input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
-      // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
-      // input_tile_batch_size;
-    }
-    CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
-    CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
-    CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
-
-    int is_last = GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images,
-        output_width,
-        output_height,
-        output_channels,
-        output_tile_x,
-        output_tile_y,
-        is_last);
-
-    // TODO: figure out the dilation business
-    GLConvolution::descriptor geometry{input_channels,
-                                       output_channels,
-                                       {kernel_width, kernel_height},
-                                       {input_width, input_height},
-                                       {output_width, output_height},
-                                       {input_tile_x, input_tile_y},
-                                       {output_tile_x, output_tile_y},
-                                       {pad_l(), pad_t()},
-                                       {stride_w(), stride_h()},
-                                       false};
-
-    if (!conv) {
-      int input_batch_size = 1, output_batch_size = 1;
-      if (!tiling) {
-        computeBatchSizes(geometry, input_batch_size, output_batch_size);
-        input_batch_size =
-            GetSingleArgument<int>("input_batch_size", input_batch_size);
-        output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
-      }
-
-      LOG(INFO) << input_channels << ": " << input_height << " X "
-                << input_width << " => " << output_channels << ": "
-                << output_height << " X " << output_width
-                << " Kernel: " << kernel_width << "X" << kernel_height;
-      if (tiling) {
-        LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
-                  << " => " << output_tile_x << " X " << output_tile_y
-                  << ", Texture size: " << input_width * input_tile_x << " X "
-                  << input_height * input_tile_y << " => "
-                  << output_width * output_tile_x << " X "
-                  << output_height * output_tile_y
-                  << ", Input tile batch size: " << input_tile_batch_size;
-      } else {
-        LOG(INFO) << "input_batch_size = " << input_batch_size
-                  << ", output_batch_size = " << output_batch_size;
-      }
-
-      conv.reset(new GLConvolution(geometry,
-                                   filter.template data<float>(),
-                                   bias.template data<float>(),
-                                   prelu_scale,
-                                   prelu_scale_size,
-                                   input_batch_size,
-                                   output_batch_size,
-                                   input_tiles,
-                                   output_tiles,
-                                   input_tile_chunk_size,
-                                   output_tile_chunk_size,
-                                   input_tile_batch_size,
-                                   output_tile_batch_size,
-                                   tiling));
-    }
-
-    conv->convolution(input, *output);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLConvolution> conv;
-
-  INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLConv, OpenGLConvOp<float16_t, false, false>);
-OPERATOR_SCHEMA(OpenGLConv).NumInputs(3).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvPRelu, OpenGLConvOp<float16_t, true, false>);
-OPERATOR_SCHEMA(OpenGLConvPRelu).NumInputs(4).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvRelu, OpenGLConvOp<float16_t, false, true>);
-OPERATOR_SCHEMA(OpenGLConvRelu).NumInputs(3).NumOutputs(1);
-
-template <class T, bool fusePRelu, bool fuseRelu>
-class OpenGLConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext>, ImageAllocator<T> {
- public:
-  USE_OPERATOR_BASE_FUNCTIONS;
-  OpenGLConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvTransposeUnpoolBase<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-    OPERATOR_NEEDS_FEATURE(
-        adj_h() == 0 && adj_w() == 0,
-        "OpenGL only supports adj_h == 1 and adj_w == 1");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
-    auto& filter = Input(FILTER);
-    auto& bias = Input(BIAS);
-
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
-    const int M = filter.dim32(0);
-    const int C = filter.dim32(1);
-    const int kernel_width = filter.dim32(2);
-    const int kernel_height = filter.dim32(3);
-
-    CAFFE_ENFORCE(input_channels == M, "filter number must be equal to input channel number");
-    CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "filter height must be equal to kernel height");
-    CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "filter width must be equal to kernel width");
-    CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
-    CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
-
-    int output_height;
-    int output_width;
-    const int output_channels = C;
-    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
-    float val = 0;
-    const float* prelu_scale = nullptr;
-    int prelu_scale_size = 0;
-    if (fusePRelu) {
-      auto& prelu = Input(PRELU);
-      prelu_scale = prelu.template data<float>();
-      prelu_scale_size = prelu.size();
-    } else if (fuseRelu) {
-      prelu_scale = &val;
-      prelu_scale_size = 1;
-    }
-
-    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
-    int output_tile_x = 1, output_tile_y = 1;
-    int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
-    int input_tile_chunk_size = 1, output_tile_chunk_size = 1,
-        input_tile_batch_size = 1, output_tile_batch_size = 1;
-
-    const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
-
-    if (tiling) {
-      // Turn on tiling
-      CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
-      computeOutputTiles(output_channels, output_tile_x, output_tile_y);
-      output_tiles = output_tile_x * output_tile_y;
-
-      output_tile_chunk_size = computeOutputTileChunkSize(
-          output_tile_x, output_tile_y, kernel_width, kernel_height);
-      output_tile_batch_size = std::max(
-          MaxOutputTileBatchSize,
-          (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
-      output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
-      output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
-
-      input_tile_chunk_size = computeInputTileChunkSize(
-          input_tile_x,
-          input_tile_y,
-          output_tile_chunk_size,
-          kernel_width,
-          kernel_height);
-      input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
-      // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
-      // input_tile_batch_size;
-    }
-    CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
-    CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
-    CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
-
-    int is_last = GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images,
-        output_width,
-        output_height,
-        output_channels,
-        output_tile_x,
-        output_tile_y,
-        is_last);
-
-    // TODO: figure out the adj business
-    GLConvolution::descriptor geometry{input_channels,
-                                       output_channels,
-                                       {kernel_width, kernel_height},
-                                       {input_width, input_height},
-                                       {output_width, output_height},
-                                       {input_tile_x, input_tile_y},
-                                       {output_tile_x, output_tile_y},
-                                       {pad_l(), pad_t()},
-                                       {stride_w(), stride_h()},
-                                       true};
-
-    if (!conv) {
-      int input_batch_size = 1, output_batch_size = 1;
-      if (!tiling) {
-        computeBatchSizes(geometry, input_batch_size, output_batch_size);
-        input_batch_size =
-            GetSingleArgument<int>("input_batch_size", input_batch_size);
-        output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
-      }
-
-      LOG(INFO) << input_channels << ": " << input_height << " X "
-                << input_width << " => " << output_channels << ": "
-                << output_height << " X " << output_width
-                << " Kernel: " << kernel_width << "X" << kernel_height;
-
-      if (tiling) {
-        LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
-                  << " => " << output_tile_x << " X " << output_tile_y
-                  << ", Texture size: " << input_width * input_tile_x << " X "
-                  << input_height * input_tile_y << " => "
-                  << output_width * output_tile_x << " X "
-                  << output_height * output_tile_y
-                  << ", Input tile batch size: " << input_tile_batch_size;
-      } else {
-        LOG(INFO) << "input_batch_size = " << input_batch_size
-                  << ", output_batch_size = " << output_batch_size;
-      }
-
-      conv.reset(new GLConvolution(geometry,
-                                   filter.template data<float>(),
-                                   bias.template data<float>(),
-                                   prelu_scale,
-                                   prelu_scale_size,
-                                   input_batch_size,
-                                   output_batch_size,
-                                   input.tile_x() * input.tile_y(),
-                                   output->tile_x() * output->tile_y(),
-                                   input_tile_chunk_size,
-                                   output_tile_chunk_size,
-                                   input_tile_batch_size,
-                                   output_tile_batch_size,
-                                   tiling));
-    }
-
-    conv->convolution(input, *output);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLConvolution> conv;
-
-  INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLConvTranspose, OpenGLConvTransposeOp<float16_t, false, false>);
-OPERATOR_SCHEMA(OpenGLConvTranspose).NumInputs(3).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvTransposePRelu, OpenGLConvTransposeOp<float16_t, true, false>);
-OPERATOR_SCHEMA(OpenGLConvTransposePRelu).NumInputs(4).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvTransposeRelu, OpenGLConvTransposeOp<float16_t, false, true>);
-OPERATOR_SCHEMA(OpenGLConvTransposeRelu).NumInputs(3).NumOutputs(1);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConvolution.h b/caffe2/mobile/contrib/opengl/operators/GLConvolution.h
deleted file mode 100644 (file)
index e6713a8..0000000
+++ /dev/null
@@ -1,232 +0,0 @@
-#pragma once
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "gl_tiling_utils.h"
-
-class GLConvolution : public GLFilter {
- public:
-  static constexpr int MaxInputBatchSize = 8;
-  static constexpr int MaxOutputBatchSize = 4;
-
-  struct descriptor {
-    int input_channels;
-    int output_channels;
-    point kernel_size;
-    point input_tile_size;
-    point output_tile_size;
-    point input_tile_grid_size;
-    point output_tile_grid_size;
-    point input_padding;
-    point input_stride;
-    bool transposed;
-  };
-
-  const float* kernel;
-  const float* bias;
-  const float* prelu_scale;
-
-  binding* inputData[MaxInputBatchSize];
-  binding* previousData[MaxOutputBatchSize];
-  binding* outputSize;
-  binding* accumulate;
-  binding* fusePRelu;
-  binding* kernel_block[MaxInputBatchSize];
-  binding* bias_block;
-  binding* prelu_scale_block;
-  binding* inputTileRange;
-
-  const descriptor geometry;
-  const int prelu_scale_size;
-  const int input_batch_size;
-  const int output_batch_size;
-  const int input_tiles;
-  const int output_tiles;
-  const int input_tile_chunk_size;
-  const int output_tile_chunk_size;
-  const int input_tile_batch_size;
-  const int output_tile_batch_size;
-  const bool tiling;
-
-  static const char* fragment_shader;
-
-  GLConvolution(
-      const descriptor& _geometry,
-      const float* _kernel,
-      const float* _bias,
-      const float* _prelu_scale = nullptr,
-      int _prelu_scale_size = 0,
-      int _input_batch_size = 1,
-      int _output_batch_size = 1,
-      int _input_tiles = 1,
-      int _output_tiles = 1,
-      int _input_tile_chunk_size = 1,
-      int _output_tile_chunk_size = 1,
-      int _input_tile_batch_size = 1,
-      int _output_tile_batch_size = 1,
-      bool _tiling = false)
-      : GLFilter(
-            "GLConvolution",
-            vertex_shader,
-            fragment_shader,
-            input_bindings(_input_batch_size, _output_batch_size),
-            uniform_blocks_bindings(
-                _input_batch_size,
-                _output_batch_size,
-                _output_tile_batch_size,
-                _prelu_scale != nullptr),
-            {/* no attributes */},
-            {{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
-             {"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
-             {"INPUT_BATCH_SIZE", c10::to_string(_input_batch_size)},
-             {"OUTPUT_BATCH_SIZE", c10::to_string(_output_batch_size)},
-             {"INPUT_TILES", c10::to_string(_input_tiles)},
-             {"OUTPUT_TILES", c10::to_string(_output_tiles)},
-             {"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
-             {"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
-             {"OUTPUT_TILE_WIDTH",
-              c10::to_string(_geometry.output_tile_size.x)},
-             {"OUTPUT_TILE_HEIGHT",
-              c10::to_string(_geometry.output_tile_size.y)},
-             {"INPUT_TILE_X", c10::to_string(_geometry.input_tile_grid_size.x)},
-             {"OUTPUT_TILE_X",
-              c10::to_string(_geometry.output_tile_grid_size.x)},
-             {"INPUT_TILE_CHUNK_SIZE", c10::to_string(_input_tile_chunk_size)},
-             {"OUTPUT_TILE_CHUNK_SIZE",
-              c10::to_string(_output_tile_chunk_size)},
-             {"OUTPUT_TILE_BATCH_SIZE",
-              c10::to_string(_output_tile_batch_size)},
-             {"TILED_CONVOLUTION", c10::to_string(_tiling)},
-             {"INPUT_PADDING_X",
-              c10::to_string(
-                  _geometry.transposed
-                      ? _geometry.kernel_size.x - 1 - _geometry.input_padding.x
-                      : _geometry.input_padding.x)},
-             {"INPUT_PADDING_Y",
-              c10::to_string(
-                  _geometry.transposed
-                      ? _geometry.kernel_size.y - 1 - _geometry.input_padding.y
-                      : _geometry.input_padding.y)},
-             {"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
-             {"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
-             {"TRANSPOSED_CONVOLUTION", c10::to_string(_geometry.transposed)},
-             {"BOUNDS_CHECK_MODE",
-              c10::to_string(bounds_check_mode(_tiling, _geometry))}}),
-        kernel(_kernel),
-        bias(_bias),
-        prelu_scale(_prelu_scale),
-        geometry(_geometry),
-        prelu_scale_size(_prelu_scale_size),
-        input_batch_size(_input_batch_size),
-        output_batch_size(_output_batch_size),
-        input_tiles(_input_tiles),
-        output_tiles(_output_tiles),
-        input_tile_chunk_size(_input_tile_chunk_size),
-        output_tile_chunk_size(_output_tile_chunk_size),
-        input_tile_batch_size(_input_tile_batch_size),
-        output_tile_batch_size(_output_tile_batch_size),
-        tiling(_tiling) {}
-
-  ~GLConvolution() {}
-
-  template <typename T>
-  void convolution(
-      const GLImageVector<T>& input_images,
-      const GLImageVector<T>& output_images);
-
- private:
-  /*
-   * Computes BOUNDS_CHECK_MODE for the convolution parameters.
-   *
-   * @retval 0 if bounds check can be skipped
-   * @retval non-zero if bounds check can not be skipped
-   */
-  inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
-    if (tiling) {
-      return 1;
-    }
-
-    int input_padding_x = geometry.input_padding.x,
-        input_padding_y = geometry.input_padding.y;
-    if (geometry.transposed) {
-      input_padding_x = geometry.kernel_size.x - 1 - input_padding_x;
-      input_padding_y = geometry.kernel_size.y - 1 - input_padding_y;
-    }
-
-    if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
-        (input_padding_x == 0 && input_padding_y == 0)) {
-      return 0;
-    } else {
-      return 1;
-    }
-  }
-
-  const std::vector<binding*> input_bindings(
-      int input_batch_size,
-      int output_batch_size) {
-    std::vector<binding*> bindings({BINDING(outputSize),
-                                    BINDING(accumulate),
-                                    BINDING(fusePRelu),
-                                    BINDING(inputTileRange)});
-
-    for (int i = 0; i < input_batch_size; i++) {
-      bindings.push_back(
-          inputData[i] = new binding{"inputData[" + c10::to_string(i) + "]"});
-    }
-
-    for (int i = 0; i < output_batch_size; i++) {
-      bindings.push_back(
-          previousData[i] =
-              new binding{"previousData[" + c10::to_string(i) + "]"});
-    }
-
-    return bindings;
-  }
-
-  const std::vector<binding*> uniform_blocks_bindings(
-      int input_batch_size,
-      int output_batch_size,
-      int output_tile_batch_size,
-      bool fuse_prelu) {
-    std::vector<binding*> bindings({BINDING(bias_block)});
-    if (fuse_prelu) {
-      bindings.push_back(BINDING(prelu_scale_block));
-    }
-
-    for (int i = 0; i < std::max(input_batch_size, output_tile_batch_size);
-         i++) {
-      bindings.push_back(
-          kernel_block[i] =
-              new binding{"Kernel_block[" + c10::to_string(i) + "]"});
-    }
-
-    return bindings;
-  }
-
-  void pack_kernel_data_for_bached_conv(
-      float16_t* data,
-      size_t size,
-      int input_channels,
-      int output_channels,
-      int is,
-      int os,
-      int ib);
-
-  void pack_kernel_data_for_tiled_conv(
-      float16_t* data, // destination
-      size_t size,
-      int input_channels,
-      int output_channels,
-      point input_tile_range,
-      point output_tile_range);
-
-  template <typename T>
-  void run_batched_conv(
-      const GLImageVector<T>& input_images,
-      const GLImageVector<T>& output_images);
-
-  template <typename T>
-  void run_tiled_conv(
-      const GLImageVector<T>& input_images,
-      const GLImageVector<T>& output_images);
-};
diff --git a/caffe2/mobile/contrib/opengl/operators/GLCopyOps.cc b/caffe2/mobile/contrib/opengl/operators/GLCopyOps.cc
deleted file mode 100644 (file)
index 1d32c49..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-
-#include "../core/DataTransfer.h"
-#include "../core/GLContext.h"
-#include "../core/GLImage.h"
-#include "../core/GLPlainTexture.h"
-#include "../core/ImageAllocator.h"
-
-#include <algorithm>
-
-namespace caffe2 {
-template <class T>
-class CopyToOpenGLOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  CopyToOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    // caffe2::Timer timer;
-    const TensorCPU& X = Input(0);
-    const int num_images = X.dim32(0);
-    const int input_channels = X.dim32(1);
-    const int input_width = X.dim32(3);
-    const int input_height = X.dim32(2);
-    const int input_size = input_width * input_height;
-
-    // set up the OpenGL context
-    GLContext::getGLContext()->set_context();
-
-    const float* input = X.template data<float>();
-
-    int tile_x = GetSingleArgument<int>("tile_x", 1);
-    int tile_y = GetSingleArgument<int>("tile_y", 1);
-
-    GLImageVector<T>* output_image = ImageAllocator<T>::newImage(num_images,
-                                                                 input_width,
-                                                                 input_height,
-                                                                 input_channels,
-                                                                 tile_x,
-                                                                 tile_y,
-#if CAFFE2_IOS
-                                                                 true
-#else
-                                                                 false
-#endif
-    );
-
-    if (output_image->tile_x() > 1 || output_image->tile_y() > 1) {
-      LOG(INFO) << "CopyToOpenGLOp tiling: " << output_image->tile_x() << ":"
-                << output_image->tile_y();
-    }
-
-    Outputs()[0]->Reset(output_image);
-
-    for (int i = 0; i < num_images; i++) {
-      const auto textures = (*output_image)[i]->textures;
-      for (int slice = 0; slice < textures.size(); slice++) {
-        // timer.Start();
-
-        textures[slice]->map_load([&](void* buffer,
-                                      size_t width,
-                                      size_t height,
-                                      size_t stride,
-                                      size_t channels,
-                                      const GLTexture::Type& type) {
-          for (int y = 0; y < tile_y; y++) {
-            for (int x = 0; x < tile_x; x++) {
-              const int tiles = slice * tile_x * tile_y + y * tile_x + x;
-              const int slice_channels = std::min(4, input_channels - 4 * tiles);
-              interleaveSlice(
-                  (float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
-                  &input[i * input_channels * input_size + 4 * tiles * input_size],
-                  input_width,
-                  input_height,
-                  stride, // texture stride
-                  slice_channels);
-            }
-          }
-        });
-        // LOG(INFO) << "Texture uploading takes " << timer.MilliSeconds() << " ms";
-      }
-    }
-
-    return true;
-  }
-};
-
-REGISTER_CPU_OPERATOR(CopyToOpenGL, CopyToOpenGLOp<float16_t>);
-OPERATOR_SCHEMA(CopyToOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-
-template <class T>
-class CopyFromOpenGLOp final : public Operator<CPUContext> {
- public:
-  CopyFromOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    caffe2::Timer timer;
-    const GLImageVector<T>& X = Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = X.size();
-    const int input_channels = X.channels();
-    const int input_width = X.width();
-    const int input_height = X.height();
-
-    TensorCPU* Y = Output(0);
-    Y->Resize(num_images, input_channels, input_height, input_width);
-    const int output_width = input_width;
-    const int output_height = input_height;
-    const int output_size = input_width * input_height;
-
-    float* output = Y->mutable_data<float>();
-
-    const int tile_x = X.tile_x();
-    const int tile_y = X.tile_y();
-    for (int i = 0; i < num_images; i++) {
-      for (int slice = 0; slice < X[i]->slices; slice++) {
-        timer.Start();
-        const GLTexture* texture = X[i]->textures[slice];
-
-        texture->map_read([&](const void* buffer,
-                              size_t width,
-                              size_t height,
-                              size_t stride,
-                              size_t channels,
-                              const GLTexture::Type& type) {
-          //#if CAFFE2_ANDROID && defined(__ARM_NEON__)
-          //        if (static_cast<AndroidGLContext*>(GLContext::getGLContext())->get_platform() ==
-          //        Mali) {
-          //          caffe2::Timer timer;
-          //          timer.Start();
-          //          float16_t* copy_buffer = (float16_t*)malloc(_capacity);
-          //          arm_memcpy(
-          //              (volatile unsigned char*)copy_buffer, (volatile unsigned char*)buffer,
-          //              _capacity);
-          //          deInterleaveSlice(
-          //              output + 4 * slice * output_size, copy_buffer, width, height, stride,
-          //              slice_channels);
-          //          free(copy_buffer);
-          //          LOG(INFO) << "memcpy takes " << timer.MilliSeconds() << " ms";
-          //        } else
-          //#endif
-          {
-            gl_log(GL_VERBOSE,
-                   "calling deInterleaveSlice width: %d, height: %d, stride: %d, channels: %d\n",
-                   width,
-                   height,
-                   stride,
-                   channels);
-
-            for (int y = 0; y < tile_y; y++) {
-              for (int x = 0; x < tile_x; x++) {
-                const int tiles = slice * tile_x * tile_y + y * tile_x + x;
-                const int slice_channels = std::min(4, input_channels - 4 * tiles);
-                deInterleaveSlice(
-                    output + i * input_channels * output_size + 4 * tiles * output_size,
-                    (float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
-                    input_width,
-                    input_height,
-                    stride,
-                    slice_channels);
-              }
-            }
-          }
-        });
-      }
-    }
-    return true;
-  }
-};
-
-REGISTER_CPU_OPERATOR(CopyFromOpenGL, CopyFromOpenGLOp<float16_t>);
-OPERATOR_SCHEMA(CopyFromOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc b/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc
deleted file mode 100644 (file)
index 4927923..0000000
+++ /dev/null
@@ -1,462 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLReduce : public GLFilter {
- public:
-  binding* inputSize;
-  binding* outputSize;
-  binding* tileSize;
-  binding* inv_pixel_count;
-  binding* epsilon;
-  binding* inputData;
-  binding* averageData;
-
-  bool compute_inv_stdev;
-  bool compute_norm;
-
-  const std::vector<binding*> input_bindings(bool compute_norm_) {
-    std::vector<binding*> bindings({BINDING(inputSize),
-                                    BINDING(outputSize),
-                                    BINDING(tileSize),
-                                    BINDING(inv_pixel_count),
-                                    BINDING(epsilon),
-                                    BINDING(inputData)});
-    if (compute_norm_) {
-      bindings.push_back(BINDING(averageData));
-    }
-    return bindings;
-  }
-
-  GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
-      : GLFilter(
-            "GLReduce",
-            vertex_shader,
-            fragment_shader,
-            input_bindings(compute_norm_),
-            {/* no uniform_blocks_bindings */},
-            {/* no attributes */},
-            {{"COMPUTE_INV_STDEV", c10::to_string((int)compute_inv_stdev_)},
-             {"COMPUTE_NORM", c10::to_string((int)compute_norm_)}}),
-        compute_inv_stdev(compute_inv_stdev_),
-        compute_norm(compute_norm_) {}
-
-  template <typename T>
-  void reduce(const GLImage<T>* input_image,
-              const GLImage<T>* output_image,
-              int tile_size_x,
-              int tile_size_y,
-              float inv_pixel_count_ = 1.0,
-              float epsilon_ = 0.0);
-
-  template <typename T>
-  void norm(const GLImage<T>* input_image,
-            const GLImage<T>* avg_image,
-            const GLImage<T>* output_image,
-            int tile_size_x,
-            int tile_size_y,
-            float inv_pixel_count_);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLReduce::fragment_shader = R"GLSL(#version 300 es
-
-#define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV)
-#define COMPUTE_NORM $(COMPUTE_NORM)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 inputSize;
-uniform ivec2 outputSize;
-uniform ivec2 tileSize;
-uniform float inv_pixel_count;
-uniform float epsilon;
-
-#if COMPUTE_NORM
-TEXTURE_INPUT(averageData);
-#endif
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize));
-  ivec2 texelCoord = outputCoord * tileSize;
-  ivec2 sumArea = min(tileSize, inputSize - texelCoord);
-  highp vec4 sum = vec4(0.0);
-
-#if COMPUTE_NORM
-  vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
-#endif
-
-  for (int y = 0; y < sumArea.y; y++) {
-    for (int x = 0; x < sumArea.x; x++) {
-      ivec2 idx = texelCoord + ivec2(x, y);
-      vec4 val = TEXTURE_LOAD(inputData, idx);
-#if COMPUTE_NORM
-      val -= avg;
-      sum += val * val;
-#else
-      sum += val;
-#endif
-    }
-  }
-
-#if COMPUTE_INV_STDEV
-  outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon)));
-#elif COMPUTE_NORM
-  outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count));
-#else
-  outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon));
-#endif
-}
-
-)GLSL";
-
-template <typename T>
-void GLReduce::reduce(const GLImage<T>* input_image,
-                      const GLImage<T>* output_image,
-                      int tile_size_x,
-                      int tile_size_y,
-                      float inv_pixel_count_,
-                      float epsilon_) {
-  int input_slices = input_image->slices;
-  int output_slices = output_image->slices;
-
-  for (int is = 0; is < input_slices; is++) {
-    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-
-    run(input_attachments,
-        {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-        [&]() {
-          glUniform2i(inputSize->location, input_image->width, input_image->height);
-          glUniform2i(outputSize->location, output_image->width, output_image->height);
-          glUniform2i(tileSize->location, tile_size_x, tile_size_y);
-          glUniform1f(inv_pixel_count->location, inv_pixel_count_);
-          glUniform1f(epsilon->location, epsilon_);
-        },
-        output_image->width,
-        output_image->height);
-  }
-}
-
-template <typename T>
-void GLReduce::norm(const GLImage<T>* input_image,
-                    const GLImage<T>* avg_image,
-                    const GLImage<T>* output_image,
-                    int tile_size_x,
-                    int tile_size_y,
-                    float inv_pixel_count_) {
-  int input_slices = input_image->slices;
-  int output_slices = output_image->slices;
-
-  for (int is = 0; is < input_slices; is++) {
-    std::vector<texture_attachment> input_attachments(
-        {{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
-
-    run(input_attachments,
-        {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-        [&]() {
-          glUniform2i(inputSize->location, input_image->width, input_image->height);
-          glUniform2i(outputSize->location, output_image->width, output_image->height);
-          glUniform2i(tileSize->location, tile_size_x, tile_size_y);
-          glUniform1f(inv_pixel_count->location, inv_pixel_count_);
-        },
-        output_image->width,
-        output_image->height);
-  }
-}
-
-class GLScale : public GLFilter {
- public:
-  binding* outputSize;
-  binding* inputData;
-  binding* averageData;
-  binding* normData;
-
-  binding* scale_factor;
-  binding* bias_factor;
-  binding* prelu_scale_factor;
-
-  const int channels;
-  const float* scale;
-  const float* bias;
-  const float* prelu_scale;
-  const int prelu_size;
-
-  const std::vector<binding*> input_bindings(bool fuse_prelu) {
-    std::vector<binding*> bindings({BINDING(outputSize),
-                                    BINDING(scale_factor),
-                                    BINDING(bias_factor),
-                                    BINDING(inputData),
-                                    BINDING(averageData),
-                                    BINDING(normData)});
-    if (fuse_prelu) {
-      bindings.push_back(prelu_scale_factor = new binding({"prelu_scale_factor"}));
-    }
-    return bindings;
-  }
-
-  GLScale(
-      const int _channels,
-      const float* _scale,
-      const float* _bias,
-      const float* _prelu_scale = nullptr,
-      const int _prelu_size = 0)
-      : GLFilter(
-            "GLScale",
-            vertex_shader,
-            fragment_shader,
-            input_bindings(_prelu_scale != nullptr),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"FUSE_PRELU", c10::to_string(_prelu_scale != nullptr)}}),
-        channels(_channels),
-        scale(_scale),
-        bias(_bias),
-        prelu_scale(_prelu_scale),
-        prelu_size(_prelu_size) {}
-
-  template <typename T>
-  void scale_and_shift(const GLImage<T>* input_image,
-                       const GLImage<T>* avg_image,
-                       const GLImage<T>* norm_image,
-                       const GLImage<T>* output_image);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLScale::fragment_shader = R"GLSL(#version 300 es
-
-#define FUSE_PRELU $(FUSE_PRELU)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-uniform ivec2 outputSize;
-uniform vec4 scale_factor;
-uniform vec4 bias_factor;
-
-#if FUSE_PRELU
-uniform vec4 prelu_scale_factor;
-#endif
-
-TEXTURE_INPUT(inputData);
-TEXTURE_INPUT(averageData);
-TEXTURE_INPUT(normData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
-  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
-  vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
-  vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0));
-
-#if FUSE_PRELU
-  vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor;
-  vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0))));
-  outputData = TEXTURE_STORE(o);
-#else
-  vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor;
-  outputData = TEXTURE_STORE(o);
-#endif
-}
-
-)GLSL";
-
-template <typename T>
-void GLScale::scale_and_shift(const GLImage<T>* input_image,
-                              const GLImage<T>* avg_image,
-                              const GLImage<T>* norm_image,
-                              const GLImage<T>* output_image) {
-  int input_slices = input_image->slices;
-  int output_slices = output_image->slices;
-
-  for (int is = 0; is < input_slices; is++) {
-    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
-                                                       {avg_image->textures[is], averageData},
-                                                       {norm_image->textures[is], normData}});
-
-    run(input_attachments,
-        {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-        [&]() {
-          glUniform2i(outputSize->location, output_image->width, output_image->height);
-          glUniform4f(scale_factor->location,
-                      scale[4 * is],
-                      channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
-                      channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
-                      channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
-          glUniform4f(bias_factor->location,
-                      bias[4 * is],
-                      channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
-                      channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
-                      channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
-          if (prelu_scale != nullptr) {
-            glUniform4f(prelu_scale_factor->location,
-                        prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
-                        channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
-                                                                        : prelu_scale[0],
-                        channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
-                                                                        : prelu_scale[0],
-                        channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
-                                                                        : prelu_scale[0]);
-          }
-        },
-        output_image->width,
-        output_image->height);
-  }
-}
-
-namespace caffe2 {
-template <class T, bool FUSE_PRELU>
-class OpenGLInstanceNormPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws),
-        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    const int tile_size_x = 16;
-    const int tile_size_y = 16;
-    int avg_buf_width = input_width;
-    int avg_buf_height = input_height;
-
-    vector<GLImageVector<T>*> reduce_buf;
-    while (reduce_buf.size() == 0 ||
-           (avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
-      avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
-      avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
-
-      reduce_buf.push_back(
-          ImageAllocator<T>::newImage(1, avg_buf_width, avg_buf_height, output_channels));
-    }
-
-    GLImageVector<T>* avg = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
-    GLImageVector<T>* inv_stdev = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
-    const float* prelu_data = nullptr;
-    int prelu_size = 0;
-    if (FUSE_PRELU) {
-      DCHECK_EQ(InputSize(), 4);
-      const auto& prelu_scale = Input(PRELU);
-      prelu_data = prelu_scale.template data<float>();
-      prelu_size = prelu_scale.size();
-    } else {
-      DCHECK_EQ(InputSize(), 3);
-    }
-
-    const auto& scale = Input(SCALE);
-    const auto& bias = Input(BIAS);
-
-    if (!f_reduce) {
-      f_reduce.reset(new GLReduce());
-      f_norm.reset(new GLReduce(false, true));
-      f_stdDev.reset(new GLReduce(true, false));
-      f_scale.reset(new GLScale(input_channels,
-                                scale.template data<float>(),
-                                bias.template data<float>(),
-                                prelu_data,
-                                prelu_size));
-    }
-
-    for (int i = 0; i < num_images; i++) {
-      for (int k = 0; k < reduce_buf.size() + 1; k++) {
-        const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
-        GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
-
-        float norm = k < reduce_buf.size()
-                         ? 1.0 / (tile_size_x * tile_size_y)
-                         : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
-                               (float)(input_width * input_height);
-        const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
-        const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
-        f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
-      }
-
-      for (int k = 0; k < reduce_buf.size() + 1; k++) {
-        const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
-        GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
-
-        float norm = k < reduce_buf.size()
-                         ? 1.0 / (tile_size_x * tile_size_y)
-                         : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
-                               (float)(input_width * input_height);
-
-        if (k == 0) {
-          f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
-        } else if (k < reduce_buf.size()) {
-          f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
-        } else {
-          const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
-          const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
-          f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
-        }
-      }
-
-      f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
-    }
-    Outputs()[OUTPUT]->Reset(output);
-    if (OutputSize() > 1) {
-      Outputs()[MEAN]->Reset(avg);
-      Outputs()[INV_STDEV]->Reset(inv_stdev);
-    } else {
-      delete avg;
-      delete inv_stdev;
-    }
-    for (auto&& rb : reduce_buf) {
-      delete rb;
-    }
-
-    return true;
-  }
-
- private:
-  float epsilon_;
-  StorageOrder order_;
-  std::unique_ptr<GLReduce> f_reduce;
-  std::unique_ptr<GLReduce> f_norm;
-  std::unique_ptr<GLReduce> f_stdDev;
-  std::unique_ptr<GLScale> f_scale;
-
-  INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
-  OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLInstanceNorm, OpenGLInstanceNormPReluOp<float16_t, false>);
-OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
-REGISTER_CPU_OPERATOR(OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp<float16_t, true>);
-OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLMul.cc b/caffe2/mobile/contrib/opengl/operators/GLMul.cc
deleted file mode 100644 (file)
index 0d41fce..0000000
+++ /dev/null
@@ -1,120 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-
-class GLMul : public GLFilter {
- public:
-  binding* outputSize;
-  binding* inputData;
-  binding* B;
-
-  GLMul()
-      : GLFilter("GLMul",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>({BINDING(outputSize), BINDING(inputData), BINDING(B)}),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {/* no replacements */}) {}
-
-  template <typename T>
-  void mul(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images, float b);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLMul::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-uniform vec4 B;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 A = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = TEXTURE_STORE(A * B);
-}
-
-)GLSL";
-
-template <typename T>
-void GLMul::mul(const GLImageVector<T>& input_images,
-                const GLImageVector<T>& output_images,
-                float b) {
-  for (int i = 0; i < input_images.size(); i++) {
-    auto input_image = input_images[i];
-    auto output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() {
-            glUniform2i(outputSize->location, output_image->width, output_image->height);
-            glUniform4f(B->location, b, b, b, b);
-          },
-          output_image->width,
-          output_image->height);
-    }
-  }
-}
-
-namespace caffe2 {
-template <class T>
-class OpenGLMulOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLMulOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
-                           "OpenGLMul only supports broadcast");
-
-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
-                           "OpenGLMul does not support axis");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-    const auto& B = Input(1);
-    CAFFE_ENFORCE_EQ(B.size(), 1); // only scalar is supported
-
-    const int num_images = input.size();
-    const auto output_height = input.height();
-    const auto output_width = input.width();
-    const int output_channels = input.channels();
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
-
-    if (!_mult) {
-      _mult.reset(new GLMul());
-    }
-
-    _mult->mul(input, *output, B.template data<float>()[0]);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLMul> _mult;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLMul, OpenGLMulOp<float16_t>);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLNormPlanarYUV.cc b/caffe2/mobile/contrib/opengl/operators/GLNormPlanarYUV.cc
deleted file mode 100644 (file)
index 39468f6..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLNormPlanarYUV : public GLFilter {
- public:
-  const float* mean;
-  const float* std;
-
-  binding* inputData;
-  binding* outputSize;
-  binding* mean_data;
-  binding* std_data;
-
-  GLNormPlanarYUV(const float* _mean, const float* _std)
-      : GLFilter("GLNormPlanarYUV",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>({BINDING(inputData),
-                                        BINDING(outputSize),
-                                        BINDING(mean_data),
-                                        BINDING(std_data)}), // input bindings
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {}),
-        mean(_mean),
-        std(_std) {}
-
-  template <typename T>
-  void normalize(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLNormPlanarYUV::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-uniform vec4 mean_data;
-uniform vec4 std_data;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = TEXTURE_STORE((value - mean_data) / std_data);
-}
-
-)GLSL";
-
-template <class T>
-void GLNormPlanarYUV::normalize(const GLImageVector<T>& input_images,
-                                const GLImageVector<T>& output_images) {
-  int num_images = input_images.size();
-  for (int i = 0; i < num_images; i++) {
-    GLImage<T>* input_image = input_images[i];
-    GLImage<T>* output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-
-      std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-
-      run(input_attachments,
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() {
-            glUniform2i(outputSize->location, output_image->width, output_image->height);
-            glUniform4f(mean_data->location, mean[0], mean[1], mean[2], 0.0);
-            glUniform4f(std_data->location, std[0], std[1], std[2], 1.0);
-          },
-          output_image->width,
-          output_image->height);
-    }
-  }
-}
-
-namespace caffe2 {
-template <typename T>
-class GLNormPlanarYUVOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  GLNormPlanarYUVOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
-
-    const auto& M = Input(1); // mean
-    const auto& S = Input(2); // standard deviation
-    CAFFE_ENFORCE(input_channels == M.dim(1));
-    CAFFE_ENFORCE(input_channels == S.dim(1));
-
-    if (!_normPlanarYUV) {
-      _normPlanarYUV.reset(new GLNormPlanarYUV(M.template data<float>(), S.template data<float>()));
-    }
-
-    _normPlanarYUV->normalize(input, *output);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  StorageOrder order_;
-  std::unique_ptr<GLNormPlanarYUV> _normPlanarYUV;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLNormalizePlanarYUV, GLNormPlanarYUVOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLNormalizePlanarYUV).NumInputs(3).NumOutputs(1);
-
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc b/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc
deleted file mode 100644 (file)
index 833c6ff..0000000
+++ /dev/null
@@ -1,273 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLPRelu : public GLFilter {
- public:
-  typedef enum { PRelu = 0, Relu = 1 } ReluType;
-
-  const float* scale;
-
-  binding* inputData;
-  binding* scale_block;
-
-  const int scale_size;
-  const int channels;
-  const int output_tile_x;
-  const int output_tile_y;
-  const int output_tile_width;
-  const int output_tile_height;
-
-  GLPRelu(
-      const float* _scale,
-      const int _scale_size,
-      const int _channels,
-      int _output_tile_x,
-      int _output_tile_y,
-      int _output_tile_width,
-      int _output_tile_height)
-      : GLFilter(
-            "GLPRelu",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(inputData)}),
-            std::vector<binding*>({BINDING(scale_block)}),
-            {/* no attributes */},
-            {{"USE_RELU", c10::to_string(PRelu)},
-             {"OUTPUT_TILES", c10::to_string(_output_tile_x * _output_tile_y)},
-             {"OUTPUT_TILE_X", c10::to_string(_output_tile_x)},
-             {"OUTPUT_TILE_WIDTH", c10::to_string(_output_tile_width)},
-             {"OUTPUT_TILE_HEIGHT", c10::to_string(_output_tile_height)},
-             {"TILED_PRELU",
-              c10::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
-        scale(_scale),
-        scale_size(_scale_size),
-        channels(_channels),
-        output_tile_x(_output_tile_x),
-        output_tile_y(_output_tile_y),
-        output_tile_width(_output_tile_width),
-        output_tile_height(_output_tile_height) {}
-
-  GLPRelu(const int _channels)
-      : GLFilter(
-            "GLRelu",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(inputData)}),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"USE_RELU", c10::to_string(Relu)},
-             {"OUTPUT_TILES", c10::to_string(1)},
-             {"OUTPUT_TILE_X", c10::to_string(1)},
-             {"OUTPUT_TILE_WIDTH", c10::to_string(1)},
-             {"OUTPUT_TILE_HEIGHT", c10::to_string(1)},
-             {"TILED_PRELU", c10::to_string(0)}}),
-        scale(nullptr),
-        scale_block(nullptr),
-        scale_size(0),
-        channels(_channels),
-        output_tile_x(1),
-        output_tile_y(1),
-        output_tile_width(1),
-        output_tile_height(1) {}
-
-  template <typename T>
-  void prelu(const GLImageVector<T>& input_images,
-             const GLImageVector<T>& output_images,
-             GLPRelu::ReluType reluType);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
-#define TILED_PRELU                 $(TILED_PRELU)
-#define USE_RELU                    $(USE_RELU)
-
-// tiling
-#define OUTPUT_TILES                $(OUTPUT_TILES)
-#define OUTPUT_TILE_X               $(OUTPUT_TILE_X)
-#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
-
-// common
-precision mediump float;
-precision highp int;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-in highp vec2 v_texCoord;
-
-#if USE_RELU
-
-// Relu
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = TEXTURE_STORE(max(value, vec4(0.0)));
-}
-
-#else
-
-#if TILED_PRELU
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-layout (std140) uniform scale_block {
-  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
-};
-
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-
-  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
-  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-
-  // outputData = value > 0 ? value : value * weight;
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
-  value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
-  outputData = TEXTURE_STORE(value);
-}
-#else
-layout (std140) uniform scale_block {
-  highp uvec4 scale;
-};
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-
-  // outputData = value > 0 ? value : value * weight;
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
-  outputData = TEXTURE_STORE(value);
-}
-#endif // TILED_PRELU
-
-#endif // USE_RELU
-
-)GLSL";
-
-template <typename T>
-void GLPRelu::prelu(const GLImageVector<T>& input_images,
-                    const GLImageVector<T>& output_images,
-                    GLPRelu::ReluType reluType) {
-  int num_images = input_images.size();
-  for (int i = 0; i < num_images; i++) {
-    GLImage<T>* input_image = input_images[i];
-    GLImage<T>* output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      if (reluType == PRelu) {
-        attach_uniform_buffer<float16_t>(scale_block, 0, [&](float16_t* data, size_t size) {
-          int output_tiles = output_tile_x * output_tile_y;
-          for (int j = 0, k = 4 * is * output_tiles;
-               k < std::min(channels, 4 * (is + 1) * output_tiles);
-               j++, k++) {
-            data[j] = scale_size == channels ? scale[k] : scale[0];
-          }
-        });
-      }
-
-      std::vector<texture_attachment> input_attachments;
-
-      input_attachments.push_back({input_image->textures[is], inputData});
-
-      run(input_attachments,
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() {},
-          output_image->texture_width,
-          output_image->texture_height);
-    }
-  }
-}
-
-namespace caffe2 {
-template <typename T, GLPRelu::ReluType reluType>
-class OpenGLPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLPReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
-    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
-    if (input_tile_x > 1 || input_tile_y > 1) {
-      CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
-    }
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(num_images,
-                                                           output_width,
-                                                           output_height,
-                                                           output_channels,
-                                                           output_tile_x,
-                                                           output_tile_y,
-                                                           is_last);
-
-    const auto* scale = reluType == GLPRelu::PRelu ? &Input(1) : nullptr;
-
-    if (!_prelu) {
-      if (reluType == GLPRelu::PRelu) {
-        _prelu.reset(new GLPRelu(scale->template data<float>(),
-                                 scale->size(),
-                                 input_channels,
-                                 output_tile_x,
-                                 output_tile_y,
-                                 output_width,
-                                 output_height));
-      } else {
-        _prelu.reset(new GLPRelu(input_channels));
-      }
-    }
-
-    _prelu->prelu(input, *output, reluType);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  StorageOrder order_;
-  std::unique_ptr<GLPRelu> _prelu;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLPRelu, OpenGLPReluOp<float16_t, GLPRelu::PRelu>);
-OPERATOR_SCHEMA(OpenGLPRelu)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape();
-REGISTER_CPU_OPERATOR(OpenGLRelu, OpenGLPReluOp<float16_t, GLPRelu::Relu>);
-OPERATOR_SCHEMA(OpenGLRelu)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape();
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPadImage.cc b/caffe2/mobile/contrib/opengl/operators/GLPadImage.cc
deleted file mode 100644 (file)
index c0e7261..0000000
+++ /dev/null
@@ -1,159 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-
-class GLPadImage : public GLFilter {
- public:
-  binding* padSize;
-  binding* inputSize;
-  binding* outputSize;
-  binding* inputData;
-
-  GLPadImage()
-      : GLFilter(
-            "GLPadImage",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>(
-                {BINDING(padSize), BINDING(inputSize), BINDING(outputSize), BINDING(inputData)}),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {/* no replacements */}) {}
-
-  template <typename T>
-  void pad(const GLImageVector<T>& input_images,
-           const GLImageVector<T>& output_images,
-           const int pad_l,
-           const int pad_t);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLPadImage::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 padSize;
-uniform ivec2 inputSize;
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)) - padSize;
-  texelCoord = max(texelCoord, -texelCoord);
-  texelCoord = min(texelCoord, ivec2(2) * (inputSize - 1) - texelCoord);
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = TEXTURE_STORE(value);
-}
-
-)GLSL";
-
-template <typename T>
-void GLPadImage::pad(const GLImageVector<T>& input_images,
-                     const GLImageVector<T>& output_images,
-                     const int pad_l,
-                     const int pad_t) {
-  for (int i = 0; i < input_images.size(); i++) {
-    auto input_image = input_images[i];
-    auto output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() {
-            glUniform2i(inputSize->location, input_image->width, input_image->height);
-            glUniform2i(outputSize->location, output_image->width, output_image->height);
-            glUniform2i(padSize->location, pad_l, pad_t);
-          },
-          output_image->width,
-          output_image->height);
-    }
-  }
-}
-
-namespace caffe2 {
-
-template <typename OPBase>
-static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
-  Tensor<CPUContext> input, output;
-  input.Resize(1, 1, H, W);
-  op->SetOutputSize(input, &output, 1);
-  CAFFE_ENFORCE_EQ(output.ndim(), 4);
-  *OH = output.dim(2);
-  *OW = output.dim(3);
-}
-
-template <class T>
-class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLPadImageOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws),
-        mode_(OperatorBase::GetSingleArgument<string>("mode", "")) {
-    OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-    OPERATOR_NEEDS_FEATURE(mode_ == "reflect", "OpenGL only supports reflection");
-
-    CAFFE_ENFORCE(legacy_pad_ == LegacyPadding::NOTSET,
-                  "Padding layer only supports explicit pad values.");
-    CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
-                  "Pooling op does not support dilation right now.");
-    CAFFE_ENFORCE(stride_h() == 1 && stride_w() == 1,
-                  "Pooling op does not support stride right now.");
-    // Pad op does not use kernel sizes, so we set it to 1 for computing the
-    // output size.
-    kernel_.assign(pads_.size() / 2, 1);
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-
-    const int num_images = input.size();
-    const int input_width = input.width();
-    const int input_height = input.height();
-    const int input_channels = input.channels();
-    const int output_channels = input_channels;
-
-    int output_height, output_width;
-    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
-
-    if (!padImage_) {
-      padImage_.reset(new GLPadImage());
-      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
-                << output_channels << ": " << output_height << " X " << output_width;
-      LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
-                << ", pad_b = " << pad_b();
-    }
-
-    padImage_->pad(input, *output, pad_l(), pad_t());
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::string mode_;
-  std::unique_ptr<GLPadImage> padImage_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLPadImage, OpenGLPadImageOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLPadImage).NumInputs(1).NumOutputs(1);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPool.cc b/caffe2/mobile/contrib/opengl/operators/GLPool.cc
deleted file mode 100644 (file)
index d293745..0000000
+++ /dev/null
@@ -1,339 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/timer.h"
-#include "caffe2/operators/pool_op.h"
-
-class GLPool : public GLFilter {
- public:
-  typedef enum { AveragePool, MaxPool } PoolType;
-
-  struct point {
-    int x;
-    int y;
-  };
-
-  struct descriptor {
-    int channels;
-    point kernel_size;
-    point input_padding;
-    point input_stride;
-    point input_tile_size;
-    point output_tile_size;
-  };
-
-  binding* inputData;
-  binding* kernelSize;
-  binding* outputSize;
-
-  const descriptor geometry;
-
-  GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
-      : GLFilter(
-            "GLPool",
-            vertex_shader,
-            fragment_shader,
-            {
-                BINDING(inputData),
-                BINDING(kernelSize),
-                BINDING(outputSize),
-            },
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
-             {"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
-             {"INPUT_PADDING_X", c10::to_string(_geometry.input_padding.x)},
-             {"INPUT_PADDING_Y", c10::to_string(_geometry.input_padding.y)},
-             {"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
-             {"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
-             {"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
-             {"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
-             {"OUTPUT_TILE_WIDTH",
-              c10::to_string(_geometry.output_tile_size.x)},
-             {"OUTPUT_TILE_HEIGHT",
-              c10::to_string(_geometry.output_tile_size.y)},
-             {"TILED_POOLING", c10::to_string(_tiling)},
-             {"MAX_POOL", c10::to_string(poolType == MaxPool)},
-             {"BOUNDS_CHECK_MODE", c10::to_string(1)}}),
-        geometry(_geometry) {}
-  ~GLPool() {}
-
-  void pool(const GLImageVector<float16_t>& input_images,
-            const GLImageVector<float16_t>& output_images) {
-    for (int i = 0; i < input_images.size(); i++) {
-      auto input_image = input_images[i];
-      auto output_image = output_images[i];
-      int input_slices = input_image->slices;
-      int output_slices = output_image->slices;
-
-      for (int is = 0; is < input_slices; is++) {
-        run({{input_image->textures[is], inputData}},
-            {output_image->textures[is]},
-            [&]() {
-              glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
-              glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
-            },
-            output_image->texture_width,
-            output_image->texture_height);
-      }
-    }
-  }
-
- private:
-  /*
-   * Computes BOUNDS_CHECK_MODE for the convolution parameters.
-   *
-   * @retval 0 if bounds check can be skipped
-   * @retval non-zero if bounds check can not be skipped
-   */
-  inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
-    if (tiling) {
-      return 1;
-    }
-
-    if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
-        (geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
-      return 0;
-    } else {
-      return 1;
-    }
-  }
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-const char* GLPool::fragment_shader = R"GLSL(#version 300 es
-#define TILED_POOLING           $(TILED_POOLING)
-#define MAX_POOL                $(MAX_POOL)
-
-// tiling
-#define INPUT_TILE_WIDTH            $(INPUT_TILE_WIDTH)
-#define INPUT_TILE_HEIGHT           $(INPUT_TILE_HEIGHT)
-#define OUTPUT_TILE_WIDTH           $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT          $(OUTPUT_TILE_HEIGHT)
-
-#define BOUNDS_CHECK_MODE           $(BOUNDS_CHECK_MODE)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
-const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
-const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
-
-uniform ivec2 kernelSize;
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-#if BOUNDS_CHECK_MODE == 0
-  #define IN_BOUNDS(p, p0, p1) (true)
-#else
-  #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
-#endif
-
-// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
-const float MIN_FLOAT = -exp2(14.0);
-
-#if TILED_POOLING
-
-const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-// tiled pooling
-#if MAX_POOL
-
-#define POOL { \
-  pool = vec4(MIN_FLOAT); \
-  for (int y = 0; y < kernelSize.y; y++) { \
-    for (int x = 0; x < kernelSize.x; x++) { \
-      ivec2 idx = tileCoord + ivec2(x, y); \
-      if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
-        vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
-        pool = max(pool, data); \
-      } \
-    } \
-  } \
-}
-
-#else
-
-#define POOL { \
-  int count = 0; \
-  for (int y = 0; y < kernelSize.y; y++) { \
-    for (int x = 0; x < kernelSize.x; x++) { \
-      ivec2 idx = tileCoord + ivec2(x, y); \
-      if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
-        vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
-        pool += data;\
-        count += 1; \
-      } \
-    } \
-  } \
-  pool = pool / float(count); \
-}
-
-#endif // MAX_POOL
-
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
-  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
-  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
-  tileCoord = input_stride * tileCoord - input_padding;
-
-  ivec2 inputTileOffset = tile * inputTileSize;
-
-#if MAX_POOL
-  vec4 pool = vec4(0);
-#else
-  highp vec4 pool = vec4(0);
-#endif
-
-  POOL;
-
-  outputData = TEXTURE_STORE(pool);
-}
-
-#else
-
-// no tiling
-#if MAX_POOL
-
-#define POOL { \
-  pool = vec4(MIN_FLOAT); \
-  for (int y = 0; y < kernelSize.y; y++) { \
-    for (int x = 0; x < kernelSize.x; x++) { \
-      ivec2 idx = texelCoord + ivec2(x, y); \
-      if IN_BOUNDS(idx, ivec2(0), inputSize) { \
-        vec4 data = TEXTURE_LOAD(inputData, idx); \
-        pool = max(pool, data); \
-      } \
-    } \
-  } \
-}
-
-#else
-
-#define POOL { \
-  int count = 0; \
-  for (int y = 0; y < kernelSize.y; y++) { \
-    for (int x = 0; x < kernelSize.x; x++) { \
-      ivec2 idx = texelCoord + ivec2(x, y); \
-      if IN_BOUNDS(idx, ivec2(0), inputSize) { \
-        vec4 data = TEXTURE_LOAD(inputData, idx); \
-        pool += data; \
-        count += 1; \
-      } \
-    } \
-  } \
-  pool = pool / float(count); \
-}
-
-#endif // MAX_POOL
-
-void main() {
-  ivec2 inputSize = textureSize(inputData, 0);
-  ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
-#if MAX_POOL
-  vec4 pool = vec4(0);
-#else
-  highp vec4 pool = vec4(0);
-#endif
-
-  POOL;
-
-  outputData = TEXTURE_STORE(pool);
-}
-#endif // TILED_POOLING
-
-)GLSL";
-
-namespace caffe2 {
-
-template <typename OPBase>
-static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
-  Tensor<CPUContext> input, output;
-  input.Resize(1, 1, H, W);
-  op->SetOutputSize(input, &output, 1);
-  CAFFE_ENFORCE_EQ(output.ndim(), 4);
-  *OH = output.dim(2);
-  *OW = output.dim(3);
-}
-
-template <typename T, GLPool::PoolType poolType>
-class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
- public:
-  GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-    CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
-                  "Pooling op does not support dilation right now.");
-    if (!global_pooling_) {
-      CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
-                        pad_r() < kernel_w(),
-                    "Pad should be smaller than kernel.");
-    }
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    int output_height;
-    int output_width;
-    const int output_channels = input_channels;
-
-    computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
-    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-
-    GLPool::descriptor geometry{input_channels,
-                                {kernel_w(), kernel_h()},
-                                {pad_l(), pad_t()},
-                                {stride_w(), stride_h()},
-                                {input_width, input_height},
-                                {output_height, output_width}};
-
-    if (!glPool_) {
-      LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
-                << output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
-                << " Tiling: " << input_tile_x << "X" << input_tile_y;
-
-      glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
-    }
-
-    glPool_->pool(input, *output);
-
-    OperatorBase::Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLPool> glPool_;
-};
-
-namespace {
-REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
-REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
-OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
-OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
-}; // namespace
-}; // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLResize.cc b/caffe2/mobile/contrib/opengl/operators/GLResize.cc
deleted file mode 100644 (file)
index 2f5a47c..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLResizeNearest : public GLFilter {
- public:
-  binding* inputData;
-  binding* outputSize;
-  binding* scale_reverse;
-
-  GLResizeNearest()
-      : GLFilter("GLResizeNearest",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
-                 {/* no uniform blocks*/},
-                 {/* no attributes */},
-                 {/* replacements */}) {}
-
-  template <typename T>
-  void resize(const GLImageVector<T>& input_images,
-              const GLImageVector<T>& output_images,
-              float width_scale_rev,
-              float height_scale_rev);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-uniform highp vec2 scale_reverse;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  // it clamps to the edge by default
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
-  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = TEXTURE_STORE(value);
-}
-)GLSL";
-
-template <typename T>
-void GLResizeNearest::resize(const GLImageVector<T>& input_images,
-                             const GLImageVector<T>& output_images,
-                             float width_scale_rev,
-                             float height_scale_rev) {
-  for (int i = 0; i < input_images.size(); i++) {
-    auto input_image = input_images[i];
-    auto output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-
-      run(input_attachments,
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() {
-            glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
-            glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
-          },
-          output_image->texture_width,
-          output_image->texture_height);
-    }
-  }
-}
-
-namespace caffe2 {
-
-template <class T>
-class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
-    if (HasArgument("width_scale")) {
-      width_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("width_scale", 1));
-    }
-    if (HasArgument("height_scale")) {
-      height_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("height_scale", 1));
-    }
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_width = input.width();
-    const int input_height = input.height();
-    const int input_channels = input.channels();
-
-    const int output_width = input_width * width_scale_;
-    const int output_height = input_height * height_scale_;
-    const int output_channels = input_channels;
-
-    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
-    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-
-    if (!resizeNearest_) {
-      resizeNearest_.reset(new GLResizeNearest());
-    }
-    resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- protected:
-  float width_scale_;
-  float height_scale_;
-  std::unique_ptr<GLResizeNearest> resizeNearest_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLResizeNearest, OpenGLResizeNearestOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLResizeNearest).NumInputs(1).NumOutputs(1);
-
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc b/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc
deleted file mode 100644 (file)
index 0188fab..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-typedef enum { Sigmoid, Tanh } OpType;
-
-class GLSigmoid : public GLFilter {
- public:
-  binding* inputData;
-  binding* outputSize;
-
-  GLSigmoid(OpType opType)
-      : GLFilter(
-            "GLSigmoid",
-            vertex_shader,
-            fragment_shader,
-            {BINDING(outputSize), BINDING(inputData)},
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"SIGMOID", c10::to_string(opType == Sigmoid)},
-             {"TANH", c10::to_string(opType == Tanh)}}) {}
-
-  template <typename T>
-  void sigmoid(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLSigmoid::fragment_shader = R"GLSL(#version 300 es
-#define SIGMOID $(SIGMOID)
-#define TANH $(TANH)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 value = TEXTURE_LOAD(inputData, ivec2(texelCoord));
-#if SIGMOID
-  value = vec4(1.0) / (vec4(1.0) + exp(-value));
-  outputData = TEXTURE_STORE(value);
-#elif TANH
-  value = tanh(value);
-  outputData = TEXTURE_STORE(value);
-#endif
-}
-
-)GLSL";
-
-template <typename T>
-void GLSigmoid::sigmoid(const GLImageVector<T>& input_images,
-                        const GLImageVector<T>& output_images) {
-  for (int i = 0; i < input_images.size(); i++) {
-    auto input_image = input_images[i];
-    auto output_image = output_images[i];
-    int input_slices = input_image->slices;
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
-          output_image->width,
-          output_image->height);
-    }
-  }
-}
-
-namespace caffe2 {
-template <typename T, OpType opType>
-class OpenGLSigmoidOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
-
-    if (!_sigmoid) {
-      _sigmoid.reset(new GLSigmoid(opType));
-    }
-
-    _sigmoid->sigmoid(input, *output);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLSigmoid> _sigmoid;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLSigmoid, OpenGLSigmoidOp<float16_t, Sigmoid>);
-OPERATOR_SCHEMA(OpenGLSigmoid)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape();
-
-REGISTER_CPU_OPERATOR(OpenGLTanh, OpenGLSigmoidOp<float16_t, Tanh>);
-OPERATOR_SCHEMA(OpenGLTanh)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape();
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc b/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc
deleted file mode 100644 (file)
index 0eb3d59..0000000
+++ /dev/null
@@ -1,434 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLSoftmaxReduce : public GLFilter {
- public:
-  binding* inputTileSize;
-  binding* outputSize;
-  binding* outputTileSize;
-  binding* tileSize;
-  binding* spatialTileSize;
-  binding* inputTileRange;
-  binding* inputData;
-  binding* maxData;
-  binding* sumData;
-
-  const std::vector<binding*> input_bindings() {
-    std::vector<binding*> bindings({BINDING(inputTileSize),
-                                    BINDING(outputSize),
-                                    BINDING(outputTileSize),
-                                    BINDING(tileSize),
-                                    BINDING(spatialTileSize),
-                                    BINDING(inputTileRange),
-                                    BINDING(inputData),
-                                    BINDING(maxData),
-                                    BINDING(sumData)});
-    return bindings;
-  }
-
-  GLSoftmaxReduce(
-      bool compute_sum_ = false,
-      bool tiled = false,
-      int input_tile_x = 1)
-      : GLFilter(
-            "GLSoftmaxReduce",
-            vertex_shader,
-            fragment_shader,
-            input_bindings(),
-            {/* no uniform_blocks_bindings */},
-            {/* no attributes */},
-            {{"COMPUTE_SUM", c10::to_string((int)compute_sum_)},
-             {"INPUT_TILE_X", c10::to_string(input_tile_x)},
-             {"TILED_SOFTMAX", c10::to_string(int(tiled))}}) {}
-
-  template <typename T>
-  void reduce(const GLImage<T>* input_image,
-              const GLImage<T>* output_image,
-              int tile_size_x,
-              int tile_size_y);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLSoftmaxReduce::fragment_shader = R"GLSL(#version 300 es
-
-#define TILED_SOFTMAX $(TILED_SOFTMAX)
-#define INPUT_TILE_X $(INPUT_TILE_X)
-// Compute sum or max
-#define COMPUTE_SUM $(COMPUTE_SUM)
-
-precision highp float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 inputTileSize;
-uniform ivec2 outputSize;
-uniform ivec2 outputTileSize;
-uniform ivec2 spatialTileSize;
-uniform ivec2 tileSize;
-uniform ivec2 inputTileRange;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-#if TILED_SOFTMAX
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
-  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
-  ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize);
-
-  vec4 result = vec4(0.0);
-  for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) {
-    int inTileX = tileIdx % INPUT_TILE_X;
-    int inTileY = tileIdx / INPUT_TILE_X;
-    ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
-    for (int y = 0; y < sumArea.y; y++) {
-      for (int x = 0; x < sumArea.x; x++) {
-        ivec2 idx = tileCoord + ivec2(x, y);
-        vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx);
-  #if COMPUTE_SUM
-        result += val;
-  #else
-        result = max(result, val);
-  #endif
-      }
-    }
-  }
-
-  outputData = TEXTURE_STORE(result);
-}
-#else
-void main() {
-  ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize));
-  ivec2 texelCoord = outputCoord * spatialTileSize;
-  ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord);
-  vec4 result = vec4(0.0);
-
-  for (int y = 0; y < sumArea.y; y++) {
-    for (int x = 0; x < sumArea.x; x++) {
-      ivec2 idx = texelCoord + ivec2(x, y);
-      vec4 val = TEXTURE_LOAD(inputData, idx);
-#if COMPUTE_SUM
-      result += val;
-#else
-      result = max(result, val);
-#endif
-    }
-  }
-
-  outputData = TEXTURE_STORE(result);
-}
-#endif
-)GLSL";
-
-template <typename T>
-void GLSoftmaxReduce::reduce(const GLImage<T>* input_image,
-                             const GLImage<T>* output_image,
-                             int tile_size_x,
-                             int tile_size_y) {
-  int input_slices = input_image->slices;
-  int output_slices = output_image->slices;
-
-  for (int is = 0; is < input_slices; is++) {
-    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-    run(input_attachments,
-        {output_image->textures.begin() + is,
-         output_image->textures.begin() + is + 1},
-        [&]() {
-          glUniform2i(
-              inputTileSize->location, input_image->width, input_image->height);
-          glUniform2i(
-              outputSize->location,
-              output_image->texture_width,
-              output_image->texture_height);
-          glUniform2i(
-              outputTileSize->location,
-              output_image->width,
-              output_image->height);
-          glUniform2i(
-              tileSize->location, input_image->tile_x, input_image->tile_y);
-          glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
-          glUniform2i(
-              inputTileRange->location,
-              0,
-              std::min(
-                  (input_image->channels + 3) / 4,
-                  input_image->tile_x * input_image->tile_y));
-        },
-        output_image->texture_width,
-        output_image->texture_height);
-  }
-}
-
-class GLSoftmaxScale : public GLFilter {
- public:
-  binding* outputSize;
-  binding* inputData;
-  binding* maxData;
-  binding* sumData;
-
-  const std::vector<binding*> input_bindings() {
-    std::vector<binding*> bindings(
-        {BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
-    return bindings;
-  }
-
-  GLSoftmaxScale(bool _compute_exp = false, bool tiled = false)
-      : GLFilter(
-            "GLSoftmaxScale",
-            vertex_shader,
-            fragment_shader,
-            input_bindings(),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"COMPUTE_EXP", c10::to_string((int)_compute_exp)},
-             {"TILED_SOFTMAX", c10::to_string((int)tiled)}}) {}
-
-  template <typename T>
-  void scale(const GLImage<T>* input_image,
-             const GLImage<T>* max_image,
-             const GLImage<T>* sum_image,
-             const GLImage<T>* output_image);
-
-  static const char* fragment_shader;
-};
-
-template <typename T>
-void GLSoftmaxScale::scale(const GLImage<T>* input_image,
-                           const GLImage<T>* max_image,
-                           const GLImage<T>* sum_image,
-                           const GLImage<T>* output_image) {
-  int input_slices = input_image->slices;
-  int output_slices = output_image->slices;
-
-  for (int is = 0; is < input_slices; is++) {
-    std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
-                                                       {max_image->textures[is], maxData},
-                                                       {sum_image->textures[is], sumData}});
-    run(input_attachments,
-        {output_image->textures.begin() + is,
-         output_image->textures.begin() + is + 1},
-        [&]() {
-          glUniform2i(
-              outputSize->location,
-              output_image->texture_width,
-              output_image->texture_height);
-        },
-        output_image->texture_width,
-        output_image->texture_height);
-  }
-}
-
-// MARK: GLSL
-
-const char* GLSoftmaxScale::fragment_shader = R"GLSL(#version 300 es
-
-#define COMPUTE_EXP $(COMPUTE_EXP)
-#define TILED_SOFTMAX $(TILED_SOFTMAX)
-
-precision highp float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_INPUT(maxData);
-TEXTURE_INPUT(sumData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
-#if COMPUTE_EXP
-  vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0));
-  #if TILED_SOFTMAX
-    float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w);
-    maxVal = vec4(singleMax, singleMax, singleMax, singleMax);
-    outputData = TEXTURE_STORE(exp(val - maxVal));
-  #else
-    outputData = TEXTURE_STORE(exp(val - maxVal));
-  #endif
-
-#else
-  vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0));
-  #if TILED_SOFTMAX
-    float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w;
-    sumVal = vec4(singleSum, singleSum, singleSum, singleSum);
-    outputData = TEXTURE_STORE(val / sumVal);
-  #else
-    outputData = TEXTURE_STORE(val / sumVal);
-  #endif
-#endif
-
-}
-)GLSL";
-
-#include "../core/ImageAllocator.h"
-#include "caffe2/core/operator.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-namespace caffe2 {
-template <class T>
-class OpenGLSoftmax final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLSoftmax(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
-    const int num_images = input.size();
-    const int input_channels = input.channels();
-    const int input_width = input.width();
-    const int input_height = input.height();
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-    // For tiling
-    const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
-    const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
-    const bool tiled = input_tile_x > 1 || input_tile_y > 1;
-    if (tiled) {
-      CAFFE_ENFORCE_EQ(
-          input.slices(), 1, "Input needs to be tiled in a single texture");
-    }
-
-    CAFFE_ENFORCE(
-        tiled || input_channels == 1,
-        "Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
-
-    // for spatial dimension
-    const int tile_size_x = 16;
-    const int tile_size_y = 16;
-
-    int max_buf_width = input_width;
-    int max_buf_height = input_height;
-    int max_buf_channels = input_channels;
-    vector<GLImageVector<T>*> reduce_buf;
-
-    while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
-      max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
-      max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
-      if (tiled) {
-        // since we are summing over all the channels within a channel tile
-        max_buf_channels =
-            (max_buf_channels + input_tile_x * input_tile_y - 1) /
-            (input_tile_x + input_tile_y);
-      }
-      reduce_buf.push_back(ImageAllocator<T>::newImage(
-          1,
-          max_buf_width,
-          max_buf_height,
-          max_buf_channels,
-          output_tile_x,
-          output_tile_y));
-    }
-
-    GLImageVector<T>* max = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
-    GLImageVector<T>* sum = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
-    GLImageVector<T>* after_exp = ImageAllocator<T>::newImage(
-        num_images,
-        output_width,
-        output_height,
-        output_channels,
-        output_tile_x,
-        output_tile_y);
-    GLImageVector<T>* output_images = ImageAllocator<T>::newImage(
-        num_images,
-        output_width,
-        output_height,
-        output_channels,
-        output_tile_x,
-        output_tile_y,
-        is_last);
-
-    if (!f_max) {
-      f_max.reset(new GLSoftmaxReduce(false, tiled, input_tile_x));
-      f_exp.reset(new GLSoftmaxScale(true, tiled));
-      f_sum.reset(new GLSoftmaxReduce(true, tiled, input_tile_x));
-      f_scale.reset(new GLSoftmaxScale(false, tiled));
-    }
-
-    for (int i = 0; i < num_images; i++) {
-      auto input_image = input[i];
-      auto max_image = (*max)[i];
-      auto sum_image = (*sum)[i];
-      auto after_exp_image = (*after_exp)[i];
-      auto output_image = (*output_images)[i];
-      // Get Max
-      for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
-        const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
-        GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
-
-        const int running_tile_size_x =
-            ir < reduce_buf.size() ? tile_size_x : in->width;
-        const int running_tile_size_y =
-            ir < reduce_buf.size() ? tile_size_y : in->height;
-        f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
-      }
-      // scale vals by exp(x - max)
-      f_exp->scale(input_image, max_image, sum_image, after_exp_image);
-
-      // Get sum of the exp
-      for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
-        const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
-        GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
-        const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
-        const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
-        f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
-      }
-
-      // Scale(softmax)
-      f_scale->scale(after_exp_image, max_image, sum_image, output_image);
-    }
-
-    Outputs()[OUTPUT]->Reset(output_images);
-
-    delete sum;
-    delete max;
-    delete after_exp;
-    for (auto&& rb : reduce_buf) {
-      delete rb;
-    }
-    return true;
-  }
-
- private:
-  StorageOrder order_;
-  std::unique_ptr<GLSoftmaxReduce> f_max;
-  std::unique_ptr<GLSoftmaxScale> f_exp;
-  std::unique_ptr<GLSoftmaxReduce> f_sum;
-  std::unique_ptr<GLSoftmaxScale> f_scale;
-
-  INPUT_TAGS(INPUT, FILTER, BIAS);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
-OPERATOR_SCHEMA(OpenGLSoftmax)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape();
-} // namespace caffe2
-#endif // CAFFE2_MOBILE
diff --git a/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc b/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc
deleted file mode 100644 (file)
index a6c32a5..0000000
+++ /dev/null
@@ -1,397 +0,0 @@
-
-#include "../core/GLContext.h"
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-enum InputFormat { BGRA = 0, RGBA = 1 };
-
-class GLStylizer : public GLFilter {
-  binding* inputData;
-  binding* outputSize;
-  binding* mean;
-  binding* noise_std;
-  bool deprocess;
-
- public:
-  GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
-      : GLFilter(
-            _deprocess ? "GLDeStylizer" : "GLStylizer",
-            vertex_shader,
-            fragment_shader,
-            std::vector<binding*>({BINDING(inputData),
-                                   BINDING(mean),
-                                   BINDING(noise_std),
-                                   BINDING(outputSize)}),
-            {/* no uniform blocks */},
-            {/* no attributes */},
-            {{"DEPROCESS", c10::to_string(_deprocess)},
-             {"RGBAINPUT", c10::to_string(input_format)}}),
-        deprocess(_deprocess) {}
-
-  template <typename T1, typename T2>
-  void stylize(const GLImage<T1>* input_image,
-               const GLImage<T2>* output_image,
-               const float mean_values[3],
-               float noise_std_value);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLStylizer::fragment_shader = R"GLSL(#version 300 es
-
-#define DEPROCESS         $(DEPROCESS)
-#define RGBAINPUT         $(RGBAINPUT)
-
-precision mediump float;
-precision mediump int;
-precision mediump sampler2D;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-uniform vec3 mean;
-uniform float noise_std;
-
-#if DEPROCESS
-TEXTURE_INPUT(inputData);
-layout(location = 0) out mediump vec4 outputData;
-#else
-uniform sampler2D inputData;
-TEXTURE_OUTPUT(0, outputData);
-#endif
-
-#if !DEPROCESS
-// http://byteblacksmith.com/improvements-to-the-canonical-one-liner-glsl-rand-for-opengl-es-2-0/
-
-highp float rand(vec2 co) {
-  highp float a = 12.9898;
-  highp float b = 78.233;
-  highp float c = 43758.5453;
-  highp float dt = dot(co.xy, vec2(a, b));
-  highp float sn = mod(dt, 3.14);
-  return fract(sin(sn) * c);
-}
-#endif
-
-// In AR Engine, input/output a RBGA texture; otherwise, BGRA tensor => texture
-#if RGBAINPUT
-void main() {
-#if DEPROCESS
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = vec4((val.rgb + mean) / 255.0, 1.0).bgra;
-#else
-  outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).bgr - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
-#endif
-}
-#else
-void main() {
-#if DEPROCESS
-  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
-  outputData = vec4((val.rgb + mean) / 255.0, 1.0);
-#else
-  outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).rgb - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
-#endif
-}
-#endif
-)GLSL";
-
-template <typename T1, typename T2>
-void GLStylizer::stylize(const GLImage<T1>* input_image,
-                         const GLImage<T2>* output_image,
-                         const float mean_values[3],
-                         float noise_std_value) {
-  int input_slices = input_image->slices;
-  int output_slices = output_image->slices;
-
-  run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
-      {output_image->textures[0]},
-      [&]() {
-        glUniform2i(outputSize->location, output_image->width, output_image->height);
-        glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
-        if (!deprocess) {
-          glUniform1f(noise_std->location, noise_std_value);
-        }
-      },
-      output_image->width,
-      output_image->height);
-}
-
-namespace caffe2 {
-class OpenGLTensorToTextureStylizerPreprocessOp : public Operator<CPUContext>,
-                                                  ImageAllocator<uint8_t>,
-                                                  ImageAllocator<float16_t> {
- public:
-  // Expect this many channels as input
-  static constexpr int kInputChannels = 4;
-
-  // Expect this many channels as output
-  static constexpr int kOutputChannels = 3;
-
-  USE_OPERATOR_BASE_FUNCTIONS;
-
-  OpenGLTensorToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {}
-
-  bool RunOnDevice() {
-    const auto& input = Input(0);
-    const auto& mean = Input(1);
-
-    CAFFE_ENFORCE(input.ndim() == 4);
-
-    const int num_images = input.dim32(0);
-    const int input_height = input.dim32(1);
-    const int input_width = input.dim32(2);
-    const int input_channels = input.dim32(3);
-
-    CAFFE_ENFORCE(input.dim32(0) == 1); // N == 1
-    CAFFE_ENFORCE(input_channels == kInputChannels);
-    CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
-
-    // get the buffers from input tensors
-    const float* mean_buffer = mean.template data<float>();
-    const uint8_t* input_buffer = input.template data<uint8_t>();
-
-    // set up the OpenGL context
-    GLContext::getGLContext()->set_context();
-
-    GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(num_images,
-                                                                                  input_width,
-                                                                                  input_height,
-                                                                                  kOutputChannels,
-#if CAFFE2_IOS
-                                                                                  true
-#else
-                                                                                  false
-#endif
-    );
-    const int tile_x = 1, tile_y = 1;
-    GLImageVector<uint8_t>* input_images = ImageAllocator<uint8_t>::newImage(
-        num_images, input_width, input_height, kInputChannels, tile_x, tile_y, false);
-    for (int i = 0; i < num_images; i++) {
-      auto input_image = (*input_images)[i];
-      auto output_image = (*output_images)[i];
-      const GLTexture* inputTexture = input_image->textures[0];
-      inputTexture->loadData(input_buffer);
-
-      if (!glStylizer_) {
-        glStylizer_.reset(new GLStylizer());
-      }
-
-      glStylizer_->stylize(
-          input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
-    }
-    delete input_images;
-    Outputs()[0]->Reset(output_images);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-template <InputFormat inputFormat>
-class OpenGLTextureToTextureStylizerPreprocessOp : public Operator<CPUContext>,
-                                                   ImageAllocator<uint8_t>,
-                                                   ImageAllocator<float16_t> {
- public:
-  // Expect this many channels as input
-  static constexpr int kInputChannels = 4;
-
-  // Expect this many channels as output
-  static constexpr int kOutputChannels = 3;
-
-  USE_OPERATOR_BASE_FUNCTIONS;
-
-  OpenGLTextureToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {}
-
-  bool RunOnDevice() {
-    const GLImageVector<uint8_t>& input = Inputs()[0]->template Get<GLImageVector<uint8_t>>();
-    const auto& mean = Input(1);
-
-    const int num_images = input.size();
-    const int input_height = input.height();
-    const int input_width = input.width();
-    const int input_channels = input.channels();
-
-    CAFFE_ENFORCE_GT(num_images, 0);
-    CAFFE_ENFORCE(input[0]->slices == 1); // N == 1
-    CAFFE_ENFORCE(input_channels == kInputChannels);
-    CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
-
-    // get the buffers from input tensors
-    const float* mean_buffer = mean.template data<float>();
-
-    GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(
-        num_images, input_width, input_height, kOutputChannels, false);
-
-    if (!glStylizer_) {
-      glStylizer_.reset(new GLStylizer(false, inputFormat));
-    }
-    for (int i = 0; i < num_images; i++) {
-      auto input_image = input[i];
-      auto output_image = (*output_images)[i];
-      glStylizer_->stylize(
-          input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
-    }
-    Outputs()[0]->Reset(output_images);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLTensorToTextureStylizerPreprocess,
-                      OpenGLTensorToTextureStylizerPreprocessOp);
-OPERATOR_SCHEMA(OpenGLTensorToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerPreprocess,
-                      OpenGLTextureToTextureStylizerPreprocessOp<RGBA>);
-OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
-
-class OpenGLTextureToTensorStylizerDeprocessOp : public Operator<CPUContext>,
-                                                 ImageAllocator<uint8_t> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  // Expect this many channels as input
-  static constexpr int kInputChannels = 3;
-
-  // Expect this many channels as output
-  static constexpr int kOutputChannels = 4;
-
-  bool RunOnDevice() {
-    const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
-    const auto& mean = Input(1);
-    auto* output = Output(0);
-
-    const int num_images = input.size(), channels = input.channels(), height = input.height(),
-              width = input.width();
-    // Assume BGR or BGRA
-    CAFFE_ENFORCE(mean.size() == kInputChannels);
-    CAFFE_ENFORCE(channels == kInputChannels);
-    // RGB
-    output->Resize(num_images, height, width, kOutputChannels);
-
-    const auto* mean_data = mean.template data<float>();
-    auto* output_buffer = output->template mutable_data<uint8_t>();
-
-    GLImageVector<uint8_t>* output_images =
-        ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels, true);
-
-    if (!glStylizer_) {
-      glStylizer_.reset(new GLStylizer(true));
-    }
-
-    for (int i = 0; i < num_images; i++) {
-      auto input_image = input[i];
-      auto output_image = (*output_images)[i];
-      glStylizer_->stylize(input_image, output_image, mean_data, 0);
-
-      output_image->textures[0]->map_read([&](const void* buffer,
-                                              size_t width,
-                                              size_t height,
-                                              size_t stride,
-                                              size_t channels,
-                                              const GLTexture::Type& type) {
-        if (width == stride) {
-          memcpy(output_buffer, buffer, channels * width * height);
-        } else {
-          typedef uint8_t(input_data_t)[height][stride][channels];
-          typedef uint8_t(output_data_t)[height][width][channels];
-
-          const input_data_t& input_data = *reinterpret_cast<const input_data_t*>(buffer);
-          output_data_t& output_data = *reinterpret_cast<output_data_t*>(output_buffer);
-
-          for (int y = 0; y < height; y++) {
-            memcpy(output_data[y], input_data[y], channels * width);
-          }
-        }
-      });
-    }
-    delete output_images;
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-template <InputFormat inputFormat>
-class OpenGLTextureToTextureStylizerDeprocessOp : public Operator<CPUContext>,
-                                                  ImageAllocator<uint8_t> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  // Expect this many channels as input
-  static constexpr int kInputChannels = 3;
-
-  // Expect this many channels as output
-  static constexpr int kOutputChannels = 4;
-
-  bool RunOnDevice() {
-    const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
-    const auto& mean = Input(1);
-
-    const int num_images = input.size(), channels = input.channels(), height = input.height(),
-              width = input.width();
-
-    CAFFE_ENFORCE(mean.size() == kInputChannels);
-    CAFFE_ENFORCE(channels == kInputChannels);
-
-    const auto* mean_data = mean.template data<float>();
-
-    // Use foreignTextureAllocator inside GLContext
-    // glDeleteTexture will not be called from inside caffe2 for this texture
-    GLImageVector<uint8_t>* output_images;
-    auto textureAllocator = GLContext::getGLContext()->getTextureAllocator();
-    const int tile_x = 1, tile_y = 1;
-    if (textureAllocator != nullptr) {
-      output_images = ImageAllocator<uint8_t>::newImage(
-          num_images, width, height, kOutputChannels, tile_x, tile_y, textureAllocator);
-    } else {
-      // fallback when textureAllocator is not set
-      output_images = ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels);
-    }
-
-    if (!glStylizer_) {
-      glStylizer_.reset(new GLStylizer(true, inputFormat));
-    }
-
-    for (int i = 0; i < num_images; i++) {
-      auto input_image = input[i];
-      auto output_image = (*output_images)[i];
-      glStylizer_->stylize(input_image, output_image, mean_data, 0);
-    }
-
-    Outputs()[0]->Reset(output_images);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLTextureToTensorStylizerDeprocess,
-                      OpenGLTextureToTensorStylizerDeprocessOp);
-OPERATOR_SCHEMA(OpenGLTextureToTensorStylizerDeprocess).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerDeprocess,
-                      OpenGLTextureToTextureStylizerDeprocessOp<RGBA>);
-OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerDeprocess).NumInputs(2).NumOutputs(1);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSub.cc b/caffe2/mobile/contrib/opengl/operators/GLSub.cc
deleted file mode 100644 (file)
index a35e1b8..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLSub : public GLFilter {
- public:
-  binding* inputData[2];
-  binding* outputSize;
-
-  GLSub()
-      : GLFilter("GLSub",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {/* no replacements */}) {}
-
-  template <typename T>
-  void sub(const GLImageVector<T>& input_image0,
-           const GLImageVector<T>& input_image1,
-           const GLImageVector<T>& output_image);
-
-  static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLSub::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData[2]);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
-    ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-    vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
-    vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
-    vec4 value = A - B;
-    outputData = TEXTURE_STORE(value);}
-
-)GLSL";
-
-template <typename T>
-void GLSub::sub(const GLImageVector<T>& input_images0,
-                const GLImageVector<T>& input_images1,
-                const GLImageVector<T>& output_images) {
-  const int num_images = input_images0.size();
-  for (int i = 0; i < num_images; i++) {
-    GLImage<T>* input_image0 = input_images0[i];
-    GLImage<T>* input_image1 = input_images1[i];
-    int input_slices = input_image0->slices;
-    GLImage<T>* output_image = output_images[i];
-    int output_slices = output_image->slices;
-
-    for (int is = 0; is < input_slices; is++) {
-      std::vector<texture_attachment> input_attachments;
-      input_attachments.push_back({input_image0->textures[is], inputData[0]});
-      input_attachments.push_back({input_image1->textures[is], inputData[1]});
-
-      run(input_attachments,
-          {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
-          [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
-          output_image->width,
-          output_image->height);
-    }
-  }
-}
-
-namespace caffe2 {
-template <typename T>
-class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
-  OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
-
-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
-  }
-
-  bool RunOnDevice() override {
-    const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
-    const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
-
-    CAFFE_ENFORCE_EQ(input0.size(), input1.size());
-
-    const int num_images = input0.size();
-    const int input_channels = input0.channels();
-    const int input_width = input0.width();
-    const int input_height = input0.height();
-    CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
-    CAFFE_ENFORCE_EQ(input1.width(), input_width);
-    CAFFE_ENFORCE_EQ(input1.height(), input_height);
-
-    const int output_channels = input_channels;
-    const int output_width = input_width;
-    const int output_height = input_height;
-
-    int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
-    GLImageVector<T>* output = ImageAllocator<T>::newImage(
-        num_images, output_width, output_height, output_channels, is_last);
-
-    if (!_sub) {
-      _sub.reset(new GLSub());
-    }
-
-    _sub->sub(input0, input1, *output);
-
-    Outputs()[0]->Reset(output);
-
-    return true;
-  }
-
- private:
-  std::unique_ptr<GLSub> _sub;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLSub, OpenGLSubOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLSub).NumInputs(2).NumOutputs(1);
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/operators/gl_tiling_utils.h b/caffe2/mobile/contrib/opengl/operators/gl_tiling_utils.h
deleted file mode 100644 (file)
index 8b0c24d..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-#include <cmath>
-
-struct point {
-  int x;
-  int y;
-};
-
-struct tile_descriptor {
-  point tile_dims;
-  point tile_size;
-  int tiles;
-};
-
-namespace caffe2 {
-inline static void squareFactors(int N, int& r1, int& r2) {
-  int f = sqrt(N);
-
-  if (f * f == N) {
-    r1 = r2 = f;
-  } else {
-    while (N % f != 0) {
-      f--;
-    }
-    r1 = N / f;
-    r2 = f;
-  }
-}
-
-inline static void computeOutputTiles(int output_channels, int& output_tile_x, int& output_tile_y) {
-  squareFactors((output_channels + 3) / 4, output_tile_x, output_tile_y);
-}
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
deleted file mode 100644 (file)
index cb175c5..0000000
+++ /dev/null
@@ -1,381 +0,0 @@
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/utils/math.h"
-
-#include "../core/GL.h"
-#include "../core/GLLogging.h"
-#include "../core/arm_neon_support.h"
-#include "../operators/gl_tiling_utils.h"
-#include "TestGLConvolution.h"
-
-#include <vector>
-
-void AddNoiseInput(const std::vector<int64_t>& shape,
-                   const std::string& name,
-                   caffe2::Workspace* ws) {
-  caffe2::CPUContext context;
-  caffe2::Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<caffe2::TensorCPU>();
-  tensor->Resize(shape);
-
-  caffe2::math::RandGaussian<float, caffe2::CPUContext>(
-      tensor->size(), 0.0f, 10.0f, tensor->mutable_data<float>(), &context);
-}
-
-double BenchOp(const std::string& typ,
-               int inputC,
-               int outputC,
-               int kW,
-               int kH,
-               int stride,
-               int inW,
-               int inH,
-               bool transposed,
-               caffe2::Workspace* ws = nullptr) {
-  caffe2::Workspace localWs;
-  if (!ws) {
-    ws = &localWs;
-  }
-
-  const char* engine = transposed ? "MOBILE" : "NNPACK";
-
-  caffe2::OperatorDef def1;
-  def1.set_name("test");
-  def1.set_type(typ);
-  def1.set_engine(engine);
-  def1.add_input("X");
-  def1.add_input("W");
-  def1.add_input("B");
-  def1.add_output("Y");
-
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_h", kH));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_w", kW));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_h", stride));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_w", stride));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_t", 0));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
-  def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
-
-  AddNoiseInput(std::vector<int64_t>{1, inputC, inH, inW}, "X", ws);
-  if (transposed) {
-    AddNoiseInput(std::vector<int64_t>{inputC, outputC, kH, kW}, "W", ws);
-  } else {
-    AddNoiseInput(std::vector<int64_t>{outputC, inputC, kH, kW}, "W", ws);
-  }
-  AddNoiseInput(std::vector<int64_t>{outputC}, "B", ws);
-
-  std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
-
-  // Measure one iteration
-  caffe2::Timer timer;
-  timer.Start();
-
-  op1->Run();
-
-  float one_iteration = timer.MilliSeconds();
-
-  int target_iterations = std::max((int)(1000 / one_iteration), 1);
-  int warmup_iterations = std::max((int)(200 / one_iteration), 1);
-
-  // warm up
-  for (int i = 0; i < warmup_iterations; i++) {
-    op1->Run();
-  }
-
-  timer.Start();
-
-  int runs = target_iterations;
-  for (int i = 0; i < runs; i++) {
-    op1->Run();
-  }
-
-  auto total_t = timer.MilliSeconds();
-
-  gl_log(GL_LOG,
-         "%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
-         typ.c_str(),
-         inputC,
-         outputC,
-         inW,
-         inH,
-         kW,
-         kH,
-         engine,
-         timer.MilliSeconds() / (float)runs);
-  return double(total_t) / runs;
-}
-
-template <typename T>
-static double BenchGLConvolution(int input_channels,
-                                 int output_channels,
-                                 int kernel_width,
-                                 int kernel_height,
-                                 int input_width,
-                                 int input_height,
-                                 int input_padding,
-                                 int input_stride,
-                                 bool transposed,
-                                 caffe2::Workspace* ws = nullptr) {
-  int tile_x = 1, tile_y = 1;
-  caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
-
-  gl_log(GL_LOG, "Input Tiles Factors: %d, %d\n", tile_x, tile_y);
-
-  caffe2::Workspace localWs;
-  if (!ws) {
-    ws = &localWs;
-  }
-
-  AddNoiseInput(
-      std::vector<int64_t>{1, input_channels, input_height, input_width}, "X_cpu", ws);
-  if (transposed) {
-    AddNoiseInput(
-        std::vector<int64_t>{input_channels, output_channels, kernel_height, kernel_width},
-        "W",
-        ws);
-  } else {
-    AddNoiseInput(
-        std::vector<int64_t>{output_channels, input_channels, kernel_height, kernel_width},
-        "W",
-        ws);
-  }
-  AddNoiseInput(std::vector<int64_t>{output_channels}, "b", ws);
-
-  caffe2::NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type(transposed ? "OpenGLConvTranspose" : "OpenGLConv");
-    op.add_input("X_gl");
-    {
-      op.add_input("W");
-      op.add_input("b");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_height);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(input_padding);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("stride");
-      arg.set_i(input_stride);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("is_last");
-      arg.set_i(1);
-    }
-    op.add_output("Y_gl");
-  }
-
-  std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
-
-  for (auto& op : netdef.op()) {
-    ops.push_back(CreateOperator(op, ws));
-  }
-
-  // Run the Copy Operator
-  ops[0]->Run();
-
-  // Make sure the tested operator is precompiled
-  ops[1]->Run();
-  glFinish();
-
-  // Measure one iteration
-  caffe2::Timer timer;
-  timer.Start();
-
-  ops[1]->Run();
-  glFinish();
-
-  float one_iteration = timer.MilliSeconds();
-
-  int target_iterations = std::max((int)(1000 / one_iteration), 1);
-  int warmup_iterations = std::max((int)(200 / one_iteration), 1);
-
-  // warm up
-  for (int i = 0; i < warmup_iterations; i++) {
-    ops[1]->Run();
-  }
-  glFinish();
-
-  timer.Start();
-
-  int runs = target_iterations;
-  for (int i = 0; i < runs; i++) {
-    ops[1]->Run();
-  }
-  glFinish();
-
-  const double gpuIterTime = double(timer.MilliSeconds()) / runs;
-
-  gl_log(GL_LOG,
-         "%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
-         transposed ? "ConvTranspose" : "Conv",
-         input_channels,
-         output_channels,
-         input_width,
-         input_height,
-         kernel_width,
-         kernel_height,
-         gpuIterTime);
-
-  return gpuIterTime;
-}
-
-void TestGLConvolution() {
-  caffe2::Workspace ws;
-  ws.GetThreadPool()->setMinWorkSize(0);
-
-  // small input sizes
-  // std::vector<int> sizes({14, 26, 52, 104});
-  // std::vector<int> channels({128, 64}); // not working for 512 and 256 channels yet
-  // std::vector<int> channels({512, 256, 128, 64});
-
-  // large input sizes
-  // std::vector<int> sizes({208, 312, 416, 720, 1080});
-  // std::vector<int> channels({16, 4});
-  //
-  std::vector<int> sizes({14, 26, 52, 104, 208});
-  // std::vector<int> channels({24, 16, 4});
-
-  //  std::vector<int> sizes({14});
-  std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
-
-  std::vector<int> kernels({3});
-
-  bool transposed = false;
-
-  int stride = 1;
-
-  for (const auto& space : sizes) {
-    for (const auto& input_channel : channels) {
-      int output_channel = input_channel;
-      /* for (const auto& output_channel : channels) */ {
-        for (const auto& kernel : kernels) {
-          const double gpuIterTime = BenchGLConvolution<float16_t>(
-              input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
-          const double cpuIterTime = BenchOp(transposed ? "ConvTranspose" : "Conv",
-                                             input_channel,
-                                             output_channel,
-                                             kernel,
-                                             kernel,
-                                             stride,
-                                             space,
-                                             space,
-                                             transposed,
-                                             &ws);
-          const double flops       = double(input_channel) * output_channel * kernel * kernel *
-                               (kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
-          // gl_log(GL_LOG,
-          printf(
-              "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
-              "%.2f\tratio: "
-              "%.2f\n",
-              space,
-              space,
-              input_channel,
-              output_channel,
-              kernel,
-              kernel,
-              flops / gpuIterTime / 1E6,
-              flops / cpuIterTime / 1E6,
-              cpuIterTime / gpuIterTime);
-        }
-      }
-    }
-  }
-
-  //  // ConvTranspose
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 2, true);
-  //  BenchGLConvolution<float16_t>(16, 16, 4, 4, 640, 360, 0, 2, true);
-  //  BenchGLConvolution<float16_t>(16, 16, 5, 5, 640, 360, 0, 2, true);
-  //  BenchGLConvolution<float16_t>(16, 16, 6, 6, 640, 360, 0, 2, true);
-  //  BenchGLConvolution<float16_t>(16, 16, 7, 7, 640, 360, 0, 2, true);
-  //  BenchGLConvolution<float16_t>(16, 16, 8, 8, 640, 360, 0, 2, true);
-  //  BenchGLConvolution<float16_t>(16, 16, 9, 9, 640, 360, 0, 2, true);
-  //
-  //  BenchOp("ConvTranspose", 16, 16, 3, 3, 2, 640, 360, true);
-  //  BenchOp("ConvTranspose", 16, 16, 4, 4, 2, 640, 360, true);
-  //  BenchOp("ConvTranspose", 16, 16, 5, 5, 2, 640, 360, true);
-  //  BenchOp("ConvTranspose", 16, 16, 6, 6, 2, 640, 360, true);
-  //  BenchOp("ConvTranspose", 16, 16, 7, 7, 2, 640, 360, true);
-  //  BenchOp("ConvTranspose", 16, 16, 8, 8, 2, 640, 360, true);
-  //  BenchOp("ConvTranspose", 16, 16, 9, 9, 2, 640, 360, true);
-  //
-  //  // Conv
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 4, 4, 1280, 720, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 5, 5, 1280, 720, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 6, 6, 1280, 720, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 7, 7, 1280, 720, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 8, 8, 1280, 720, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 9, 9, 1280, 720, 0, 1, false);
-  //
-  //  BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
-  //  BenchOp("Conv", 16, 16, 4, 4, 1, 1280, 720, false);
-  //  BenchOp("Conv", 16, 16, 5, 5, 1, 1280, 720, false);
-  //  BenchOp("Conv", 16, 16, 6, 6, 1, 1280, 720, false);
-  //  BenchOp("Conv", 16, 16, 7, 7, 1, 1280, 720, false);
-  //  BenchOp("Conv", 16, 16, 8, 8, 1, 1280, 720, false);
-  //  BenchOp("Conv", 16, 16, 9, 9, 1, 1280, 720, false);
-
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 80, 45, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 160, 90, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 320, 180, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
-  //
-  //  BenchOp("Conv", 16, 16, 3, 3, 1, 80, 45, false);
-  //  BenchOp("Conv", 16, 16, 3, 3, 1, 160, 90, false);
-  //  BenchOp("Conv", 16, 16, 3, 3, 1, 320, 180, false);
-  //  BenchOp("Conv", 16, 16, 3, 3, 1, 640, 360, false);
-  //  BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
-  //
-  //  BenchGLConvolution<float16_t>(128, 128, 3, 3, 14, 14, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(256, 256, 3, 3, 14, 14, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(128, 128, 3, 3, 28, 28, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(256, 256, 3, 3, 28, 28, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(128, 128, 3, 3, 56, 56, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(256, 256, 3, 3, 56, 56, 0, 1, false);
-  //  BenchGLConvolution<float16_t>(64, 64, 7, 7, 128, 128, 0, 1, false);
-  //
-  //  BenchOp("Conv", 128, 128, 3, 3, 1, 14, 14, false);
-  //  BenchOp("Conv", 256, 256, 3, 3, 1, 14, 14, false);
-  //  BenchOp("Conv", 128, 128, 3, 3, 1, 28, 28, false);
-  //  BenchOp("Conv", 256, 256, 3, 3, 1, 28, 28, false);
-  //  BenchOp("Conv", 128, 128, 3, 3, 1, 56, 56, false);
-  //  BenchOp("Conv", 256, 256, 3, 3, 1, 56, 56, false);
-  //  BenchOp("Conv", 64, 64, 7, 7, 1, 128, 128, false);
-}
diff --git a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.h b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.h
deleted file mode 100644 (file)
index 63f5d3b..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#pragma once
-
-void TestGLConvolution();
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
deleted file mode 100644 (file)
index c8e5891..0000000
+++ /dev/null
@@ -1,2924 +0,0 @@
-
-#include "opengl_test.h"
-
-#include "../core/GLContext.h"
-#include "../core/GLImageAllocator.h"
-#include "../core/GLLogging.h"
-#include "../core/ImageAllocator.h"
-#include "../core/arm_neon_support.h"
-#include "../core/rewrite_net.h"
-#include "../operators/gl_tiling_utils.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/utils/math.h"
-#include "caffe2/utils/proto_utils.h"
-
-#ifdef CAFFE2_USE_MPSCNN
-#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
-#endif
-
-#define DEBUGGING false
-
-namespace caffe2 {
-
-template <class T>
-float absolute_error(T t1, T t2) {
-  return std::abs((float)t1 - (float)t2);
-}
-
-template <class T>
-float relative_error(T t1, T t2) {
-  return t2 != 0 ? absolute_error(t1, t2) / (float)t2 : 1;
-}
-
-// OpenGL: t1, CPU: t2
-void checkError1D(const TensorCPU& t1, const TensorCPU& t2, float error) {
-  CAFFE_ENFORCE_EQ(t1.size(), t2.size());
-#if DEBUGGING
-  gl_log(GL_LOG, "OpenGL output:\n");
-  for (int i = 0; i < t1.size(); i++) {
-    gl_log(GL_LOG, "%.5f\t", t1.template data<float>()[i]);
-  }
-  gl_log(GL_LOG, "\n");
-  gl_log(GL_LOG, "CPU output:\n");
-  for (int i = 0; i < t2.size(); i++) {
-    gl_log(GL_LOG, "%.5f\t", t2.template data<float>()[i]);
-  }
-  gl_log(GL_LOG, "\n");
-
-#else
-  int count = 0;
-  if (t1.template IsType<float>()) {
-    for (auto i = 0; i < t1.size(); ++i) {
-      const float t1_i = t1.template data<float>()[i];
-      const float t2_i = t2.template data<float>()[i];
-
-      if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
-        gl_log(GL_ERR,
-               "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n",
-               i,
-               t1_i,
-               t2_i,
-               absolute_error(t1_i, t2_i),
-               relative_error(t1_i, t2_i) * 100);
-        if (count++ == 10) {
-          CAFFE_THROW("--- Test Failed ---");
-        }
-      }
-    }
-  }
-#endif
-}
-
-// OpenGL: t1, CPU: t2
-void checkError(const TensorCPU& t1, const TensorCPU& t2, float error) {
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-#if DEBUGGING
-  gl_log(GL_LOG, "opengl_test output\n");
-  gl_log(GL_LOG, "\nOpenGL output:\n");
-  for (int i = 0; i < t1.size(); i++) {
-    if (t1.ndim() > 2 && i % t1.dim(2) == 0) {
-      gl_log(GL_LOG, "\n");
-    }
-    if (t1.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0) {
-      gl_log(GL_LOG, "\n");
-    }
-    if (t1.template IsType<float>()) {
-      const float t1_i = t1.template data<float>()[i];
-      gl_log(GL_LOG, "%.3f\t", t1_i);
-    } else if (t1.template IsType<uint8_t>()) {
-      const uint8_t t1_i = t1.template data<uint8_t>()[i];
-      gl_log(GL_LOG, "%.3d\t", (int)t1_i);
-    }
-  }
-
-  gl_log(GL_LOG, "\nCPU output:\n");
-  for (int i = 0; i < t2.size(); i++) {
-    if (t2.ndim() > 2 && i % t2.dim(2) == 0)
-      gl_log(GL_LOG, "\n");
-    if (t2.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0)
-      gl_log(GL_LOG, "\n");
-    if (t2.template IsType<float>()) {
-      const float t2_i = t2.template data<float>()[i];
-      gl_log(GL_LOG, "%.3f\t", t2_i);
-    } else if (t2.template IsType<uint8_t>()) {
-      const uint8_t t2_i = t2.template data<uint8_t>()[i];
-      gl_log(GL_LOG, "%.3d\t", (int)t2_i);
-    }
-  }
-  gl_log(GL_LOG, "\n");
-#else
-
-  int count = 0;
-  float maxError = 0, minError = 0;
-  if (t1.template IsType<float>()) {
-    for (auto i = 0; i < t1.size(); ++i) {
-      const float t1_i = t1.template data<float>()[i];
-      const float t2_i = t2.template data<float>()[i];
-      if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
-        if (count < 10) {
-          gl_log(GL_ERR,
-                 "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n",
-                 i,
-                 t1_i,
-                 t2_i,
-                 absolute_error(t1_i, t2_i),
-                 relative_error(t1_i, t2_i) * 100);
-        } else {
-          CAFFE_THROW("--- Test Failed ---");
-        }
-        count++;
-      }
-      float err = t1_i - t2_i;
-      if (err > maxError) {
-        maxError = err;
-      } else if (err < minError) {
-        minError = err;
-      }
-    }
-  } else if (t1.template IsType<uint8_t>()) {
-    for (auto i = 0; i < t1.size(); ++i) {
-      const uint8_t t1_i = t1.template data<uint8_t>()[i];
-      const uint8_t t2_i = t2.template data<uint8_t>()[i];
-      if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
-        if (count < 10) {
-          gl_log(GL_ERR,
-                 "i: %d, GL: %d, CPU: %d, absolute error: %.2f, relative error: %.2f%%\n",
-                 i,
-                 t1_i,
-                 t2_i,
-                 absolute_error(t1_i, t2_i),
-                 relative_error(t1_i, t2_i) * 100);
-        } else {
-          CAFFE_THROW("--- Test Failed ---");
-        }
-        count++;
-      }
-      float err = t1_i - t2_i;
-      if (err > maxError) {
-        maxError = err;
-      } else if (err < minError) {
-        minError = err;
-      }
-    }
-  }
-  gl_log(GL_LOG,
-         "#errors = %d in %d, maxError = %f, minError = %f\n",
-         count,
-         (int)t1.size(),
-         maxError,
-         minError);
-#endif
-}
-
-void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1, int tile_y = 1) {
-  LOG(INFO) << "OPENGLCopyFrom/To Test";
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-
-    // Note: may overflow for half precision
-    //    float *data = t->mutable_data<float>();
-    //    for (int i = 0; i < t->size(); i++) {
-    //      data[i] = i;
-    //    }
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("X_gl");
-    op.add_output("Y_cpu");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
-  const auto& t2 = ws.GetBlob("X_cpu")->Get<TensorCPU>(); // CPU
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-
-  checkError(t1, t2, error);
-}
-
-typedef enum {
-  AveragePool,
-  MaxPool,
-  Conv,
-  ConvTranspose,
-  ConvPRelu,
-  ConvTransposePRelu,
-  ConvRelu,
-  ConvTransposeRelu
-} PoolOp;
-
-const char* glPoolOperationName[] = {"OpenGLAveragePool",
-                                     "OpenGLMaxPool",
-                                     "OpenGLConv",
-                                     "OpenGLConvTranspose",
-                                     "OpenGLConvPRelu",
-                                     "OpenGLConvTransposePRelu",
-                                     "OpenGLConvRelu",
-                                     "OpenGLConvTransposeRelu"};
-
-const char* cpuPoolOperationName[] = {"AveragePool",
-                                      "MaxPool",
-                                      "Conv",
-                                      "ConvTranspose",
-                                      "Conv",
-                                      "ConvTranspose",
-                                      "Conv",
-                                      "ConvTranspose"};
-
-void testOpenGLConv(int N,
-                    int C,
-                    int H,
-                    int W,
-                    int K, // output_channels
-                    int kernel_h,
-                    int kernel_w,
-                    int pad,
-                    int stride,
-                    PoolOp poolOp,
-                    float error,
-                    bool random_input     = true,
-                    int input_batch_size  = 1,
-                    int output_batch_size = 1,
-                    int input_tile_x      = 1,
-                    int input_tile_y      = 1,
-                    bool tiling           = false) {
-  LOG(INFO) << "OpenGL Conv Test: "
-            << "input C: " << C << ", output C: " << K << ", H: " << H << ", W: " << W
-            << ", K: " << kernel_w << "x" << kernel_h << ", P: " << pad << ", S: " << stride
-            << " Op: " << glPoolOperationName[poolOp];
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-#if 0
-  gl_log(GL_LOG, "Input tensor:");
-  for (int i = 0; i < t->size(); i++) {
-    const float t1_i = t->data<float>()[i];
-    if (i % t->dim(3) == 0)
-      gl_log(GL_LOG, "\n");
-    if (i % (4 * t->dim(2) * t->dim(3)) == 0)
-      gl_log(GL_LOG, "-------------------------------\n");
-    gl_log(GL_LOG, "%.3f\t", t1_i);
-  }
-  gl_log(GL_LOG, "\n\n");
-#endif
-  }
-
-  if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
-    if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
-      t->Resize(K, C, kernel_h, kernel_w);
-    } else {
-      t->Resize(C, K, kernel_h, kernel_w);
-    }
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      // Set the weights to all 1s
-      //      for (int i = 0; i < t->size(); i++) {
-      //        data[i] = 1;
-      //      }
-
-      // Set the weights to 1s, 2s, 3s... for channel 0, 1, 2, 3...
-      int j = 0;
-      for (int i = 0; i < t->size(); i++) {
-        if (i % (C * kernel_h * kernel_w) == 0) {
-          j++;
-        }
-        data[i] = j;
-      }
-    }
-
-#if 0
-    gl_log(GL_LOG, "Kernel (printing only the first line for each output channel):");
-    for (int i = 0; i < t->size(); i++) {
-      if (i == 0 || i % (t->dim(1) * t->dim(2) * t->dim(3)) == 0) {
-        gl_log(GL_LOG, "\n");
-        for (int j = 0; j < t->dim(3); j++) {
-          const float t1_i = t->data<float>()[i + j];
-          gl_log(GL_LOG, "%.3f\t", t1_i);
-        }
-      }
-    }
-    gl_log(GL_LOG, "\n");
-#endif
-
-    // bias
-    {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
-      t->Resize(K);
-      CPUContext ctx;
-      if (random_input) {
-        math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-      } else {
-        // Set bias to 1
-        float* data = t->mutable_data<float>();
-        for (int i = 0; i < t->size(); i++) {
-          data[i] = i + 1;
-        }
-      }
-#if 0
-    gl_log(GL_LOG, "Bias:\n");
-    for (int i = 0; i < t->size(); i++) {
-      const float t1_i = t->data<float>()[i];
-      gl_log(GL_LOG, "%.3f\t", t1_i);
-    }
-    gl_log(GL_LOG, "\n");
-#endif
-    }
-  }
-
-  if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
-    t->Resize(K);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      // Set prelu scale to i + 1
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = -0.5;
-      }
-    }
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(input_tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(input_tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type(glPoolOperationName[poolOp]);
-    op.add_input("X_gl");
-    if (poolOp != AveragePool && poolOp != MaxPool) {
-      op.add_input("W");
-      op.add_input("b");
-    }
-    if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-      op.add_input("p");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("stride");
-      arg.set_i(stride);
-    }
-    if (poolOp != AveragePool && poolOp != MaxPool) {
-      if (tiling) {
-        {
-          auto& arg = *(op.add_arg());
-          arg.set_name("tiling");
-          arg.set_i(1);
-        }
-      } else {
-        {
-          auto& arg = *(op.add_arg());
-          arg.set_name("input_batch_size");
-          arg.set_i(input_batch_size);
-        }
-        {
-          auto& arg = *(op.add_arg());
-          arg.set_name("output_batch_size");
-          arg.set_i(output_batch_size);
-        }
-      }
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("is_last");
-      arg.set_i(1);
-    }
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type(cpuPoolOperationName[poolOp]);
-
-    op.add_input("X_cpu");
-    if (poolOp != AveragePool && poolOp != MaxPool) {
-      op.add_input("W");
-      op.add_input("b");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("stride");
-      arg.set_i(stride);
-    }
-    op.add_output("Y_ref");
-  }
-  if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto& op = *(netdef.add_op());
-    op.set_type("PRelu");
-    op.add_input("Y_ref");
-    op.add_input("p");
-    op.add_output("Y_ref");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-  } else if (poolOp == ConvRelu || poolOp == ConvTransposeRelu) {
-    auto& op = *(netdef.add_op());
-    op.set_type("Relu");
-    op.add_input("Y_ref");
-    op.add_output("Y_ref");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
-  const auto& t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(t1, t2, error);
-}
-
-void testOpenGLPRelu(
-    int N, int C, int H, int W, int prelu_size, int input_tile_x, int input_tile_y, float error) {
-  LOG(INFO) << "OpenGL PRelu Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
-  }
-
-  // prelu scale
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
-    t->Resize(prelu_size);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(input_tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(input_tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLPRelu");
-    op.add_input("X_gl");
-    op.add_input("p");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("PRelu");
-    op.add_input("X_cpu");
-    op.add_input("p");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile_y, float error) {
-  LOG(INFO) << "OpenGL Relu Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(input_tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(input_tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLRelu");
-    op.add_input("X_gl");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Relu");
-    op.add_input("X_cpu");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile_x = 1, int input_tile_y = 1) {
-  LOG(INFO) << "OpenGL Add Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
-    t0->Resize(N, C, H, W);
-    CPUContext ctx0;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
-
-    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
-    t1->Resize(N, C, H, W);
-    CPUContext ctx1;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu0");
-    op.add_output("X_gl0");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(input_tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(input_tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu1");
-    op.add_output("X_gl1");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_x");
-      arg.set_i(input_tile_x);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("tile_y");
-      arg.set_i(input_tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLAdd");
-    op.add_input("X_gl0");
-    op.add_input("X_gl1");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Add");
-    op.add_input("X_cpu0");
-    op.add_input("X_cpu1");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(t1, t2, error);
-}
-
-void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
-  LOG(INFO) << "OpenGL Sub Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-
-  Workspace ws;
-  {
-    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
-    t0->Resize(N, C, H, W);
-    CPUContext ctx0;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
-
-    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
-    t1->Resize(N, C, H, W);
-    CPUContext ctx1;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu0");
-    op.add_output("X_gl0");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu1");
-    op.add_output("X_gl1");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLSub");
-    op.add_input("X_gl0");
-    op.add_input("X_gl1");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Sub");
-    op.add_input("X_cpu0");
-    op.add_input("X_cpu1");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-  checkError(t2, t1, error);
-}
-
-void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = false, float error = 0.1) {
-  LOG(INFO) << "OpenGL Concat Test "
-            << "H: " << H << ", W: " << W;
-  Workspace ws;
-  for (int i = 0; i < Cs.size(); i++) {
-    auto* t =
-        BlobGetMutableTensor(ws.CreateBlob("X_cpu" + c10::to_string(i)), CPU);
-    t->Resize(N, Cs[i], H, W);
-    CPUContext ctx0;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx0);
-  }
-
-  NetDef netdef;
-  for (int i = 0; i < Cs.size(); i++) {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu" + c10::to_string(i));
-    op.add_output("X_gl" + c10::to_string(i));
-    if (tiling) {
-      int tile_x = 1, tile_y = 1;
-      computeOutputTiles(Cs[i], tile_x, tile_y);
-      printf("Cs[i] = %d, tile_x = %d, tile_y = %d\n", Cs[i], tile_x, tile_y);
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("tile_x");
-        arg.set_i(tile_x);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("tile_y");
-        arg.set_i(tile_y);
-      }
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLConcat");
-    for (int i = 0; i < Cs.size(); i++) {
-      op.add_input("X_gl" + c10::to_string(i));
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    op.add_output("Y_gl");
-    op.add_output("Y_gl_mask");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Concat");
-    for (int i = 0; i < Cs.size(); i++) {
-      op.add_input("X_cpu" + c10::to_string(i));
-    }
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-    op.add_output("Y_ref_mask");
-  }
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGL Sigmoid Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLSigmoid");
-    op.add_input("X_gl");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Sigmoid");
-    op.add_input("X_cpu");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLTanh(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGL Tanh Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLTanh");
-    op.add_input("X_gl");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Tanh");
-    op.add_input("X_cpu");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLMul(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGL Mul Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
-    t->Resize(1);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLMul");
-    op.add_input("X_gl");
-    op.add_input("B");
-    op.add_output("Y_gl");
-
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("broadcast");
-      arg.set_i(1);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Mul");
-    op.add_input("X_cpu");
-    op.add_input("B");
-
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("broadcast");
-      arg.set_i(1);
-    }
-
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
-  LOG(INFO) << "OpenGL Softmax Test "
-            << "N: " << N << " D: " << D << " Tiled:" << tiled;
-  Workspace ws;
-  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-  {
-    t->Resize(N, D);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Reshape");
-    op.add_input("X_cpu");
-    op.add_output("X_reshaped");
-    op.add_output("old_shape");
-    auto& arg = *(op.add_arg());
-    arg.set_name("shape");
-    if (tiled) {
-      arg.add_ints(N);
-      arg.add_ints(D);
-      arg.add_ints(1);
-      arg.add_ints(1);
-    } else {
-      arg.add_ints(N);
-      arg.add_ints(1);
-      arg.add_ints(D);
-      arg.add_ints(1);
-    }
-  }
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_reshaped");
-    op.add_output("X_gl");
-    if (tiled) {
-      int tile_x = 1, tile_y = 1;
-      squareFactors((D + 3) / 4, tile_x, tile_y);
-      auto& argx = *(op.add_arg());
-      argx.set_name("tile_x");
-      argx.set_i(tile_x);
-      auto& argy = *(op.add_arg());
-      argy.set_name("tile_y");
-      argy.set_i(tile_y);
-    }
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLSoftmax");
-    op.add_input("X_gl");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu0");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Reshape");
-    op.add_input("Y_cpu0");
-    op.add_output("Y_cpu");
-    op.add_output("old_shape");
-    auto& arg = *(op.add_arg());
-    arg.set_name("shape");
-    arg.add_ints(N);
-    arg.add_ints(D);
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("Softmax");
-    op.add_input("X_cpu");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGL InstanceNorm Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
-    //    for (auto i = 0; i < t->size(); ++i) {
-    //      t->mutable_data<float>()[i] = 0.001;
-    //    }
-  }
-
-  // scale
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = (i + 1) / t->size();
-    }
-  }
-  // bias
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = 8 - 2 * i;
-    }
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLInstanceNorm");
-    op.add_input("X_gl");
-    op.add_input("W");
-    op.add_input("b");
-    op.add_output("Y_gl");
-    op.add_output("Mean_gl");
-    op.add_output("InvStdev_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Mean_gl");
-    op.add_output("Mean_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("InvStdev_gl");
-    op.add_output("InvStdev_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("InstanceNorm");
-    op.add_input("X_cpu");
-    op.add_input("W");
-    op.add_input("b");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-    op.add_output("Mean_ref");
-    op.add_output("InvStdev_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  LOG(INFO) << "Check mean";
-  checkError1D(
-      ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001);
-  LOG(INFO) << "Check inv_stdev";
-  checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(),
-               ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(),
-               0.001);
-  LOG(INFO) << "Check instance norm";
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGL InstanceNormPRelu Test "
-            << "C: " << C << ", H: " << H << ", W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
-    //    for (auto i = 0; i < t->size(); ++i) {
-    //      t->mutable_data<float>()[i] = 0.001;
-    //    }
-  }
-
-  // scale
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = (i + 1) / t->size();
-    }
-  }
-  // bias
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = 8 - 2 * i;
-    }
-  }
-  // prelu scale
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
-    t->Resize(C);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLInstanceNormPRelu");
-    op.add_input("X_gl");
-    op.add_input("W");
-    op.add_input("b");
-    op.add_input("p");
-    op.add_output("Y_gl");
-    op.add_output("Mean_gl");
-    op.add_output("InvStdev_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Mean_gl");
-    op.add_output("Mean_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("InvStdev_gl");
-    op.add_output("InvStdev_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("InstanceNorm");
-    op.add_input("X_cpu");
-    op.add_input("W");
-    op.add_input("b");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-    op.add_output("Mean_ref");
-    op.add_output("InvStdev_ref");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("PRelu");
-    op.add_input("Y_ref");
-    op.add_input("p");
-    auto& arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
-  LOG(INFO) << "Check mean";
-  checkError1D(
-      ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001);
-  LOG(INFO) << "Check inv_stdev";
-  checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(),
-               ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(),
-               0.001);
-  LOG(INFO) << "Check instance norm";
-  checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void OpenGL_speedtest(int N,
-                      int C,
-                      int H,
-                      int W,
-                      int K,
-                      int kernel_h,
-                      int kernel_w,
-                      int pad,
-                      float error,
-                      bool random_input = true) {
-  LOG(INFO) << "OpenGL Conv Speed Test "
-            << " C: " << C << " H: " << H << " W: " << W;
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
-    t->Resize(K, C, kernel_h, kernel_w);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
-    t->Resize(K);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-  }
-
-  NetDef netdef;
-  netdef.set_name("Test net");
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLConv");
-    op.add_input("X_gl");
-    op.add_input("W");
-    op.add_input("b");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  CAFFE_ENFORCE(ws.RunNetOnce(netdef));
-  caffe2::NetBase* net = ws.CreateNet(netdef);
-  CHECK_NOTNULL(net);
-  CAFFE_ENFORCE(net->Run());
-  net->TEST_Benchmark(1, 4, true);
-}
-
-void testOpenGLPadImage(
-    int N, int C, int H, int W, int pad_l, int pad_r, int pad_t, int pad_b, float error) {
-  LOG(INFO) << "OpenGLPadImage Test";
-  {
-    Workspace ws;
-    {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-      t->Resize(N, C, H, W);
-      CPUContext ctx;
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-      //      for (auto i = 0; i < t->size(); ++i) {
-      //        t->mutable_data<float>()[i] = i + 1;
-      //      }
-    }
-
-    NetDef netdef;
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("CopyToOpenGL");
-      op.add_input("X_cpu");
-      op.add_output("X_gl");
-    }
-
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("OpenGLPadImage");
-      op.add_input("X_gl");
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_l");
-        arg.set_i(pad_l);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_r");
-        arg.set_i(pad_r);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_t");
-        arg.set_i(pad_t);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_b");
-        arg.set_i(pad_b);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("mode");
-        arg.set_s("reflect");
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("is_last");
-        arg.set_i(1);
-      }
-      op.add_output("Y_gl");
-    }
-
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("CopyFromOpenGL");
-      op.add_input("Y_gl");
-      op.add_output("Y_cpu");
-    }
-
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("PadImage");
-      op.add_input("X_cpu");
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_l");
-        arg.set_i(pad_l);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_r");
-        arg.set_i(pad_r);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_t");
-        arg.set_i(pad_t);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("pad_b");
-        arg.set_i(pad_b);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("mode");
-        arg.set_s("reflect");
-      }
-      op.add_output("Y_ref");
-    }
-
-    ws.RunNetOnce(netdef);
-
-    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl
-    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
-    checkError(t2, t1, error);
-  }
-}
-
-void testOpenGLResize(int N,
-                      int C,
-                      int H,
-                      int W,
-                      int width_scale,
-                      int height_scale,
-                      float error,
-                      int input_tile_x = 1,
-                      int input_tile_y = 1) {
-  LOG(INFO) << "OpenGLResize Test";
-  {
-    Workspace ws;
-    {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-      t->Resize(N, C, H, W);
-      CPUContext ctx;
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    }
-
-    NetDef netdef;
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("CopyToOpenGL");
-      op.add_input("X_cpu");
-      op.add_output("X_gl");
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("tile_x");
-        arg.set_i(input_tile_x);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("tile_y");
-        arg.set_i(input_tile_y);
-      }
-    }
-
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("OpenGLResizeNearest");
-      op.add_input("X_gl");
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("width_scale");
-        arg.set_f(width_scale);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("height_scale");
-        arg.set_f(height_scale);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("is_last");
-        arg.set_i(1);
-      }
-      op.add_output("Y_gl");
-    }
-
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("CopyFromOpenGL");
-      op.add_input("Y_gl");
-      op.add_output("Y_cpu");
-    }
-
-    {
-      auto& op = *(netdef.add_op());
-      op.set_type("ResizeNearest");
-      op.add_input("X_cpu");
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("width_scale");
-        arg.set_f(width_scale);
-      }
-      {
-        auto& arg = *(op.add_arg());
-        arg.set_name("height_scale");
-        arg.set_f(height_scale);
-      }
-      op.add_output("Y_ref");
-    }
-
-    ws.RunNetOnce(netdef);
-
-    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl
-    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
-    checkError(t2, t1, error);
-  }
-}
-
-void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGL Preprocess Test";
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, H, W, C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<uint8_t>()[i] = rand() % 255;
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
-    t->Resize(3);
-    CPUContext ctx;
-    t->mutable_data<float>()[0] = 100;
-    t->mutable_data<float>()[1] = 50;
-    t->mutable_data<float>()[2] = 150;
-  }
-
-  NetDef netdef;
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLTensorToTextureStylizerPreprocess");
-    op.add_input("X_cpu");
-    op.add_input("mean");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("noise_std");
-      arg.set_f(0.00001);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("noise_size");
-      arg.set_i(512);
-    }
-
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
-    op.add_input("X_cpu");
-    op.add_input("mean");
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("noise_std");
-      arg.set_f(0.00001);
-    }
-    {
-      auto& arg = *(op.add_arg());
-      arg.set_name("noise_size");
-      arg.set_i(512);
-    }
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
-  checkError(t2, t1, error);
-}
-
-void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGLDeprocess Test";
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = rand() % 1000 - 500;
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
-    t->Resize(3);
-    CPUContext ctx;
-    t->mutable_data<float>()[0] = 30;
-    t->mutable_data<float>()[1] = 40;
-    t->mutable_data<float>()[2] = 50;
-  }
-
-  NetDef netdef;
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLTextureToTensorStylizerDeprocess");
-    op.add_input("X_gl");
-    op.add_input("mean");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
-    op.add_input("X_cpu");
-    op.add_input("mean");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-  checkError(t2, t1, error);
-}
-
-void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "OpenGLNormPlanarYUV Test";
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, 3, H, W);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = rand() % 1000 - 500;
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
-    t->Resize(1, 3);
-    CPUContext ctx;
-    t->mutable_data<float>()[0] = 30;
-    t->mutable_data<float>()[1] = 40;
-    t->mutable_data<float>()[2] = 50;
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
-    t->Resize(1, 3);
-    CPUContext ctx;
-    t->mutable_data<float>()[0] = 6;
-    t->mutable_data<float>()[1] = 7;
-    t->mutable_data<float>()[2] = 8;
-  }
-
-  NetDef netdef;
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("OpenGLNormalizePlanarYUV");
-    op.add_input("X_gl");
-    op.add_input("mean");
-    op.add_input("stdev");
-    op.add_output("Y_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("Y_gl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("NormalizePlanarYUV");
-    op.add_input("X_cpu");
-    op.add_input("mean");
-    op.add_input("stdev");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-  checkError(t2, t1, error);
-}
-
-void OpenGL_copyops_speedtest(int N,
-                              int C,
-                              int H,
-                              int W,
-                              int K,
-                              int kernel_h,
-                              int kernel_w,
-                              int pad,
-                              float error,
-                              bool random_input = true) {
-  LOG(INFO) << "OpenGL CopyOps Speed Test";
-  Workspace ws;
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
-    t->Resize(K, C, kernel_h, kernel_w);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-  }
-
-  {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
-    t->Resize(K);
-    CPUContext ctx;
-    if (random_input) {
-      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-    } else {
-      float* data = t->mutable_data<float>();
-      for (int i = 0; i < t->size(); i++) {
-        data[i] = 1;
-      }
-    }
-  }
-
-  NetDef netdef;
-  netdef.set_name("Test net");
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu");
-    op.add_output("X_gl");
-  }
-
-  {
-    auto& op = *(netdef.add_op());
-    op.set_type("CopyFromOpenGL");
-    op.add_input("X_gl");
-    op.add_output("Y_cpu");
-  }
-
-  caffe2::NetBase* net = ws.CreateNet(netdef);
-  CHECK_NOTNULL(net);
-  net->TEST_Benchmark(1, 4, true);
-}
-
-static NetDef truncateAfter(NetDef def, size_t idx) {
-  // idx = 0, net = 10 -> remove 9
-  // idx = 0, net = 1 -> remove 0
-  const auto toRemove = def.op_size() - idx - 1;
-  for (auto i = 0; i < toRemove; ++i) {
-    def.mutable_op()->RemoveLast();
-  }
-  CHECK_EQ(def.op_size(), idx + 1);
-  return def;
-}
-
-void compareModelsForOpenGL(std::string name,
-                            const NetDef& initNet,
-                            NetDef predictNet,
-                            int width,
-                            int height,
-                            int channel,
-                            std::string input_type,
-                            std::string input_order) {
-
-  if (name == "styleTransfer") {
-    for (int i = 0; i < predictNet.mutable_op(0)->arg_size(); i++) {
-      auto* arg = predictNet.mutable_op(0)->mutable_arg(i);
-      if (arg->name() == "noise_std") {
-        arg->set_f(0);
-      }
-    }
-  }
-
-  for (auto i = 0; i < predictNet.op_size(); ++i) {
-    auto truncatedPredictNet = truncateAfter(predictNet, i);
-
-    // Change the last blob to external_output(0) for the predict net
-    auto output_blob = "_OUTPUT_BLOB__";
-    truncatedPredictNet.set_external_output(0, output_blob);
-    truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob);
-
-    NetDef truncatedOpenGLPredictNet = rewritePredictNetForOpenGL(truncatedPredictNet);
-
-    //    LOG(INFO) << "truncatedPredictNet";
-    //    dumpDefForOpenGL(truncatedPredictNet);
-    //
-    LOG(INFO) << "truncatedOpenGLPredictNet";
-    dumpDefForOpenGL(truncatedOpenGLPredictNet);
-
-    CPUContext ctx;
-    Workspace cws;
-    cws.RunNetOnce(initNet);
-
-    auto* t_cpu = BlobGetMutableTensor(
-        cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
-    if (name == "styleTransfer") {
-      CAFFE_ENFORCE_EQ(input_order, "NHWC");
-      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
-      t_cpu->Resize(1, height, width, channel);
-      for (auto i = 0; i < t_cpu->size(); ++i) {
-        t_cpu->mutable_data<uint8_t>()[i] = i % 255;
-      }
-    } else if (name == "segmentation") {
-      CAFFE_ENFORCE_EQ(input_order, "NCHW");
-      CAFFE_ENFORCE_EQ(input_type, "float");
-      t_cpu->Resize(1, channel, height, width);
-      float* input = t_cpu->mutable_data<float>();
-      const int size = width * height;
-      // Limit input range to YUV
-      math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1
-      math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436
-      math::RandGaussian<float, CPUContext>(
-          size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615
-    } else if (name == "denoiser") {
-      CAFFE_ENFORCE_EQ(input_order, "NCHW");
-      CAFFE_ENFORCE_EQ(input_type, "float");
-      t_cpu->Resize(1, channel, height, width);
-      float* input = t_cpu->mutable_data<float>();
-      const int spatial_size = width * height;
-      math::RandGaussian<float, CPUContext>(spatial_size, 0, 0.33, input, &ctx); // R Channel
-      math::RandGaussian<float, CPUContext>(
-          spatial_size, 0, 0.33, input + spatial_size, &ctx); // G Channel
-      math::RandGaussian<float, CPUContext>(
-          spatial_size, 0, 0.33, input + 2 * spatial_size, &ctx); // B Channel
-      // Clamp Range of input [-1, +1]
-      for (auto i = 0; i < t_cpu->size(); ++i) {
-        input[i] = input[i] > 1 ? 1 : input[i] < -1 ? -1 : input[i];
-      }
-    } else {
-      CAFFE_THROW("CompareModels only works with style transfer and segmentation now");
-    }
-
-    Workspace mws;
-    mws.RunNetOnce(initNet);
-
-    auto* t_gl = BlobGetMutableTensor(
-        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
-    if (name == "styleTransfer") {
-      CAFFE_ENFORCE_EQ(input_order, "NHWC");
-      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
-      t_gl->Resize(1, height, width, channel);
-      uint8_t* input = t_gl->mutable_data<uint8_t>();
-      memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->storage().capacity());
-    } else if (name == "segmentation") {
-      CAFFE_ENFORCE_EQ(input_order, "NCHW");
-      CAFFE_ENFORCE_EQ(input_type, "float");
-      t_gl->Resize(1, channel, height, width);
-      float* input = t_gl->mutable_data<float>();
-      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
-    } else if (name == "denoiser") {
-      CAFFE_ENFORCE_EQ(input_order, "NCHW");
-      CAFFE_ENFORCE_EQ(input_type, "float");
-      t_gl->Resize(1, channel, height, width);
-      float* input = t_gl->mutable_data<float>();
-      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
-    }
-
-    cws.RunNetOnce(truncatedPredictNet);
-    mws.RunNetOnce(truncatedOpenGLPredictNet);
-
-    const auto m_name =
-        truncatedOpenGLPredictNet.op(truncatedOpenGLPredictNet.op_size() - 1).output(0);
-    const auto c_name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
-
-    LOG(INFO) << "Checking correspondence for name: " << m_name << ", idx: " << i;
-    {
-      const auto& mt = mws.GetBlob(m_name)->Get<TensorCPU>(); // GPU
-      const auto& ct = cws.GetBlob(c_name)->Get<TensorCPU>(); // CPU
-      if (name == "denoiser") {
-        checkError(mt, ct, 0.02); // 1% of Scale
-        LOG(INFO) << "Error Check Completed for Denoiser Layer: " << i;
-      } else {
-        checkError(mt, ct, 1);
-      }
-    }
-  }
-}
-
-void compareBatchedToTiledModels(std::string name,
-                                 const NetDef& initNet,
-                                 NetDef predictNet,
-                                 int width,
-                                 int height,
-                                 int channel,
-                                 std::string input_type,
-                                 std::string input_order) {
-
-  if (name == "styleTransfer") {
-    for (int i = 0; i < predictNet.mutable_op(0)->arg_size(); i++) {
-      auto* arg = predictNet.mutable_op(0)->mutable_arg(i);
-      if (arg->name() == "noise_std") {
-        arg->set_f(0);
-      }
-    }
-  }
-
-  for (auto i = 19; i < predictNet.op_size(); ++i) {
-    auto truncatedPredictNet = truncateAfter(predictNet, i);
-
-    // Change the last blob to external_output(0) for the predict net
-    auto output_blob = "_OUTPUT_BLOB__";
-    truncatedPredictNet.set_external_output(0, output_blob);
-    truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob);
-
-    NetDef bachedNet = rewritePredictNetForOpenGL(truncatedPredictNet, false, false);
-    NetDef tiledNet = rewritePredictNetForOpenGL(truncatedPredictNet, false, true);
-
-    LOG(INFO) << "truncatedPredictNet";
-    dumpDefForOpenGL(truncatedPredictNet);
-
-    LOG(INFO) << "truncatedOpenGLPredictNet";
-    dumpDefForOpenGL(bachedNet);
-
-    CPUContext ctx;
-
-    Workspace tws;
-    tws.RunNetOnce(initNet);
-
-    auto* t_batch =
-        BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
-    if (name == "styleTransfer") {
-      CAFFE_ENFORCE_EQ(input_order, "NHWC");
-      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
-      t_batch->Resize(1, height, width, channel);
-      for (auto i = 0; i < t_batch->size(); ++i) {
-        t_batch->mutable_data<uint8_t>()[i] = i % 255;
-      }
-    } else if (name == "segmentation") {
-      CAFFE_ENFORCE_EQ(input_order, "NCHW");
-      CAFFE_ENFORCE_EQ(input_type, "float");
-      t_batch->Resize(1, channel, height, width);
-      float* input = t_batch->mutable_data<float>();
-      const int size = width * height;
-      // Limit input range to YUV
-      math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1
-      math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436
-      math::RandGaussian<float, CPUContext>(
-          size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615
-    } else {
-      CAFFE_THROW("CompareModels only works with style transfer and segmentation now");
-    }
-
-    Workspace bws;
-    bws.RunNetOnce(initNet);
-
-    auto* t_tiling =
-        BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
-    if (name == "styleTransfer") {
-      CAFFE_ENFORCE_EQ(input_order, "NHWC");
-      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
-      t_tiling->Resize(1, height, width, channel);
-      uint8_t* input = t_tiling->mutable_data<uint8_t>();
-      memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->storage().capacity());
-
-    } else if (name == "segmentation") {
-      CAFFE_ENFORCE_EQ(input_order, "NCHW");
-      CAFFE_ENFORCE_EQ(input_type, "float");
-      t_tiling->Resize(1, channel, height, width);
-      float* input = t_tiling->mutable_data<float>();
-      memcpy(input, t_batch->mutable_data<float>(), t_batch->storage().capacity());
-    }
-
-    bws.RunNetOnce(bachedNet);
-    tws.RunNetOnce(tiledNet);
-
-    const auto batch_name = bachedNet.op(bachedNet.op_size() - 1).output(0);
-    const auto tile_name = tiledNet.op(tiledNet.op_size() - 1).output(0);
-
-    LOG(INFO) << "Checking correspondence for name: " << batch_name << ", idx: " << i;
-    {
-      const auto& bt = bws.GetBlob(batch_name)->Get<TensorCPU>(); // GPU
-      const auto& tt = tws.GetBlob(tile_name)->Get<TensorCPU>(); // CPU
-      checkError(bt, tt, 0.01);
-    }
-  }
-}
-
-int runModelBenchmarks(caffe2::NetDef& init_net,
-                       caffe2::NetDef& predict_net,
-                       int warm_up_runs,
-                       int main_runs,
-                       int channel,
-                       int height,
-                       int width,
-                       std::string input_type,
-                       std::string input_order,
-                       std::string engine, // "CPU", "OPENGL", or "MPSCNN"
-                       bool run_individual,
-                       bool use_texture_input,
-                       bool use_tiling,
-                       bool run_fusion) {
-  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
-
-  // caffe2::dumpDefForOpenGL(init_net);
-  caffe2::dumpDefForOpenGL(predict_net);
-
-  CAFFE_ENFORCE(workspace->RunNetOnce(init_net));
-  caffe2::NetDef net_def;
-
-  // rewrite network
-  if (engine == "CPU") {
-    net_def.CopyFrom(predict_net);
-  } else if (engine == "OPENGL") {
-    if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input, use_tiling, run_fusion)) {
-      CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run");
-      return -1;
-    }
-  } else if (engine == "MPSCNN") {
-#ifdef CAFFE2_USE_MPSCNN
-    if (!caffe2::tryConvertToMPSCNN(init_net, predict_net, &net_def)) {
-      CAFFE_THROW("Failed to convert to MPSCNN. Benchmark failed to run");
-      return -1;
-    }
-#else
-    CAFFE_THROW("MPSCNN not enabled. Benchmark failed to run");
-    return -1;
-#endif
-  } else {
-    CAFFE_THROW("Unsupported engine. Benchmark failed to run");
-    return -1;
-  }
-
-  if (!net_def.has_name()) {
-    net_def.set_name("benchmark");
-  }
-  caffe2::NetBase* net = workspace->CreateNet(net_def);
-
-  // create input blob
-  if (engine == "CPU" || engine == "MPSCNN" || !use_texture_input) {
-    caffe2::TensorCPU* b;
-    if (!net_def.external_input_size()) {
-      b = workspace->CreateBlob("data")->GetMutable<caffe2::TensorCPU>();
-    } else {
-      b = workspace->CreateBlob(net_def.external_input(0))->GetMutable<caffe2::TensorCPU>();
-    }
-
-    if (input_order == "NCHW") {
-      b->Resize(std::vector<int32_t>(
-          {1, static_cast<int>(channel), static_cast<int>(height), static_cast<int>(width)}));
-    } else if (input_order == "NHWC") {
-      b->Resize(std::vector<int32_t>(
-          {1, static_cast<int>(height), static_cast<int>(width), static_cast<int>(channel)}));
-    } else {
-      CAFFE_THROW("Unknown input order: ", input_order);
-    }
-    if (input_type == "uint8_t") {
-      b->mutable_data<uint8_t>();
-    } else if (input_type == "float") {
-      b->mutable_data<float>();
-    } else {
-      CAFFE_THROW("Unknown input type: ", input_type);
-    }
-  } else {
-    const int tile_x = 1, tile_y = 1;
-    Blob* blob = nullptr;
-    if (!net_def.external_input_size()) {
-      blob = workspace->CreateBlob("data");
-    } else {
-      blob = workspace->CreateBlob(net_def.external_input(0));
-    }
-    if (input_type == "float") {
-      ImageAllocator<float16_t> allocator;
-      GLImageVector<float16_t>* output_image = allocator.newImage(1,
-                                                                  width,
-                                                                  height,
-                                                                  channel,
-                                                                  tile_x,
-                                                                  tile_y,
-#if CAFFE2_IOS
-                                                                  true
-#else
-                                                                  false
-#endif
-      );
-      blob->Reset(output_image);
-      for (auto& texture : (*output_image)[0]->textures) {
-        texture->map_load([&](void* buffer,
-                              size_t width,
-                              size_t height,
-                              size_t stride,
-                              size_t channels,
-                              const GLTexture::Type& type) {});
-      }
-    } else {
-      ImageAllocator<uint8_t> allocator;
-      GLImageVector<uint8_t>* output_image = allocator.newImage(1,
-                                                                width,
-                                                                height,
-                                                                channel,
-                                                                tile_x,
-                                                                tile_y,
-#if CAFFE2_IOS
-                                                                true
-#else
-                                                                false
-#endif
-      );
-      blob->Reset(output_image);
-      for (auto& texture : (*output_image)[0]->textures) {
-        texture->map_load([&](void* buffer,
-                              size_t width,
-                              size_t height,
-                              size_t stride,
-                              size_t channels,
-                              const GLTexture::Type& type) {});
-      }
-    }
-  }
-
-  // run benchmark
-  if (engine == "CPU" || engine == "MPSCNN") {
-    CHECK_NOTNULL(net);
-    CAFFE_ENFORCE(net->Run());
-    net->TEST_Benchmark(warm_up_runs, main_runs, run_individual);
-  } else if (engine == "OPENGL") {
-    CHECK_NOTNULL(net);
-    CAFFE_ENFORCE(net->Run());
-
-    for (int i = 0; i < warm_up_runs; i++) {
-      net->Run();
-    }
-    glFinish();
-
-    Timer timer;
-    timer.Start();
-    for (int i = 0; i < main_runs; i++) {
-      net->Run();
-    }
-    if (use_texture_input) {
-      glFinish();
-    }
-
-    double iter_time = (double)timer.MilliSeconds() / main_runs;
-    LOG(INFO) << "Main run finished. Milliseconds per iter: " << iter_time
-              << ". Iters per second: " << 1000.0 / iter_time;
-
-    if (run_individual) {
-      std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
-
-      for (auto& op : net_def.op()) {
-        ops.push_back(CreateOperator(op, workspace.get()));
-        ops.back()->Run(); // warm up
-      }
-
-      for (int k = 0; k < ops.size(); k++) {
-        timer.Start();
-        for (int i = 0; i < main_runs; i++) {
-          ops[k]->Run();
-        }
-        glFinish();
-
-        LOG(INFO) << "Operator #" << k << " " << net_def.op(k).type() << ": "
-                  << (double)timer.MilliSeconds() / main_runs;
-      }
-    }
-  }
-
-  return 0;
-}
-
-template <typename T>
-void testGLTextureTypes() {
-  gl_log(GL_LOG, "Executing %s...\n", __PRETTY_FUNCTION__);
-
-  GLImageAllocator<T>* allocator = GLImageAllocator<T>::newGLImageAllocator();
-
-  GLImageVector<T>* image = allocator->newImage(1, 10, 10, 4, 1, 1, true);
-
-  const GLTexture* texture = (*image)[0]->textures[0];
-
-  texture->map_load([&](void* buffer,
-                        size_t width,
-                        size_t height,
-                        size_t stride,
-                        size_t channels,
-                        const GLTexture::Type& type) {
-    T* buffer_data = (T*)buffer;
-
-    for (int y = 0; y < height; y++) {
-      for (int x = 0; x < width; x++) {
-        for (int c = 0; c < channels; c++) {
-          buffer_data[channels * (y * stride + x) + c] = x + y;
-        }
-      }
-    }
-  });
-
-  texture->map_read([&](const void* buffer,
-                        size_t width,
-                        size_t height,
-                        size_t stride,
-                        size_t channels,
-                        const GLTexture::Type& type) {
-    const T* buffer_data = (const T*)buffer;
-
-    for (int y = 0; y < height; y++) {
-      for (int x = 0; x < width; x++) {
-        gl_log(GL_LOG, "%d, ", (int)buffer_data[channels * (y * stride + x) + 0]);
-      }
-      gl_log(GL_LOG, "\n");
-    }
-  });
-  delete image;
-  delete allocator;
-  gl_log(GL_LOG, "...done with %s\n", __PRETTY_FUNCTION__);
-}
-
-void testOpenGL() {
-  {
-    // Test a bunch of different tiled convolutions
-    std::vector<int> channels({3, 4, 6, 8, 12, 16, 32, 64, 128, 256, 512});
-
-    for (const auto& input_channels : channels) {
-      int tile_x = 1, tile_y = 1;
-      squareFactors((input_channels + 3) / 4, tile_x, tile_y);
-
-      for (const auto& output_channels : channels) {
-        for (int size = 5; size < 8; size *= 2) {
-          testOpenGLConv(1,
-                         input_channels,
-                         size,
-                         size,
-                         output_channels,
-                         3,
-                         3,
-                         0,
-                         1,
-                         Conv,
-                         0.1 * input_channels / 8,
-                         true,
-                         1,
-                         1,
-                         tile_x,
-                         tile_y,
-                         true);
-        }
-
-        for (int size = 5; size < 16; size *= 2) {
-          testOpenGLConv(1,
-                         input_channels,
-                         size,
-                         size,
-                         output_channels,
-                         3,
-                         3,
-                         0,
-                         1,
-                         ConvTranspose,
-                         0.1 * input_channels / 8,
-                         true,
-                         1,
-                         1,
-                         tile_x,
-                         tile_y,
-                         true);
-        }
-      }
-    }
-
-    // Test various paddings and strides with tiled convolution
-    for (int kernel_size = 1; kernel_size <= 5; kernel_size++) {
-      for (int pad = 0; pad < kernel_size; pad++) {
-        for (int stride = 1; stride <= 8; stride++) {
-          testOpenGLConv(1,
-                         16,
-                         100,
-                         100,
-                         16,
-                         kernel_size,
-                         kernel_size,
-                         pad,
-                         stride,
-                         Conv,
-                         0.5,
-                         true,
-                         1,
-                         1,
-                         2,
-                         2,
-                         true);
-        }
-
-        for (int stride = 1; stride <= 8; stride++) {
-          testOpenGLConv(1,
-                         16,
-                         100,
-                         100,
-                         16,
-                         kernel_size,
-                         kernel_size,
-                         pad,
-                         stride,
-                         ConvTranspose,
-                         0.5,
-                         true,
-                         1,
-                         1,
-                         2,
-                         2,
-                         true);
-        }
-      }
-    }
-
-    // Test a bunch of batched convolutions
-    for (int kernel_size = 1; kernel_size <= 8; kernel_size++) {
-      for (int stride = 1; stride <= 8; stride++) {
-        testOpenGLConv(1,
-                       16,
-                       10,
-                       10,
-                       16,
-                       kernel_size,
-                       kernel_size,
-                       0,
-                       stride,
-                       ConvTranspose,
-                       0.5 * (1 + kernel_size / 3.0),
-                       true,
-                       1,
-                       1);
-      }
-
-      for (int stride = 1; stride <= 8; stride++) {
-        testOpenGLConv(1,
-                       16,
-                       10,
-                       10,
-                       16,
-                       kernel_size,
-                       kernel_size,
-                       0,
-                       stride,
-                       Conv,
-                       0.5 * (1 + kernel_size / 3.0),
-                       true,
-                       1,
-                       1);
-      }
-    }
-    for (const auto& channel : channels) {
-      int tile_x = 1, tile_y = 1;
-      squareFactors((channel + 3) / 4, tile_x, tile_y);
-      // clang-format off
-      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvPRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
-      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvTransposePRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
-      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
-      testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvTransposeRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
-
-      testOpenGLPRelu(1, channel, 13, 4, channel, tile_x, tile_y, 0.1);
-      testOpenGLRelu(1, channel, 4, 17, tile_x, tile_y, 0.1);
-      testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, MaxPool, 0.01, true, 1, 1, tile_x, tile_y, true);
-      testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, AveragePool, 0.01, true, 1, 1, tile_x, tile_y, true);
-      testOpenGLAdd(1, channel, 14, 8, 0.1, tile_x, tile_y);
-      testOpenGLResize(1, channel, 16, 16, 2, 2, 0.1, tile_x, tile_y);
-      // clang-format on
-    }
-  }
-
-  {
-    testGLTextureTypes<uint8_t>();
-    testGLTextureTypes<float16_t>();
-
-    testOpenGLCopyOps(1, 4, 4, 4, 1e-2);
-    testOpenGLCopyOps(1, 3, 4, 4, 1e-2);
-    testOpenGLCopyOps(1, 2, 4, 4, 1e-2);
-    testOpenGLCopyOps(1, 1, 4, 4, 1e-2);
-    testOpenGLCopyOps(1, 4, 2, 2, 1e-2);
-    testOpenGLCopyOps(1, 4, 4, 4, 1e-2);
-    testOpenGLCopyOps(1, 4, 1, 1, 1e-2);
-    testOpenGLCopyOps(1, 4, 8, 8, 1e-2);
-    testOpenGLCopyOps(1, 6, 8, 3, 1e-2);
-    testOpenGLCopyOps(1, 4, 1, 2, 1e-2);
-    testOpenGLCopyOps(1, 8, 6, 1, 1e-2);
-    testOpenGLCopyOps(1, 8, 13, 18, 1e-2);
-    testOpenGLCopyOps(1, 16, 13, 18, 1e-2);
-    testOpenGLCopyOps(1, 13, 128, 90, 1e-2);
-    testOpenGLCopyOps(1, 16, 1280, 720, 1e-2);
-
-    testOpenGLCopyOps(1, 16, 4, 4, 1e-2, 2, 2);
-    testOpenGLCopyOps(1, 64, 16, 16, 1e-2, 2, 2);
-    testOpenGLCopyOps(1, 48, 13, 17, 1e-2, 3, 2);
-    testOpenGLCopyOps(1, 512, 1, 1, 1e-2, 4, 16);
-    testOpenGLCopyOps(1, 256, 7, 7, 1e-2, 8, 8);
-    testOpenGLCopyOps(1, 20, 13, 17, 1e-2, 5, 1);
-
-    // Test pooling operators
-    LOG(INFO) << "Test pooling operators";
-    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true);
-    testOpenGLConv(1, 4, 5, 5, 4, 5, 5, 0, 1, AveragePool, 0.5, true);
-
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, AveragePool, 0.01, true);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, AveragePool, 0.01, true);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, AveragePool, 0.01, true);
-
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, MaxPool, 0.01, true);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, MaxPool, 0.01, true);
-
-    // Test strided convolution
-    LOG(INFO) << "Test strided convolution";
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, Conv, 0.5, true, 1, 1);
-
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 3, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 3, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 3, Conv, 0.5, true, 1, 1);
-
-    // Test input batching
-    LOG(INFO) << "Test input batching";
-    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
-    testOpenGLConv(1, 8, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 2, 1);
-    testOpenGLConv(1, 12, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 3, 1);
-    testOpenGLConv(1, 16, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 4, 1);
-
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input
-    testOpenGLConv(1, 8, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 2, 1); // use random input
-    testOpenGLConv(1, 12, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 3, 1); // use random input
-    testOpenGLConv(1, 16, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 4, 1); // use random input
-    testOpenGLConv(1, 32, 10, 10, 4, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
-
-    // Test output batching
-    LOG(INFO) << "Test output batching";
-    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
-    testOpenGLConv(1, 4, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 1, 2);
-    testOpenGLConv(1, 4, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 1, 3);
-    testOpenGLConv(1, 4, 5, 5, 16, 3, 3, 0, 1, Conv, 0.5, false, 1, 4);
-
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input
-    testOpenGLConv(1, 4, 10, 10, 8, 3, 3, 0, 1, Conv, 1.5, true, 1, 2); // use random input
-    testOpenGLConv(1, 4, 10, 10, 12, 3, 3, 0, 1, Conv, 0.5, true, 1, 3); // use random input
-    testOpenGLConv(1, 4, 10, 10, 16, 3, 3, 0, 1, Conv, 0.5, true, 1, 4); // use random input
-
-    // Test both
-    LOG(INFO) << "Test both input and output batching";
-    testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
-    testOpenGLConv(1, 8, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 2, 2);
-    testOpenGLConv(1, 12, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 3, 3);
-
-    testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input
-    testOpenGLConv(1, 8, 10, 10, 8, 3, 3, 0, 1, Conv, 1, true, 2, 2); // use random input
-    testOpenGLConv(1, 12, 10, 10, 12, 3, 3, 0, 1, Conv, 2, true, 3, 3); // use random input
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
-
-    // Test different combination of batching
-    LOG(INFO) << "Test mixed input and output batching sizes";
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 2);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 2);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 4);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 4);
-
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 1);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 1);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 1);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 2);
-
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2); // use random input
-
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1);
-    testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2);
-
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 2); // use random input
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 2); // use random input
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
-    testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 4); // use random input
-
-    // Test input/output channels
-    for (int i = 0; i < 4; i++) {
-      testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
-      testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
-    }
-
-    // Test large input size
-    LOG(INFO) << "Test large input size";
-    testOpenGLConv(1, 4, 1280, 720, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
-
-    // Test non standard input size
-    testOpenGLConv(1, 16, 125, 73, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
-    testOpenGLConv(1, 16, 127, 71, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
-
-    // Test for different kernel size
-    LOG(INFO) << "Test kernel sizes 4 to 6";
-    for (int w = 4; w < 7; w++) {
-      testOpenGLConv(1, 4, 128, 72, 4, w, w, 0, 1, Conv, 4 * (w / 3.0) * (w / 3.0), true, 1, 1);
-    }
-
-    // Test for random failures
-    for (int i = 0; i < 10; i++) {
-      testOpenGLConv(1, 6, 111, 111, 3, 3, 3, 0, 2, ConvTranspose, 0.5, true, 2, 1);
-      testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2);
-    }
-
-    LOG(INFO) << "Test OpenGL ConvPRelu";
-    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1);
-    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvPRelu, 1, true, 1, 1);
-    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvPRelu, 2, true, 2, 2);
-    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvPRelu, 4, true, 3, 1);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 1, 1);
-
-    LOG(INFO) << "Test OpenGL ConvTransposePRelu";
-    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1);
-    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposePRelu, 1, true, 1, 1);
-    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 2, 2);
-    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 3, 1);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 1, 1);
-
-    LOG(INFO) << "Test OpenGL ConvRelu";
-    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvRelu, 2, true, 1, 1);
-    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvRelu, 1, true, 1, 1);
-    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvRelu, 2, true, 2, 2);
-    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvRelu, 4, true, 3, 1);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 1, 1);
-
-    LOG(INFO) << "Test OpenGL ConvTransposeRelu";
-    testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 1, 1);
-    testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposeRelu, 1, true, 1, 1);
-    testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 2, 2);
-    testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 3, 1);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4);
-    testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 1, 1);
-
-    LOG(INFO) << "Test OpenGL PRelu";
-    testOpenGLPRelu(1, 4, 16, 16, 4, 1, 1, 0.1);
-    testOpenGLPRelu(1, 16, 16, 16, 1, 1, 1, 0.1);
-    testOpenGLPRelu(1, 12, 16, 16, 1, 1, 1, 0.1);
-    testOpenGLPRelu(1, 6, 640, 360, 6, 1, 1, 0.1);
-
-    LOG(INFO) << "Test OpenGL Relu";
-    testOpenGLRelu(1, 4, 16, 16, 1, 1, 0.1);
-    testOpenGLRelu(1, 16, 16, 16, 1, 1, 0.1);
-    testOpenGLRelu(1, 6, 640, 360, 1, 1, 0.1);
-
-    LOG(INFO) << "Test OpenGL Add";
-    testOpenGLAdd(1, 16, 640, 360, 0.1);
-    testOpenGLAdd(1, 12, 640, 360, 0.1);
-
-    LOG(INFO) << "Test OpenGL Sub";
-    testOpenGLSub(1, 16, 640, 360, 0.1);
-    testOpenGLSub(1, 12, 640, 360, 0.1);
-
-    LOG(INFO) << "Test OpenGL Sigmoid";
-    testOpenGLSigmoid(1, 4, 16, 16, 0.1);
-    testOpenGLSigmoid(1, 12, 64, 48, 0.1);
-    testOpenGLSigmoid(1, 6, 640, 360, 0.1);
-
-    LOG(INFO) << "Test OpenGL Tanh";
-    testOpenGLTanh(1, 4, 16, 16, 0.1);
-    testOpenGLTanh(1, 12, 64, 48, 0.1);
-    testOpenGLTanh(1, 6, 640, 360, 0.1);
-
-    LOG(INFO) << "Test OpenGL Mul";
-    testOpenGLMul(1, 4, 16, 16, 0.1);
-    testOpenGLMul(1, 12, 64, 48, 0.1);
-    testOpenGLMul(1, 6, 640, 360, 0.1);
-
-    LOG(INFO) << "Test OpenGL Concat";
-    testOpenGLConcat(1, std::vector<int>{4, 4}, 16, 16);
-    testOpenGLConcat(1, std::vector<int>{4, 4, 4}, 16, 16);
-    testOpenGLConcat(1, std::vector<int>{4, 4, 4, 4}, 16, 16);
-    testOpenGLConcat(1, std::vector<int>{8, 4, 12}, 16, 16);
-    testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16);
-    testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16);
-
-    testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16, true);
-    testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16, true);
-
-    LOG(INFO) << "Test OpenGL Softmax";
-    testOpenGLSoftmax(1, 100, 0.1);
-    testOpenGLSoftmax(1, 500, 0.1);
-    testOpenGLSoftmax(1, 1000, 0.1);
-    testOpenGLSoftmax(1, 5000, 0.1);
-
-    LOG(INFO) << "Test OpenGL InstanceNorm";
-    testOpenGLInstanceNorm(1, 4, 16, 16, 0.2);
-    testOpenGLInstanceNorm(1, 4, 20, 20, 0.2);
-    testOpenGLInstanceNorm(1, 4, 128, 128, 0.2);
-    testOpenGLInstanceNorm(1, 12, 120, 140, 0.3);
-    testOpenGLInstanceNorm(1, 3, 120, 140, 0.2);
-    testOpenGLInstanceNorm(1, 4, 192, 192, 0.2);
-
-    testOpenGLInstanceNorm(1, 4, 258, 198, 0.2);
-    testOpenGLInstanceNorm(1, 8, 338, 198, 0.2);
-    testOpenGLInstanceNorm(1, 12, 334, 194, 0.2);
-    testOpenGLInstanceNorm(1, 16, 324, 184, 0.2);
-    testOpenGLInstanceNorm(1, 6, 640, 360, 0.2);
-
-    LOG(INFO) << "Test OpenGL InstanceNormPRelu";
-    testOpenGLInstanceNormPRelu(1, 4, 16, 16, 0.2);
-    testOpenGLInstanceNormPRelu(1, 4, 20, 20, 0.2);
-    testOpenGLInstanceNormPRelu(1, 4, 128, 128, 0.2);
-    testOpenGLInstanceNormPRelu(1, 12, 120, 140, 0.3);
-    testOpenGLInstanceNormPRelu(1, 3, 120, 140, 0.2);
-    testOpenGLInstanceNormPRelu(1, 4, 192, 192, 0.2);
-
-    testOpenGLInstanceNormPRelu(1, 4, 258, 198, 0.2);
-    testOpenGLInstanceNormPRelu(1, 8, 338, 198, 0.2);
-    testOpenGLInstanceNormPRelu(1, 12, 334, 194, 0.2);
-    testOpenGLInstanceNormPRelu(1, 16, 324, 184, 0.2);
-    testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2);
-
-    LOG(INFO) << "Test OpenGL ResizeNearest";
-    testOpenGLResize(1, 4, 16, 16, 1, 1, 0.1);
-    testOpenGLResize(1, 4, 16, 16, 2, 2, 0.1);
-    testOpenGLResize(1, 4, 16, 16, 3, 3, 0.1);
-    testOpenGLResize(1, 4, 16, 16, 4, 4, 0.1);
-    testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
-    testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
-    testOpenGLResize(1, 12, 25, 25, 3, 3, 0.1);
-    testOpenGLResize(1, 4, 720, 1280, 3, 3, 0.1);
-
-    // debug style transfer
-    // conv
-    testOpenGLConv(1, 3, 82, 82, 8, 9, 9, 0, 1, Conv, 4, true, 1, 1);
-    testOpenGLConv(1, 8, 74, 74, 8, 3, 3, 0, 1, Conv, 4, true, 1, 1);
-    testOpenGLConv(1, 8, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1);
-    testOpenGLConv(1, 12, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1);
-
-    // convtranspose
-    testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2);
-    testOpenGLConv(1, 6, 112, 112, 3, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 1);
-
-    LOG(INFO) << "Test OpenGL PadImage";
-    testOpenGLPadImage(1, 3, 11, 11, 0, 1, 0, 1, 0.001);
-    testOpenGLPadImage(1, 3, 50, 80, 0, 1, 0, 1, 0.001);
-    testOpenGLPadImage(1, 12, 50, 80, 10, 9, 10, 9, 0.001);
-
-    LOG(INFO) << "Test OpenGL Preprocess";
-    testOpenGLPreprocess(1, 4, 8, 8, 0.20);
-    testOpenGLPreprocess(1, 4, 1280, 720, 0.20);
-
-    LOG(INFO) << "Test OpenGL Deprocess";
-    testOpenGLDeprocess(1, 3, 8, 8, 0.01);
-    testOpenGLDeprocess(1, 3, 1280, 720, 0.01);
-
-    LOG(INFO) << "Test OpenGL NormalizePlanarYUV";
-    testOpenGLNormPlanarYUV(1, 3, 8, 8, 0.01);
-    testOpenGLNormPlanarYUV(1, 3, 192, 192, 0.01);
-
-    //  for (int i = 0; i < 4; i += 1) {
-    //    LOG(INFO) << "C: " << 4 << ", H: " << 1280 + i << ", W: " << 720 + i;
-    //    OpenGL_copyops_speedtest(1, 4, 1280, 720 + i, 4, 3, 3, 0, 0.5);
-    //  }
-
-    //  for (int i = 0; i < 1; i += 1) {
-    //    LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i;
-    //    OpenGL_copyops_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5);
-    //  }
-    //
-    //  for (int i = 0; i < 9; i += 1) {
-    //    LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i;
-    //    OpenGL_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5);
-    //  }
-
-    // Multi-Batch Tests
-    LOG(INFO) << "Test OpenGL Multi-batch Support";
-    testOpenGLCopyOps(2, 4, 4, 4, 1e-2);
-    testOpenGLCopyOps(3, 4, 4, 4, 1e-2);
-    testOpenGLCopyOps(5, 4, 4, 4, 1e-2);
-    testOpenGLConv(2, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true);
-    testOpenGLConv(2, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true);
-    testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(5, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(7, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(11, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(12, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(21, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(50, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
-    testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, ConvTranspose, 0.5, true, 1, 1);
-    testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1);
-    testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1);
-
-    testOpenGLPRelu(3, 4, 16, 16, 4, 1, 1, 0.1);
-    testOpenGLPRelu(5, 4, 16, 16, 4, 1, 1, 0.1);
-
-    testOpenGLRelu(3, 4, 16, 16, 1, 1, 0.1);
-    testOpenGLRelu(7, 4, 16, 16, 1, 1, 0.1);
-
-    testOpenGLAdd(3, 16, 640, 360, 0.1);
-    testOpenGLAdd(9, 16, 640, 360, 0.1);
-
-    testOpenGLSigmoid(3, 4, 16, 16, 0.1);
-    testOpenGLSigmoid(11, 4, 16, 16, 0.1);
-
-    testOpenGLInstanceNorm(3, 4, 16, 16, 0.2);
-    testOpenGLInstanceNorm(13, 4, 16, 16, 0.2);
-
-    testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2);
-    testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2);
-
-    testOpenGLResize(3, 4, 16, 16, 1, 1, 0.1);
-    testOpenGLResize(16, 4, 16, 16, 1, 1, 0.1);
-
-    testOpenGLPadImage(3, 3, 4, 4, 0, 1, 0, 1, 0.01);
-    testOpenGLPadImage(23, 3, 4, 4, 0, 1, 0, 1, 0.01);
-
-    testOpenGLSoftmax(3, 1000, 0.1);
-    testOpenGLSoftmax(27, 100, 0.1);
-
-    testOpenGLNormPlanarYUV(4, 3, 192, 192, 0.01);
-
-    // Test Tiling
-    testOpenGLSoftmax(3, 1000, 0.1, true);
-    testOpenGLSoftmax(9, 523, 0.1, true);
-    testOpenGLSoftmax(27, 100, 0.1, true);
-  }
-
-  LOG(INFO) << "End of OpenGL tests";
-}
-} // namespace caffe2
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.h b/caffe2/mobile/contrib/opengl/test/opengl_test.h
deleted file mode 100644 (file)
index 5938d00..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-
-#include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-void testOpenGL();
-void compareModelsForOpenGL(std::string name,
-                            const NetDef& initNet,
-                            NetDef predictNet,
-                            int width,
-                            int height,
-                            int channel,
-                            std::string input_type,
-                            std::string input_order);
-
-void compareBatchedToTiledModels(std::string name,
-                                 const NetDef& initNet,
-                                 NetDef predictNet,
-                                 int width,
-                                 int height,
-                                 int channel,
-                                 std::string input_type,
-                                 std::string input_order);
-
-int runModelBenchmarks(caffe2::NetDef& init_net,
-                       caffe2::NetDef& predict_net,
-                       int warm_up_runs,
-                       int main_runs,
-                       int channel,
-                       int height,
-                       int width,
-                       std::string input_type,
-                       std::string input_order,
-                       std::string engine,
-                       bool run_individual    = false,
-                       bool use_texture_input = false,
-                       bool use_tiling        = false,
-                       bool run_fusion        = true);
-} // namespace caffe2
index 222ee4b..9f378a7 100644 (file)
@@ -834,23 +834,13 @@ if(USE_PROF)
   endif()
 endif()
 
-if (USE_MOBILE_OPENGL)
-  if (ANDROID)
-    list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
-  elseif (IOS)
-    message(STATUS "TODO item for adding ios opengl dependency")
-  else()
-    message(WARNING "mobile opengl is only used in android or ios builds.")
-    caffe2_update_option(USE_MOBILE_OPENGL OFF)
-  endif()
-endif()
-
 # ---[ ARM Compute Library: check compatibility.
 if (USE_ACL)
   if (NOT ANDROID)
     message(WARNING "ARM Compute Library is only supported for Android builds.")
     caffe2_update_option(USE_ACL OFF)
   else()
+    list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
     if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
       # 32-bit ARM (armv7, armv7-a, armv7l, etc)
       set(ACL_ARCH "armv7a")
index 7707946..d687c6d 100644 (file)
@@ -98,7 +98,6 @@ function (caffe2_print_configuration_summary)
   message(STATUS "  USE_METAL             : ${USE_METAL}")
   message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
   message(STATUS "  USE_MKLDNN            : ${CAFFE2_USE_MKLDNN}")
-  message(STATUS "  USE_MOBILE_OPENGL     : ${USE_MOBILE_OPENGL}")
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
index 6c353f6..eacba45 100755 (executable)
@@ -85,9 +85,6 @@ CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK")
 CMAKE_ARGS+=("-DANDROID_ABI=armeabi-v7a with NEON")
 CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=21")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
-# TODO: As the toolchain file doesn't support NEON-FP16 extension,
-# we disable USE_MOBILE_OPENGL for now, it will be re-enabled in the future.
-CMAKE_ARGS+=("-DUSE_MOBILE_OPENGL=OFF")
 
 # Use-specified CMake arguments go last to allow overridding defaults
 CMAKE_ARGS+=($@)