option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
option(USE_LMDB "Use LMDB" ON)
option(USE_METAL "Use Metal for iOS build" ON)
-option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
option(USE_NCCL "Use NCCL" ON)
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
add_subdirectory(ios)
-# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
-if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
- # add_subdirectory(opengl)
-endif()
if (USE_ACL)
# add_subdirectory(arm-compute)
endif()
+++ /dev/null
-add_subdirectory(core)
-add_subdirectory(operators)
-
-if (ANDROID)
- add_subdirectory(android)
-endif()
-
-if (IOS)
- add_subdirectory(ios)
-endif()
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+++ /dev/null
-#include "AndroidGLContext.h"
-#include "caffe2/core/logging.h"
-#include "gl3stub.h"
-#include <regex>
-
-namespace {
-
-static const std::unordered_map<std::string, GL_Renderer>& renderer_map() {
- static std::unordered_map<std::string, GL_Renderer> m = {
- {"Adreno", Adreno},
- {"Mali", Mali},
- {"NVIDIA", Tegra} /*, {"PowerVR", PowerVR} */};
- return m;
-}
-
-} // namespace
-
-EGLContext AndroidGLContext::create_opengl_thread_context() {
- EGLSurface surface = EGL_NO_SURFACE;
- EGLContext context = EGL_NO_CONTEXT;
- EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
- if (display == EGL_NO_DISPLAY) {
- // We failed to get a display
- CAFFE_THROW("Problem with OpenGL context");
- return context;
- }
-
- EGLint major;
- EGLint minor;
- eglInitialize(display, &major, &minor);
-
- const EGLint configAttr[] = {EGL_RENDERABLE_TYPE,
- EGL_OPENGL_ES2_BIT,
- EGL_SURFACE_TYPE,
- EGL_PBUFFER_BIT, // we create a pixelbuffer surface
- EGL_NONE};
-
- EGLint numConfig;
- EGLConfig eglConfig;
- if (!eglChooseConfig(display, configAttr, &eglConfig, 1, &numConfig)) {
- // We failed to find a suitable config
- eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
- eglTerminate(display);
- display = EGL_NO_DISPLAY;
- CAFFE_THROW("Problem with OpenGL context");
- return context;
- }
-
- const EGLint ctxAttr[] = {EGL_CONTEXT_CLIENT_VERSION,
- 2, // very important!
- EGL_NONE};
-
- // Create an EGL context based on the chosen configuration.
- context = eglCreateContext(display, eglConfig, EGL_NO_CONTEXT, ctxAttr);
-
- // We need a surface. For most mixed JNI/Java based apps it is suggested
- // that we pass a Java surface through JNI and extract the surface
- // Pure NDK apps get passed the android_app structure which includes a surface
- // We want our own OpenGL context for the current thread.
- // Here we create a fake 1x1 'pixel buffer' surface.
- // We don't expecting to run vertex or fragment shaders.
-
- const EGLint surfaceAttr[] = {EGL_WIDTH, 1, EGL_HEIGHT, 1, EGL_NONE};
-
- surface = eglCreatePbufferSurface(display, eglConfig, surfaceAttr);
-
- // Bind context, draw and surface to current thread
- eglMakeCurrent(display, surface, surface, context);
-
- // Bind the API for this context. In our case we want to use OpenGL_ES
- eglBindAPI(EGL_OPENGL_ES_API);
- return context;
-}
-
-bool AndroidGLContext::opengl_thread_context_exists() {
- return eglGetCurrentContext() != EGL_NO_CONTEXT;
-}
-
-bool AndroidGLContext::release_opengl_thread_context() {
- EGLContext display = eglGetCurrentDisplay();
- if (display != EGL_NO_DISPLAY) {
- if (_eglcontext != EGL_NO_CONTEXT) {
- eglDestroyContext(display, _eglcontext);
- _eglcontext = EGL_NO_CONTEXT;
- }
- EGLSurface surface = eglGetCurrentSurface(EGL_DRAW);
- if (surface != EGL_NO_SURFACE) {
- eglDestroySurface(display, surface);
- surface = EGL_NO_SURFACE;
- }
- surface = eglGetCurrentSurface(EGL_READ);
- if (surface != EGL_NO_SURFACE) {
- eglDestroySurface(display, surface);
- surface = EGL_NO_SURFACE;
- }
- eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
- eglTerminate(display);
- display = EGL_NO_DISPLAY;
- }
- eglReleaseThread();
- return true;
-}
-
-void AndroidGLContext::init_gles3() {
- if (!gl3stubInit()) {
- CAFFE_THROW("OpenGL ES 3 not initialized");
- } else {
- LOG(INFO) << "OpenGL ES 3 successfully enabled";
- }
-}
-
-GL_Renderer AndroidGLContext::get_platform() {
- std::string rendererStr((const char*)glGetString(GL_RENDERER));
- std::regex regStr("^[A-Za-z]*");
- std::smatch matchs;
- if (std::regex_search(rendererStr, matchs, regStr)) {
- const std::string renderer = *matchs.begin();
- auto found = renderer_map().find(renderer);
- if (found != renderer_map().end()) {
- return found->second;
- }
- }
- CAFFE_THROW("Unsupported GPU renderer");
-}
-
-AndroidGLContext::AndroidGLContext() {
- if (!opengl_thread_context_exists()) {
- _eglcontext = create_opengl_thread_context();
- LOG(INFO) << "New EGLContext created";
-
- if (!supportOpenGLES3(&half_float_supported)) {
- CAFFE_THROW("OpenGL ES 3 not supported");
- }
-
- if (!isSupportedDevice()) {
- LOG(ERROR) << "Device not fully supported";
- }
- } else {
- _eglcontext = EGL_NO_CONTEXT;
- LOG(INFO) << "Reusing EGLContext, make sure OpenGL ES 3 is supported";
- }
- static std::once_flag once;
- std::call_once(once, [&]() { init_gles3(); });
-}
-
-AndroidGLContext::~AndroidGLContext() {
- if (_eglcontext != EGL_NO_CONTEXT) {
- release_opengl_thread_context();
- }
-}
-
-void AndroidGLContext::set_context() {}
-
-void AndroidGLContext::reset_context() {}
-
-void AndroidGLContext::flush_context() {}
+++ /dev/null
-
-#pragma once
-
-#include "../core/GLContext.h"
-#include "../core/GLTexture.h"
-#include <unordered_map>
-
-enum GL_Renderer { Adreno, Mali, Tegra /*, PowerVR */ };
-
-class AndroidGLContext : public GLContext {
- private:
- EGLContext _eglcontext;
-
- EGLContext create_opengl_thread_context();
- bool opengl_thread_context_exists();
- bool release_opengl_thread_context();
-
- public:
- AndroidGLContext();
- ~AndroidGLContext();
- void set_context();
- void reset_context();
- void flush_context();
- void init_gles3();
- GL_Renderer get_platform();
-};
+++ /dev/null
-file(GLOB_RECURSE tmp *.cc *.c)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+++ /dev/null
-
-#include "AndroidGLContext.h"
-
-std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
-
-void GLContext::initGLContext() {
- if (_glcontext == nullptr) {
- _glcontext.reset(new AndroidGLContext());
- }
-}
-
-GLContext* GLContext::getGLContext() {
- if (_glcontext == nullptr) {
- initGLContext();
- }
- return _glcontext.get();
-}
-
-void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
+++ /dev/null
-
-#include "../core/GLImageAllocator.h"
-#include "../core/arm_neon_support.h"
-
-template <typename T>
-GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
- return new GLImageAllocator<T>();
-}
-
-template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
-template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
+++ /dev/null
-
-#pragma once
-
-#include <arm_neon.h>
-typedef __fp16 float16_t;
+++ /dev/null
-
-// clang-format off
-
-#include <EGL/egl.h>
-#include "gl3stub.h"
-
-GLboolean gl3stubInit() {
- #define FIND_PROC(s) s = (void*)eglGetProcAddress(#s)
- FIND_PROC(glReadBuffer);
- FIND_PROC(glDrawRangeElements);
- FIND_PROC(glTexImage3D);
- FIND_PROC(glTexSubImage3D);
- FIND_PROC(glCopyTexSubImage3D);
- FIND_PROC(glCompressedTexImage3D);
- FIND_PROC(glCompressedTexSubImage3D);
- FIND_PROC(glGenQueries);
- FIND_PROC(glDeleteQueries);
- FIND_PROC(glIsQuery);
- FIND_PROC(glBeginQuery);
- FIND_PROC(glEndQuery);
- FIND_PROC(glGetQueryiv);
- FIND_PROC(glGetQueryObjectuiv);
- FIND_PROC(glUnmapBuffer);
- FIND_PROC(glGetBufferPointerv);
- FIND_PROC(glDrawBuffers);
- FIND_PROC(glUniformMatrix2x3fv);
- FIND_PROC(glUniformMatrix3x2fv);
- FIND_PROC(glUniformMatrix2x4fv);
- FIND_PROC(glUniformMatrix4x2fv);
- FIND_PROC(glUniformMatrix3x4fv);
- FIND_PROC(glUniformMatrix4x3fv);
- FIND_PROC(glBlitFramebuffer);
- FIND_PROC(glRenderbufferStorageMultisample);
- FIND_PROC(glFramebufferTextureLayer);
- FIND_PROC(glMapBufferRange);
- FIND_PROC(glFlushMappedBufferRange);
- FIND_PROC(glBindVertexArray);
- FIND_PROC(glDeleteVertexArrays);
- FIND_PROC(glGenVertexArrays);
- FIND_PROC(glIsVertexArray);
- FIND_PROC(glGetIntegeri_v);
- FIND_PROC(glBeginTransformFeedback);
- FIND_PROC(glEndTransformFeedback);
- FIND_PROC(glBindBufferRange);
- FIND_PROC(glBindBufferBase);
- FIND_PROC(glTransformFeedbackVaryings);
- FIND_PROC(glGetTransformFeedbackVarying);
- FIND_PROC(glVertexAttribIPointer);
- FIND_PROC(glGetVertexAttribIiv);
- FIND_PROC(glGetVertexAttribIuiv);
- FIND_PROC(glVertexAttribI4i);
- FIND_PROC(glVertexAttribI4ui);
- FIND_PROC(glVertexAttribI4iv);
- FIND_PROC(glVertexAttribI4uiv);
- FIND_PROC(glGetUniformuiv);
- FIND_PROC(glGetFragDataLocation);
- FIND_PROC(glUniform1ui);
- FIND_PROC(glUniform2ui);
- FIND_PROC(glUniform3ui);
- FIND_PROC(glUniform4ui);
- FIND_PROC(glUniform1uiv);
- FIND_PROC(glUniform2uiv);
- FIND_PROC(glUniform3uiv);
- FIND_PROC(glUniform4uiv);
- FIND_PROC(glClearBufferiv);
- FIND_PROC(glClearBufferuiv);
- FIND_PROC(glClearBufferfv);
- FIND_PROC(glClearBufferfi);
- FIND_PROC(glGetStringi);
- FIND_PROC(glCopyBufferSubData);
- FIND_PROC(glGetUniformIndices);
- FIND_PROC(glGetActiveUniformsiv);
- FIND_PROC(glGetUniformBlockIndex);
- FIND_PROC(glGetActiveUniformBlockiv);
- FIND_PROC(glGetActiveUniformBlockName);
- FIND_PROC(glUniformBlockBinding);
- FIND_PROC(glDrawArraysInstanced);
- FIND_PROC(glDrawElementsInstanced);
- FIND_PROC(glFenceSync);
- FIND_PROC(glIsSync);
- FIND_PROC(glDeleteSync);
- FIND_PROC(glClientWaitSync);
- FIND_PROC(glWaitSync);
- FIND_PROC(glGetInteger64v);
- FIND_PROC(glGetSynciv);
- FIND_PROC(glGetInteger64i_v);
- FIND_PROC(glGetBufferParameteri64v);
- FIND_PROC(glGenSamplers);
- FIND_PROC(glDeleteSamplers);
- FIND_PROC(glIsSampler);
- FIND_PROC(glBindSampler);
- FIND_PROC(glSamplerParameteri);
- FIND_PROC(glSamplerParameteriv);
- FIND_PROC(glSamplerParameterf);
- FIND_PROC(glSamplerParameterfv);
- FIND_PROC(glGetSamplerParameteriv);
- FIND_PROC(glGetSamplerParameterfv);
- FIND_PROC(glVertexAttribDivisor);
- FIND_PROC(glBindTransformFeedback);
- FIND_PROC(glDeleteTransformFeedbacks);
- FIND_PROC(glGenTransformFeedbacks);
- FIND_PROC(glIsTransformFeedback);
- FIND_PROC(glPauseTransformFeedback);
- FIND_PROC(glResumeTransformFeedback);
- FIND_PROC(glGetProgramBinary);
- FIND_PROC(glProgramBinary);
- FIND_PROC(glProgramParameteri);
- FIND_PROC(glInvalidateFramebuffer);
- FIND_PROC(glInvalidateSubFramebuffer);
- FIND_PROC(glTexStorage2D);
- FIND_PROC(glTexStorage3D);
- FIND_PROC(glGetInternalformativ);
-
- // Bind GL_EXT_texture_border_clamp
-
- FIND_PROC(glTexParameterIivEXT);
- FIND_PROC(glTexParameterIuivEXT);
- FIND_PROC(glGetTexParameterIivEXT);
- FIND_PROC(glGetTexParameterIuivEXT);
- FIND_PROC(glSamplerParameterIivEXT);
- FIND_PROC(glSamplerParameterIuivEXT);
- FIND_PROC(glGetSamplerParameterIivEXT);
- FIND_PROC(glGetSamplerParameterIuivEXT);
-
- #undef FIND_PROC
-
- if (!glReadBuffer ||
- !glDrawRangeElements ||
- !glTexImage3D ||
- !glTexSubImage3D ||
- !glCopyTexSubImage3D ||
- !glCompressedTexImage3D ||
- !glCompressedTexSubImage3D ||
- !glGenQueries ||
- !glDeleteQueries ||
- !glIsQuery ||
- !glBeginQuery ||
- !glEndQuery ||
- !glGetQueryiv ||
- !glGetQueryObjectuiv ||
- !glUnmapBuffer ||
- !glGetBufferPointerv ||
- !glDrawBuffers ||
- !glUniformMatrix2x3fv ||
- !glUniformMatrix3x2fv ||
- !glUniformMatrix2x4fv ||
- !glUniformMatrix4x2fv ||
- !glUniformMatrix3x4fv ||
- !glUniformMatrix4x3fv ||
- !glBlitFramebuffer ||
- !glRenderbufferStorageMultisample ||
- !glFramebufferTextureLayer ||
- !glMapBufferRange ||
- !glFlushMappedBufferRange ||
- !glBindVertexArray ||
- !glDeleteVertexArrays ||
- !glGenVertexArrays ||
- !glIsVertexArray ||
- !glGetIntegeri_v ||
- !glBeginTransformFeedback ||
- !glEndTransformFeedback ||
- !glBindBufferRange ||
- !glBindBufferBase ||
- !glTransformFeedbackVaryings ||
- !glGetTransformFeedbackVarying ||
- !glVertexAttribIPointer ||
- !glGetVertexAttribIiv ||
- !glGetVertexAttribIuiv ||
- !glVertexAttribI4i ||
- !glVertexAttribI4ui ||
- !glVertexAttribI4iv ||
- !glVertexAttribI4uiv ||
- !glGetUniformuiv ||
- !glGetFragDataLocation ||
- !glUniform1ui ||
- !glUniform2ui ||
- !glUniform3ui ||
- !glUniform4ui ||
- !glUniform1uiv ||
- !glUniform2uiv ||
- !glUniform3uiv ||
- !glUniform4uiv ||
- !glClearBufferiv ||
- !glClearBufferuiv ||
- !glClearBufferfv ||
- !glClearBufferfi ||
- !glGetStringi ||
- !glCopyBufferSubData ||
- !glGetUniformIndices ||
- !glGetActiveUniformsiv ||
- !glGetUniformBlockIndex ||
- !glGetActiveUniformBlockiv ||
- !glGetActiveUniformBlockName ||
- !glUniformBlockBinding ||
- !glDrawArraysInstanced ||
- !glDrawElementsInstanced ||
- !glFenceSync ||
- !glIsSync ||
- !glDeleteSync ||
- !glClientWaitSync ||
- !glWaitSync ||
- !glGetInteger64v ||
- !glGetSynciv ||
- !glGetInteger64i_v ||
- !glGetBufferParameteri64v ||
- !glGenSamplers ||
- !glDeleteSamplers ||
- !glIsSampler ||
- !glBindSampler ||
- !glSamplerParameteri ||
- !glSamplerParameteriv ||
- !glSamplerParameterf ||
- !glSamplerParameterfv ||
- !glGetSamplerParameteriv ||
- !glGetSamplerParameterfv ||
- !glVertexAttribDivisor ||
- !glBindTransformFeedback ||
- !glDeleteTransformFeedbacks ||
- !glGenTransformFeedbacks ||
- !glIsTransformFeedback ||
- !glPauseTransformFeedback ||
- !glResumeTransformFeedback ||
- !glGetProgramBinary ||
- !glProgramBinary ||
- !glProgramParameteri ||
- !glInvalidateFramebuffer ||
- !glInvalidateSubFramebuffer ||
- !glTexStorage2D ||
- !glTexStorage3D ||
- !glGetInternalformativ)
- {
- return GL_FALSE;
- }
-
- return GL_TRUE;
-}
-
-/* Function pointer definitions */
-GL_APICALL void (* GL_APIENTRY glReadBuffer) (GLenum mode);
-GL_APICALL void (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
-GL_APICALL void (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
-GL_APICALL void (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
-GL_APICALL void (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
-GL_APICALL void (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
-GL_APICALL void (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
-GL_APICALL void (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
-GL_APICALL void (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
-GL_APICALL GLboolean (* GL_APIENTRY glIsQuery) (GLuint id);
-GL_APICALL void (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
-GL_APICALL void (* GL_APIENTRY glEndQuery) (GLenum target);
-GL_APICALL void (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
-GL_APICALL void (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
-GL_APICALL GLboolean (* GL_APIENTRY glUnmapBuffer) (GLenum target);
-GL_APICALL void (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
-GL_APICALL void (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
-GL_APICALL void (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
-GL_APICALL void (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
-GL_APICALL void (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
-GL_APICALL GLvoid* (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-GL_APICALL void (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
-GL_APICALL void (* GL_APIENTRY glBindVertexArray) (GLuint array);
-GL_APICALL void (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
-GL_APICALL void (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
-GL_APICALL GLboolean (* GL_APIENTRY glIsVertexArray) (GLuint array);
-GL_APICALL void (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
-GL_APICALL void (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
-GL_APICALL void (* GL_APIENTRY glEndTransformFeedback) (void);
-GL_APICALL void (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
-GL_APICALL void (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
-GL_APICALL void (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
-GL_APICALL void (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
-GL_APICALL void (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
-GL_APICALL void (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
-GL_APICALL void (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
-GL_APICALL void (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
-GL_APICALL void (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
-GL_APICALL void (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
-GL_APICALL void (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
-GL_APICALL void (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
-GL_APICALL GLint (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
-GL_APICALL void (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
-GL_APICALL void (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
-GL_APICALL void (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
-GL_APICALL void (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
-GL_APICALL void (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
-GL_APICALL void (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
-GL_APICALL void (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
-GL_APICALL void (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
-GL_APICALL void (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
-GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
-GL_APICALL void (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
-GL_APICALL void (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
-GL_APICALL void (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
-GL_APICALL GLuint (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
-GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
-GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
-GL_APICALL void (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
-GL_APICALL void (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
-GL_APICALL void (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
-GL_APICALL GLsync (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
-GL_APICALL GLboolean (* GL_APIENTRY glIsSync) (GLsync sync);
-GL_APICALL void (* GL_APIENTRY glDeleteSync) (GLsync sync);
-GL_APICALL GLenum (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-GL_APICALL void (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-GL_APICALL void (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
-GL_APICALL void (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
-GL_APICALL void (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
-GL_APICALL void (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
-GL_APICALL void (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
-GL_APICALL void (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
-GL_APICALL GLboolean (* GL_APIENTRY glIsSampler) (GLuint sampler);
-GL_APICALL void (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
-GL_APICALL void (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
-GL_APICALL void (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
-GL_APICALL void (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
-GL_APICALL void (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
-GL_APICALL void (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
-GL_APICALL void (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
-GL_APICALL void (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
-GL_APICALL void (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
-GL_APICALL void (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
-GL_APICALL void (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
-GL_APICALL GLboolean (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
-GL_APICALL void (* GL_APIENTRY glPauseTransformFeedback) (void);
-GL_APICALL void (* GL_APIENTRY glResumeTransformFeedback) (void);
-GL_APICALL void (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
-GL_APICALL void (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
-GL_APICALL void (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
-GL_APICALL void (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
-GL_APICALL void (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
-GL_APICALL void (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
-GL_APICALL void (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
-GL_APICALL void (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
-
-// GL_EXT_texture_border_clamp
-
-GL_APICALL void (* GL_APIENTRY glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
-GL_APICALL void (* GL_APIENTRY glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
-GL_APICALL void (* GL_APIENTRY glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
-GL_APICALL void (* GL_APIENTRY glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
-GL_APICALL void (* GL_APIENTRY glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
-GL_APICALL void (* GL_APIENTRY glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
-GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
-GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
-
-// End GL_EXT_texture_border_clamp
-
-// clang-format on
+++ /dev/null
-
-#ifndef __gl3_h_
-#define __gl3_h_
-
-/*
- * stub gl3.h for dynamic loading, based on:
- * gl3.h last updated on $Date: 2013-02-12 14:37:24 -0800 (Tue, 12 Feb 2013) $
- *
- * Changes:
- * - Added #include <GLES2/gl2.h>
- * - Removed duplicate OpenGL ES 2.0 declarations
- * - Converted OpenGL ES 3.0 function prototypes to function pointer
- * declarations
- * - Added gl3stubInit() declaration
- */
-
-#include <GLES2/gl2.h>
-#include <android/api-level.h>
-
-// clang-format off
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Call this function before calling any OpenGL ES 3.0 functions. It will
- * return GL_TRUE if the OpenGL ES 3.0 was successfully initialized, GL_FALSE
- * otherwise. */
-GLboolean gl3stubInit();
-
-/*-------------------------------------------------------------------------
- * Data type definitions
- *-----------------------------------------------------------------------*/
-
-/* OpenGL ES 3.0 */
-
-typedef unsigned short GLhalf;
-#if __ANDROID_API__ <= 19
-typedef khronos_int64_t GLint64;
-typedef khronos_uint64_t GLuint64;
-typedef struct __GLsync *GLsync;
-#endif
-
-/*-------------------------------------------------------------------------
- * Token definitions
- *-----------------------------------------------------------------------*/
-
-/* OpenGL ES core versions */
-#define GL_ES_VERSION_3_0 1
-
-/* OpenGL ES 3.0 */
-
-#define GL_READ_BUFFER 0x0C02
-#define GL_UNPACK_ROW_LENGTH 0x0CF2
-#define GL_UNPACK_SKIP_ROWS 0x0CF3
-#define GL_UNPACK_SKIP_PIXELS 0x0CF4
-#define GL_PACK_ROW_LENGTH 0x0D02
-#define GL_PACK_SKIP_ROWS 0x0D03
-#define GL_PACK_SKIP_PIXELS 0x0D04
-#define GL_COLOR 0x1800
-#define GL_DEPTH 0x1801
-#define GL_STENCIL 0x1802
-#define GL_RED 0x1903
-#define GL_RGB8 0x8051
-#define GL_RGBA8 0x8058
-#define GL_RGB10_A2 0x8059
-#define GL_TEXTURE_BINDING_3D 0x806A
-#define GL_UNPACK_SKIP_IMAGES 0x806D
-#define GL_UNPACK_IMAGE_HEIGHT 0x806E
-#define GL_TEXTURE_3D 0x806F
-#define GL_TEXTURE_WRAP_R 0x8072
-#define GL_MAX_3D_TEXTURE_SIZE 0x8073
-#define GL_UNSIGNED_INT_2_10_10_10_REV 0x8368
-#define GL_MAX_ELEMENTS_VERTICES 0x80E8
-#define GL_MAX_ELEMENTS_INDICES 0x80E9
-#define GL_TEXTURE_MIN_LOD 0x813A
-#define GL_TEXTURE_MAX_LOD 0x813B
-#define GL_TEXTURE_BASE_LEVEL 0x813C
-#define GL_TEXTURE_MAX_LEVEL 0x813D
-#define GL_MIN 0x8007
-#define GL_MAX 0x8008
-#define GL_DEPTH_COMPONENT24 0x81A6
-#define GL_MAX_TEXTURE_LOD_BIAS 0x84FD
-#define GL_TEXTURE_COMPARE_MODE 0x884C
-#define GL_TEXTURE_COMPARE_FUNC 0x884D
-#define GL_CURRENT_QUERY 0x8865
-#define GL_QUERY_RESULT 0x8866
-#define GL_QUERY_RESULT_AVAILABLE 0x8867
-#define GL_BUFFER_MAPPED 0x88BC
-#define GL_BUFFER_MAP_POINTER 0x88BD
-#define GL_STREAM_READ 0x88E1
-#define GL_STREAM_COPY 0x88E2
-#define GL_STATIC_READ 0x88E5
-#define GL_STATIC_COPY 0x88E6
-#define GL_DYNAMIC_READ 0x88E9
-#define GL_DYNAMIC_COPY 0x88EA
-#define GL_MAX_DRAW_BUFFERS 0x8824
-#define GL_DRAW_BUFFER0 0x8825
-#define GL_DRAW_BUFFER1 0x8826
-#define GL_DRAW_BUFFER2 0x8827
-#define GL_DRAW_BUFFER3 0x8828
-#define GL_DRAW_BUFFER4 0x8829
-#define GL_DRAW_BUFFER5 0x882A
-#define GL_DRAW_BUFFER6 0x882B
-#define GL_DRAW_BUFFER7 0x882C
-#define GL_DRAW_BUFFER8 0x882D
-#define GL_DRAW_BUFFER9 0x882E
-#define GL_DRAW_BUFFER10 0x882F
-#define GL_DRAW_BUFFER11 0x8830
-#define GL_DRAW_BUFFER12 0x8831
-#define GL_DRAW_BUFFER13 0x8832
-#define GL_DRAW_BUFFER14 0x8833
-#define GL_DRAW_BUFFER15 0x8834
-#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS 0x8B49
-#define GL_MAX_VERTEX_UNIFORM_COMPONENTS 0x8B4A
-#define GL_SAMPLER_3D 0x8B5F
-#define GL_SAMPLER_2D_SHADOW 0x8B62
-#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT 0x8B8B
-#define GL_PIXEL_PACK_BUFFER 0x88EB
-#define GL_PIXEL_UNPACK_BUFFER 0x88EC
-#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED
-#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF
-#define GL_FLOAT_MAT2x3 0x8B65
-#define GL_FLOAT_MAT2x4 0x8B66
-#define GL_FLOAT_MAT3x2 0x8B67
-#define GL_FLOAT_MAT3x4 0x8B68
-#define GL_FLOAT_MAT4x2 0x8B69
-#define GL_FLOAT_MAT4x3 0x8B6A
-#define GL_SRGB 0x8C40
-#define GL_SRGB8 0x8C41
-#define GL_SRGB8_ALPHA8 0x8C43
-#define GL_COMPARE_REF_TO_TEXTURE 0x884E
-#define GL_MAJOR_VERSION 0x821B
-#define GL_MINOR_VERSION 0x821C
-#define GL_NUM_EXTENSIONS 0x821D
-#define GL_RGBA32F 0x8814
-#define GL_RGB32F 0x8815
-#define GL_RGBA16F 0x881A
-#define GL_RGB16F 0x881B
-#define GL_VERTEX_ATTRIB_ARRAY_INTEGER 0x88FD
-#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF
-#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904
-#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905
-#define GL_MAX_VARYING_COMPONENTS 0x8B4B
-#define GL_TEXTURE_2D_ARRAY 0x8C1A
-#define GL_TEXTURE_BINDING_2D_ARRAY 0x8C1D
-#define GL_R11F_G11F_B10F 0x8C3A
-#define GL_UNSIGNED_INT_10F_11F_11F_REV 0x8C3B
-#define GL_RGB9_E5 0x8C3D
-#define GL_UNSIGNED_INT_5_9_9_9_REV 0x8C3E
-#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH 0x8C76
-#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE 0x8C7F
-#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS 0x8C80
-#define GL_TRANSFORM_FEEDBACK_VARYINGS 0x8C83
-#define GL_TRANSFORM_FEEDBACK_BUFFER_START 0x8C84
-#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE 0x8C85
-#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN 0x8C88
-#define GL_RASTERIZER_DISCARD 0x8C89
-#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A
-#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS 0x8C8B
-#define GL_INTERLEAVED_ATTRIBS 0x8C8C
-#define GL_SEPARATE_ATTRIBS 0x8C8D
-#define GL_TRANSFORM_FEEDBACK_BUFFER 0x8C8E
-#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING 0x8C8F
-#define GL_RGBA32UI 0x8D70
-#define GL_RGB32UI 0x8D71
-#define GL_RGBA16UI 0x8D76
-#define GL_RGB16UI 0x8D77
-#define GL_RGBA8UI 0x8D7C
-#define GL_RGB8UI 0x8D7D
-#define GL_RGBA32I 0x8D82
-#define GL_RGB32I 0x8D83
-#define GL_RGBA16I 0x8D88
-#define GL_RGB16I 0x8D89
-#define GL_RGBA8I 0x8D8E
-#define GL_RGB8I 0x8D8F
-#define GL_RED_INTEGER 0x8D94
-#define GL_RGB_INTEGER 0x8D98
-#define GL_RGBA_INTEGER 0x8D99
-#define GL_SAMPLER_2D_ARRAY 0x8DC1
-#define GL_SAMPLER_2D_ARRAY_SHADOW 0x8DC4
-#define GL_SAMPLER_CUBE_SHADOW 0x8DC5
-#define GL_UNSIGNED_INT_VEC2 0x8DC6
-#define GL_UNSIGNED_INT_VEC3 0x8DC7
-#define GL_UNSIGNED_INT_VEC4 0x8DC8
-#define GL_INT_SAMPLER_2D 0x8DCA
-#define GL_INT_SAMPLER_3D 0x8DCB
-#define GL_INT_SAMPLER_CUBE 0x8DCC
-#define GL_INT_SAMPLER_2D_ARRAY 0x8DCF
-#define GL_UNSIGNED_INT_SAMPLER_2D 0x8DD2
-#define GL_UNSIGNED_INT_SAMPLER_3D 0x8DD3
-#define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4
-#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7
-#define GL_BUFFER_ACCESS_FLAGS 0x911F
-#define GL_BUFFER_MAP_LENGTH 0x9120
-#define GL_BUFFER_MAP_OFFSET 0x9121
-#define GL_DEPTH_COMPONENT32F 0x8CAC
-#define GL_DEPTH32F_STENCIL8 0x8CAD
-#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV 0x8DAD
-#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING 0x8210
-#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE 0x8211
-#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE 0x8212
-#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE 0x8213
-#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE 0x8214
-#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE 0x8215
-#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE 0x8216
-#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE 0x8217
-#define GL_FRAMEBUFFER_DEFAULT 0x8218
-#define GL_FRAMEBUFFER_UNDEFINED 0x8219
-#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
-#define GL_DEPTH_STENCIL 0x84F9
-#define GL_UNSIGNED_INT_24_8 0x84FA
-#define GL_DEPTH24_STENCIL8 0x88F0
-#define GL_UNSIGNED_NORMALIZED 0x8C17
-#define GL_DRAW_FRAMEBUFFER_BINDING GL_FRAMEBUFFER_BINDING
-#define GL_READ_FRAMEBUFFER 0x8CA8
-#define GL_DRAW_FRAMEBUFFER 0x8CA9
-#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA
-#define GL_RENDERBUFFER_SAMPLES 0x8CAB
-#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER 0x8CD4
-#define GL_MAX_COLOR_ATTACHMENTS 0x8CDF
-#define GL_COLOR_ATTACHMENT1 0x8CE1
-#define GL_COLOR_ATTACHMENT2 0x8CE2
-#define GL_COLOR_ATTACHMENT3 0x8CE3
-#define GL_COLOR_ATTACHMENT4 0x8CE4
-#define GL_COLOR_ATTACHMENT5 0x8CE5
-#define GL_COLOR_ATTACHMENT6 0x8CE6
-#define GL_COLOR_ATTACHMENT7 0x8CE7
-#define GL_COLOR_ATTACHMENT8 0x8CE8
-#define GL_COLOR_ATTACHMENT9 0x8CE9
-#define GL_COLOR_ATTACHMENT10 0x8CEA
-#define GL_COLOR_ATTACHMENT11 0x8CEB
-#define GL_COLOR_ATTACHMENT12 0x8CEC
-#define GL_COLOR_ATTACHMENT13 0x8CED
-#define GL_COLOR_ATTACHMENT14 0x8CEE
-#define GL_COLOR_ATTACHMENT15 0x8CEF
-#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE 0x8D56
-#define GL_MAX_SAMPLES 0x8D57
-#define GL_HALF_FLOAT 0x140B
-#define GL_MAP_READ_BIT 0x0001
-#define GL_MAP_WRITE_BIT 0x0002
-#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004
-#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008
-#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010
-#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020
-#define GL_RG 0x8227
-#define GL_RG_INTEGER 0x8228
-#define GL_R8 0x8229
-#define GL_RG8 0x822B
-#define GL_R16F 0x822D
-#define GL_R32F 0x822E
-#define GL_RG16F 0x822F
-#define GL_RG32F 0x8230
-#define GL_R8I 0x8231
-#define GL_R8UI 0x8232
-#define GL_R16I 0x8233
-#define GL_R16UI 0x8234
-#define GL_R32I 0x8235
-#define GL_R32UI 0x8236
-#define GL_RG8I 0x8237
-#define GL_RG8UI 0x8238
-#define GL_RG16I 0x8239
-#define GL_RG16UI 0x823A
-#define GL_RG32I 0x823B
-#define GL_RG32UI 0x823C
-#define GL_VERTEX_ARRAY_BINDING 0x85B5
-#define GL_R8_SNORM 0x8F94
-#define GL_RG8_SNORM 0x8F95
-#define GL_RGB8_SNORM 0x8F96
-#define GL_RGBA8_SNORM 0x8F97
-#define GL_SIGNED_NORMALIZED 0x8F9C
-#define GL_PRIMITIVE_RESTART_FIXED_INDEX 0x8D69
-#define GL_COPY_READ_BUFFER 0x8F36
-#define GL_COPY_WRITE_BUFFER 0x8F37
-#define GL_COPY_READ_BUFFER_BINDING GL_COPY_READ_BUFFER
-#define GL_COPY_WRITE_BUFFER_BINDING GL_COPY_WRITE_BUFFER
-#define GL_UNIFORM_BUFFER 0x8A11
-#define GL_UNIFORM_BUFFER_BINDING 0x8A28
-#define GL_UNIFORM_BUFFER_START 0x8A29
-#define GL_UNIFORM_BUFFER_SIZE 0x8A2A
-#define GL_MAX_VERTEX_UNIFORM_BLOCKS 0x8A2B
-#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS 0x8A2D
-#define GL_MAX_COMBINED_UNIFORM_BLOCKS 0x8A2E
-#define GL_MAX_UNIFORM_BUFFER_BINDINGS 0x8A2F
-#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30
-#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS 0x8A31
-#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS 0x8A33
-#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT 0x8A34
-#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH 0x8A35
-#define GL_ACTIVE_UNIFORM_BLOCKS 0x8A36
-#define GL_UNIFORM_TYPE 0x8A37
-#define GL_UNIFORM_SIZE 0x8A38
-#define GL_UNIFORM_NAME_LENGTH 0x8A39
-#define GL_UNIFORM_BLOCK_INDEX 0x8A3A
-#define GL_UNIFORM_OFFSET 0x8A3B
-#define GL_UNIFORM_ARRAY_STRIDE 0x8A3C
-#define GL_UNIFORM_MATRIX_STRIDE 0x8A3D
-#define GL_UNIFORM_IS_ROW_MAJOR 0x8A3E
-#define GL_UNIFORM_BLOCK_BINDING 0x8A3F
-#define GL_UNIFORM_BLOCK_DATA_SIZE 0x8A40
-#define GL_UNIFORM_BLOCK_NAME_LENGTH 0x8A41
-#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS 0x8A42
-#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES 0x8A43
-#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER 0x8A44
-#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER 0x8A46
-#define GL_INVALID_INDEX 0xFFFFFFFFu
-#define GL_MAX_VERTEX_OUTPUT_COMPONENTS 0x9122
-#define GL_MAX_FRAGMENT_INPUT_COMPONENTS 0x9125
-#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111
-#define GL_OBJECT_TYPE 0x9112
-#define GL_SYNC_CONDITION 0x9113
-#define GL_SYNC_STATUS 0x9114
-#define GL_SYNC_FLAGS 0x9115
-#define GL_SYNC_FENCE 0x9116
-#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117
-#define GL_UNSIGNALED 0x9118
-#define GL_SIGNALED 0x9119
-#define GL_ALREADY_SIGNALED 0x911A
-#define GL_TIMEOUT_EXPIRED 0x911B
-#define GL_CONDITION_SATISFIED 0x911C
-#define GL_WAIT_FAILED 0x911D
-#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001
-#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFFull
-#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR 0x88FE
-#define GL_ANY_SAMPLES_PASSED 0x8C2F
-#define GL_ANY_SAMPLES_PASSED_CONSERVATIVE 0x8D6A
-#define GL_SAMPLER_BINDING 0x8919
-#define GL_RGB10_A2UI 0x906F
-#define GL_TEXTURE_SWIZZLE_R 0x8E42
-#define GL_TEXTURE_SWIZZLE_G 0x8E43
-#define GL_TEXTURE_SWIZZLE_B 0x8E44
-#define GL_TEXTURE_SWIZZLE_A 0x8E45
-#define GL_GREEN 0x1904
-#define GL_BLUE 0x1905
-#define GL_INT_2_10_10_10_REV 0x8D9F
-#define GL_TRANSFORM_FEEDBACK 0x8E22
-#define GL_TRANSFORM_FEEDBACK_PAUSED 0x8E23
-#define GL_TRANSFORM_FEEDBACK_ACTIVE 0x8E24
-#define GL_TRANSFORM_FEEDBACK_BINDING 0x8E25
-#define GL_PROGRAM_BINARY_RETRIEVABLE_HINT 0x8257
-#define GL_PROGRAM_BINARY_LENGTH 0x8741
-#define GL_NUM_PROGRAM_BINARY_FORMATS 0x87FE
-#define GL_PROGRAM_BINARY_FORMATS 0x87FF
-#define GL_COMPRESSED_R11_EAC 0x9270
-#define GL_COMPRESSED_SIGNED_R11_EAC 0x9271
-#define GL_COMPRESSED_RG11_EAC 0x9272
-#define GL_COMPRESSED_SIGNED_RG11_EAC 0x9273
-#define GL_COMPRESSED_RGB8_ETC2 0x9274
-#define GL_COMPRESSED_SRGB8_ETC2 0x9275
-#define GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 0x9276
-#define GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2 0x9277
-#define GL_COMPRESSED_RGBA8_ETC2_EAC 0x9278
-#define GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC 0x9279
-#define GL_TEXTURE_IMMUTABLE_FORMAT 0x912F
-#define GL_MAX_ELEMENT_INDEX 0x8D6B
-#define GL_NUM_SAMPLE_COUNTS 0x9380
-#define GL_TEXTURE_IMMUTABLE_LEVELS 0x82DF
-
-/*-------------------------------------------------------------------------
- * Entrypoint definitions
- *-----------------------------------------------------------------------*/
-
-/* OpenGL ES 3.0 */
-
-extern GL_APICALL void (* GL_APIENTRY glReadBuffer) (GLenum mode);
-extern GL_APICALL void (* GL_APIENTRY glDrawRangeElements) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid* indices);
-extern GL_APICALL void (* GL_APIENTRY glTexImage3D) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid* pixels);
-extern GL_APICALL void (* GL_APIENTRY glTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid* pixels);
-extern GL_APICALL void (* GL_APIENTRY glCopyTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
-extern GL_APICALL void (* GL_APIENTRY glCompressedTexImage3D) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid* data);
-extern GL_APICALL void (* GL_APIENTRY glCompressedTexSubImage3D) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid* data);
-extern GL_APICALL void (* GL_APIENTRY glGenQueries) (GLsizei n, GLuint* ids);
-extern GL_APICALL void (* GL_APIENTRY glDeleteQueries) (GLsizei n, const GLuint* ids);
-extern GL_APICALL GLboolean (* GL_APIENTRY glIsQuery) (GLuint id);
-extern GL_APICALL void (* GL_APIENTRY glBeginQuery) (GLenum target, GLuint id);
-extern GL_APICALL void (* GL_APIENTRY glEndQuery) (GLenum target);
-extern GL_APICALL void (* GL_APIENTRY glGetQueryiv) (GLenum target, GLenum pname, GLint* params);
-extern GL_APICALL void (* GL_APIENTRY glGetQueryObjectuiv) (GLuint id, GLenum pname, GLuint* params);
-extern GL_APICALL GLboolean (* GL_APIENTRY glUnmapBuffer) (GLenum target);
-extern GL_APICALL void (* GL_APIENTRY glGetBufferPointerv) (GLenum target, GLenum pname, GLvoid** params);
-extern GL_APICALL void (* GL_APIENTRY glDrawBuffers) (GLsizei n, const GLenum* bufs);
-extern GL_APICALL void (* GL_APIENTRY glUniformMatrix2x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glUniformMatrix3x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glUniformMatrix2x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glUniformMatrix4x2fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glUniformMatrix3x4fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glUniformMatrix4x3fv) (GLint location, GLsizei count, GLboolean transpose, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glBlitFramebuffer) (GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
-extern GL_APICALL void (* GL_APIENTRY glRenderbufferStorageMultisample) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
-extern GL_APICALL void (* GL_APIENTRY glFramebufferTextureLayer) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
-extern GL_APICALL GLvoid* (* GL_APIENTRY glMapBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-extern GL_APICALL void (* GL_APIENTRY glFlushMappedBufferRange) (GLenum target, GLintptr offset, GLsizeiptr length);
-extern GL_APICALL void (* GL_APIENTRY glBindVertexArray) (GLuint array);
-extern GL_APICALL void (* GL_APIENTRY glDeleteVertexArrays) (GLsizei n, const GLuint* arrays);
-extern GL_APICALL void (* GL_APIENTRY glGenVertexArrays) (GLsizei n, GLuint* arrays);
-extern GL_APICALL GLboolean (* GL_APIENTRY glIsVertexArray) (GLuint array);
-extern GL_APICALL void (* GL_APIENTRY glGetIntegeri_v) (GLenum target, GLuint index, GLint* data);
-extern GL_APICALL void (* GL_APIENTRY glBeginTransformFeedback) (GLenum primitiveMode);
-extern GL_APICALL void (* GL_APIENTRY glEndTransformFeedback) (void);
-extern GL_APICALL void (* GL_APIENTRY glBindBufferRange) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
-extern GL_APICALL void (* GL_APIENTRY glBindBufferBase) (GLenum target, GLuint index, GLuint buffer);
-extern GL_APICALL void (* GL_APIENTRY glTransformFeedbackVaryings) (GLuint program, GLsizei count, const GLchar* const* varyings, GLenum bufferMode);
-extern GL_APICALL void (* GL_APIENTRY glGetTransformFeedbackVarying) (GLuint program, GLuint index, GLsizei bufSize, GLsizei* length, GLsizei* size, GLenum* type, GLchar* name);
-extern GL_APICALL void (* GL_APIENTRY glVertexAttribIPointer) (GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid* pointer);
-extern GL_APICALL void (* GL_APIENTRY glGetVertexAttribIiv) (GLuint index, GLenum pname, GLint* params);
-extern GL_APICALL void (* GL_APIENTRY glGetVertexAttribIuiv) (GLuint index, GLenum pname, GLuint* params);
-extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4i) (GLuint index, GLint x, GLint y, GLint z, GLint w);
-extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4ui) (GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
-extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4iv) (GLuint index, const GLint* v);
-extern GL_APICALL void (* GL_APIENTRY glVertexAttribI4uiv) (GLuint index, const GLuint* v);
-extern GL_APICALL void (* GL_APIENTRY glGetUniformuiv) (GLuint program, GLint location, GLuint* params);
-extern GL_APICALL GLint (* GL_APIENTRY glGetFragDataLocation) (GLuint program, const GLchar *name);
-extern GL_APICALL void (* GL_APIENTRY glUniform1ui) (GLint location, GLuint v0);
-extern GL_APICALL void (* GL_APIENTRY glUniform2ui) (GLint location, GLuint v0, GLuint v1);
-extern GL_APICALL void (* GL_APIENTRY glUniform3ui) (GLint location, GLuint v0, GLuint v1, GLuint v2);
-extern GL_APICALL void (* GL_APIENTRY glUniform4ui) (GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
-extern GL_APICALL void (* GL_APIENTRY glUniform1uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void (* GL_APIENTRY glUniform2uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void (* GL_APIENTRY glUniform3uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void (* GL_APIENTRY glUniform4uiv) (GLint location, GLsizei count, const GLuint* value);
-extern GL_APICALL void (* GL_APIENTRY glClearBufferiv) (GLenum buffer, GLint drawbuffer, const GLint* value);
-extern GL_APICALL void (* GL_APIENTRY glClearBufferuiv) (GLenum buffer, GLint drawbuffer, const GLuint* value);
-extern GL_APICALL void (* GL_APIENTRY glClearBufferfv) (GLenum buffer, GLint drawbuffer, const GLfloat* value);
-extern GL_APICALL void (* GL_APIENTRY glClearBufferfi) (GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
-extern GL_APICALL const GLubyte* (* GL_APIENTRY glGetStringi) (GLenum name, GLuint index);
-extern GL_APICALL void (* GL_APIENTRY glCopyBufferSubData) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
-extern GL_APICALL void (* GL_APIENTRY glGetUniformIndices) (GLuint program, GLsizei uniformCount, const GLchar* const* uniformNames, GLuint* uniformIndices);
-extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformsiv) (GLuint program, GLsizei uniformCount, const GLuint* uniformIndices, GLenum pname, GLint* params);
-extern GL_APICALL GLuint (* GL_APIENTRY glGetUniformBlockIndex) (GLuint program, const GLchar* uniformBlockName);
-extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockiv) (GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint* params);
-extern GL_APICALL void (* GL_APIENTRY glGetActiveUniformBlockName) (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei* length, GLchar* uniformBlockName);
-extern GL_APICALL void (* GL_APIENTRY glUniformBlockBinding) (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
-extern GL_APICALL void (* GL_APIENTRY glDrawArraysInstanced) (GLenum mode, GLint first, GLsizei count, GLsizei instanceCount);
-extern GL_APICALL void (* GL_APIENTRY glDrawElementsInstanced) (GLenum mode, GLsizei count, GLenum type, const GLvoid* indices, GLsizei instanceCount);
-extern GL_APICALL GLsync (* GL_APIENTRY glFenceSync) (GLenum condition, GLbitfield flags);
-extern GL_APICALL GLboolean (* GL_APIENTRY glIsSync) (GLsync sync);
-extern GL_APICALL void (* GL_APIENTRY glDeleteSync) (GLsync sync);
-extern GL_APICALL GLenum (* GL_APIENTRY glClientWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-extern GL_APICALL void (* GL_APIENTRY glWaitSync) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-extern GL_APICALL void (* GL_APIENTRY glGetInteger64v) (GLenum pname, GLint64* params);
-extern GL_APICALL void (* GL_APIENTRY glGetSynciv) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei* length, GLint* values);
-extern GL_APICALL void (* GL_APIENTRY glGetInteger64i_v) (GLenum target, GLuint index, GLint64* data);
-extern GL_APICALL void (* GL_APIENTRY glGetBufferParameteri64v) (GLenum target, GLenum pname, GLint64* params);
-extern GL_APICALL void (* GL_APIENTRY glGenSamplers) (GLsizei count, GLuint* samplers);
-extern GL_APICALL void (* GL_APIENTRY glDeleteSamplers) (GLsizei count, const GLuint* samplers);
-extern GL_APICALL GLboolean (* GL_APIENTRY glIsSampler) (GLuint sampler);
-extern GL_APICALL void (* GL_APIENTRY glBindSampler) (GLuint unit, GLuint sampler);
-extern GL_APICALL void (* GL_APIENTRY glSamplerParameteri) (GLuint sampler, GLenum pname, GLint param);
-extern GL_APICALL void (* GL_APIENTRY glSamplerParameteriv) (GLuint sampler, GLenum pname, const GLint* param);
-extern GL_APICALL void (* GL_APIENTRY glSamplerParameterf) (GLuint sampler, GLenum pname, GLfloat param);
-extern GL_APICALL void (* GL_APIENTRY glSamplerParameterfv) (GLuint sampler, GLenum pname, const GLfloat* param);
-extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameteriv) (GLuint sampler, GLenum pname, GLint* params);
-extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterfv) (GLuint sampler, GLenum pname, GLfloat* params);
-extern GL_APICALL void (* GL_APIENTRY glVertexAttribDivisor) (GLuint index, GLuint divisor);
-extern GL_APICALL void (* GL_APIENTRY glBindTransformFeedback) (GLenum target, GLuint id);
-extern GL_APICALL void (* GL_APIENTRY glDeleteTransformFeedbacks) (GLsizei n, const GLuint* ids);
-extern GL_APICALL void (* GL_APIENTRY glGenTransformFeedbacks) (GLsizei n, GLuint* ids);
-extern GL_APICALL GLboolean (* GL_APIENTRY glIsTransformFeedback) (GLuint id);
-extern GL_APICALL void (* GL_APIENTRY glPauseTransformFeedback) (void);
-extern GL_APICALL void (* GL_APIENTRY glResumeTransformFeedback) (void);
-extern GL_APICALL void (* GL_APIENTRY glGetProgramBinary) (GLuint program, GLsizei bufSize, GLsizei* length, GLenum* binaryFormat, GLvoid* binary);
-extern GL_APICALL void (* GL_APIENTRY glProgramBinary) (GLuint program, GLenum binaryFormat, const GLvoid* binary, GLsizei length);
-extern GL_APICALL void (* GL_APIENTRY glProgramParameteri) (GLuint program, GLenum pname, GLint value);
-extern GL_APICALL void (* GL_APIENTRY glInvalidateFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments);
-extern GL_APICALL void (* GL_APIENTRY glInvalidateSubFramebuffer) (GLenum target, GLsizei numAttachments, const GLenum* attachments, GLint x, GLint y, GLsizei width, GLsizei height);
-extern GL_APICALL void (* GL_APIENTRY glTexStorage2D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
-extern GL_APICALL void (* GL_APIENTRY glTexStorage3D) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
-extern GL_APICALL void (* GL_APIENTRY glGetInternalformativ) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint* params);
-
-#ifndef GL_EXT_texture_border_clamp
-#define GL_EXT_texture_border_clamp 1
-#define GL_TEXTURE_BORDER_COLOR_EXT 0x1004
-#define GL_CLAMP_TO_BORDER_EXT 0x812D
-extern GL_APICALL void (* GL_APIENTRY glTexParameterIivEXT) (GLenum target, GLenum pname, const GLint *params);
-extern GL_APICALL void (* GL_APIENTRY glTexParameterIuivEXT) (GLenum target, GLenum pname, const GLuint *params);
-extern GL_APICALL void (* GL_APIENTRY glGetTexParameterIivEXT) (GLenum target, GLenum pname, GLint *params);
-extern GL_APICALL void (* GL_APIENTRY glGetTexParameterIuivEXT) (GLenum target, GLenum pname, GLuint *params);
-extern GL_APICALL void (* GL_APIENTRY glSamplerParameterIivEXT) (GLuint sampler, GLenum pname, const GLint *param);
-extern GL_APICALL void (* GL_APIENTRY glSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, const GLuint *param);
-extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIivEXT) (GLuint sampler, GLenum pname, GLint *params);
-extern GL_APICALL void (* GL_APIENTRY glGetSamplerParameterIuivEXT) (GLuint sampler, GLenum pname, GLuint *params);
-#endif /* GL_EXT_texture_border_clamp */
-
-#ifdef __cplusplus
-}
-#endif
-// clang-format on
-
-#endif
+++ /dev/null
-file(GLOB_RECURSE tmp *.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+++ /dev/null
-
-#include "DataTransfer.h"
-#include "GLLogging.h"
-
-#include "caffe2/core/common.h"
-
-inline uint16x4x4_t vld4_u16_aligned16(const uint16_t* address) {
- return vld4_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 16)));
-}
-
-inline uint16x4_t vld1_u16_aligned8(const uint16_t* address) {
- return vld1_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 8)));
-}
-
-inline void vst4_u16_aligned16(uint16_t* address, uint16x4x4_t data) {
- vst4_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 16)), data);
-}
-
-inline void vst1_u16_aligned8(uint16_t* address, uint16x4_t data) {
- vst1_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 8)), data);
-}
-
-template <int input_channels>
-static void interleaveSlice(
- void* output, const float* input, size_t width, size_t height, size_t row_stride) {
- const float* input_r = input;
- const float* input_g = input_r + height * width;
- const float* input_b = input_g + height * width;
- const float* input_a = input_b + height * width;
- uint16_t* output_f16 = static_cast<uint16_t*>(output);
- if (width >= 4) {
- for (size_t y = 0; y < height; y++) {
- size_t nx = width;
- while (nx >= 4) {
- const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
- input_r += 4;
- uint16x4_t g, b, a;
- g = b = a = vdup_n_u16(0);
- if (input_channels >= 2) {
- g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
- input_g += 4;
- if (input_channels >= 3) {
- b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
- input_b += 4;
- if (input_channels >= 4) {
- a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
- input_a += 4;
- }
- }
- }
-
- const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
- vst4_u16_aligned16(output_f16, rgba);
- output_f16 += 4 * 4;
-
- nx -= 4;
- }
- if (nx != 0) {
- output_f16 -= (4 - nx) * 4;
- input_r -= 4 - nx;
- if (input_channels >= 2) {
- input_g -= 4 - nx;
- if (input_channels >= 3) {
- input_b -= 4 - nx;
- if (input_channels >= 4) {
- input_a -= 4 - nx;
- }
- }
- }
-
- const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
- input_r += 4;
- uint16x4_t g, b, a;
- g = b = a = vdup_n_u16(0);
- if (input_channels >= 2) {
- g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
- input_g += 4;
- if (input_channels >= 3) {
- b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
- input_b += 4;
- if (input_channels >= 4) {
- a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
- input_a += 4;
- }
- }
- }
-
- const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
- vst4_u16_aligned16(output_f16, rgba);
- output_f16 += 4 * 4;
- }
- output_f16 += (row_stride - width) * 4;
- }
- } else {
- for (size_t y = 0; y < height; y++) {
- for (size_t x = 0; x < width; x++) {
- float32x4_t rgba = vld1q_dup_f32(input_r++);
- if (input_channels >= 2) {
- rgba = vld1q_lane_f32(input_g++, rgba, 1);
- if (input_channels >= 3) {
- rgba = vld1q_lane_f32(input_b++, rgba, 2);
- if (input_channels >= 4) {
- rgba = vld1q_lane_f32(input_a++, rgba, 3);
- }
- }
- }
- vst1_u16_aligned8(output_f16, uint16x4_t(vcvt_f16_f32(rgba)));
- output_f16 += 4;
- }
- output_f16 += (row_stride - width) * 4;
- }
- }
-}
-
-void interleaveSlice(void* output,
- const float* input,
- size_t width,
- size_t height,
- size_t row_stride,
- uint16_t input_channels) {
- switch (input_channels) {
- case 1:
- interleaveSlice<1>(output, input, width, height, row_stride);
- break;
- case 2:
- interleaveSlice<2>(output, input, width, height, row_stride);
- break;
- case 3:
- interleaveSlice<3>(output, input, width, height, row_stride);
- break;
- case 4:
- interleaveSlice<4>(output, input, width, height, row_stride);
- break;
- }
-}
-
-template <int output_channels>
-static void deInterleaveSlice(
- float* output, const void* input, size_t width, size_t height, size_t row_stride) {
- float* output_r = output;
- float* output_g = output_r + height * width;
- float* output_b = output_g + height * width;
- float* output_a = output_b + height * width;
- const uint16_t* input_f16 = static_cast<const uint16_t*>(input);
- if (width >= 4) {
- for (size_t y = 0; y < height; y++) {
- size_t nx = width;
- while (nx >= 4) {
- const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
- input_f16 += 4 * 4;
- const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
- vst1q_f32(output_r, r);
- output_r += 4;
- if (output_channels >= 2) {
- const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
- vst1q_f32(output_g, g);
- output_g += 4;
- if (output_channels >= 3) {
- const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
- vst1q_f32(output_b, b);
- output_b += 4;
- if (output_channels >= 4) {
- const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
- vst1q_f32(output_a, a);
- output_a += 4;
- }
- }
- }
-
- nx -= 4;
- }
- if (nx != 0) {
- input_f16 -= (4 - nx) * 4;
- output_r -= 4 - nx;
- if (output_channels >= 2) {
- output_g -= 4 - nx;
- if (output_channels >= 3) {
- output_b -= 4 - nx;
- if (output_channels >= 4) {
- output_a -= 4 - nx;
- }
- }
- }
-
- const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
- input_f16 += 4 * 4;
- const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
- vst1q_f32(output_r, r);
- output_r += 4;
- if (output_channels >= 2) {
- const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
- vst1q_f32(output_g, g);
- output_g += 4;
- if (output_channels >= 3) {
- const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
- vst1q_f32(output_b, b);
- output_b += 4;
- if (output_channels >= 4) {
- const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
- vst1q_f32(output_a, a);
- output_a += 4;
- }
- }
- }
- }
- input_f16 += (row_stride - width) * 4;
- }
- } else {
- for (size_t y = 0; y < height; y++) {
- for (size_t x = 0; x < width; x++) {
- const float32x4_t rgba = vcvt_f32_f16(float16x4_t(vld1_u16_aligned8(input_f16)));
- input_f16 += 4;
- vst1q_lane_f32(output_r++, rgba, 0);
- if (output_channels >= 2) {
- vst1q_lane_f32(output_g++, rgba, 1);
- if (output_channels >= 3) {
- vst1q_lane_f32(output_b++, rgba, 2);
- if (output_channels >= 4) {
- vst1q_lane_f32(output_a++, rgba, 3);
- }
- }
- }
- }
- input_f16 += (row_stride - width) * 4;
- }
- }
-}
-
-void deInterleaveSlice(float* output,
- const void* input,
- size_t width,
- size_t height,
- size_t row_stride,
- uint32_t output_channels) {
- switch (output_channels) {
- case 1:
- deInterleaveSlice<1>(output, input, width, height, row_stride);
- break;
- case 2:
- deInterleaveSlice<2>(output, input, width, height, row_stride);
- break;
- case 3:
- deInterleaveSlice<3>(output, input, width, height, row_stride);
- break;
- case 4:
- deInterleaveSlice<4>(output, input, width, height, row_stride);
- break;
- }
-}
+++ /dev/null
-
-#pragma once
-
-#include "arm_neon_support.h"
-
-void interleaveSlice(void* output,
- const float* input,
- size_t width,
- size_t height,
- size_t row_stride,
- uint16_t input_channels);
-void deInterleaveSlice(float* output,
- const void* input,
- size_t width,
- size_t height,
- size_t input_stride,
- uint32_t output_channels);
+++ /dev/null
-
-#pragma once
-#include "caffe2/core/common.h"
-
-#if CAFFE2_IOS
-#include <OpenGLES/ES3/gl.h>
-#include <OpenGLES/ES3/glext.h>
-#elif CAFFE2_ANDROID
-#include <EGL/egl.h>
-#include <GLES2/gl2.h>
-#include "caffe2/mobile/contrib/opengl/android/gl3stub.h"
-#endif
+++ /dev/null
-
-#include "caffe2/core/logging.h"
-
-#include "GL.h"
-#include "GLContext.h"
-#include "GLLogging.h"
-
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#if CAFFE2_IOS
-#include "sys/utsname.h"
-#include <regex>
-#endif
-
-void getOpenGLESVersion(int& major, int& minor) {
- glGetIntegerv(GL_MAJOR_VERSION, &major);
- glGetIntegerv(GL_MINOR_VERSION, &minor);
-}
-
-bool checkOpenGLExtensions(std::string gl_ext_str) {
- static std::unordered_set<std::string> extensions;
- if (extensions.empty()) {
- const caffe2::string extension_str((const char*)glGetString(GL_EXTENSIONS));
- LOG(INFO) << "GL_EXTENSIONS: " << extension_str;
-
- std::stringstream ss(extension_str);
- while (!ss.eof()) {
- std::string extension;
- ss >> extension;
- extensions.insert(extension);
- }
- }
-
- return extensions.count(gl_ext_str) > 0;
-}
-
-bool GLContext::GL_EXT_texture_border_clamp_defined() {
- static int major = 0, minor = 0;
- if (major == 0) {
- getOpenGLESVersion(major, minor);
- }
-
- if (major == 3 && minor == 2) {
- return true;
- }
-
- return checkOpenGLExtensions("GL_EXT_texture_border_clamp") || // Most common
- checkOpenGLExtensions("GL_OES_texture_border_clamp");
-}
-
-bool supportOpenGLES3(bool* half_float_supported) {
- int major = 0, minor = 0;
- getOpenGLESVersion(major, minor);
-
- LOG(INFO) << "GL_VERSION: OpenGL ES " << major << "." << minor;
-
- if (major < 3) {
- LOG(ERROR) << "OpenGL ES 3.0 not supported";
- return false;
- }
-
- if (!checkOpenGLExtensions("GL_EXT_color_buffer_half_float")) {
- LOG(ERROR) << "GL_EXT_color_buffer_half_float is not available";
- if (half_float_supported) {
- *half_float_supported = false;
- }
- }
- return true;
-}
-
-#if CAFFE2_IOS
-int iPhoneVersion() {
- static int version = 0;
- static std::once_flag once;
- std::call_once(once, [&]() {
- struct utsname systemInfo;
- uname(&systemInfo);
- std::string iphone_ver_str = systemInfo.machine;
- LOG(INFO) << systemInfo.machine;
-
- if (iphone_ver_str.find("iPhone") != std::string::npos) {
- std::regex regStr("([0-9]+)");
- std::smatch matchs;
- if (std::regex_search(iphone_ver_str, matchs, regStr)) {
- version = stoi(matchs[0]);
- }
- }
- });
- return version;
-}
-#endif
-
-#if CAFFE2_ANDROID
-// whitelist of supported GPUs
-bool isSupportedRenderer() {
- static std::unordered_set<std::string> supported_renderers = {
- "Adreno (TM) 540",
- "Adreno (TM) 530",
- "Adreno (TM) 510",
- "Adreno (TM) 430",
- "Adreno (TM) 418",
- "Mali-G71",
- "Mali-T880",
- "NVIDIA Tegra"};
- std::string rendererStr((const char*)glGetString(GL_RENDERER));
- LOG(INFO) << "GL_RENDERER: " << rendererStr;
-
- int start = rendererStr.find_first_not_of(" ");
- int end = rendererStr.find_last_not_of(" ");
- rendererStr = rendererStr.substr(start, end - start + 1);
- return supported_renderers.count(rendererStr) > 0;
-}
-#endif
-
-bool isSupportedDevice() {
-#if CAFFE2_IOS
- return iPhoneVersion() >= 7; // iPhone 6 and up
-#elif CAFFE2_ANDROID
- return isSupportedRenderer();
-#else
- return false;
-#endif
-}
+++ /dev/null
-
-#pragma once
-#include "GLTexture.h"
-#include "caffe2/core/common.h"
-#include <functional>
-
-class GLContext {
- private:
- static std::unique_ptr<GLContext> _glcontext;
- std::function<const GLTexture*(const int width, const int height)> foreignTextureAllocator =
- nullptr;
-
- protected:
- bool half_float_supported = true;
-
- public:
- virtual void set_context() = 0;
- virtual void reset_context() = 0;
- virtual void flush_context() = 0;
- virtual ~GLContext(){};
-
- static void initGLContext();
- static GLContext* getGLContext();
- static void deleteGLContext();
-
- static bool GL_EXT_texture_border_clamp_defined();
-
- inline bool halfFloatTextureSupported() { return half_float_supported; }
-
- void setTextureAllocator(
- std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
- foreignTextureAllocator = textureAllocator;
- }
-
- std::function<const GLTexture*(const int width, const int height)> getTextureAllocator() {
- return foreignTextureAllocator;
- }
-};
-
-bool supportOpenGLES3(bool* hfs = nullptr);
-
-bool isSupportedDevice();
-
-#if CAFFE2_IOS
-int iPhoneVersion();
-#endif
+++ /dev/null
-
-#include "GLFilter.h"
-#include <sstream>
-
-GLFilter::GLFilter(const std::string _kernel_name,
- const std::string _vertex_shader,
- const std::string _fragment_shader,
- const std::vector<binding*> uniforms,
- const std::vector<binding*> uniform_blocks,
- const std::vector<binding*> attributes,
- const replacements_t& _replacements)
- : kernel_name(_kernel_name),
- uniforms_(uniforms),
- uniform_blocks_(uniform_blocks),
- attributes_(attributes) {
- // shader program
- if (createProgram(_vertex_shader.c_str(),
- process_replacements(_fragment_shader, _replacements).c_str(),
- &program)) {
- gl_log(GL_VERBOSE, "created program %d\n", program);
- } else {
- releaseBuffers();
-
- throwRuntimeError(
- [&](std::stringstream& errmsg) { errmsg << "Problem initializing OpenGL program"; });
- }
-}
-
-const char* shader_utils = R"GLSL(
-#define unpackHalf4x16(pd) vec4(unpackHalf2x16(pd.x), unpackHalf2x16(pd.y))
-#define packHalf4x16(pd) uvec2(packHalf2x16(pd.xy), packHalf2x16(pd.zw))
-)GLSL";
-
-const char* half_float_texture_utils = R"GLSL(
-precision mediump sampler2D;
-
-#define TEXTURE_OUTPUT(_loc, _var) \
- layout(location = _loc) out mediump vec4 _var
-#define TEXTURE_INPUT(_var) \
- uniform sampler2D _var
-#define TEXTURE_LOAD(_input, _coord) \
- texelFetch((_input), (_coord), 0)
-#define TEXTURE_STORE(_val) \
- (_val)
-)GLSL";
-
-const char* half_float_compat_texture_utils = R"GLSL(
-precision highp usampler2D;
-
-#define TEXTURE_OUTPUT(_loc, _var) \
- layout(location = _loc) out highp uvec2 _var
-#define TEXTURE_INPUT(_var) \
- uniform usampler2D _var
-#define TEXTURE_LOAD(_input, _coord) \
- unpackHalf4x16(texelFetch((_input), (_coord), 0).xy)
-#define TEXTURE_STORE(_val) \
- (uvec2(packHalf4x16((_val))))
-)GLSL";
-
-std::string GLFilter::process_replacements(std::string shader,
- const replacements_t& replacements) const {
- for (auto&& replacement : replacements) {
- std::string tag = "$(" + replacement.first + ")";
- std::string value = replacement.second;
-
- size_t position = shader.find(tag);
- if (position != std::string::npos) {
- shader.replace(position, tag.size(), value);
- } else {
- throwRuntimeError(
- [&](std::stringstream& errmsg) { errmsg << "Couldn't find replacement tag: " << tag; });
- }
- }
-
- // Add some #defines for convenience
- std::string version_tag = "#version 300 es";
- if (GLContext::getGLContext()->halfFloatTextureSupported()) {
- shader.insert(shader.find(version_tag) + version_tag.size(), half_float_texture_utils);
- } else {
- shader.insert(shader.find(version_tag) + version_tag.size(), half_float_compat_texture_utils);
- }
- shader.insert(shader.find(version_tag) + version_tag.size(), shader_utils);
- return shader;
-}
-
-template <typename T>
-void GLFilter::attach_uniform_buffer(const binding* block,
- GLuint bindingPoint,
- std::function<void(T*, size_t)> loader) {
- if (block->location >= 0) {
- if (bindingPoint < kMaxUniformBlocks) {
- if (uniformBlock[bindingPoint] == 0) {
- // Associate the uniform block index with a binding point
- glUniformBlockBinding(program, block->location, bindingPoint);
-
- // Get the size of block
- glGetActiveUniformBlockiv(program, block->location, GL_UNIFORM_BLOCK_DATA_SIZE, &blockSize[bindingPoint]);
-
- // Create and fill a buffer object
- glGenBuffers(1, &uniformBlock[bindingPoint]);
-
- gl_log(GL_VERBOSE, "created uniform buffer block %d\n", uniformBlock[bindingPoint]);
- }
-
- // Fill a buffer object
- glBindBuffer(GL_UNIFORM_BUFFER, uniformBlock[bindingPoint]);
- glBufferData(GL_UNIFORM_BUFFER, blockSize[bindingPoint], NULL, GL_DYNAMIC_DRAW);
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "Unable to bind uniform buffer " << block->name << ":" << block->location
- << " at binding point " << bindingPoint;
- });
-
- T* blockData = (T*)glMapBufferRange(
- GL_UNIFORM_BUFFER, 0, blockSize[bindingPoint], GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
- if (blockData != NULL) {
- // Copy the data into the mapped buffer
- if (loader)
- loader(blockData, blockSize[bindingPoint]);
-
- // Unmap the buffer
- if (glUnmapBuffer(GL_UNIFORM_BUFFER) == GL_TRUE) {
- // Bind the buffer object to the uniform block binding point
- glBindBufferBase(GL_UNIFORM_BUFFER, bindingPoint, uniformBlock[bindingPoint]);
- } else {
- throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "Error unmapping element buffer object"; });
- }
- } else {
- throwRuntimeError([&](std::stringstream& errmsg) {
- errmsg << "Error mapping element buffer object, blockSize: " << blockSize;
- });
- }
-
- glBindBuffer(GL_UNIFORM_BUFFER, 0);
- } else {
- throwRuntimeError([&](std::stringstream& errmsg) {
- errmsg << "Uniform block binding point out of range: " << bindingPoint << ", should be < "
- << kMaxUniformBlocks;
- });
- }
- } else {
- throwRuntimeError([&](std::stringstream& errmsg) { errmsg << "unbound uniform block"; });
- }
-}
-
-template void GLFilter::attach_uniform_buffer<float16_t>(const binding* block,
- GLuint bindingPoint,
- std::function<void(float16_t*, size_t)> loader);
-
-static const GLenum unused_capability[] = {GL_CULL_FACE,
- GL_BLEND,
- GL_DITHER,
- GL_STENCIL_TEST,
- GL_DEPTH_TEST,
- GL_SCISSOR_TEST,
- GL_POLYGON_OFFSET_FILL,
- GL_SAMPLE_ALPHA_TO_COVERAGE,
- GL_SAMPLE_COVERAGE};
-
-void GLFilter::run(const std::vector<texture_attachment>& input,
- const std::vector<const GLTexture*>& output,
- std::function<void(void)> uniforms_initializer,
- int width,
- int height) {
- const int first_texture_id = GL_TEXTURE0;
-
- GLint defaultFramebuffer = 0;
- glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
-
- gl_log(GL_VERBOSE,
- "GLFilter::run %s - inputs: %d, outputs: %d, width: %d, height: %d\n",
- kernel_name.c_str(),
- input.size(),
- output.size(),
- width,
- height);
-
- if (output.size() > 4) {
- throwRuntimeError([&](std::stringstream& errmsg) {
- errmsg << "Too many output textures: " << output.size() << ", should be <= 4";
- });
- }
-
- if (frameBuffer == 0) {
- // create the frame buffer
- glGenFramebuffers(1, &frameBuffer);
- gl_log(GL_VERBOSE, "created frame buffer %d\n", frameBuffer);
- }
-
- glBindFramebuffer(GL_FRAMEBUFFER, frameBuffer);
- checkGLError([&](std::stringstream& errmsg) { errmsg << "glBindFramebuffer"; });
-
- // Set up the output textures
- for (int i = 0; i < output.size(); i++) {
- GLenum target = output[i]->target();
- GLuint texture = output[i]->name();
-
- glBindTexture(target, texture);
- glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, target, texture, 0);
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "Unable to connect output texture " << texture << " at color attachment " << i;
- });
-
- gl_log(GL_VERBOSE, "connected output texture %d to color attachment %d\n", texture, i);
- }
-
- // Bind the output textures to the frame buffer attachments
- if (!frame_buffer_initialized) {
- const int attachments_number = output.size();
- const GLenum attachments[4] = {
- GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1, GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3};
-
- glDrawBuffers(attachments_number, attachments);
-
- int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
-
- if (fbs != GL_FRAMEBUFFER_COMPLETE) {
- throwRuntimeError(
- [&](std::stringstream& errmsg) { errmsg << "Frame buffer incomplete: " << fbs; });
- }
-
- frame_buffer_initialized = true;
- }
-
- glUseProgram(program);
- checkGLError([&](std::stringstream& errmsg) { errmsg << "glUseProgram"; });
-
- // Set up the input textures
- GLenum texture_idx = first_texture_id;
- for (int i = 0; i < input.size(); i++, texture_idx++) {
- if (input[i].uniform->location >= 0) {
- GLenum target = input[i].texture->target();
- GLuint texture = input[i].texture->name();
-
- glActiveTexture(texture_idx);
- glBindTexture(target, texture);
- glUniform1i(input[i].uniform->location, texture_idx - GL_TEXTURE0);
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << ": Unable to attach input texture " << texture << " to uniform "
- << input[i].uniform->name << ":" << input[i].uniform->location << " at index "
- << texture_idx - GL_TEXTURE0;
- });
-
- gl_log(GL_VERBOSE,
- "connected input texture %d to texture unit %d\n",
- texture,
- texture_idx - GL_TEXTURE0);
- } else {
- gl_log(GL_VERBOSE, "something wrong happened when i = %d\n", i);
- }
- }
-
- // Caller supplied uniforms initializer
- if (uniforms_initializer) {
- uniforms_initializer();
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "errors in the uniforms initializer callback";
- });
- }
-
- // Validate program
- if (check_opengl_errors && !validateProgram(program)) {
- throwRuntimeError(
- [&](std::stringstream& errmsg) { errmsg << "Couldn't validate OpenGL program"; });
- }
-
- glViewport(0, 0, width, height);
-
- // Disable stuff we don't need and make sure that we have all the channels ebabled
- for (int i = 0; i < sizeof(unused_capability) / sizeof(GLenum); i++) {
- glDisable(unused_capability[i]);
- }
- glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
-
- // glDrawElements should be more efficient, but on iOS glDrawArrays is faster.
-
- const bool useDrawArrays = true;
-
- if (useDrawArrays) {
- enum { ATTRIB_VERTEX, ATTRIB_TEXTUREPOSITON, NUM_ATTRIBUTES };
-
- static const GLfloat squareVertices[] = {
- -1.0f,
- -1.0f, // bottom left
- 1.0f,
- -1.0f, // bottom right
- -1.0f,
- 1.0f, // top left
- 1.0f,
- 1.0f, // top right
- };
-
- static const float textureVertices[] = {
- 0.0f,
- 0.0f, // bottom left
- 1.0f,
- 0.0f, // bottom right
- 0.0f,
- 1.0f, // top left
- 1.0f,
- 1.0f, // top right
- };
-
- glBindBuffer(GL_ARRAY_BUFFER, 0);
- glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, squareVertices);
- glEnableVertexAttribArray(ATTRIB_VERTEX);
- checkGLError(
- [&](std::stringstream& errmsg) { errmsg << "glEnableVertexAttribArray(ATTRIB_VERTEX)"; });
-
- glVertexAttribPointer(ATTRIB_TEXTUREPOSITON, 2, GL_FLOAT, 0, 0, textureVertices);
- glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON);
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "glEnableVertexAttribArray(ATTRIB_TEXTUREPOSITON)";
- });
-
- gl_log(GL_VERBOSE, "Calling glDrawArrays\n");
- glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-
- checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawArrays"; });
- } else {
- // Run the shaders on the output geometry
- static const GLfloat vVertices[] = {
- -1.0f, -1.0f, 0.0f, // Position 0
- 0.0f, 0.0f, // TexCoord 0
- -1.0f, 1.0f, 0.0f, // Position 1
- 0.0f, 1.0f, // TexCoord 1
- 1.0f, 1.0f, 0.0f, // Position 2
- 1.0f, 1.0f, // TexCoord 2
- 1.0f, -1.0f, 0.0f, // Position 3
- 1.0f, 0.0f // TexCoord 3
- };
- static const GLushort indices[] = {0, 1, 2, 0, 2, 3};
-
- // Load the vertex position
- glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), vVertices);
- // Load the texture coordinate
- glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(GLfloat), &vVertices[3]);
-
- glEnableVertexAttribArray(0);
- glEnableVertexAttribArray(1);
-
- gl_log(GL_VERBOSE, "Calling glDrawElements\n");
- glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, indices);
-
- checkGLError([&](std::stringstream& errmsg) { errmsg << "glDrawElements"; });
- }
-
-#if CAFFE2_ANDROID
- glFlush();
-#endif
-
- // Unbind the current texture - Man, this is expensive!
- for (int i = texture_idx - 1; i >= first_texture_id; i--) {
- gl_log(GL_VERBOSE, "unbinding texture unit %d\n", i - GL_TEXTURE0);
- glActiveTexture(i);
- glBindTexture(GL_TEXTURE_2D, 0);
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "Error unbinding texture unit " << i - GL_TEXTURE0;
- });
- }
-
- glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
-}
-
-void GLFilter::releaseBuffers() {
- for (int i = 0; i < kMaxUniformBlocks; i++) {
- if (uniformBlock[i]) {
- gl_log(GL_VERBOSE, "deleting uniform buffer block %d\n", uniformBlock[i]);
- glDeleteBuffers(1, &uniformBlock[i]);
- uniformBlock[i] = 0;
- }
- }
- if (frameBuffer) {
- gl_log(GL_VERBOSE, "deleting frame buffer %d\n", frameBuffer);
- glDeleteFramebuffers(1, &frameBuffer);
- frameBuffer = 0;
- }
-}
-
-void GLFilter::deleteProgram() {
- if (program) {
- gl_log(GL_VERBOSE, "deleting program %d\n", program);
- glDeleteProgram(program);
- program = 0;
- }
-}
-
-void GLFilter::deleteBindings() {
- for (binding* uniform : uniforms_) {
- delete uniform;
- }
- for (binding* uniform_block : uniform_blocks_) {
- delete uniform_block;
- }
- for (binding* attribute : attributes_) {
- delete attribute;
- }
-}
-
-// Simple vertex shader setting up the coordinates system
-const char* GLFilter::vertex_shader = R"GLSL(#version 300 es
-
- layout(location = 0) in vec4 a_position;
- layout(location = 1) in vec2 a_texCoord;
- out vec2 v_texCoord;
-
- void main()
- {
- gl_Position = a_position;
- v_texCoord = a_texCoord;
- }
-)GLSL";
-
-bool GLFilter::createProgram(const GLchar* vertSource,
- const GLchar* fragSource,
- GLuint* program) const {
- GLuint vertShader = 0, fragShader = 0, prog = 0, status = 1;
-
- // Clear the error state. We check error state later in the function and
- // want to capture only errors in filter program initialization.
- glGetError();
-
- // Create shader program
- prog = glCreateProgram();
-
- // Create and compile vertex shader
- status *= compileShader(GL_VERTEX_SHADER, 1, &vertSource, &vertShader);
-
- // Create and compile fragment shader
- status *= compileShader(GL_FRAGMENT_SHADER, 1, &fragSource, &fragShader);
-
- // Attach vertex shader to program
- glAttachShader(prog, vertShader);
-
- // Attach fragment shader to program
- glAttachShader(prog, fragShader);
-
- // Bind attribute locations
- // This needs to be done prior to linking
- for (auto&& attribute : attributes_) {
- glBindAttribLocation(prog, attribute->location, attribute->name.c_str());
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "Couldn't bind attribute: " << attribute->name << " at location "
- << attribute->location;
- });
- }
-
- // Link program
- status *= linkProgram(prog);
-
- // Get locations of uniforms
- if (status) {
- for (auto&& uniform : uniforms_) {
- uniform->location = glGetUniformLocation(prog, uniform->name.c_str());
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "Couldn't resolve uniform: " << uniform->name;
- });
- }
-
- for (auto&& uniform_block : uniform_blocks_) {
- uniform_block->location = glGetUniformBlockIndex(prog, uniform_block->name.c_str());
- gl_log(GL_VERBOSE,
- "Getting location for uniform block: %s, location: %d\n",
- uniform_block->name.c_str(),
- uniform_block->location);
-
- checkGLError([&](std::stringstream& errmsg) {
- errmsg << "Couldn't resolve uniform block: " << uniform_block->name;
- });
- }
-
- *program = prog;
- }
-
- // Release vertex and fragment shaders
- if (vertShader) {
- glDetachShader(prog, vertShader);
- glDeleteShader(vertShader);
- }
- if (fragShader) {
- glDetachShader(prog, fragShader);
- glDeleteShader(fragShader);
- }
-
- return status == 1;
-}
-
-#include <stdlib.h>
-
-/* Compile a shader from the provided source(s) */
-GLint GLFilter::compileShader(GLenum target,
- GLsizei count,
- const GLchar** sources,
- GLuint* shader) const {
- GLint status = 1;
-
- *shader = glCreateShader(target);
- glShaderSource(*shader, count, sources, NULL);
- glCompileShader(*shader);
-
- GLint logLength = 0;
- glGetShaderiv(*shader, GL_INFO_LOG_LENGTH, &logLength);
- if (logLength > 0) {
- std::vector<GLchar> log(logLength);
- glGetShaderInfoLog(*shader, logLength, &logLength, &log[0]);
- gl_log(GL_ERR, "Shader compile log:\n%s", &log[0]);
- }
-
- glGetShaderiv(*shader, GL_COMPILE_STATUS, &status);
- if (status == 0) {
- int i;
-
- gl_log(GL_ERR, "Failed to compile shader:\n");
- for (i = 0; i < count; i++)
- gl_log(GL_ERR, "%s", sources[i]);
- }
-
- return status;
-}
-
-/* Link a program with all currently attached shaders */
-GLint GLFilter::linkProgram(GLuint program) const {
- GLint status = 1;
-
- glLinkProgram(program);
-
- GLint logLength = 0;
- glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
- if (logLength > 0) {
- std::vector<GLchar> log(logLength);
- glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
- gl_log(GL_ERR, "Program link log:\n%s", &log[0]);
- }
-
- glGetProgramiv(program, GL_LINK_STATUS, &status);
- if (status == 0)
- gl_log(GL_ERR, "Failed to link program %d\n", program);
-
- return status;
-}
-
-/* Validate a program (for i.e. inconsistent samplers) */
-GLint GLFilter::validateProgram(GLuint program) const {
- GLint status = 1;
-
- glValidateProgram(program);
-
- GLint logLength = 0;
- glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
- if (logLength > 0) {
- std::vector<GLchar> log(logLength);
- glGetProgramInfoLog(program, logLength, &logLength, &log[0]);
- gl_log(GL_ERR, "Program validate log:\n%s", &log[0]);
- }
-
- glGetProgramiv(program, GL_VALIDATE_STATUS, &status);
- if (status == 0)
- gl_log(GL_ERR, "Failed to validate program %d\n", program);
-
- return status;
-}
+++ /dev/null
-
-#pragma once
-
-#include "GLContext.h"
-#include "GLTexture.h"
-#include "arm_neon_support.h"
-
-#include <functional>
-#include <string>
-#include <vector>
-
-#define BINDING(variableName) (variableName = new binding{#variableName})
-#define ATTRIBUTE(variableName, value) (variableName = new binding{#variableName, value})
-
-class GLFilter {
- protected:
- const std::string kernel_name;
- GLuint program = 0;
- GLuint frameBuffer = 0;
- static constexpr int kMaxUniformBlocks = 12;
- GLuint uniformBlock[kMaxUniformBlocks] = {0};
- GLint blockSize[kMaxUniformBlocks] = {0};
- bool frame_buffer_initialized = false;
-
- // glGetError() can be expensive, we should turn error checking off when we're done with debugging
-
- static constexpr bool check_opengl_errors = true;
-
-public:
- typedef std::vector<std::pair<std::string, std::string>> replacements_t;
-
- struct binding {
- const std::string name;
- GLint location;
- };
-
- struct texture_attachment {
- const GLTexture* texture;
- const binding* uniform;
- };
-
- GLFilter(const std::string kernel_name,
- const std::string vertex_shader,
- const std::string fragment_shader,
- const std::vector<binding*> uniforms,
- const std::vector<binding*> uniform_blocks = {},
- const std::vector<binding*> attributes = {},
- const replacements_t& replacements = {});
-
- // TODO: The set and reset context need to be commented out for unit testing
- ~GLFilter() {
- releaseBuffers();
- deleteProgram();
- deleteBindings();
- }
-
- void throwRuntimeError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
- std::stringstream errmsg;
- errmsg << kernel_name << ": ";
- error_formatter(errmsg);
- throw std::runtime_error(errmsg.str());
- }
-
- void checkGLError(std::function<void(std::stringstream& errmsg)> error_formatter) const {
- if (check_opengl_errors) {
- GLenum glError = glGetError();
- if (glError != GL_NO_ERROR) {
- throwRuntimeError([&](std::stringstream& errmsg) {
- error_formatter(errmsg);
- errmsg << ", " << glError;
- });
- }
- }
- }
-
- template <typename T>
- void attach_uniform_buffer(const binding* block,
- GLuint bindingPoint, std::function<void(T*, size_t)> loader);
-
- void run(const std::vector<texture_attachment>& input,
- const std::vector<const GLTexture*>& output,
- std::function<void(void)> uniforms_initializer,
- int width,
- int height);
-
- void releaseBuffers();
- void deleteProgram();
- void deleteBindings();
-
- static const char* vertex_shader;
-
- private:
- const std::vector<binding*> uniforms_;
- const std::vector<binding*> uniform_blocks_;
- const std::vector<binding*> attributes_;
-
- std::string process_replacements(std::string source, const replacements_t& replacements) const;
-
- bool createProgram(const GLchar* vertSource, const GLchar* fragSource, GLuint* program) const;
-
- GLint compileShader(GLenum target, GLsizei count, const GLchar** sources, GLuint* shader) const;
- GLint linkProgram(GLuint program) const;
- GLint validateProgram(GLuint program) const;
-};
+++ /dev/null
-
-#include "GLImage.h"
-#include "arm_neon_support.h"
-#include <c10/util/typeid.h>
-
-namespace caffe2 {
-CAFFE_KNOWN_TYPE(GLImage<float>);
-CAFFE_KNOWN_TYPE(GLImage<uint8_t>);
-CAFFE_KNOWN_TYPE(GLImageVector<float>);
-CAFFE_KNOWN_TYPE(GLImageVector<uint8_t>);
-#ifdef __ARM_NEON__
-CAFFE_KNOWN_TYPE(GLImage<float16_t>);
-CAFFE_KNOWN_TYPE(GLImageVector<float16_t>);
-#endif
-} // namespace caffe2
+++ /dev/null
-
-#pragma once
-
-#include "GLTexture.h"
-#include "caffe2/core/logging.h"
-
-#include <functional>
-#include <vector>
-
-template <typename T>
-class GLImage {
- public:
- const int width;
- const int height;
- const int channels;
- const int data_size;
-
- const int tile_x;
- const int tile_y;
- const int texture_width;
- const int texture_height;
- const int slices;
-
- const std::vector<const GLTexture*> textures;
-
- constexpr static int slice_channels = 4;
-
- static constexpr int channels_to_slices(int channels, int tile_x, int tile_y) {
- return ((channels + slice_channels - 1) / slice_channels + tile_x * tile_y - 1) /
- (tile_x * tile_y);
- }
-
- static const std::vector<const GLTexture*> allocate_textures(
- int slices, std::function<const GLTexture*(int slice)> texture_loader) {
- std::vector<const GLTexture*> textures;
- for (int i = 0; i < slices; i++) {
- textures.push_back(texture_loader(i));
- }
- return textures;
- }
-
- GLImage(int _width,
- int _height,
- int _channels,
- int _tile_x,
- int _tile_y,
- std::function<const GLTexture*(int slice)> texture_loader)
- : width(_width),
- height(_height),
- channels(_channels),
- data_size(sizeof(T)),
- tile_x(_tile_x),
- tile_y(_tile_y),
- texture_width(_width * _tile_x),
- texture_height(_height * _tile_y),
- slices(channels_to_slices(_channels, _tile_x, _tile_y)),
- textures(allocate_textures(slices, texture_loader)) {
- CAFFE_ENFORCE_EQ(
- slices, ((channels + 3) / 4 + tile_x * tile_y - 1) / (tile_x * tile_y));
- }
-
- GLImage(int _width,
- int _height,
- int _channels,
- int _tile_x,
- int _tile_y,
- bool _destroy,
- std::function<const GLTexture*(int slice)> texture_loader)
- : width(_width),
- height(_height),
- channels(_channels),
- data_size(sizeof(T)),
- tile_x(_tile_x),
- tile_y(_tile_y),
- texture_width(_width * _tile_x),
- texture_height(_height * _tile_y),
- slices(channels_to_slices(_channels, _tile_x, _tile_y)),
- textures(allocate_textures(slices, texture_loader)) {
- CAFFE_ENFORCE_EQ(slices * tile_x * tile_y, (channels + 3) / 4);
- }
-
- GLImage()
- : width(0),
- height(0),
- channels(0),
- data_size(sizeof(T)),
- tile_x(0),
- tile_y(0),
- texture_width(0),
- texture_height(0),
- slices(0){};
-
- virtual ~GLImage() {
- gl_log(GL_VERBOSE, "deleting GLImage\n");
- for (auto&& texture : textures) {
- delete texture;
- }
- }
-};
-
-template <typename T>
-class GLImageVector {
- private:
- std::vector<GLImage<T>*> images_;
- int num_images_ = 0;
- int width_ = 0;
- int height_ = 0;
- int channels_ = 0;
- int tile_x_ = 0;
- int tile_y_ = 0;
-
- public:
- GLImage<T>* operator[](int index) const {
- CAFFE_ENFORCE_LT(index, num_images_, "Out of bounds when accessing GLImageVector");
- return images_[index];
- }
-
- void push_back(GLImage<T>* image) {
- CAFFE_ENFORCE_EQ(image->channels, channels_);
- CAFFE_ENFORCE_EQ(image->width, width_);
- CAFFE_ENFORCE_EQ(image->height, height_);
- CAFFE_ENFORCE_EQ(image->tile_x, tile_x_);
- CAFFE_ENFORCE_EQ(image->tile_y, tile_y_);
- images_.push_back(image);
- CAFFE_ENFORCE_LE(images_.size(), num_images_);
- }
-
- int size() const { return images_.size(); }
- int channels() const { return channels_; }
- int width() const { return width_; }
- int height() const { return height_; }
- int tile_x() const { return tile_x_; }
- int tile_y() const { return tile_y_; }
- int slices() const { return size() > 0 ? images_[0]->slices : 0; }
-
- GLImageVector(int num_images, int width, int height, int channels, int tile_x = 1, int tile_y = 1)
- : num_images_(num_images),
- width_(width),
- height_(height),
- channels_(channels),
- tile_x_(tile_x),
- tile_y_(tile_y) {}
-
- GLImageVector() {}
-
- ~GLImageVector() {
- for (int i = 0; i < images_.size(); i++) {
- delete images_[i];
- }
- }
-};
+++ /dev/null
-
-#include "GLImageAllocator.h"
-#include "arm_neon_support.h"
-
-template <class T>
-GLImageVector<T>* GLImageAllocator<T>::newImage(
- int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output) {
- GLImageVector<T>* images =
- new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
- for (int i = 0; i < num_images; i++) {
- images->push_back(
- new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
- bool usePadding = is_output;
- return new GLPlainTexture(type, nullptr, width * tile_x, height * tile_y, usePadding);
- }));
- }
- return images;
-}
-
-template <class T>
-GLImageVector<T>* GLImageAllocator<T>::newImage(
- int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y,
- std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
- GLImageVector<T>* images =
- new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
- for (int i = 0; i < num_images; i++) {
- images->push_back(
- new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
- return textureAllocator(width, height);
- }));
- }
- return images;
-}
-
-template <class T>
-GLImageVector<T>* GLImageAllocator<T>::ShareTexture(const GLuint textureID,
- int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y) {
- GLImageVector<T>* images =
- new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
- for (int i = 0; i < num_images; i++) {
- images->push_back(
- new GLImage<T>(width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
- return new GLPlainTexture(
- GLImageAllocator<T>::type, textureID, width * tile_x, height * tile_y);
- }));
- }
- return images;
-}
-
-template <>
-const GLTexture::Type& GLImageAllocator<float16_t>::type = GLTexture::FP16;
-template <>
-const GLTexture::Type& GLImageAllocator<uint8_t>::type = GLTexture::UI8;
-
-template class GLImageAllocator<float16_t>;
-template class GLImageAllocator<uint8_t>;
+++ /dev/null
-
-#pragma once
-
-#include "GLImage.h"
-#include "GLPlainTexture.h"
-
-template <class T>
-class GLImageAllocator {
- public:
- static const GLTexture::Type& type;
-
- GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
-
- virtual ~GLImageAllocator() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
-
- virtual GLImageVector<T>* newImage(
- int num_images, int width, int height, int channels, int tile_x, int tile_y, bool is_output);
-
- virtual GLImageVector<T>* newImage(
- int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y,
- std::function<const GLTexture*(const int width, const int height)> textureAllocator);
-
- virtual GLImageVector<T>* ShareTexture(const GLuint textureID,
- int num_images,
- int width,
- int height,
- int channels,
- int tile_x = 1,
- int tile_y = 1);
-
- static GLImageAllocator<T>* newGLImageAllocator();
-};
+++ /dev/null
-
-#pragma once
-
-#include <stdarg.h>
-#include <stdio.h>
-
-enum { GL_ERR = -1, GL_LOG = 0, GL_VERBOSE = 1 };
-
-static constexpr int GL_LOG_LEVEL = GL_LOG;
-
-static inline int gl_log(int level, const char* format, ...) {
- int r = 0;
- if (level <= GL_LOG_LEVEL) {
- va_list args;
- va_start(args, format);
- r = vfprintf(stderr, format, args);
- va_end(args);
- }
- return r;
-}
+++ /dev/null
-
-#include "GLPBO.h"
-
-#include "caffe2/core/logging.h"
-
-GLPBO::~GLPBO() {
- if (pboId != 0) {
- gl_log(GL_LOG, "deleting PBO buffer %d\n", pboId);
- glDeleteBuffers(1, &pboId);
- pboId = 0;
- }
- if (pboFrameBuffer != 0) {
- gl_log(GL_LOG, "deleting PBO frame buffer %d\n", pboFrameBuffer);
- glDeleteFramebuffers(1, &pboFrameBuffer);
- pboFrameBuffer = 0;
- }
-}
-
-GLPBO* GLPBO::pboContext = NULL;
-
-GLPBO* GLPBO::getContext() {
- if (pboContext == NULL) {
- pboContext = new GLPBO();
- }
- return pboContext;
-}
-
-void GLPBO::mapTextureData(GLuint _textureId,
- GLsizei _width,
- GLsizei _height,
- GLsizei _stride,
- GLsizei _channels,
- const GLTexture::Type& _type,
- std::function<void(const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type)> process) {
- GLint defaultFramebuffer = 0;
- glGetIntegerv(GL_FRAMEBUFFER_BINDING, &defaultFramebuffer);
-
- if (pboFrameBuffer == 0) {
- glGenFramebuffers(1, &pboFrameBuffer);
- gl_log(GL_VERBOSE, "created PBO frame buffer %d\n", pboFrameBuffer);
- }
-
- glBindFramebuffer(GL_FRAMEBUFFER, pboFrameBuffer);
-
- glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _textureId, 0);
-
- int fbs = glCheckFramebufferStatus(GL_FRAMEBUFFER);
- if (fbs != GL_FRAMEBUFFER_COMPLETE) {
- std::stringstream errmsg;
- errmsg << ": Frame buffer incomplete: " << fbs;
- throw std::runtime_error(errmsg.str());
- }
-
- if (pboId == 0) {
- glGenBuffers(1, &pboId);
- gl_log(GL_VERBOSE, "created PBO buffer %d\n", pboId);
- }
- glBindBuffer(GL_PIXEL_PACK_BUFFER, pboId);
-
- size_t buffer_size = _stride * _height * _channels * _type.dataSize();
-
- if (buffer_size > pboSize) {
- LOG(INFO) << "Allocating PBO of capacity " << buffer_size;
-
- glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, NULL, GL_DYNAMIC_READ);
- pboSize = buffer_size;
- }
-
- glReadBuffer(GL_COLOR_ATTACHMENT0);
- glReadPixels(0, 0, _stride, _height, _type.format, _type.type, 0);
-
- GLhalf* ptr = (GLhalf*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, buffer_size, GL_MAP_READ_BIT);
-
- if (ptr) {
- process(ptr, _width, _height, _stride, _channels, _type);
- } else {
- std::stringstream errmsg;
- errmsg << ": glMapBufferRange using PBO incomplete";
- throw std::runtime_error(errmsg.str());
- }
-
- // Unmap buffer
- glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
- glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
-
- // Bind to the default FrameBuffer
- glBindFramebuffer(GL_FRAMEBUFFER, defaultFramebuffer);
-}
+++ /dev/null
-
-#pragma once
-
-#include "GLTexture.h"
-#include <functional>
-
-class GLPBO {
- GLuint pboId = 0;
- GLuint pboSize = 0;
- GLuint pboFrameBuffer = 0;
-
- ~GLPBO();
-
- static GLPBO* pboContext;
-
- public:
- void mapTextureData(GLuint _textureId,
- GLsizei _width,
- GLsizei _height,
- GLsizei _stride,
- GLsizei _channels,
- const GLTexture::Type& type,
- std::function<void(const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type)> process);
-
- static GLPBO* getContext();
-};
+++ /dev/null
-
-#include "GLPlainTexture.h"
-#include "GLPBO.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/timer.h"
-
-#define half_float_supported (GLContext::getGLContext()->halfFloatTextureSupported())
-
-#define FIXED_TYPE(_t) (((_t).type != GL_HALF_FLOAT || half_float_supported) ? (_t) : GLTexture::FP16_COMPAT)
-
-GLPlainTexture::GLPlainTexture(
- const Type& type, const void* input, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
- : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
- // caffe2::Timer timer;
- // timer.Start();
- glGenTextures(1, &_textureId);
- glBindTexture(GL_TEXTURE_2D, _textureId);
- glTexImage2D(GL_TEXTURE_2D, 0, _type.internalFormat, _stride, _height, 0, _type.format, _type.type, input);
-
- gl_log(
- GL_VERBOSE,
- "GLPlainTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
- _textureId,
- _type.internalFormat,
- _type.format,
- _type.type);
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, _filter);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, _filter);
-
-#if GL_EXT_texture_border_clamp
- GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
- glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
- // Set the texture to use the border clamp wrapping mode.
- _wrap = GL_CLAMP_TO_BORDER_EXT;
-#endif
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, _wrap);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, _wrap);
-
- glBindTexture(GL_TEXTURE_2D, 0);
- // LOG(INFO) << "glTexImage2D takes " << timer.MilliSeconds() << " ms";
-}
-
-GLPlainTexture::GLPlainTexture(
- const Type& type, const GLuint textureID, GLsizei width, GLsizei height, bool use_padding, GLint filter, GLint wrap)
- : GLTexture(FIXED_TYPE(type), width, height, use_padding, filter, wrap) {
- _textureId = textureID;
- isOwner = false;
- gl_log(
- GL_VERBOSE,
- "GLPlainTexture() - wrapped textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
- _textureId,
- _type.internalFormat,
- _type.format,
- _type.type);
-}
+++ /dev/null
-
-#pragma once
-
-#include "GLContext.h"
-#include "GLTexture.h"
-
-class GLPlainTexture : public GLTexture {
- private:
- bool isOwner = true;
-
- public:
- GLPlainTexture(const Type& type,
- const void* input,
- GLsizei width,
- GLsizei height,
- bool use_padding = false,
- GLint filter = GL_NEAREST,
- GLint wrap = GL_CLAMP_TO_EDGE);
-
- GLPlainTexture(const Type& type,
- const GLuint textureID,
- GLsizei width,
- GLsizei height,
- bool use_padding = false,
- GLint filter = GL_NEAREST,
- GLint wrap = GL_CLAMP_TO_EDGE);
-
- ~GLPlainTexture() {
- if (glIsTexture(_textureId)) {
- if (isOwner) {
- gl_log(GL_VERBOSE, "~GLPlainTexture() - deleting texture %d\n", _textureId);
- glDeleteTextures(1, &_textureId);
- }
- } else {
- gl_log(GL_ERR, "not deleting texture %d\n", _textureId);
- }
- }
-
- GLuint name() const { return _textureId; };
-
- GLenum target() const { return GL_TEXTURE_2D; };
-
- bool flipped() const { return false; };
-};
+++ /dev/null
-
-#include "GLPredictor.h"
-#include "GLContext.h"
-#include "rewrite_net.h"
-#include <vector>
-
-namespace caffe2 {
-
-template <class T>
-void shareInputGLImage(Workspace* ws, const std::string& name, GLImageVector<T>* input) {
- auto* blob = ws->GetBlob(name);
- CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
- blob->ShareExternal<GLImageVector<T>>(input);
-}
-
-template <class T>
-const GLImageVector<T>* extractOutputGLImage(Workspace* ws, const std::string& name) {
- auto* blob = ws->GetBlob(name);
- CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
- return &blob->template Get<GLImageVector<T>>();
-}
-
-const NetDef create_gl_run_net(const NetDef& init_net,
- const NetDef& run_net,
- bool use_texture_input) {
- NetDef gl_run_net;
- if (!tryConvertToOpenGL(init_net, run_net, &gl_run_net, use_texture_input)) {
- CAFFE_THROW("Failed to convert model to OpenGL");
- }
- return gl_run_net;
-}
-
-GLPredictor::GLPredictor(const NetDef& init_net,
- const NetDef& run_net,
- bool use_texture_input,
- Workspace* parent)
- : Predictor(init_net, create_gl_run_net(init_net, run_net, use_texture_input), parent) {}
-
-GLPredictor::~GLPredictor() {}
-
-template <class T>
-bool GLPredictor::run(std::vector<GLImageVector<T>*>& inputs,
- std::vector<const GLImageVector<T>*>* outputs) {
- const NetDef& run_net_ = Predictor::def();
- CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
- for (auto i = 0; i < inputs.size(); ++i) {
- shareInputGLImage<T>(Predictor::ws(), run_net_.external_input(i), inputs[i]);
- }
-
- if (!Predictor::ws()->RunNet(run_net_.name())) {
- return false;
- }
-
- for (auto i = 0; i < run_net_.external_output_size(); ++i) {
- outputs->push_back(extractOutputGLImage<T>(Predictor::ws(), run_net_.external_output(i)));
- }
-
- return true;
-}
-
-template bool GLPredictor::run(std::vector<GLImageVector<uint8_t>*>& inputs,
- std::vector<const GLImageVector<uint8_t>*>* outputs);
-} // namespace caffe2
+++ /dev/null
-
-#pragma once
-
-#include "GLImage.h"
-#include "caffe2/core/net.h"
-#include "caffe2/predictor/predictor.h"
-
-namespace caffe2 {
-class GLPredictor : public Predictor {
- public:
- GLPredictor(const NetDef& init_net,
- const NetDef& run_net,
- bool use_texture_input = false,
- Workspace* parent = nullptr);
-
- template <class T>
- bool run(std::vector<GLImageVector<T>*>& inputs, std::vector<const GLImageVector<T>*>* outputs);
-
- ~GLPredictor();
-};
-} // namespace caffe2
+++ /dev/null
-
-#include "GLTexture.h"
-#include "DataTransfer.h"
-#include "GLPBO.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/timer.h"
-
-#if CAFFE2_ANDROID && defined(__ARM_NEON__)
-
-#include "../android/AndroidGLContext.h"
-
-// https://community.arm.com/thread/10002
-void arm_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz) {
- if (sz & 63) {
- sz = (sz & -64) + 64;
- }
-
- asm volatile(
- "NEONCopyPLD: \n"
- " VLDM %[src]!,{d0-d7} \n"
- " VSTM %[dst]!,{d0-d7} \n"
- " SUBS %[sz],%[sz],#0x40 \n"
- " BGT NEONCopyPLD \n"
- : [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
- :
- : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
-}
-#endif
-
-const GLTexture::Type GLTexture::FP16 = {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT};
-const GLTexture::Type GLTexture::UI8 = {GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE};
-const GLTexture::Type GLTexture::FP16_COMPAT = {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT};
-
-void GLTexture::map_read(std::function<void(const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const {
- GLPBO* pbo = GLPBO::getContext();
- pbo->mapTextureData(_textureId, _width, _height, _stride, _channels, _type, process);
-}
-
-void GLTexture::map_load(std::function<void(void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const {
- const int alignment = 32; // 4 * _type.dataSize();
- void* buffer = nullptr;
- size_t buffer_size = _width * _height * _channels * _type.dataSize();
-
-#ifdef __ANDROID__
- buffer = (void*)memalign(alignment, buffer_size);
-#else
- posix_memalign((void**)&buffer, alignment, buffer_size);
-#endif
- CAFFE_ENFORCE(buffer);
-
- process(buffer, _width, _height, _width, _channels, _type);
- loadData(buffer);
- free(buffer);
-}
-
-void GLTexture::loadData(const void* pixels) const {
- glBindTexture(GL_TEXTURE_2D, _textureId);
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, _width, _height, _type.format, _type.type, pixels);
- glBindTexture(GL_TEXTURE_2D, 0);
-}
+++ /dev/null
-
-#pragma once
-#include "GL.h"
-#include "GLLogging.h"
-
-class GLTexture {
- public:
- struct Type {
- const GLenum internalFormat;
- const GLenum format;
- const GLenum type;
-
- int dataSize() const {
- switch (type) {
- case GL_UNSIGNED_INT:
- return 4;
- case GL_HALF_FLOAT:
- return 2;
- case GL_UNSIGNED_BYTE:
- return 1;
- default:
- throw std::runtime_error("Unknown Texture Type");
- }
- }
-
- int channels() const {
- switch (format) {
- case GL_R8:
- return 1;
- case GL_RG8:
- return 2;
- // case GL_BGRA:
- case GL_RG_INTEGER:
- case GL_RGBA:
- return 4;
- default:
- throw std::runtime_error("Unknown Texture Format");
- }
- }
- };
-
- static const Type FP16;
- static const Type FP16_COMPAT;
- static const Type UI8;
-
- protected:
- const Type& _type;
-
- const GLsizei _width;
- const GLsizei _height;
- const GLsizei _stride;
- const GLsizei _channels;
- const bool _use_padding;
-
- GLint _filter;
- GLint _wrap;
- GLuint _textureId;
-
- public:
- GLTexture(const Type& type,
- int width,
- int height,
- int stride,
- bool use_padding,
- GLint filter,
- GLint wrap)
- : _type(type),
- _width(width),
- _height(height),
- _stride(stride),
- _channels(type.channels()),
- _use_padding(use_padding),
- _filter(filter),
- _wrap(wrap) {}
-
- GLTexture(const Type& type, int width, int height, bool use_padding, GLint filter, GLint wrap)
- : GLTexture(type,
- width,
- height,
- use_padding ? (width + 7) / 8 * 8 : width,
- use_padding,
- filter,
- wrap) {}
-
- virtual ~GLTexture() {}
- virtual GLuint name() const = 0;
- virtual GLenum target() const = 0;
- virtual bool flipped() const = 0;
-
- virtual void map_read(std::function<void(const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const;
-
- virtual void map_load(std::function<void(void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const;
-
- void loadData(const void* pixels) const;
-};
+++ /dev/null
-
-#pragma once
-
-#include "GLImageAllocator.h"
-
-namespace caffe2 {
-
-template <class T>
-class ImageAllocator {
- GLImageAllocator<T>* glImageAllocator;
-
- public:
- ImageAllocator() : glImageAllocator(GLImageAllocator<T>::newGLImageAllocator()) {}
-
- virtual ~ImageAllocator() { delete glImageAllocator; }
-
- GLImageVector<T>* newImage(
- int num_images, int width, int height, int channels, bool is_output = false) {
- const int tile_x = 1, tile_y = 1;
- return glImageAllocator->newImage(
- num_images, width, height, channels, tile_x, tile_y, is_output);
- }
-
- GLImageVector<T>* newImage(int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y,
- bool is_output = false) {
- return glImageAllocator->newImage(
- num_images, width, height, channels, tile_x, tile_y, is_output);
- }
-
- GLImageVector<T>* newImage(
- int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y,
- std::function<const GLTexture*(const int width, const int height)> textureAllocator) {
- return glImageAllocator->newImage(
- num_images, width, height, channels, tile_x, tile_y, textureAllocator);
- }
-};
-} // namespace caffe2
+++ /dev/null
-
-#pragma once
-
-#include "caffe2/core/common.h"
-
-#ifdef __ARM_NEON__
-#if CAFFE2_IOS
-#include "arm_neon.h"
-#elif CAFFE2_ANDROID
-#include "caffe2/mobile/contrib/opengl/android/arm_neon_support.h"
-#endif
-#endif
+++ /dev/null
-
-#include "rewrite_net.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/proto_utils.h"
-#include <unordered_map>
-#include <unordered_set>
-
-#ifdef CAFFE2_ANDROID
-#include "../android/AndroidGLContext.h"
-#endif
-
-namespace caffe2 {
-
-struct Analysis {
- struct SSA {
- using BlobVersions = std::unordered_map<std::string, size_t>;
- BlobVersions inVersions;
- BlobVersions outVersions;
- };
- std::vector<SSA> ssa;
- std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
-};
-
-static Analysis analyzeNet(const NetDef& net) {
- Analysis::SSA::BlobVersions frontier;
- Analysis analysis;
-
- auto play = [&](size_t i, const OperatorDef& op) {
- Analysis::SSA::BlobVersions inVersions;
- for (const auto& s : op.input()) {
- inVersions[s] = frontier[s];
- analysis.inUsages[s][frontier[s]].push_back(i);
- }
- Analysis::SSA::BlobVersions outVersions;
- for (const auto& s : op.output()) {
- if (frontier.find(s) != frontier.end()) {
- frontier[s] += 1;
- }
- outVersions[s] = frontier[s];
- }
- analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
- };
-
- for (auto i = 0; i < net.op_size(); ++i) {
- play(i, net.op(i));
- }
- return analysis;
-}
-
-static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
- auto* op = predictNet.add_op();
- op->set_name("CopyToOpenGL");
- op->set_type("CopyToOpenGL");
- op->add_input(cpu_blob);
- op->add_output(cpu_blob + "_M");
-}
-
-static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) {
- // add argument "is_last" to the last op to signal this is the last operator before the
- // CopyFromOpenGL op
- auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1);
- auto* arg = last_op->add_arg();
- arg->set_name("is_last");
- arg->set_i(1);
-
- auto* op = predictNet.add_op();
- op->set_name("CopyFromOpenGL");
- op->set_type("CopyFromOpenGL");
- op->add_input(cpu_blob + "_M");
- op->add_output(cpu_blob);
-}
-
-static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) {
- // Do some validation of the outputs. For this version, we require:
- // - a single input (first element of external_input()) is consumed by the NetDef
- // - a single output (first element of external_output()) is produced by the NetDef.
- // - the input is consumed by def.op(0), and this is the only consumer.
- // - the output is produced by def.op(-1).
- CAFFE_ENFORCE_GE(def.external_input_size(), 1);
- CAFFE_ENFORCE_GE(def.external_output_size(), 1);
- auto analysis = analyzeNet(def);
- // enforce a single use of the input blob.
- CAFFE_ENFORCE_GE(def.op_size(), 1);
-
- const auto& inputBlob = def.external_input(0);
- // Enforce that the input blob has a single usage - in the first operator.
- CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
- // Enforce that the external_output(0) blob is produced by the last operator in this sequence.
- const auto& outputBlob = def.external_output(0);
- CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) !=
- analysis.ssa.back().outVersions.end());
- const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
- // This should hold true by definition of the SSA analysis.
- CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
- analysis.inUsages[outputBlob].end());
-
- NetDef mdef;
- mdef.CopyFrom(def);
- mdef.clear_op();
-
- std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
- cpu_blobs[def.external_input(0)].insert(0);
-
- for (auto i = 0; i < def.op_size(); i++) {
- const auto& currentOp = def.op(i);
- if (glOps.count(currentOp.type()) > 0) {
- // OpenGL Op
- // insert copyToOpenGLOp
- for (auto j = 0; j < currentOp.input_size(); j++) {
- auto& input = currentOp.input(j);
- auto version = analysis.ssa[i].inVersions[input];
- if (cpu_blobs[input].count(version) > 0) {
- insertCopyToGPUOp(mdef, input);
- gpu_blobs[input].insert(version);
- cpu_blobs[input].erase(version);
- }
- // Only the first input should be OpenGL texture
- // Otherwise, copyToOpenGLOp will be inserted for the weights,
- // which are outputs of QuantDecode
- if (currentOp.type().find("OpenGLConv") == 0) {
- if (j == 0) {
- break;
- }
- }
- }
-
- auto* op = mdef.add_op();
- op->CopyFrom(currentOp);
-
- // swap input blob
- for (auto j = 0; j < currentOp.input_size(); j++) {
- auto& input = currentOp.input(j);
- auto version = analysis.ssa[i].inVersions[input];
- if (gpu_blobs[input].count(version) > 0) {
- op->set_input(j, input + "_M");
- }
- }
-
- // swap output blob
- for (auto j = 0; j < currentOp.output_size(); j++) {
- auto& output = currentOp.output(j);
- auto version = analysis.ssa[i].outVersions[output];
- op->set_output(j, output + "_M");
- gpu_blobs[output].insert(version);
- }
- // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op
- if (i == def.op_size() - 1) {
- insertCopyFromGPUOp(mdef, currentOp.output(0));
- }
- } else {
- // CPU Op
- // insert copyFromOpenGLOp
- for (auto j = 0; j < currentOp.input_size(); j++) {
- auto& input = currentOp.input(j);
- auto version = analysis.ssa[i].inVersions[input];
- if (gpu_blobs[input].count(version) > 0) {
- insertCopyFromGPUOp(mdef, input);
- }
- }
- auto* op = mdef.add_op();
- op->CopyFrom(currentOp);
- for (auto j = 0; j < currentOp.output_size(); j++) {
- auto& output = currentOp.output(j);
- auto version = analysis.ssa[i].outVersions[output];
- cpu_blobs[output].insert(version);
- }
- }
- }
- return mdef;
-}
-
-static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
- const OperatorDef& nextOp,
- OperatorDef* fusedOp,
- std::unordered_set<std::string>& glOps) {
- // Check for possible invalid opportunities.
- if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
- return false;
- }
- // The fused op cannot be inplace
- if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
- return false;
- }
-
- static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
- {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"},
- {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"},
- {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"},
- {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}};
- auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
- if (it == fusionOpportunities.end()) {
- return false;
- }
-
- glOps.insert(it->second);
- fusedOp->CopyFrom(currentOp);
- fusedOp->set_output(0, nextOp.output(0));
- fusedOp->set_type(it->second);
- for (auto i = 1; i < nextOp.input_size(); i++) {
- fusedOp->add_input(nextOp.input(i));
- }
- return true;
-}
-
-static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
- CHECK_GE(def.op_size(), 1);
- NetDef mdef;
- mdef.CopyFrom(def);
- mdef.clear_op();
- auto i = 0;
-
- while (i < def.op_size()) {
- if (i == def.op_size() - 1) {
- VLOG(2) << "Last operator, skipping";
- auto* op = mdef.add_op();
- op->CopyFrom(def.op(i));
- i += 1;
- continue;
- }
-
- const auto& currentOp = def.op(i);
- const auto& nextOp = def.op(i + 1);
- OperatorDef fusedOp;
- if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
- VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
- // We can fuse.
- auto* op = mdef.add_op();
- op->CopyFrom(fusedOp);
- i += 2;
- continue;
- }
- VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
- // Just emit the current type.
- auto* op = mdef.add_op();
- op->CopyFrom(currentOp);
- i += 1;
- }
- return mdef;
-}
-
-void dumpDefForOpenGL(const NetDef& d) {
- for (const auto& op : d.op()) {
- LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
- }
-}
-
-// // For debugging
-// void dumpDefForOpenGL(const NetDef &net) {
-// for (const auto &op : net.op()) {
-// printf("***Operator: %s\n", op.type().c_str());
-// for (auto input : op.input()) {
-// printf("\tInput: %s\n", input.c_str());
-// }
-//
-// for (auto output : op.output()) {
-// printf("\tOutput: %s\n", output.c_str());
-// }
-// }
-//}
-
-NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) {
- CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
- NetDef net;
- net.CopyFrom(predictNet);
-
- std::unordered_map<std::string, std::string> replacements(
- {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess",
- useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess"
- : "OpenGLTensorToTextureStylizerPreprocess"},
- {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess",
- useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess"
- : "OpenGLTextureToTensorStylizerDeprocess"}});
-
- std::unordered_set<std::string> openGLOps; // Used to insert copy ops
- bool needCopyOps = false;
-
- const auto& opKeyList = CPUOperatorRegistry()->Keys();
- auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end());
-
-#ifdef CAFFE2_ANDROID
- // TODO: debug InstanceNorm models on Mali devices
- AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext();
- if (context->get_platform() == Mali) {
- opKeySet.erase("OpenGLInstanceNorm");
- opKeySet.erase("OpenGLInstanceNormPRelu");
- }
-#endif
- for (auto i = 0; i < net.op_size(); ++i) {
- auto* op = net.mutable_op(i);
- string openGLOp = std::string("OpenGL") + op->type();
- if (replacements.count(openGLOp) > 0) {
- openGLOp = replacements[openGLOp];
- }
-
- if (opKeySet.find(openGLOp) != opKeySet.end()) {
- op->set_type(openGLOp);
- openGLOps.insert(openGLOp);
-
- if (useTiling) {
- auto* arg = op->add_arg();
- arg->set_name("tiling");
- arg->set_i(1);
- }
- } else {
- needCopyOps = true;
- }
- }
-
- if (useTextureInput && needCopyOps) {
- CAFFE_THROW("OpenGL operator missing");
- }
-
- if (runFusion) {
- net = runOpenGLFusion(net, openGLOps);
- }
-
- if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) {
- // For end-to-end testing
- if (net.op(net.op_size() - 1).type() !=
- replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) {
- auto* last_op = net.mutable_op(net.op_size() - 1);
- auto output = last_op->output(0) + "M";
- last_op->set_output(0, output);
- auto* copy_op = net.add_op();
- copy_op->set_name("CopyFromOpenGL");
- copy_op->set_type("CopyFromOpenGL");
- copy_op->add_input(output);
- // rename output blob in case input and output blob has the same name
- copy_op->add_output(net.external_output(0));
- }
- } else {
- if (!useTextureInput) {
- needCopyOps = true;
- }
- }
-
- // copy ops are needed when the input is not a texture
- if (needCopyOps) {
- // For non style transfer cases
- net = insertInputOutputCopyOps(net, openGLOps);
- }
-
- return net;
-}
-
-bool tryConvertToOpenGL(const NetDef& initNet,
- const NetDef& predictNet,
- NetDef* glPredictNet,
- bool useTextureInput,
- bool useTiling,
- bool runFusion) {
- try {
- // Throws if unsupported operators are found.
- *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion);
- dumpDefForOpenGL(*glPredictNet);
- // Throws if unsupported parameters are found.
- Workspace ws;
- ws.RunNetOnce(initNet);
- ws.CreateNet(*glPredictNet);
- LOG(INFO) << "OpenGL is successfully enabled";
- return true;
- } catch (const std::exception& e) {
- LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
- return false;
- }
-}
-} // namespace caffe2
+++ /dev/null
-
-#pragma once
-#include "GLPredictor.h"
-#include "caffe2/predictor/predictor.h"
-
-namespace caffe2 {
-bool tryConvertToOpenGL(const NetDef& initNet,
- const NetDef& predictNet,
- NetDef* glPredictNet,
- bool useTextureInput = false,
- bool useTiling = false,
- bool runFusion = true);
-
-// Exposed for testing
-NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
- bool useTextureInput = false,
- bool useTiling = false,
- bool runFusion = true);
-void dumpDefForOpenGL(const NetDef& net);
-} // namespace caffe2
+++ /dev/null
-file(GLOB_RECURSE tmp *.mm *.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+++ /dev/null
-
-#include "IOSGLContext.h"
-
-std::unique_ptr<GLContext> GLContext::_glcontext = nullptr;
-
-void GLContext::initGLContext() {
- if (_glcontext == nullptr) {
- _glcontext.reset(new IOSGLContext());
- }
-}
-
-GLContext* GLContext::getGLContext() {
- if (_glcontext == nullptr) {
- initGLContext();
- }
- return _glcontext.get();
-}
-
-void GLContext::deleteGLContext() { _glcontext.reset(nullptr); }
+++ /dev/null
-
-#include "IOSGLImageAllocator.h"
-#include <arm_neon.h>
-
-template <typename T>
-GLImageAllocator<T>* GLImageAllocator<T>::newGLImageAllocator() {
- return new IOSGLImageAllocator<T>();
-}
-
-template GLImageAllocator<float16_t>* GLImageAllocator<float16_t>::newGLImageAllocator();
-template GLImageAllocator<uint8_t>* GLImageAllocator<uint8_t>::newGLImageAllocator();
+++ /dev/null
-
-#pragma once
-
-#include "../core/GLContext.h"
-#include "../core/GLTexture.h"
-
-#import <CoreVideo/CoreVideo.h>
-
-class IOSGLContext : public GLContext {
- void* oglContext;
- void* oldContext;
- CVOpenGLESTextureCacheRef textureCache;
-
- public:
- IOSGLContext();
- ~IOSGLContext();
-
- const GLTexture* createNewTexture(CVPixelBufferRef pixelBuffer, const GLTexture::Type& type);
- void set_context();
- void reset_context();
- void flush_context();
-};
+++ /dev/null
-
-#include "IOSGLContext.h"
-#include "IOSGLTexture.h"
-#import <sstream>
-
-#import <OpenGLES/EAGL.h>
-
-IOSGLContext::IOSGLContext() {
- auto const currentContext = [EAGLContext currentContext];
- oldContext = (void*)CFBridgingRetain(currentContext);
-
- if (currentContext != nil && [currentContext API] == kEAGLRenderingAPIOpenGLES3) {
- oglContext = (void*)CFBridgingRetain(currentContext);
-
- gl_log(GL_LOG, "Reusing current context %p\n", oglContext);
- } else {
- oglContext =
- (void*)CFBridgingRetain([[EAGLContext alloc] initWithAPI:kEAGLRenderingAPIOpenGLES3]);
-
- gl_log(GL_LOG, "Created a new context %p\n", oglContext);
- }
-
- if (!oglContext) {
- throw std::runtime_error("Problem with OpenGL context");
- }
-
- set_context();
- textureCache = NULL;
- CVReturn err = CVOpenGLESTextureCacheCreate(
- kCFAllocatorDefault, NULL, (__bridge EAGLContext*)oglContext, NULL, &textureCache);
-
- if (err) {
- std::stringstream errmsg;
- errmsg << "Error at CVOpenGLESTextureCacheCreate " << err;
- throw std::runtime_error(errmsg.str());
- }
-}
-
-IOSGLContext::~IOSGLContext() {
- gl_log(GL_VERBOSE, "~IOSGLContext()");
-
- set_context();
- if (textureCache) {
- CFRelease(textureCache);
- textureCache = 0;
- }
- reset_context();
-
- // Explicitly release only after we `reset_context` since otherwise we are going to read from a
- // dangling pointer.
- if (oglContext) {
- CFBridgingRelease(oglContext);
- }
- if (oldContext) {
- CFBridgingRelease(oldContext);
- }
-}
-
-const GLTexture* IOSGLContext::createNewTexture(CVPixelBufferRef pixelBuffer,
- const GLTexture::Type& type) {
- return new IOSGLTexture(type, textureCache, pixelBuffer);
-}
-
-void IOSGLContext::set_context() {
- auto const currentContext = [EAGLContext currentContext];
-
- if ((__bridge void*)currentContext != oglContext) {
- if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oglContext]) {
- throw std::runtime_error("Problem setting OpenGL context");
- }
- GLenum glError = glGetError();
- if (glError != GL_NO_ERROR) {
- gl_log(GL_ERR, "There is an error: 0x%X\n", glError);
- }
- gl_log(GL_VERBOSE, "Set context to %p\n", oglContext);
- }
-}
-
-void IOSGLContext::reset_context() {
- EAGLContext* currentContext = [EAGLContext currentContext];
-
- if ((__bridge void*)currentContext != oldContext) {
- GLenum glError = glGetError();
- if (glError != GL_NO_ERROR) {
- gl_log(GL_ERR, "There is an error before: 0x%X\n", glError);
- }
- if (![EAGLContext setCurrentContext:(__bridge EAGLContext*)oldContext]) {
- throw std::runtime_error("Problem setting OpenGL context");
- }
- glError = glGetError();
- if (glError != GL_NO_ERROR) {
- gl_log(GL_ERR, "There is an error after: 0x%X\n", glError);
- }
- gl_log(GL_VERBOSE, "Reset context to %p\n", oldContext);
- }
-}
-
-void IOSGLContext::flush_context() { CVOpenGLESTextureCacheFlush(textureCache, 0); }
+++ /dev/null
-
-#include "IOSGLImageAllocator.h"
-
-#include "../core/GLImage.h"
-#include "../core/GLImageAllocator.h"
-#include "../core/GLPlainTexture.h"
-
-#include "IOSGLContext.h"
-#include "IOSGLTexture.h"
-
-#include "../core/arm_neon_support.h"
-
-template <class T>
-GLImageVector<T>* IOSGLImageAllocator<T>::newImage(int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y,
- bool useCVPixelBuffer) {
- GLImageVector<T>* output_images =
- new GLImageVector<T>(num_images, width, height, channels, tile_x, tile_y);
- if (useCVPixelBuffer) {
- IOSGLContext* gl_context = (IOSGLContext*)GLContext::getGLContext();
- for (int i = 0; i < num_images; i++) {
- GLImage<T>* output_image = new GLImage<T>(
- width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
- gl_log(GL_VERBOSE,
- "%s pixelbuffers.size(): %ld\n",
- __PRETTY_FUNCTION__,
- pixelbuffers.size());
-
- CVPixelBufferRef buffer = NULL;
- int slices = (channels + 3) / 4;
- int slice_index = i * slices + slice;
- if (pixelbuffers.size() < slice_index + 1) {
- const int texture_width = width * tile_x;
- const int texture_height = height * tile_y;
- buffer =
- IOSGLTexture::createCVPixelBuffer(pixelFormat, texture_width, texture_height);
- gl_log(GL_VERBOSE,
- "created a new buffer %p for image %d slice %d of dimensions %dx%d\n",
- buffer,
- i,
- slice,
- texture_width,
- texture_height);
- pixelbuffers.push_back(buffer);
- } else {
- buffer = pixelbuffers[slice_index];
-
- gl_log(GL_VERBOSE, "reused buffer %p for image %d slice %d\n", buffer, i, slice);
- }
-
- return gl_context->createNewTexture(buffer, GLImageAllocator<T>::type);
- });
- output_images->push_back(output_image);
- }
- } else {
- for (int i = 0; i < num_images; i++) {
- GLImage<T>* image = new GLImage<T>(
- width, height, channels, tile_x, tile_y, [&](int slice) -> const GLTexture* {
- return new GLPlainTexture(
- GLImageAllocator<T>::type, nullptr, width * tile_x, height * tile_y);
- });
- output_images->push_back(image);
- }
- }
- return output_images;
-}
-
-template <>
-const FourCharCode IOSGLImageAllocator<float16_t>::pixelFormat = kCVPixelFormatType_64RGBAHalf;
-template <>
-const FourCharCode IOSGLImageAllocator<uint8_t>::pixelFormat = kCVPixelFormatType_32BGRA;
-
-template class IOSGLImageAllocator<float16_t>;
-template class IOSGLImageAllocator<uint8_t>;
+++ /dev/null
-
-#pragma once
-
-#include "../core/GLImageAllocator.h"
-
-#import <CoreVideo/CoreVideo.h>
-
-template <class T>
-class IOSGLImageAllocator : public GLImageAllocator<T> {
- static const GLTexture::Type& type;
-
- std::vector<CVPixelBufferRef> pixelbuffers;
-
- public:
- static const FourCharCode pixelFormat;
-
- IOSGLImageAllocator() : GLImageAllocator<T>() { gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__); }
-
- ~IOSGLImageAllocator() {
- gl_log(GL_VERBOSE, "%s\n", __PRETTY_FUNCTION__);
-
- for (auto&& pixelbuffer : pixelbuffers) {
- CFRelease(pixelbuffer);
- }
- }
-
- GLImageVector<T>* newImage(int num_images,
- int width,
- int height,
- int channels,
- int tile_x,
- int tile_y,
- bool useCVPixelBuffer);
-};
+++ /dev/null
-
-#pragma once
-
-#include "../core/GLContext.h"
-#include "../core/GLTexture.h"
-
-#import <CoreVideo/CoreVideo.h>
-
-class IOSGLTexture : public GLTexture {
- CVOpenGLESTextureRef textureRef;
-
- IOSGLTexture(const Type& type,
- CVOpenGLESTextureCacheRef textureCache,
- CVPixelBufferRef sourceImage,
- GLint _filter = GL_NEAREST,
- GLint _wrap = GL_CLAMP_TO_EDGE);
-
- friend class IOSGLContext;
-
- public:
- const CVPixelBufferRef sourceImage;
-
- ~IOSGLTexture() { CFRelease(textureRef); }
-
- void map_buffer(std::function<void(void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const;
-
- virtual void map_read(std::function<void(const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const;
-
- virtual void map_load(std::function<void(void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const;
-
- GLuint name() const { return CVOpenGLESTextureGetName(textureRef); }
- GLenum target() const { return CVOpenGLESTextureGetTarget(textureRef); };
- bool flipped() const { return CVOpenGLESTextureIsFlipped(textureRef); };
-
- static CVPixelBufferRef createCVPixelBuffer(OSType pixelType, int32_t width, int32_t height);
-};
+++ /dev/null
-
-#include "IOSGLTexture.h"
-#include "../core/DataTransfer.h"
-
-IOSGLTexture::IOSGLTexture(const Type& type,
- CVOpenGLESTextureCacheRef textureCache,
- CVPixelBufferRef _sourceImage,
- GLint filter,
- GLint wrap)
- : GLTexture(type,
- CVPixelBufferGetWidth(_sourceImage),
- CVPixelBufferGetHeight(_sourceImage),
- CVPixelBufferGetBytesPerRow(_sourceImage) / (type.channels() * type.dataSize()),
- false,
- filter,
- wrap),
- sourceImage(_sourceImage) {
- CVReturn err = CVOpenGLESTextureCacheCreateTextureFromImage(kCFAllocatorDefault,
- textureCache,
- _sourceImage,
- NULL,
- GL_TEXTURE_2D,
- _type.internalFormat,
- _width,
- _height,
- _type.format,
- _type.type,
- 0,
- &textureRef);
-
- if (!textureRef || err) {
- gl_log(GL_ERR,
- "something went wrong, sourceImage: %p, width: %d, height: %d, filter: %d, wrap: %d\n",
- _sourceImage,
- _width,
- _height,
- filter,
- wrap);
- }
- _textureId = name();
- gl_log(
- GL_VERBOSE,
- "IOSGLTexture() - allocated textureId %d, internalFormat: 0x%X, format: 0x%X, type: 0x%X\n",
- _textureId,
- _type.internalFormat,
- _type.format,
- _type.type);
-
- glActiveTexture(GL_TEXTURE0);
- glBindTexture(GL_TEXTURE_2D, _textureId);
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
-
-#if GL_EXT_texture_border_clamp
- GLfloat borderColor[] = {0.0f, 0.0f, 0.0f, 0.0f};
- glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR_EXT, borderColor);
- // Set the texture to use the border clamp wrapping mode.
- wrap = GL_CLAMP_TO_BORDER_EXT;
-#endif
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, wrap);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, wrap);
-
- glBindTexture(GL_TEXTURE_2D, 0);
-}
-
-CVPixelBufferRef IOSGLTexture::createCVPixelBuffer(OSType pixelFormat,
- int32_t width,
- int32_t height) {
- NSDictionary* pixelBufferAttributes = @{
- (id)kCVPixelBufferPixelFormatTypeKey : @(pixelFormat),
- (id)kCVPixelFormatOpenGLESCompatibility : @YES,
- (id)kCVPixelBufferIOSurfacePropertiesKey : @{/*empty dictionary*/}
- };
-
- CVPixelBufferRef buffer = NULL;
- CVPixelBufferCreate(kCFAllocatorDefault,
- width,
- height,
- pixelFormat,
- (__bridge CFDictionaryRef)(pixelBufferAttributes),
- &buffer);
- return buffer;
-}
-
-void IOSGLTexture::map_buffer(std::function<void(void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const {
- if (CVPixelBufferLockBaseAddress(sourceImage, 0) == kCVReturnSuccess) {
- void* buffer = CVPixelBufferGetBaseAddress(sourceImage);
- int buffer_stride = CVPixelBufferGetBytesPerRow(sourceImage) / (_channels * _type.dataSize());
- process(buffer, _width, _height, buffer_stride, _channels, _type);
-
- CVPixelBufferUnlockBaseAddress(sourceImage, 0);
- }
-}
-
-void IOSGLTexture::map_load(std::function<void(void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const {
- map_buffer(process);
-}
-
-void IOSGLTexture::map_read(std::function<void(const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const Type& type)> process) const {
- // TODO: why is glFlush() only necessary when running tests
- glFlush();
-
- map_buffer(process);
-}
+++ /dev/null
-file(GLOB_RECURSE tmp *.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLAdd : public GLFilter {
- public:
- binding* inputData[2];
- binding* outputSize;
-
- GLAdd()
- : GLFilter("GLAdd",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>(
- {BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {/* no replacements */}) {}
-
- template <typename T>
- void add(const GLImageVector<T>& input_image0,
- const GLImageVector<T>& input_image1,
- const GLImageVector<T>& output_image);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLAdd::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData[2]);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
- vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
- vec4 value = A + B;
- outputData = TEXTURE_STORE(value);
-}
-
-)GLSL";
-
-template <typename T>
-void GLAdd::add(const GLImageVector<T>& input_images0,
- const GLImageVector<T>& input_images1,
- const GLImageVector<T>& output_images) {
- const int num_images = input_images0.size();
- for (int i = 0; i < num_images; i++) {
- GLImage<T>* input_image0 = input_images0[i];
- GLImage<T>* input_image1 = input_images1[i];
- int input_slices = input_image0->slices;
- GLImage<T>* output_image = output_images[i];
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments;
- input_attachments.push_back({input_image0->textures[is], inputData[0]});
- input_attachments.push_back({input_image1->textures[is], inputData[1]});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() { glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height); },
- output_image->texture_width,
- output_image->texture_height);
- }
- }
-}
-
-namespace caffe2 {
-template <typename T>
-class OpenGLAddOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLAddOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {
- OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false,
- "OpenGLAdd does not support broadcast");
-
- OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLAdd does not support axis");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
- const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
-
- CAFFE_ENFORCE_EQ(input0.size(), input1.size());
-
- const int num_images = input0.size();
- const int input_channels = input0.channels();
- const int input_width = input0.width();
- const int input_height = input0.height();
- const int input_tile_x = input0.tile_x();
- const int input_tile_y = input0.tile_y();
-
- CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
- CAFFE_ENFORCE_EQ(input1.width(), input_width);
- CAFFE_ENFORCE_EQ(input1.height(), input_height);
- CAFFE_ENFORCE_EQ(input1.tile_x(), input_tile_x);
- CAFFE_ENFORCE_EQ(input1.tile_y(), input_tile_y);
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
- const int output_tile_x = input_tile_x;
- const int output_tile_y = input_tile_y;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-
- if (!_add) {
- _add.reset(new GLAdd());
- }
-
- _add->add(input0, input1, *output);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLAdd> _add;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLAdd, OpenGLAddOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLAdd).NumInputs(2).NumOutputs(1);
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-#include "gl_tiling_utils.h"
-
-#include <iostream>
-#include <vector>
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/utils/math.h"
-
-class GLConcat : public GLFilter {
- public:
- bool tiling_;
- binding* inputData;
- binding* outputSize;
- binding* inputTileRange;
- binding* input_tile_x;
-
- GLConcat(tile_descriptor output_tile_geometries, bool tiling = false)
- : GLFilter(
- "GLConcat",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(outputSize),
- BINDING(inputData),
- BINDING(inputTileRange),
- BINDING(input_tile_x)}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"TILING", c10::to_string(tiling)},
- {"OUTPUT_TILES", c10::to_string(output_tile_geometries.tiles)},
- {"OUTPUT_TILE_X",
- c10::to_string(output_tile_geometries.tile_dims.x)},
- {"OUTPUT_TILE_WIDTH",
- c10::to_string(output_tile_geometries.tile_size.x)},
- {"OUTPUT_TILE_HEIGHT",
- c10::to_string(output_tile_geometries.tile_size.y)}}),
- tiling_(tiling) {}
-
- template <typename T>
- void concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_image, int size);
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLConcat::fragment_shader = R"GLSL(#version 300 es
-#define TILING $(TILING)
-
-// tiling
-#define OUTPUT_TILES $(OUTPUT_TILES)
-#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
-#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-uniform ivec2 outputSize;
-uniform ivec2 inputTileRange; // (]
-uniform int input_tile_x;
-
-#if TILING
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
- ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
- int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-
- if (tileNum >= inputTileRange.x && tileNum < inputTileRange.y) {
- tileNum = tileNum - inputTileRange.x;
- texelCoord = ivec2(tileNum % input_tile_x, tileNum / input_tile_x) * ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT) + tileCoord;
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE(value);
- } else {
- // early termination
- discard;
- }
-}
-
-#else
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE(value);
-}
-#endif
-
-)GLSL";
-
-template <typename T>
-void GLConcat::concat(const GLImageVector<T>** input_images, const GLImageVector<T>& output_images, int input_size) {
- for (int k = 0; k < output_images.size(); k++) {
- GLImage<T>* output_image = output_images[k];
-
- int is = 0, os = 0;
- for (int i = 0; i < input_size; i++) {
- for (int j = 0; j < input_images[i]->slices(); j++) {
- GLImage<T>* input_image = (*input_images[i])[k];
- std::vector<texture_attachment> input_attachments;
- input_attachments.push_back({input_image->textures[j], inputData});
-
- run(input_attachments,
- {output_image->textures.begin() + os, output_image->textures.begin() + os + 1},
- [&]() {
- glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
- glUniform2i(inputTileRange->location, is, is + input_image->tile_x * input_image->tile_y);
- glUniform1i(input_tile_x->location, input_image->tile_x);
- },
- output_image->texture_width,
- output_image->texture_height);
- if (!tiling_) {
- os++; // for tiling, you always write to the same texture
- }
- is += input_image->tile_x * input_image->tile_y;
- }
- }
- }
-}
-
-namespace caffe2 {
-template <typename T>
-class OpenGLConcatOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLConcatOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws),
- order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = input0.size();
-
- const GLImageVector<T>** input_images = new const GLImageVector<T>*[Inputs().size()];
- input_images[0] = &input0;
- int channelCount = input0.channels();
-
- bool tiling = OperatorBase::GetSingleArgument<int>("tiling", 0);
-
- // Only supports input channels divisible by 4 for now
- CAFFE_ENFORCE_EQ(input0.channels() % 4, 0);
- for (auto i = 1; i < Inputs().size(); i++) {
- const GLImageVector<T>& inputi = Inputs()[i]->template Get<GLImageVector<T>>();
- channelCount += inputi.channels();
- CAFFE_ENFORCE_EQ(num_images, inputi.size());
- CAFFE_ENFORCE_EQ(inputi.channels() % 4, 0);
- CAFFE_ENFORCE_EQ(input0.width(), inputi.width());
- CAFFE_ENFORCE_EQ(input0.height(), inputi.height());
- input_images[i] = &inputi;
-
- if (inputi.tile_x() > 1 || inputi.tile_y() > 1) {
- tiling = true;
- }
- }
-
- const int input_width = input0.width();
- const int input_height = input0.height();
-
- const int output_channels = channelCount;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int output_tile_x = 1;
- int output_tile_y = 1;
- if (tiling) {
- computeOutputTiles(output_channels, output_tile_x, output_tile_y);
- }
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
- if (!_concat) {
- tile_descriptor output_tile_geometries{
- {output_tile_x, output_tile_y}, {output_width, output_height}, output_tile_x * output_tile_y};
- _concat.reset(new GLConcat(output_tile_geometries, tiling));
- }
-
- _concat->concat(input_images, *output, Inputs().size());
- delete[] input_images;
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- StorageOrder order_;
- std::unique_ptr<GLConcat> _concat;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLConcat, OpenGLConcatOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLConcat).NumInputs(2, 4).NumOutputs(1, 2);
-} // namespace caffe2
+++ /dev/null
-#include "GLConvolution.h"
-#include "../core/GLContext.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/conv_transpose_unpool_op_base.h"
-#include <iostream>
-#include <vector>
-
-#define MaxOutputTileBatchSize 2
-
-// MARK: GLSL
-const char* GLConvolution::fragment_shader = R"GLSL(#version 300 es
-#define TILED_CONVOLUTION $(TILED_CONVOLUTION)
-#define TRANSPOSED_CONVOLUTION $(TRANSPOSED_CONVOLUTION)
-
-// batching
-#define INPUT_BATCH_SIZE $(INPUT_BATCH_SIZE)
-#define OUTPUT_BATCH_SIZE $(OUTPUT_BATCH_SIZE)
-
-// tiling
-#define INPUT_TILES $(INPUT_TILES)
-#define OUTPUT_TILES $(OUTPUT_TILES)
-#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
-#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
-#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
-#define INPUT_TILE_X $(INPUT_TILE_X)
-#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
-#define INPUT_TILE_CHUNK_SIZE $(INPUT_TILE_CHUNK_SIZE)
-#define OUTPUT_TILE_CHUNK_SIZE $(OUTPUT_TILE_CHUNK_SIZE)
-#define OUTPUT_TILE_BATCH_SIZE $(OUTPUT_TILE_BATCH_SIZE)
-
-#define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
-
-// common
-const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
-const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
-const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
-
-precision mediump float;
-precision mediump int;
-precision mediump sampler2D;
-
-in highp vec2 v_texCoord;
-
-#define unpackKernel(pk) \
- mat4(vec4(unpackHalf2x16(pk.packed_data[0].x), unpackHalf2x16(pk.packed_data[0].y)), \
- vec4(unpackHalf2x16(pk.packed_data[0].z), unpackHalf2x16(pk.packed_data[0].w)), \
- vec4(unpackHalf2x16(pk.packed_data[1].x), unpackHalf2x16(pk.packed_data[1].y)), \
- vec4(unpackHalf2x16(pk.packed_data[1].z), unpackHalf2x16(pk.packed_data[1].w)))
-
-#if BOUNDS_CHECK_MODE == 0
- #define IN_BOUNDS(p, p0, p1) (true)
-#else
- #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
-#endif
-
-#if TILED_CONVOLUTION
-// Tiled convolution
-const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-uniform ivec2 outputSize;
-uniform bool accumulate;
-uniform bool fusePRelu;
-
-uniform ivec2 inputTileRange;
-
-TEXTURE_INPUT(inputData[1]);
-TEXTURE_INPUT(previousData[1]);
-
-struct packedKernel {
- highp uvec4 packed_data[2];
-};
-
-struct kernel {
- packedKernel data[kernel_size.x * kernel_size.y];
-};
-
-layout (std140) uniform Kernel_block {
- kernel kernel_data[INPUT_TILE_CHUNK_SIZE * OUTPUT_TILE_CHUNK_SIZE];
-} kernel_block[OUTPUT_TILE_BATCH_SIZE];
-
-layout (std140) uniform bias_block {
- highp uvec4 bias[(OUTPUT_TILES + 1) / 2];
-};
-
-layout (std140) uniform prelu_scale_block {
- highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
-};
-
-TEXTURE_OUTPUT(0, outputData0);
-
-#if TRANSPOSED_CONVOLUTION
-
-#define CONVOLUTION(ib) { \
- ivec2 p0 = (input_padding + input_stride - tileCoord % input_stride) % input_stride; \
- for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
- for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
- int i = y * kernel_size.x + x; \
- ivec2 idx = tileCoord + ivec2(x, y) - input_padding; \
- if IN_BOUNDS(idx, ivec2(0), inputTileSize * input_stride) { \
- vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx / input_stride); \
- mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
- sum += k * data; \
- } \
- } \
- } \
-}
-
-#else
-
-#define CONVOLUTION(ib) { \
- for (int y = 0, i = 0; y < kernel_size.y; y++) { \
- for (int x = 0; x < kernel_size.x; x++, i++) { \
- ivec2 idx = tileCoord + ivec2(x, y); \
- if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
- vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx); \
- mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
- sum += k * data; \
- } \
- } \
- } \
-}
-#endif // TRANSPOSED_CONVOLUTION
-
-void main() {
- ivec2 inputSize = textureSize(inputData[0], 0);
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
- ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
- ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
-
- int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-
-#if !TRANSPOSED_CONVOLUTION
- tileCoord = input_stride * tileCoord - input_padding;
-#endif
-
- highp vec4 sum = vec4(0);
-
- for (int tile_idx = inputTileRange.x; tile_idx < inputTileRange.y; tile_idx++) {
- int inTileX = tile_idx % INPUT_TILE_X;
- int inTileY = tile_idx / INPUT_TILE_X;
- int inTileId = tile_idx % INPUT_TILE_CHUNK_SIZE; // normalized input tile idx, used to index the kernel
-
- int kernelIdx = OUTPUT_TILE_CHUNK_SIZE * inTileId + tileNum % OUTPUT_TILE_CHUNK_SIZE;
- ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
-
- int outputChunkIdx = tileNum / OUTPUT_TILE_CHUNK_SIZE;
- if (outputChunkIdx == 0) {
- CONVOLUTION(0);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 1
- else if (outputChunkIdx == 1) {
- CONVOLUTION(1);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 2
- else if (outputChunkIdx == 2) {
- CONVOLUTION(2);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 3
- else if (outputChunkIdx == 3) {
- CONVOLUTION(3);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 4
- else if (outputChunkIdx == 4) {
- CONVOLUTION(4);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 5
- else if (outputChunkIdx == 5) {
- CONVOLUTION(5);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 6
- else if (outputChunkIdx == 6) {
- CONVOLUTION(6);
- }
-#if OUTPUT_TILE_BATCH_SIZE > 7
- else if (outputChunkIdx == 7) {
- CONVOLUTION(7);
- }
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
- }
-
- vec4 biasValue = (tileNum % 2 == 0) ? unpackHalf4x16(bias[tileNum/2].xy) : unpackHalf4x16(bias[tileNum/2].zw);
- vec4 prevData = TEXTURE_LOAD(previousData[0], texelCoord);
- vec4 value = sum + (accumulate ? prevData : biasValue);
-
- vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
-
- vec4 o0 = fusePRelu ? mix(value * preluValue, value, vec4(greaterThan(value, vec4(0)))) : value;
- outputData0 = TEXTURE_STORE(o0);
-}
-
-#else
-
-// batched convolution
-
-uniform ivec2 outputSize;
-uniform bool accumulate;
-uniform bool fusePRelu;
-
-TEXTURE_INPUT(inputData[INPUT_BATCH_SIZE]);
-TEXTURE_INPUT(previousData[OUTPUT_BATCH_SIZE]);
-
-struct packedKernel {
- highp uvec4 packed_data[2];
-};
-
-struct kernel {
- packedKernel data[kernel_size.x * kernel_size.y];
-};
-
-layout (std140) uniform Kernel_block {
- kernel kernel_data[OUTPUT_BATCH_SIZE];
-} kernel_block[INPUT_BATCH_SIZE];
-
-layout (std140) uniform bias_block {
- highp uvec4 bias[(OUTPUT_BATCH_SIZE + 1) / 2];
-};
-
-layout (std140) uniform prelu_scale_block {
- highp uvec4 scale[(OUTPUT_BATCH_SIZE + 1) / 2];
-};
-
-TEXTURE_OUTPUT(0, outputData0);
-#if OUTPUT_BATCH_SIZE > 1
-TEXTURE_OUTPUT(1, outputData1);
-#if OUTPUT_BATCH_SIZE > 2
-TEXTURE_OUTPUT(2, outputData2);
-#if OUTPUT_BATCH_SIZE > 3
-TEXTURE_OUTPUT(3, outputData3);
-#endif
-#endif
-#endif
-
-#if TRANSPOSED_CONVOLUTION
-#define CONVOLUTION(ib) { \
- ivec2 p0 = (input_padding + input_stride - texelCoord % input_stride) % input_stride; \
- for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
- for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
- int i = y * kernel_size.x + x; \
- ivec2 idx = texelCoord + ivec2(x, y) - input_padding; \
- if IN_BOUNDS(idx, ivec2(0), inputSize * input_stride) { \
- vec4 data = TEXTURE_LOAD(inputData[ib], idx / input_stride); \
- for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
- mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
- sum[ob] += k * data; \
- } \
- } \
- } \
- } \
-}
-
-#else
-
-#define CONVOLUTION(ib) { \
- for (int y = 0, i = 0; y < kernel_size.y; y++) { \
- for (int x = 0; x < kernel_size.x; x++, i++) { \
- ivec2 idx = coord + ivec2(x, y); \
- if IN_BOUNDS(idx, ivec2(0), inputSize) { \
- vec4 data = TEXTURE_LOAD(inputData[ib], idx); \
- for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
- mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
- sum[ob] += k * data; \
- } \
- } \
- } \
- } \
-}
-
-#endif // TRANSPOSED_CONVOLUTION
-
-void main() {
- ivec2 inputSize = textureSize(inputData[0], 0);
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
-#if !TRANSPOSED_CONVOLUTION
- ivec2 coord = input_stride * texelCoord - input_padding;
-#endif
-
- highp vec4 sum[OUTPUT_BATCH_SIZE] = vec4[OUTPUT_BATCH_SIZE](vec4(0)
-#if OUTPUT_BATCH_SIZE > 1
- , vec4(0)
-#if OUTPUT_BATCH_SIZE > 2
- , vec4(0)
-#if OUTPUT_BATCH_SIZE > 3
- , vec4(0)
-#endif
-#endif
-#endif
- );
-
- CONVOLUTION(0);
-#if INPUT_BATCH_SIZE > 1
- CONVOLUTION(1);
-#if INPUT_BATCH_SIZE > 2
- CONVOLUTION(2);
-#if INPUT_BATCH_SIZE > 3
- CONVOLUTION(3);
-#if INPUT_BATCH_SIZE > 4
- CONVOLUTION(4);
-#if INPUT_BATCH_SIZE > 5
- CONVOLUTION(5);
-#if INPUT_BATCH_SIZE > 6
- CONVOLUTION(6);
-#if INPUT_BATCH_SIZE > 7
- CONVOLUTION(7);
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-
- vec4 prev0 = TEXTURE_LOAD(previousData[0], texelCoord);
- vec4 value = sum[0] + (accumulate ? prev0: unpackHalf4x16(bias[0].xy));
- vec4 o0 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
- outputData0 = TEXTURE_STORE(o0);
-#if OUTPUT_BATCH_SIZE > 1
- vec4 prev1 = TEXTURE_LOAD(previousData[1], texelCoord);
- value = sum[1] + (accumulate ? prev1 : unpackHalf4x16(bias[0].zw));
- vec4 o1 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
- outputData1 = TEXTURE_STORE(o1);
-#if OUTPUT_BATCH_SIZE > 2
- vec4 prev2 = TEXTURE_LOAD(previousData[2], texelCoord);
- value = sum[2] + (accumulate ? prev2 : unpackHalf4x16(bias[1].xy));
- vec4 o2 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
- outputData2 = TEXTURE_STORE(o2);
-#if OUTPUT_BATCH_SIZE > 3
- vec4 prev3 = TEXTURE_LOAD(previousData[3], texelCoord);
- value = sum[3] + (accumulate ? prev3: unpackHalf4x16(bias[1].zw));
- vec4 o3 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
- outputData3 = TEXTURE_STORE(o3);
-#endif
-#endif
-#endif
-}
-
-#endif // TILED_CONVOLUTION
-
-)GLSL";
-
-void GLConvolution::pack_kernel_data_for_bached_conv(
- float16_t* data,
- size_t size,
- int input_channels,
- int output_channels,
- int is,
- int os,
- int ib) {
- typedef float16_t(packedKernel)[output_batch_size][geometry.kernel_size.y]
- [geometry.kernel_size.x][4][4];
- packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
-
- const int batch_input_channels = std::min(4, input_channels - 4 * (is + ib));
- for (int ob = 0; ob < output_batch_size; ob++) {
- const int batch_output_channels =
- std::min(4, output_channels - 4 * (os + ob));
- for (int out = 0; out < batch_output_channels; out++) {
- for (int in = 0; in < batch_input_channels; in++) {
- for (int y = 0; y < geometry.kernel_size.y; y++) {
- for (int x = 0; x < geometry.kernel_size.x; x++) {
- // clang-format off
- if (geometry.transposed) {
- typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
- const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
- packed_kernel_data[ob][y][x][in][out] =
- kernel_data[4 * (is + ib) + in][4 * (os + ob) + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
- } else {
- typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
- const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
- packed_kernel_data[ob][y][x][in][out] = kernel_data[4 * (os + ob) + out][4 * (is + ib) + in][y][x];
- }
- // clang-format on
- }
- }
- }
- }
- }
-}
-
-void GLConvolution::pack_kernel_data_for_tiled_conv(
- float16_t* data, // destination
- size_t size,
- int input_channels,
- int output_channels,
- point input_tile_range,
- point output_tile_range) {
- typedef float16_t(
- packedKernel)[input_tile_chunk_size][output_tile_chunk_size]
- [geometry.kernel_size.y][geometry.kernel_size.x][4][4];
- packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
-
- for (int it = input_tile_range.x; it < input_tile_range.y; it++) {
- for (int ot = output_tile_range.x; ot < output_tile_range.y; ot++) {
- for (int y = 0; y < geometry.kernel_size.y; y++) {
- for (int x = 0; x < geometry.kernel_size.x; x++) {
- for (int out = 0; out < std::min(4, (output_channels - ot * 4));
- out++) {
- for (int in = 0; in < std::min(4, (input_channels - it * 4));
- in++) {
- // clang-format off
- if (geometry.transposed) {
- typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
- const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
- packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
- kernel_data[4 * it + in] [4 * ot + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
- } else {
- typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
- const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
- packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
- kernel_data[4 * ot + out][4 * it + in][y][x];
- }
- // clang-format on
- }
- }
- }
- }
- }
- }
-}
-
-template <typename T>
-void GLConvolution::convolution(
- const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images) {
- if (tiling) {
- run_tiled_conv(input_images, output_images);
- } else {
- run_batched_conv(input_images, output_images);
- }
-}
-
-template <typename T>
-void GLConvolution::run_batched_conv(
- const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images) {
- for (int i = 0; i < input_images.size(); i++) {
- GLImage<T>* input_image = input_images[i];
- GLImage<T>* output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is += input_batch_size) {
- for (int os = 0; os < output_slices; os += output_batch_size) {
- const int output_channels_per_batch =
- std::min(4 * output_batch_size, geometry.output_channels - 4 * os);
-
- gl_log(
- GL_VERBOSE,
- "GLConvolution::convolution - is: %d, os: %d\n",
- is,
- os);
-
- // Note the order of the binding point needs to be the same as in the
- // constructor
- int binding_point = 0;
-
- // bias
- attach_uniform_buffer<float16_t>(
- bias_block, binding_point++, [&](float16_t* data, size_t size) {
- CAFFE_ENFORCE_GE(
- size,
- output_channels_per_batch * sizeof(float16_t),
- "Bias buffer size too small");
- for (int ob = 0; ob < output_channels_per_batch; ob++) {
- data[ob] = bias[4 * os + ob];
- }
- });
-
- // kernel weights
- for (int ib = 0; ib < input_batch_size; ib++) {
- attach_uniform_buffer<float16_t>(
- kernel_block[ib],
- binding_point++,
- [&](float16_t* data, size_t size) {
- CAFFE_ENFORCE_EQ(
- size,
- 4 * (4 * output_batch_size) * geometry.kernel_size.y *
- geometry.kernel_size.x * sizeof(float16_t),
- "Kernel size mismatch");
- pack_kernel_data_for_bached_conv(
- data,
- size,
- input_image->channels,
- output_image->channels,
- is,
- os,
- ib);
- });
- }
-
- // PRelu scale
- if (prelu_scale != nullptr && is == input_slices - input_batch_size) {
- attach_uniform_buffer<float16_t>(
- prelu_scale_block,
- binding_point++,
- [&](float16_t* data, size_t size) {
- CAFFE_ENFORCE_GE(
- size,
- output_channels_per_batch * sizeof(float16_t),
- "PRelu buffer size too small");
- for (int ob = 0; ob < output_channels_per_batch; ob++) {
- data[ob] = prelu_scale_size == geometry.output_channels
- ? prelu_scale[4 * os + ob]
- : prelu_scale[0];
- }
- });
- }
-
- std::vector<texture_attachment> input_attachments;
- for (int ib = 0; ib < input_batch_size; ib++) {
- input_attachments.push_back(
- {input_image->textures[is + ib], inputData[ib]});
- }
- for (int ob = 0; ob < output_batch_size; ob++) {
- input_attachments.push_back(
- {output_image->textures[os + ob], previousData[ob]});
- }
-
- run(input_attachments,
- {output_image->textures.begin() + os,
- output_image->textures.begin() + os + output_batch_size},
- [&]() {
- glUniform2i(
- outputSize->location,
- output_image->texture_width,
- output_image->texture_height);
- glUniform2i(inputTileRange->location, 0, 1);
- glUniform1i(accumulate->location, is != 0);
- glUniform1i(
- fusePRelu->location,
- prelu_scale != nullptr &&
- (is == input_slices - input_batch_size));
- },
- output_image->texture_width,
- output_image->texture_height);
- }
- }
- }
-}
-
-template <typename T>
-void GLConvolution::run_tiled_conv(
- const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images) {
- for (int i = 0; i < input_images.size(); i++) {
- GLImage<T>* input_image = input_images[i];
- GLImage<T>* output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
- int input_tile_x = input_image->tile_x;
- int input_tile_y = input_image->tile_y;
- int input_tiles = input_image->tile_x * input_image->tile_y;
- int output_tiles = output_image->tile_x * output_image->tile_y;
-
- for (int ib = 0, it = 0; it < input_tiles;
- ib++, it += input_tile_chunk_size) {
- // Note the order of the binding point needs to be the same as in the
- // constructor
- int binding_point = 0;
-
- // bias
- attach_uniform_buffer<float16_t>(
- bias_block, binding_point++, [&](float16_t* data, size_t size) {
- CAFFE_ENFORCE_GE(
- size,
- geometry.output_channels * sizeof(float16_t),
- "Bias buffer size too small");
- for (int ob = 0; ob < geometry.output_channels; ob++) {
- data[ob] = bias[ob];
- }
- });
-
- // kernel weights
- for (int ob = 0, ot = 0; ot < output_tiles;
- ob++, ot += output_tile_chunk_size) {
- attach_uniform_buffer<float16_t>(
- kernel_block[ob],
- binding_point++,
- [&](float16_t* data, size_t size) {
- CAFFE_ENFORCE_EQ(
- size,
- (4 * input_tile_chunk_size) * (4 * output_tile_chunk_size) *
- geometry.kernel_size.y * geometry.kernel_size.x *
- sizeof(float16_t),
- "Kernel size mismatch");
- pack_kernel_data_for_tiled_conv(
- data,
- size,
- input_image->channels,
- output_image->channels,
- {it, std::min(it + input_tile_chunk_size, input_tiles)},
- {ot, std::min(ot + output_tile_chunk_size, output_tiles)});
- });
- }
-
- // PRelu scale
- if (prelu_scale != nullptr && ib == input_tile_batch_size - 1) {
- attach_uniform_buffer<float16_t>(
- prelu_scale_block,
- binding_point++,
- [&](float16_t* data, size_t size) {
- CAFFE_ENFORCE_GE(
- size,
- geometry.output_channels * sizeof(float16_t),
- "PRelu buffer size too small");
- for (int ob = 0; ob < geometry.output_channels; ob++) {
- data[ob] = prelu_scale_size == geometry.output_channels
- ? prelu_scale[ob]
- : prelu_scale[0];
- }
- });
- }
-
- std::vector<texture_attachment> input_attachments(
- {{input_image->textures[0], inputData[0]},
- {output_image->textures[0], previousData[0]}});
-
- run(input_attachments,
- {output_image->textures[0]},
- [&]() {
- glUniform2i(
- outputSize->location,
- output_image->texture_width,
- output_image->texture_height);
- // [inputTileFrom, inputTileTo)
- glUniform2i(
- inputTileRange->location,
- it,
- std::min(it + input_tile_chunk_size, input_tiles));
-
- glUniform1i(accumulate->location, it != 0);
- glUniform1i(
- fusePRelu->location,
- prelu_scale != nullptr && (ib == input_tile_batch_size - 1));
- },
- output_image->texture_width,
- output_image->texture_height);
- }
- }
-}
-
-namespace caffe2 {
-
-template <typename OPBase>
-static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
- Tensor<CPUContext> input, output;
- input.Resize(1, 1, H, W);
- op->SetOutputSize(input, &output, 1);
- CAFFE_ENFORCE_EQ(output.ndim(), 4);
- *OH = output.dim(2);
- *OW = output.dim(3);
-}
-
-static int computeOutputTileChunkSize(int output_tile_x,
- int output_tile_y,
- int kernel_width,
- int kernel_height) {
- static const int maxUniformBlockBufferSize = 16 * 1024;
- return std::min(
- output_tile_x * output_tile_y,
- maxUniformBlockBufferSize / 4 /
- (4 * kernel_width * kernel_height * (int)sizeof(float16_t)));
-}
-
-static int computeInputTileChunkSize(
- int input_tile_x,
- int input_tile_y,
- int output_tile_chunk_size,
- int kernel_width,
- int kernel_height) {
- static const int maxUniformBlockBufferSize = 16 * 1024;
- return std::min(
- input_tile_x * input_tile_y,
- maxUniformBlockBufferSize / 4 /
- (4 * output_tile_chunk_size * kernel_width * kernel_height *
- (int)sizeof(float16_t)));
-}
-
-// Todo: optimize input/output batch size and use of uniforms/textures for
-// kernel data
-static void computeBatchSizes(
- GLConvolution::descriptor& geometry,
- int& input_batch_size,
- int& output_batch_size) {
- int kernel_size = std::max(geometry.kernel_size.x, geometry.kernel_size.y);
- int input_slices = (geometry.input_channels + 3) / 4;
- int output_slices = (geometry.output_channels + 3) / 4;
-
-#if CAFFE2_ANDROID
- input_batch_size = input_slices % 2 == 0 ? 2 : 1;
- output_batch_size = output_slices % 2 == 0 ? 2 : 1;
-#else
- if (iPhoneVersion() >= 8) {
- // iPhone 6S and up
- input_batch_size =
- /* input_slices % 8 == 0 ? 8 : */ input_slices % 4 == 0
- ? 4
- : input_slices % 3 == 0 ? 3 : input_slices % 2 == 0 ? 2 : 1;
- output_batch_size = output_slices % 4 == 0
- ? 4
- : output_slices % 3 == 0 ? 3 : output_slices % 2 == 0 ? 2 : 1;
- }
-#endif
-}
-
-template <class T, bool fusePRelu, bool fuseRelu>
-class OpenGLConvOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
- public:
- USE_OPERATOR_BASE_FUNCTIONS;
- OpenGLConvOp(const OperatorDef& operator_def, Workspace* ws)
- : ConvPoolOpBase<CPUContext>(operator_def, ws) {
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- OPERATOR_NEEDS_FEATURE(group_ == 1, "OpenGL only supports group == 1");
- OPERATOR_NEEDS_FEATURE(
- dilation_h() == 1 && dilation_w() == 1,
- "OpenGL only supports dialation == 1");
- }
-
- bool RunOnDeviceWithOrderNCHW() override {
- const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
- auto& filter = Input(FILTER);
- auto& bias = Input(BIAS);
-
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- CAFFE_ENFORCE(filter.ndim(), 4);
- const int M = filter.dim32(0);
- const int kernel_width = filter.dim32(2);
- const int kernel_height = filter.dim32(3);
-
- CAFFE_ENFORCE(filter.dim32(1) == input_channels, "");
- CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
- CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
- CAFFE_ENFORCE(bias.ndim() == 1, "");
- CAFFE_ENFORCE(bias.dim32(0) == M, "");
-
- int output_height;
- int output_width;
- const int output_channels = M;
- computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
- float val = 0;
- const float* prelu_scale = nullptr;
- int prelu_scale_size = 0;
- if (fusePRelu) {
- auto& prelu = Input(PRELU);
- prelu_scale = prelu.template data<float>();
- prelu_scale_size = prelu.size();
- } else if (fuseRelu) {
- prelu_scale = &val;
- prelu_scale_size = 1;
- }
-
- const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
- int output_tile_x = 1, output_tile_y = 1;
- int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
- int input_tile_chunk_size = 1, output_tile_chunk_size = 1;
- int input_tile_batch_size = 1, output_tile_batch_size = 1;
-
- const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
-
- if (tiling) {
- // Turn on tiling
- CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
- computeOutputTiles(output_channels, output_tile_x, output_tile_y);
- output_tiles = output_tile_x * output_tile_y;
-
- output_tile_chunk_size = computeOutputTileChunkSize(
- output_tile_x, output_tile_y, kernel_width, kernel_height);
- output_tile_batch_size = std::max(
- MaxOutputTileBatchSize,
- (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
- output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
- output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
-
- input_tile_chunk_size = computeInputTileChunkSize(
- input_tile_x,
- input_tile_y,
- output_tile_chunk_size,
- kernel_width,
- kernel_height);
- input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
- // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
- // input_tile_batch_size;
- }
- CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
- CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
- CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
-
- int is_last = GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images,
- output_width,
- output_height,
- output_channels,
- output_tile_x,
- output_tile_y,
- is_last);
-
- // TODO: figure out the dilation business
- GLConvolution::descriptor geometry{input_channels,
- output_channels,
- {kernel_width, kernel_height},
- {input_width, input_height},
- {output_width, output_height},
- {input_tile_x, input_tile_y},
- {output_tile_x, output_tile_y},
- {pad_l(), pad_t()},
- {stride_w(), stride_h()},
- false};
-
- if (!conv) {
- int input_batch_size = 1, output_batch_size = 1;
- if (!tiling) {
- computeBatchSizes(geometry, input_batch_size, output_batch_size);
- input_batch_size =
- GetSingleArgument<int>("input_batch_size", input_batch_size);
- output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
- }
-
- LOG(INFO) << input_channels << ": " << input_height << " X "
- << input_width << " => " << output_channels << ": "
- << output_height << " X " << output_width
- << " Kernel: " << kernel_width << "X" << kernel_height;
- if (tiling) {
- LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
- << " => " << output_tile_x << " X " << output_tile_y
- << ", Texture size: " << input_width * input_tile_x << " X "
- << input_height * input_tile_y << " => "
- << output_width * output_tile_x << " X "
- << output_height * output_tile_y
- << ", Input tile batch size: " << input_tile_batch_size;
- } else {
- LOG(INFO) << "input_batch_size = " << input_batch_size
- << ", output_batch_size = " << output_batch_size;
- }
-
- conv.reset(new GLConvolution(geometry,
- filter.template data<float>(),
- bias.template data<float>(),
- prelu_scale,
- prelu_scale_size,
- input_batch_size,
- output_batch_size,
- input_tiles,
- output_tiles,
- input_tile_chunk_size,
- output_tile_chunk_size,
- input_tile_batch_size,
- output_tile_batch_size,
- tiling));
- }
-
- conv->convolution(input, *output);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLConvolution> conv;
-
- INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLConv, OpenGLConvOp<float16_t, false, false>);
-OPERATOR_SCHEMA(OpenGLConv).NumInputs(3).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvPRelu, OpenGLConvOp<float16_t, true, false>);
-OPERATOR_SCHEMA(OpenGLConvPRelu).NumInputs(4).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvRelu, OpenGLConvOp<float16_t, false, true>);
-OPERATOR_SCHEMA(OpenGLConvRelu).NumInputs(3).NumOutputs(1);
-
-template <class T, bool fusePRelu, bool fuseRelu>
-class OpenGLConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext>, ImageAllocator<T> {
- public:
- USE_OPERATOR_BASE_FUNCTIONS;
- OpenGLConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
- : ConvTransposeUnpoolBase<CPUContext>(operator_def, ws) {
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- OPERATOR_NEEDS_FEATURE(
- adj_h() == 0 && adj_w() == 0,
- "OpenGL only supports adj_h == 1 and adj_w == 1");
- }
-
- bool RunOnDeviceWithOrderNCHW() override {
- const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
- auto& filter = Input(FILTER);
- auto& bias = Input(BIAS);
-
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
- const int M = filter.dim32(0);
- const int C = filter.dim32(1);
- const int kernel_width = filter.dim32(2);
- const int kernel_height = filter.dim32(3);
-
- CAFFE_ENFORCE(input_channels == M, "filter number must be equal to input channel number");
- CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "filter height must be equal to kernel height");
- CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "filter width must be equal to kernel width");
- CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
- CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
-
- int output_height;
- int output_width;
- const int output_channels = C;
- computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
- float val = 0;
- const float* prelu_scale = nullptr;
- int prelu_scale_size = 0;
- if (fusePRelu) {
- auto& prelu = Input(PRELU);
- prelu_scale = prelu.template data<float>();
- prelu_scale_size = prelu.size();
- } else if (fuseRelu) {
- prelu_scale = &val;
- prelu_scale_size = 1;
- }
-
- const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
- int output_tile_x = 1, output_tile_y = 1;
- int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
- int input_tile_chunk_size = 1, output_tile_chunk_size = 1,
- input_tile_batch_size = 1, output_tile_batch_size = 1;
-
- const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
-
- if (tiling) {
- // Turn on tiling
- CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
- computeOutputTiles(output_channels, output_tile_x, output_tile_y);
- output_tiles = output_tile_x * output_tile_y;
-
- output_tile_chunk_size = computeOutputTileChunkSize(
- output_tile_x, output_tile_y, kernel_width, kernel_height);
- output_tile_batch_size = std::max(
- MaxOutputTileBatchSize,
- (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
- output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
- output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
-
- input_tile_chunk_size = computeInputTileChunkSize(
- input_tile_x,
- input_tile_y,
- output_tile_chunk_size,
- kernel_width,
- kernel_height);
- input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
- // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
- // input_tile_batch_size;
- }
- CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
- CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
- CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
-
- int is_last = GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images,
- output_width,
- output_height,
- output_channels,
- output_tile_x,
- output_tile_y,
- is_last);
-
- // TODO: figure out the adj business
- GLConvolution::descriptor geometry{input_channels,
- output_channels,
- {kernel_width, kernel_height},
- {input_width, input_height},
- {output_width, output_height},
- {input_tile_x, input_tile_y},
- {output_tile_x, output_tile_y},
- {pad_l(), pad_t()},
- {stride_w(), stride_h()},
- true};
-
- if (!conv) {
- int input_batch_size = 1, output_batch_size = 1;
- if (!tiling) {
- computeBatchSizes(geometry, input_batch_size, output_batch_size);
- input_batch_size =
- GetSingleArgument<int>("input_batch_size", input_batch_size);
- output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
- }
-
- LOG(INFO) << input_channels << ": " << input_height << " X "
- << input_width << " => " << output_channels << ": "
- << output_height << " X " << output_width
- << " Kernel: " << kernel_width << "X" << kernel_height;
-
- if (tiling) {
- LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
- << " => " << output_tile_x << " X " << output_tile_y
- << ", Texture size: " << input_width * input_tile_x << " X "
- << input_height * input_tile_y << " => "
- << output_width * output_tile_x << " X "
- << output_height * output_tile_y
- << ", Input tile batch size: " << input_tile_batch_size;
- } else {
- LOG(INFO) << "input_batch_size = " << input_batch_size
- << ", output_batch_size = " << output_batch_size;
- }
-
- conv.reset(new GLConvolution(geometry,
- filter.template data<float>(),
- bias.template data<float>(),
- prelu_scale,
- prelu_scale_size,
- input_batch_size,
- output_batch_size,
- input.tile_x() * input.tile_y(),
- output->tile_x() * output->tile_y(),
- input_tile_chunk_size,
- output_tile_chunk_size,
- input_tile_batch_size,
- output_tile_batch_size,
- tiling));
- }
-
- conv->convolution(input, *output);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLConvolution> conv;
-
- INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLConvTranspose, OpenGLConvTransposeOp<float16_t, false, false>);
-OPERATOR_SCHEMA(OpenGLConvTranspose).NumInputs(3).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvTransposePRelu, OpenGLConvTransposeOp<float16_t, true, false>);
-OPERATOR_SCHEMA(OpenGLConvTransposePRelu).NumInputs(4).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLConvTransposeRelu, OpenGLConvTransposeOp<float16_t, false, true>);
-OPERATOR_SCHEMA(OpenGLConvTransposeRelu).NumInputs(3).NumOutputs(1);
-} // namespace caffe2
+++ /dev/null
-#pragma once
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "gl_tiling_utils.h"
-
-class GLConvolution : public GLFilter {
- public:
- static constexpr int MaxInputBatchSize = 8;
- static constexpr int MaxOutputBatchSize = 4;
-
- struct descriptor {
- int input_channels;
- int output_channels;
- point kernel_size;
- point input_tile_size;
- point output_tile_size;
- point input_tile_grid_size;
- point output_tile_grid_size;
- point input_padding;
- point input_stride;
- bool transposed;
- };
-
- const float* kernel;
- const float* bias;
- const float* prelu_scale;
-
- binding* inputData[MaxInputBatchSize];
- binding* previousData[MaxOutputBatchSize];
- binding* outputSize;
- binding* accumulate;
- binding* fusePRelu;
- binding* kernel_block[MaxInputBatchSize];
- binding* bias_block;
- binding* prelu_scale_block;
- binding* inputTileRange;
-
- const descriptor geometry;
- const int prelu_scale_size;
- const int input_batch_size;
- const int output_batch_size;
- const int input_tiles;
- const int output_tiles;
- const int input_tile_chunk_size;
- const int output_tile_chunk_size;
- const int input_tile_batch_size;
- const int output_tile_batch_size;
- const bool tiling;
-
- static const char* fragment_shader;
-
- GLConvolution(
- const descriptor& _geometry,
- const float* _kernel,
- const float* _bias,
- const float* _prelu_scale = nullptr,
- int _prelu_scale_size = 0,
- int _input_batch_size = 1,
- int _output_batch_size = 1,
- int _input_tiles = 1,
- int _output_tiles = 1,
- int _input_tile_chunk_size = 1,
- int _output_tile_chunk_size = 1,
- int _input_tile_batch_size = 1,
- int _output_tile_batch_size = 1,
- bool _tiling = false)
- : GLFilter(
- "GLConvolution",
- vertex_shader,
- fragment_shader,
- input_bindings(_input_batch_size, _output_batch_size),
- uniform_blocks_bindings(
- _input_batch_size,
- _output_batch_size,
- _output_tile_batch_size,
- _prelu_scale != nullptr),
- {/* no attributes */},
- {{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
- {"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
- {"INPUT_BATCH_SIZE", c10::to_string(_input_batch_size)},
- {"OUTPUT_BATCH_SIZE", c10::to_string(_output_batch_size)},
- {"INPUT_TILES", c10::to_string(_input_tiles)},
- {"OUTPUT_TILES", c10::to_string(_output_tiles)},
- {"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
- {"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
- {"OUTPUT_TILE_WIDTH",
- c10::to_string(_geometry.output_tile_size.x)},
- {"OUTPUT_TILE_HEIGHT",
- c10::to_string(_geometry.output_tile_size.y)},
- {"INPUT_TILE_X", c10::to_string(_geometry.input_tile_grid_size.x)},
- {"OUTPUT_TILE_X",
- c10::to_string(_geometry.output_tile_grid_size.x)},
- {"INPUT_TILE_CHUNK_SIZE", c10::to_string(_input_tile_chunk_size)},
- {"OUTPUT_TILE_CHUNK_SIZE",
- c10::to_string(_output_tile_chunk_size)},
- {"OUTPUT_TILE_BATCH_SIZE",
- c10::to_string(_output_tile_batch_size)},
- {"TILED_CONVOLUTION", c10::to_string(_tiling)},
- {"INPUT_PADDING_X",
- c10::to_string(
- _geometry.transposed
- ? _geometry.kernel_size.x - 1 - _geometry.input_padding.x
- : _geometry.input_padding.x)},
- {"INPUT_PADDING_Y",
- c10::to_string(
- _geometry.transposed
- ? _geometry.kernel_size.y - 1 - _geometry.input_padding.y
- : _geometry.input_padding.y)},
- {"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
- {"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
- {"TRANSPOSED_CONVOLUTION", c10::to_string(_geometry.transposed)},
- {"BOUNDS_CHECK_MODE",
- c10::to_string(bounds_check_mode(_tiling, _geometry))}}),
- kernel(_kernel),
- bias(_bias),
- prelu_scale(_prelu_scale),
- geometry(_geometry),
- prelu_scale_size(_prelu_scale_size),
- input_batch_size(_input_batch_size),
- output_batch_size(_output_batch_size),
- input_tiles(_input_tiles),
- output_tiles(_output_tiles),
- input_tile_chunk_size(_input_tile_chunk_size),
- output_tile_chunk_size(_output_tile_chunk_size),
- input_tile_batch_size(_input_tile_batch_size),
- output_tile_batch_size(_output_tile_batch_size),
- tiling(_tiling) {}
-
- ~GLConvolution() {}
-
- template <typename T>
- void convolution(
- const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images);
-
- private:
- /*
- * Computes BOUNDS_CHECK_MODE for the convolution parameters.
- *
- * @retval 0 if bounds check can be skipped
- * @retval non-zero if bounds check can not be skipped
- */
- inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
- if (tiling) {
- return 1;
- }
-
- int input_padding_x = geometry.input_padding.x,
- input_padding_y = geometry.input_padding.y;
- if (geometry.transposed) {
- input_padding_x = geometry.kernel_size.x - 1 - input_padding_x;
- input_padding_y = geometry.kernel_size.y - 1 - input_padding_y;
- }
-
- if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
- (input_padding_x == 0 && input_padding_y == 0)) {
- return 0;
- } else {
- return 1;
- }
- }
-
- const std::vector<binding*> input_bindings(
- int input_batch_size,
- int output_batch_size) {
- std::vector<binding*> bindings({BINDING(outputSize),
- BINDING(accumulate),
- BINDING(fusePRelu),
- BINDING(inputTileRange)});
-
- for (int i = 0; i < input_batch_size; i++) {
- bindings.push_back(
- inputData[i] = new binding{"inputData[" + c10::to_string(i) + "]"});
- }
-
- for (int i = 0; i < output_batch_size; i++) {
- bindings.push_back(
- previousData[i] =
- new binding{"previousData[" + c10::to_string(i) + "]"});
- }
-
- return bindings;
- }
-
- const std::vector<binding*> uniform_blocks_bindings(
- int input_batch_size,
- int output_batch_size,
- int output_tile_batch_size,
- bool fuse_prelu) {
- std::vector<binding*> bindings({BINDING(bias_block)});
- if (fuse_prelu) {
- bindings.push_back(BINDING(prelu_scale_block));
- }
-
- for (int i = 0; i < std::max(input_batch_size, output_tile_batch_size);
- i++) {
- bindings.push_back(
- kernel_block[i] =
- new binding{"Kernel_block[" + c10::to_string(i) + "]"});
- }
-
- return bindings;
- }
-
- void pack_kernel_data_for_bached_conv(
- float16_t* data,
- size_t size,
- int input_channels,
- int output_channels,
- int is,
- int os,
- int ib);
-
- void pack_kernel_data_for_tiled_conv(
- float16_t* data, // destination
- size_t size,
- int input_channels,
- int output_channels,
- point input_tile_range,
- point output_tile_range);
-
- template <typename T>
- void run_batched_conv(
- const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images);
-
- template <typename T>
- void run_tiled_conv(
- const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images);
-};
+++ /dev/null
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-
-#include "../core/DataTransfer.h"
-#include "../core/GLContext.h"
-#include "../core/GLImage.h"
-#include "../core/GLPlainTexture.h"
-#include "../core/ImageAllocator.h"
-
-#include <algorithm>
-
-namespace caffe2 {
-template <class T>
-class CopyToOpenGLOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- CopyToOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {}
-
- bool RunOnDevice() override {
- // caffe2::Timer timer;
- const TensorCPU& X = Input(0);
- const int num_images = X.dim32(0);
- const int input_channels = X.dim32(1);
- const int input_width = X.dim32(3);
- const int input_height = X.dim32(2);
- const int input_size = input_width * input_height;
-
- // set up the OpenGL context
- GLContext::getGLContext()->set_context();
-
- const float* input = X.template data<float>();
-
- int tile_x = GetSingleArgument<int>("tile_x", 1);
- int tile_y = GetSingleArgument<int>("tile_y", 1);
-
- GLImageVector<T>* output_image = ImageAllocator<T>::newImage(num_images,
- input_width,
- input_height,
- input_channels,
- tile_x,
- tile_y,
-#if CAFFE2_IOS
- true
-#else
- false
-#endif
- );
-
- if (output_image->tile_x() > 1 || output_image->tile_y() > 1) {
- LOG(INFO) << "CopyToOpenGLOp tiling: " << output_image->tile_x() << ":"
- << output_image->tile_y();
- }
-
- Outputs()[0]->Reset(output_image);
-
- for (int i = 0; i < num_images; i++) {
- const auto textures = (*output_image)[i]->textures;
- for (int slice = 0; slice < textures.size(); slice++) {
- // timer.Start();
-
- textures[slice]->map_load([&](void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {
- for (int y = 0; y < tile_y; y++) {
- for (int x = 0; x < tile_x; x++) {
- const int tiles = slice * tile_x * tile_y + y * tile_x + x;
- const int slice_channels = std::min(4, input_channels - 4 * tiles);
- interleaveSlice(
- (float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
- &input[i * input_channels * input_size + 4 * tiles * input_size],
- input_width,
- input_height,
- stride, // texture stride
- slice_channels);
- }
- }
- });
- // LOG(INFO) << "Texture uploading takes " << timer.MilliSeconds() << " ms";
- }
- }
-
- return true;
- }
-};
-
-REGISTER_CPU_OPERATOR(CopyToOpenGL, CopyToOpenGLOp<float16_t>);
-OPERATOR_SCHEMA(CopyToOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-
-template <class T>
-class CopyFromOpenGLOp final : public Operator<CPUContext> {
- public:
- CopyFromOpenGLOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {}
-
- bool RunOnDevice() override {
- caffe2::Timer timer;
- const GLImageVector<T>& X = Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = X.size();
- const int input_channels = X.channels();
- const int input_width = X.width();
- const int input_height = X.height();
-
- TensorCPU* Y = Output(0);
- Y->Resize(num_images, input_channels, input_height, input_width);
- const int output_width = input_width;
- const int output_height = input_height;
- const int output_size = input_width * input_height;
-
- float* output = Y->mutable_data<float>();
-
- const int tile_x = X.tile_x();
- const int tile_y = X.tile_y();
- for (int i = 0; i < num_images; i++) {
- for (int slice = 0; slice < X[i]->slices; slice++) {
- timer.Start();
- const GLTexture* texture = X[i]->textures[slice];
-
- texture->map_read([&](const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {
- //#if CAFFE2_ANDROID && defined(__ARM_NEON__)
- // if (static_cast<AndroidGLContext*>(GLContext::getGLContext())->get_platform() ==
- // Mali) {
- // caffe2::Timer timer;
- // timer.Start();
- // float16_t* copy_buffer = (float16_t*)malloc(_capacity);
- // arm_memcpy(
- // (volatile unsigned char*)copy_buffer, (volatile unsigned char*)buffer,
- // _capacity);
- // deInterleaveSlice(
- // output + 4 * slice * output_size, copy_buffer, width, height, stride,
- // slice_channels);
- // free(copy_buffer);
- // LOG(INFO) << "memcpy takes " << timer.MilliSeconds() << " ms";
- // } else
- //#endif
- {
- gl_log(GL_VERBOSE,
- "calling deInterleaveSlice width: %d, height: %d, stride: %d, channels: %d\n",
- width,
- height,
- stride,
- channels);
-
- for (int y = 0; y < tile_y; y++) {
- for (int x = 0; x < tile_x; x++) {
- const int tiles = slice * tile_x * tile_y + y * tile_x + x;
- const int slice_channels = std::min(4, input_channels - 4 * tiles);
- deInterleaveSlice(
- output + i * input_channels * output_size + 4 * tiles * output_size,
- (float16_t*)buffer + 4 * (y * input_height * stride + x * input_width),
- input_width,
- input_height,
- stride,
- slice_channels);
- }
- }
- }
- });
- }
- }
- return true;
- }
-};
-
-REGISTER_CPU_OPERATOR(CopyFromOpenGL, CopyFromOpenGLOp<float16_t>);
-OPERATOR_SCHEMA(CopyFromOpenGL).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLReduce : public GLFilter {
- public:
- binding* inputSize;
- binding* outputSize;
- binding* tileSize;
- binding* inv_pixel_count;
- binding* epsilon;
- binding* inputData;
- binding* averageData;
-
- bool compute_inv_stdev;
- bool compute_norm;
-
- const std::vector<binding*> input_bindings(bool compute_norm_) {
- std::vector<binding*> bindings({BINDING(inputSize),
- BINDING(outputSize),
- BINDING(tileSize),
- BINDING(inv_pixel_count),
- BINDING(epsilon),
- BINDING(inputData)});
- if (compute_norm_) {
- bindings.push_back(BINDING(averageData));
- }
- return bindings;
- }
-
- GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
- : GLFilter(
- "GLReduce",
- vertex_shader,
- fragment_shader,
- input_bindings(compute_norm_),
- {/* no uniform_blocks_bindings */},
- {/* no attributes */},
- {{"COMPUTE_INV_STDEV", c10::to_string((int)compute_inv_stdev_)},
- {"COMPUTE_NORM", c10::to_string((int)compute_norm_)}}),
- compute_inv_stdev(compute_inv_stdev_),
- compute_norm(compute_norm_) {}
-
- template <typename T>
- void reduce(const GLImage<T>* input_image,
- const GLImage<T>* output_image,
- int tile_size_x,
- int tile_size_y,
- float inv_pixel_count_ = 1.0,
- float epsilon_ = 0.0);
-
- template <typename T>
- void norm(const GLImage<T>* input_image,
- const GLImage<T>* avg_image,
- const GLImage<T>* output_image,
- int tile_size_x,
- int tile_size_y,
- float inv_pixel_count_);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLReduce::fragment_shader = R"GLSL(#version 300 es
-
-#define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV)
-#define COMPUTE_NORM $(COMPUTE_NORM)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 inputSize;
-uniform ivec2 outputSize;
-uniform ivec2 tileSize;
-uniform float inv_pixel_count;
-uniform float epsilon;
-
-#if COMPUTE_NORM
-TEXTURE_INPUT(averageData);
-#endif
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize));
- ivec2 texelCoord = outputCoord * tileSize;
- ivec2 sumArea = min(tileSize, inputSize - texelCoord);
- highp vec4 sum = vec4(0.0);
-
-#if COMPUTE_NORM
- vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
-#endif
-
- for (int y = 0; y < sumArea.y; y++) {
- for (int x = 0; x < sumArea.x; x++) {
- ivec2 idx = texelCoord + ivec2(x, y);
- vec4 val = TEXTURE_LOAD(inputData, idx);
-#if COMPUTE_NORM
- val -= avg;
- sum += val * val;
-#else
- sum += val;
-#endif
- }
- }
-
-#if COMPUTE_INV_STDEV
- outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon)));
-#elif COMPUTE_NORM
- outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count));
-#else
- outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon));
-#endif
-}
-
-)GLSL";
-
-template <typename T>
-void GLReduce::reduce(const GLImage<T>* input_image,
- const GLImage<T>* output_image,
- int tile_size_x,
- int tile_size_y,
- float inv_pixel_count_,
- float epsilon_) {
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(inputSize->location, input_image->width, input_image->height);
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform2i(tileSize->location, tile_size_x, tile_size_y);
- glUniform1f(inv_pixel_count->location, inv_pixel_count_);
- glUniform1f(epsilon->location, epsilon_);
- },
- output_image->width,
- output_image->height);
- }
-}
-
-template <typename T>
-void GLReduce::norm(const GLImage<T>* input_image,
- const GLImage<T>* avg_image,
- const GLImage<T>* output_image,
- int tile_size_x,
- int tile_size_y,
- float inv_pixel_count_) {
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments(
- {{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(inputSize->location, input_image->width, input_image->height);
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform2i(tileSize->location, tile_size_x, tile_size_y);
- glUniform1f(inv_pixel_count->location, inv_pixel_count_);
- },
- output_image->width,
- output_image->height);
- }
-}
-
-class GLScale : public GLFilter {
- public:
- binding* outputSize;
- binding* inputData;
- binding* averageData;
- binding* normData;
-
- binding* scale_factor;
- binding* bias_factor;
- binding* prelu_scale_factor;
-
- const int channels;
- const float* scale;
- const float* bias;
- const float* prelu_scale;
- const int prelu_size;
-
- const std::vector<binding*> input_bindings(bool fuse_prelu) {
- std::vector<binding*> bindings({BINDING(outputSize),
- BINDING(scale_factor),
- BINDING(bias_factor),
- BINDING(inputData),
- BINDING(averageData),
- BINDING(normData)});
- if (fuse_prelu) {
- bindings.push_back(prelu_scale_factor = new binding({"prelu_scale_factor"}));
- }
- return bindings;
- }
-
- GLScale(
- const int _channels,
- const float* _scale,
- const float* _bias,
- const float* _prelu_scale = nullptr,
- const int _prelu_size = 0)
- : GLFilter(
- "GLScale",
- vertex_shader,
- fragment_shader,
- input_bindings(_prelu_scale != nullptr),
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"FUSE_PRELU", c10::to_string(_prelu_scale != nullptr)}}),
- channels(_channels),
- scale(_scale),
- bias(_bias),
- prelu_scale(_prelu_scale),
- prelu_size(_prelu_size) {}
-
- template <typename T>
- void scale_and_shift(const GLImage<T>* input_image,
- const GLImage<T>* avg_image,
- const GLImage<T>* norm_image,
- const GLImage<T>* output_image);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLScale::fragment_shader = R"GLSL(#version 300 es
-
-#define FUSE_PRELU $(FUSE_PRELU)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-uniform ivec2 outputSize;
-uniform vec4 scale_factor;
-uniform vec4 bias_factor;
-
-#if FUSE_PRELU
-uniform vec4 prelu_scale_factor;
-#endif
-
-TEXTURE_INPUT(inputData);
-TEXTURE_INPUT(averageData);
-TEXTURE_INPUT(normData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
- vec4 val = TEXTURE_LOAD(inputData, texelCoord);
- vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
- vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0));
-
-#if FUSE_PRELU
- vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor;
- vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0))));
- outputData = TEXTURE_STORE(o);
-#else
- vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor;
- outputData = TEXTURE_STORE(o);
-#endif
-}
-
-)GLSL";
-
-template <typename T>
-void GLScale::scale_and_shift(const GLImage<T>* input_image,
- const GLImage<T>* avg_image,
- const GLImage<T>* norm_image,
- const GLImage<T>* output_image) {
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
- {avg_image->textures[is], averageData},
- {norm_image->textures[is], normData}});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform4f(scale_factor->location,
- scale[4 * is],
- channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
- channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
- channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
- glUniform4f(bias_factor->location,
- bias[4 * is],
- channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
- channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
- channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
- if (prelu_scale != nullptr) {
- glUniform4f(prelu_scale_factor->location,
- prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
- channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
- : prelu_scale[0],
- channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
- : prelu_scale[0],
- channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
- : prelu_scale[0]);
- }
- },
- output_image->width,
- output_image->height);
- }
-}
-
-namespace caffe2 {
-template <class T, bool FUSE_PRELU>
-class OpenGLInstanceNormPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws),
- epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
- order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
- CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- const int tile_size_x = 16;
- const int tile_size_y = 16;
- int avg_buf_width = input_width;
- int avg_buf_height = input_height;
-
- vector<GLImageVector<T>*> reduce_buf;
- while (reduce_buf.size() == 0 ||
- (avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
- avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
- avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
-
- reduce_buf.push_back(
- ImageAllocator<T>::newImage(1, avg_buf_width, avg_buf_height, output_channels));
- }
-
- GLImageVector<T>* avg = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
- GLImageVector<T>* inv_stdev = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, is_last);
- const float* prelu_data = nullptr;
- int prelu_size = 0;
- if (FUSE_PRELU) {
- DCHECK_EQ(InputSize(), 4);
- const auto& prelu_scale = Input(PRELU);
- prelu_data = prelu_scale.template data<float>();
- prelu_size = prelu_scale.size();
- } else {
- DCHECK_EQ(InputSize(), 3);
- }
-
- const auto& scale = Input(SCALE);
- const auto& bias = Input(BIAS);
-
- if (!f_reduce) {
- f_reduce.reset(new GLReduce());
- f_norm.reset(new GLReduce(false, true));
- f_stdDev.reset(new GLReduce(true, false));
- f_scale.reset(new GLScale(input_channels,
- scale.template data<float>(),
- bias.template data<float>(),
- prelu_data,
- prelu_size));
- }
-
- for (int i = 0; i < num_images; i++) {
- for (int k = 0; k < reduce_buf.size() + 1; k++) {
- const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
- GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
-
- float norm = k < reduce_buf.size()
- ? 1.0 / (tile_size_x * tile_size_y)
- : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
- (float)(input_width * input_height);
- const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
- const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
- f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
- }
-
- for (int k = 0; k < reduce_buf.size() + 1; k++) {
- const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
- GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
-
- float norm = k < reduce_buf.size()
- ? 1.0 / (tile_size_x * tile_size_y)
- : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
- (float)(input_width * input_height);
-
- if (k == 0) {
- f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
- } else if (k < reduce_buf.size()) {
- f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
- } else {
- const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
- const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
- f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
- }
- }
-
- f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
- }
- Outputs()[OUTPUT]->Reset(output);
- if (OutputSize() > 1) {
- Outputs()[MEAN]->Reset(avg);
- Outputs()[INV_STDEV]->Reset(inv_stdev);
- } else {
- delete avg;
- delete inv_stdev;
- }
- for (auto&& rb : reduce_buf) {
- delete rb;
- }
-
- return true;
- }
-
- private:
- float epsilon_;
- StorageOrder order_;
- std::unique_ptr<GLReduce> f_reduce;
- std::unique_ptr<GLReduce> f_norm;
- std::unique_ptr<GLReduce> f_stdDev;
- std::unique_ptr<GLScale> f_scale;
-
- INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
- OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLInstanceNorm, OpenGLInstanceNormPReluOp<float16_t, false>);
-OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
-REGISTER_CPU_OPERATOR(OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp<float16_t, true>);
-OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-
-class GLMul : public GLFilter {
- public:
- binding* outputSize;
- binding* inputData;
- binding* B;
-
- GLMul()
- : GLFilter("GLMul",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(outputSize), BINDING(inputData), BINDING(B)}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {/* no replacements */}) {}
-
- template <typename T>
- void mul(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images, float b);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLMul::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-uniform vec4 B;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 A = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE(A * B);
-}
-
-)GLSL";
-
-template <typename T>
-void GLMul::mul(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- float b) {
- for (int i = 0; i < input_images.size(); i++) {
- auto input_image = input_images[i];
- auto output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform4f(B->location, b, b, b, b);
- },
- output_image->width,
- output_image->height);
- }
- }
-}
-
-namespace caffe2 {
-template <class T>
-class OpenGLMulOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLMulOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {
- OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
- "OpenGLMul only supports broadcast");
-
- OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
- "OpenGLMul does not support axis");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
- const auto& B = Input(1);
- CAFFE_ENFORCE_EQ(B.size(), 1); // only scalar is supported
-
- const int num_images = input.size();
- const auto output_height = input.height();
- const auto output_width = input.width();
- const int output_channels = input.channels();
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, is_last);
-
- if (!_mult) {
- _mult.reset(new GLMul());
- }
-
- _mult->mul(input, *output, B.template data<float>()[0]);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLMul> _mult;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLMul, OpenGLMulOp<float16_t>);
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLNormPlanarYUV : public GLFilter {
- public:
- const float* mean;
- const float* std;
-
- binding* inputData;
- binding* outputSize;
- binding* mean_data;
- binding* std_data;
-
- GLNormPlanarYUV(const float* _mean, const float* _std)
- : GLFilter("GLNormPlanarYUV",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(inputData),
- BINDING(outputSize),
- BINDING(mean_data),
- BINDING(std_data)}), // input bindings
- {/* no uniform blocks */},
- {/* no attributes */},
- {}),
- mean(_mean),
- std(_std) {}
-
- template <typename T>
- void normalize(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLNormPlanarYUV::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-uniform vec4 mean_data;
-uniform vec4 std_data;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE((value - mean_data) / std_data);
-}
-
-)GLSL";
-
-template <class T>
-void GLNormPlanarYUV::normalize(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images) {
- int num_images = input_images.size();
- for (int i = 0; i < num_images; i++) {
- GLImage<T>* input_image = input_images[i];
- GLImage<T>* output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
-
- std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform4f(mean_data->location, mean[0], mean[1], mean[2], 0.0);
- glUniform4f(std_data->location, std[0], std[1], std[2], 1.0);
- },
- output_image->width,
- output_image->height);
- }
- }
-}
-
-namespace caffe2 {
-template <typename T>
-class GLNormPlanarYUVOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- GLNormPlanarYUVOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws),
- order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, is_last);
-
- const auto& M = Input(1); // mean
- const auto& S = Input(2); // standard deviation
- CAFFE_ENFORCE(input_channels == M.dim(1));
- CAFFE_ENFORCE(input_channels == S.dim(1));
-
- if (!_normPlanarYUV) {
- _normPlanarYUV.reset(new GLNormPlanarYUV(M.template data<float>(), S.template data<float>()));
- }
-
- _normPlanarYUV->normalize(input, *output);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- StorageOrder order_;
- std::unique_ptr<GLNormPlanarYUV> _normPlanarYUV;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLNormalizePlanarYUV, GLNormPlanarYUVOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLNormalizePlanarYUV).NumInputs(3).NumOutputs(1);
-
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLPRelu : public GLFilter {
- public:
- typedef enum { PRelu = 0, Relu = 1 } ReluType;
-
- const float* scale;
-
- binding* inputData;
- binding* scale_block;
-
- const int scale_size;
- const int channels;
- const int output_tile_x;
- const int output_tile_y;
- const int output_tile_width;
- const int output_tile_height;
-
- GLPRelu(
- const float* _scale,
- const int _scale_size,
- const int _channels,
- int _output_tile_x,
- int _output_tile_y,
- int _output_tile_width,
- int _output_tile_height)
- : GLFilter(
- "GLPRelu",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(inputData)}),
- std::vector<binding*>({BINDING(scale_block)}),
- {/* no attributes */},
- {{"USE_RELU", c10::to_string(PRelu)},
- {"OUTPUT_TILES", c10::to_string(_output_tile_x * _output_tile_y)},
- {"OUTPUT_TILE_X", c10::to_string(_output_tile_x)},
- {"OUTPUT_TILE_WIDTH", c10::to_string(_output_tile_width)},
- {"OUTPUT_TILE_HEIGHT", c10::to_string(_output_tile_height)},
- {"TILED_PRELU",
- c10::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
- scale(_scale),
- scale_size(_scale_size),
- channels(_channels),
- output_tile_x(_output_tile_x),
- output_tile_y(_output_tile_y),
- output_tile_width(_output_tile_width),
- output_tile_height(_output_tile_height) {}
-
- GLPRelu(const int _channels)
- : GLFilter(
- "GLRelu",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(inputData)}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"USE_RELU", c10::to_string(Relu)},
- {"OUTPUT_TILES", c10::to_string(1)},
- {"OUTPUT_TILE_X", c10::to_string(1)},
- {"OUTPUT_TILE_WIDTH", c10::to_string(1)},
- {"OUTPUT_TILE_HEIGHT", c10::to_string(1)},
- {"TILED_PRELU", c10::to_string(0)}}),
- scale(nullptr),
- scale_block(nullptr),
- scale_size(0),
- channels(_channels),
- output_tile_x(1),
- output_tile_y(1),
- output_tile_width(1),
- output_tile_height(1) {}
-
- template <typename T>
- void prelu(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- GLPRelu::ReluType reluType);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
-#define TILED_PRELU $(TILED_PRELU)
-#define USE_RELU $(USE_RELU)
-
-// tiling
-#define OUTPUT_TILES $(OUTPUT_TILES)
-#define OUTPUT_TILE_X $(OUTPUT_TILE_X)
-#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
-
-// common
-precision mediump float;
-precision highp int;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-in highp vec2 v_texCoord;
-
-#if USE_RELU
-
-// Relu
-void main() {
- ivec2 inputSize = textureSize(inputData, 0);
- ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE(max(value, vec4(0.0)));
-}
-
-#else
-
-#if TILED_PRELU
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-layout (std140) uniform scale_block {
- highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
-};
-
-void main() {
- ivec2 inputSize = textureSize(inputData, 0);
- ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-
- ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
- int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
-
- // outputData = value > 0 ? value : value * weight;
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
- value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
- outputData = TEXTURE_STORE(value);
-}
-#else
-layout (std140) uniform scale_block {
- highp uvec4 scale;
-};
-void main() {
- ivec2 inputSize = textureSize(inputData, 0);
- ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
-
- // outputData = value > 0 ? value : value * weight;
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
- outputData = TEXTURE_STORE(value);
-}
-#endif // TILED_PRELU
-
-#endif // USE_RELU
-
-)GLSL";
-
-template <typename T>
-void GLPRelu::prelu(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- GLPRelu::ReluType reluType) {
- int num_images = input_images.size();
- for (int i = 0; i < num_images; i++) {
- GLImage<T>* input_image = input_images[i];
- GLImage<T>* output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- if (reluType == PRelu) {
- attach_uniform_buffer<float16_t>(scale_block, 0, [&](float16_t* data, size_t size) {
- int output_tiles = output_tile_x * output_tile_y;
- for (int j = 0, k = 4 * is * output_tiles;
- k < std::min(channels, 4 * (is + 1) * output_tiles);
- j++, k++) {
- data[j] = scale_size == channels ? scale[k] : scale[0];
- }
- });
- }
-
- std::vector<texture_attachment> input_attachments;
-
- input_attachments.push_back({input_image->textures[is], inputData});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {},
- output_image->texture_width,
- output_image->texture_height);
- }
- }
-}
-
-namespace caffe2 {
-template <typename T, GLPRelu::ReluType reluType>
-class OpenGLPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLPReluOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws),
- order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
- const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
- if (input_tile_x > 1 || input_tile_y > 1) {
- CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
- }
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(num_images,
- output_width,
- output_height,
- output_channels,
- output_tile_x,
- output_tile_y,
- is_last);
-
- const auto* scale = reluType == GLPRelu::PRelu ? &Input(1) : nullptr;
-
- if (!_prelu) {
- if (reluType == GLPRelu::PRelu) {
- _prelu.reset(new GLPRelu(scale->template data<float>(),
- scale->size(),
- input_channels,
- output_tile_x,
- output_tile_y,
- output_width,
- output_height));
- } else {
- _prelu.reset(new GLPRelu(input_channels));
- }
- }
-
- _prelu->prelu(input, *output, reluType);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- StorageOrder order_;
- std::unique_ptr<GLPRelu> _prelu;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLPRelu, OpenGLPReluOp<float16_t, GLPRelu::PRelu>);
-OPERATOR_SCHEMA(OpenGLPRelu)
- .NumInputs(2)
- .NumOutputs(1)
- .AllowInplace({{0, 0}})
- .IdenticalTypeAndShape();
-REGISTER_CPU_OPERATOR(OpenGLRelu, OpenGLPReluOp<float16_t, GLPRelu::Relu>);
-OPERATOR_SCHEMA(OpenGLRelu)
- .NumInputs(1)
- .NumOutputs(1)
- .AllowInplace({{0, 0}})
- .IdenticalTypeAndShape();
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-
-class GLPadImage : public GLFilter {
- public:
- binding* padSize;
- binding* inputSize;
- binding* outputSize;
- binding* inputData;
-
- GLPadImage()
- : GLFilter(
- "GLPadImage",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>(
- {BINDING(padSize), BINDING(inputSize), BINDING(outputSize), BINDING(inputData)}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {/* no replacements */}) {}
-
- template <typename T>
- void pad(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- const int pad_l,
- const int pad_t);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLPadImage::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 padSize;
-uniform ivec2 inputSize;
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)) - padSize;
- texelCoord = max(texelCoord, -texelCoord);
- texelCoord = min(texelCoord, ivec2(2) * (inputSize - 1) - texelCoord);
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE(value);
-}
-
-)GLSL";
-
-template <typename T>
-void GLPadImage::pad(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- const int pad_l,
- const int pad_t) {
- for (int i = 0; i < input_images.size(); i++) {
- auto input_image = input_images[i];
- auto output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(inputSize->location, input_image->width, input_image->height);
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform2i(padSize->location, pad_l, pad_t);
- },
- output_image->width,
- output_image->height);
- }
- }
-}
-
-namespace caffe2 {
-
-template <typename OPBase>
-static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
- Tensor<CPUContext> input, output;
- input.Resize(1, 1, H, W);
- op->SetOutputSize(input, &output, 1);
- CAFFE_ENFORCE_EQ(output.ndim(), 4);
- *OH = output.dim(2);
- *OW = output.dim(3);
-}
-
-template <class T>
-class OpenGLPadImageOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLPadImageOp(const OperatorDef& operator_def, Workspace* ws)
- : ConvPoolOpBase<CPUContext>(operator_def, ws),
- mode_(OperatorBase::GetSingleArgument<string>("mode", "")) {
- OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- OPERATOR_NEEDS_FEATURE(mode_ == "reflect", "OpenGL only supports reflection");
-
- CAFFE_ENFORCE(legacy_pad_ == LegacyPadding::NOTSET,
- "Padding layer only supports explicit pad values.");
- CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
- "Pooling op does not support dilation right now.");
- CAFFE_ENFORCE(stride_h() == 1 && stride_w() == 1,
- "Pooling op does not support stride right now.");
- // Pad op does not use kernel sizes, so we set it to 1 for computing the
- // output size.
- kernel_.assign(pads_.size() / 2, 1);
- }
-
- bool RunOnDeviceWithOrderNCHW() override {
- const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
-
- const int num_images = input.size();
- const int input_width = input.width();
- const int input_height = input.height();
- const int input_channels = input.channels();
- const int output_channels = input_channels;
-
- int output_height, output_width;
- computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, is_last);
-
- if (!padImage_) {
- padImage_.reset(new GLPadImage());
- LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => "
- << output_channels << ": " << output_height << " X " << output_width;
- LOG(INFO) << "Padmode: " << mode_ << ", pad_l = " << pad_l() << ", pad_r = " << pad_r() << ", pad_t = " << pad_t()
- << ", pad_b = " << pad_b();
- }
-
- padImage_->pad(input, *output, pad_l(), pad_t());
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::string mode_;
- std::unique_ptr<GLPadImage> padImage_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLPadImage, OpenGLPadImageOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLPadImage).NumInputs(1).NumOutputs(1);
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/timer.h"
-#include "caffe2/operators/pool_op.h"
-
-class GLPool : public GLFilter {
- public:
- typedef enum { AveragePool, MaxPool } PoolType;
-
- struct point {
- int x;
- int y;
- };
-
- struct descriptor {
- int channels;
- point kernel_size;
- point input_padding;
- point input_stride;
- point input_tile_size;
- point output_tile_size;
- };
-
- binding* inputData;
- binding* kernelSize;
- binding* outputSize;
-
- const descriptor geometry;
-
- GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
- : GLFilter(
- "GLPool",
- vertex_shader,
- fragment_shader,
- {
- BINDING(inputData),
- BINDING(kernelSize),
- BINDING(outputSize),
- },
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
- {"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
- {"INPUT_PADDING_X", c10::to_string(_geometry.input_padding.x)},
- {"INPUT_PADDING_Y", c10::to_string(_geometry.input_padding.y)},
- {"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
- {"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
- {"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
- {"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
- {"OUTPUT_TILE_WIDTH",
- c10::to_string(_geometry.output_tile_size.x)},
- {"OUTPUT_TILE_HEIGHT",
- c10::to_string(_geometry.output_tile_size.y)},
- {"TILED_POOLING", c10::to_string(_tiling)},
- {"MAX_POOL", c10::to_string(poolType == MaxPool)},
- {"BOUNDS_CHECK_MODE", c10::to_string(1)}}),
- geometry(_geometry) {}
- ~GLPool() {}
-
- void pool(const GLImageVector<float16_t>& input_images,
- const GLImageVector<float16_t>& output_images) {
- for (int i = 0; i < input_images.size(); i++) {
- auto input_image = input_images[i];
- auto output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- run({{input_image->textures[is], inputData}},
- {output_image->textures[is]},
- [&]() {
- glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
- glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
- },
- output_image->texture_width,
- output_image->texture_height);
- }
- }
- }
-
- private:
- /*
- * Computes BOUNDS_CHECK_MODE for the convolution parameters.
- *
- * @retval 0 if bounds check can be skipped
- * @retval non-zero if bounds check can not be skipped
- */
- inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
- if (tiling) {
- return 1;
- }
-
- if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
- (geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
- return 0;
- } else {
- return 1;
- }
- }
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-const char* GLPool::fragment_shader = R"GLSL(#version 300 es
-#define TILED_POOLING $(TILED_POOLING)
-#define MAX_POOL $(MAX_POOL)
-
-// tiling
-#define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
-#define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
-#define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
-#define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
-
-#define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
-const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
-const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
-
-uniform ivec2 kernelSize;
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-#if BOUNDS_CHECK_MODE == 0
- #define IN_BOUNDS(p, p0, p1) (true)
-#else
- #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
-#endif
-
-// MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
-const float MIN_FLOAT = -exp2(14.0);
-
-#if TILED_POOLING
-
-const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
-const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
-
-// tiled pooling
-#if MAX_POOL
-
-#define POOL { \
- pool = vec4(MIN_FLOAT); \
- for (int y = 0; y < kernelSize.y; y++) { \
- for (int x = 0; x < kernelSize.x; x++) { \
- ivec2 idx = tileCoord + ivec2(x, y); \
- if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
- vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
- pool = max(pool, data); \
- } \
- } \
- } \
-}
-
-#else
-
-#define POOL { \
- int count = 0; \
- for (int y = 0; y < kernelSize.y; y++) { \
- for (int x = 0; x < kernelSize.x; x++) { \
- ivec2 idx = tileCoord + ivec2(x, y); \
- if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
- vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
- pool += data;\
- count += 1; \
- } \
- } \
- } \
- pool = pool / float(count); \
-}
-
-#endif // MAX_POOL
-
-void main() {
- ivec2 inputSize = textureSize(inputData, 0);
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
-
- ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
- ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
- tileCoord = input_stride * tileCoord - input_padding;
-
- ivec2 inputTileOffset = tile * inputTileSize;
-
-#if MAX_POOL
- vec4 pool = vec4(0);
-#else
- highp vec4 pool = vec4(0);
-#endif
-
- POOL;
-
- outputData = TEXTURE_STORE(pool);
-}
-
-#else
-
-// no tiling
-#if MAX_POOL
-
-#define POOL { \
- pool = vec4(MIN_FLOAT); \
- for (int y = 0; y < kernelSize.y; y++) { \
- for (int x = 0; x < kernelSize.x; x++) { \
- ivec2 idx = texelCoord + ivec2(x, y); \
- if IN_BOUNDS(idx, ivec2(0), inputSize) { \
- vec4 data = TEXTURE_LOAD(inputData, idx); \
- pool = max(pool, data); \
- } \
- } \
- } \
-}
-
-#else
-
-#define POOL { \
- int count = 0; \
- for (int y = 0; y < kernelSize.y; y++) { \
- for (int x = 0; x < kernelSize.x; x++) { \
- ivec2 idx = texelCoord + ivec2(x, y); \
- if IN_BOUNDS(idx, ivec2(0), inputSize) { \
- vec4 data = TEXTURE_LOAD(inputData, idx); \
- pool += data; \
- count += 1; \
- } \
- } \
- } \
- pool = pool / float(count); \
-}
-
-#endif // MAX_POOL
-
-void main() {
- ivec2 inputSize = textureSize(inputData, 0);
- ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
-#if MAX_POOL
- vec4 pool = vec4(0);
-#else
- highp vec4 pool = vec4(0);
-#endif
-
- POOL;
-
- outputData = TEXTURE_STORE(pool);
-}
-#endif // TILED_POOLING
-
-)GLSL";
-
-namespace caffe2 {
-
-template <typename OPBase>
-static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
- Tensor<CPUContext> input, output;
- input.Resize(1, 1, H, W);
- op->SetOutputSize(input, &output, 1);
- CAFFE_ENFORCE_EQ(output.ndim(), 4);
- *OH = output.dim(2);
- *OW = output.dim(3);
-}
-
-template <typename T, GLPool::PoolType poolType>
-class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
- public:
- GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
- : ConvPoolOpBase<CPUContext>(operator_def, ws) {
- OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
- "Pooling op does not support dilation right now.");
- if (!global_pooling_) {
- CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
- pad_r() < kernel_w(),
- "Pad should be smaller than kernel.");
- }
- }
-
- bool RunOnDeviceWithOrderNCHW() override {
- const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- int output_height;
- int output_width;
- const int output_channels = input_channels;
-
- computeOutputHW(this, input_height, input_width, &output_height, &output_width);
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
- const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-
- GLPool::descriptor geometry{input_channels,
- {kernel_w(), kernel_h()},
- {pad_l(), pad_t()},
- {stride_w(), stride_h()},
- {input_width, input_height},
- {output_height, output_width}};
-
- if (!glPool_) {
- LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
- << output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
- << " Tiling: " << input_tile_x << "X" << input_tile_y;
-
- glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
- }
-
- glPool_->pool(input, *output);
-
- OperatorBase::Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLPool> glPool_;
-};
-
-namespace {
-REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
-REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
-OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
-OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
-}; // namespace
-}; // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLResizeNearest : public GLFilter {
- public:
- binding* inputData;
- binding* outputSize;
- binding* scale_reverse;
-
- GLResizeNearest()
- : GLFilter("GLResizeNearest",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(outputSize), BINDING(scale_reverse), BINDING(inputData)}),
- {/* no uniform blocks*/},
- {/* no attributes */},
- {/* replacements */}) {}
-
- template <typename T>
- void resize(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- float width_scale_rev,
- float height_scale_rev);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLResizeNearest::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-uniform highp vec2 scale_reverse;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- // it clamps to the edge by default
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize) * scale_reverse);
- vec4 value = TEXTURE_LOAD(inputData, texelCoord);
- outputData = TEXTURE_STORE(value);
-}
-)GLSL";
-
-template <typename T>
-void GLResizeNearest::resize(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images,
- float width_scale_rev,
- float height_scale_rev) {
- for (int i = 0; i < input_images.size(); i++) {
- auto input_image = input_images[i];
- auto output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
- glUniform2f(scale_reverse->location, width_scale_rev, height_scale_rev);
- },
- output_image->texture_width,
- output_image->texture_height);
- }
- }
-}
-
-namespace caffe2 {
-
-template <class T>
-class OpenGLResizeNearestOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLResizeNearestOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
- if (HasArgument("width_scale")) {
- width_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("width_scale", 1));
- }
- if (HasArgument("height_scale")) {
- height_scale_ = static_cast<float>(OperatorBase::GetSingleArgument<float>("height_scale", 1));
- }
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_width = input.width();
- const int input_height = input.height();
- const int input_channels = input.channels();
-
- const int output_width = input_width * width_scale_;
- const int output_height = input_height * height_scale_;
- const int output_channels = input_channels;
-
- const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
- const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
-
- if (!resizeNearest_) {
- resizeNearest_.reset(new GLResizeNearest());
- }
- resizeNearest_->resize(input, *output, 1.0 / width_scale_, 1.0 / height_scale_);
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- protected:
- float width_scale_;
- float height_scale_;
- std::unique_ptr<GLResizeNearest> resizeNearest_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLResizeNearest, OpenGLResizeNearestOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLResizeNearest).NumInputs(1).NumOutputs(1);
-
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-typedef enum { Sigmoid, Tanh } OpType;
-
-class GLSigmoid : public GLFilter {
- public:
- binding* inputData;
- binding* outputSize;
-
- GLSigmoid(OpType opType)
- : GLFilter(
- "GLSigmoid",
- vertex_shader,
- fragment_shader,
- {BINDING(outputSize), BINDING(inputData)},
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"SIGMOID", c10::to_string(opType == Sigmoid)},
- {"TANH", c10::to_string(opType == Tanh)}}) {}
-
- template <typename T>
- void sigmoid(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLSigmoid::fragment_shader = R"GLSL(#version 300 es
-#define SIGMOID $(SIGMOID)
-#define TANH $(TANH)
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 value = TEXTURE_LOAD(inputData, ivec2(texelCoord));
-#if SIGMOID
- value = vec4(1.0) / (vec4(1.0) + exp(-value));
- outputData = TEXTURE_STORE(value);
-#elif TANH
- value = tanh(value);
- outputData = TEXTURE_STORE(value);
-#endif
-}
-
-)GLSL";
-
-template <typename T>
-void GLSigmoid::sigmoid(const GLImageVector<T>& input_images,
- const GLImageVector<T>& output_images) {
- for (int i = 0; i < input_images.size(); i++) {
- auto input_image = input_images[i];
- auto output_image = output_images[i];
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- run(std::vector<texture_attachment>({{input_image->textures[is], inputData}}),
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
- output_image->width,
- output_image->height);
- }
- }
-}
-
-namespace caffe2 {
-template <typename T, OpType opType>
-class OpenGLSigmoidOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {}
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, is_last);
-
- if (!_sigmoid) {
- _sigmoid.reset(new GLSigmoid(opType));
- }
-
- _sigmoid->sigmoid(input, *output);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLSigmoid> _sigmoid;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLSigmoid, OpenGLSigmoidOp<float16_t, Sigmoid>);
-OPERATOR_SCHEMA(OpenGLSigmoid)
- .NumInputs(1)
- .NumOutputs(1)
- .AllowInplace({{0, 0}})
- .IdenticalTypeAndShape();
-
-REGISTER_CPU_OPERATOR(OpenGLTanh, OpenGLSigmoidOp<float16_t, Tanh>);
-OPERATOR_SCHEMA(OpenGLTanh)
- .NumInputs(1)
- .NumOutputs(1)
- .AllowInplace({{0, 0}})
- .IdenticalTypeAndShape();
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLSoftmaxReduce : public GLFilter {
- public:
- binding* inputTileSize;
- binding* outputSize;
- binding* outputTileSize;
- binding* tileSize;
- binding* spatialTileSize;
- binding* inputTileRange;
- binding* inputData;
- binding* maxData;
- binding* sumData;
-
- const std::vector<binding*> input_bindings() {
- std::vector<binding*> bindings({BINDING(inputTileSize),
- BINDING(outputSize),
- BINDING(outputTileSize),
- BINDING(tileSize),
- BINDING(spatialTileSize),
- BINDING(inputTileRange),
- BINDING(inputData),
- BINDING(maxData),
- BINDING(sumData)});
- return bindings;
- }
-
- GLSoftmaxReduce(
- bool compute_sum_ = false,
- bool tiled = false,
- int input_tile_x = 1)
- : GLFilter(
- "GLSoftmaxReduce",
- vertex_shader,
- fragment_shader,
- input_bindings(),
- {/* no uniform_blocks_bindings */},
- {/* no attributes */},
- {{"COMPUTE_SUM", c10::to_string((int)compute_sum_)},
- {"INPUT_TILE_X", c10::to_string(input_tile_x)},
- {"TILED_SOFTMAX", c10::to_string(int(tiled))}}) {}
-
- template <typename T>
- void reduce(const GLImage<T>* input_image,
- const GLImage<T>* output_image,
- int tile_size_x,
- int tile_size_y);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLSoftmaxReduce::fragment_shader = R"GLSL(#version 300 es
-
-#define TILED_SOFTMAX $(TILED_SOFTMAX)
-#define INPUT_TILE_X $(INPUT_TILE_X)
-// Compute sum or max
-#define COMPUTE_SUM $(COMPUTE_SUM)
-
-precision highp float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 inputTileSize;
-uniform ivec2 outputSize;
-uniform ivec2 outputTileSize;
-uniform ivec2 spatialTileSize;
-uniform ivec2 tileSize;
-uniform ivec2 inputTileRange;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_OUTPUT(0, outputData);
-
-#if TILED_SOFTMAX
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
- ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
- ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize);
-
- vec4 result = vec4(0.0);
- for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) {
- int inTileX = tileIdx % INPUT_TILE_X;
- int inTileY = tileIdx / INPUT_TILE_X;
- ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
- for (int y = 0; y < sumArea.y; y++) {
- for (int x = 0; x < sumArea.x; x++) {
- ivec2 idx = tileCoord + ivec2(x, y);
- vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx);
- #if COMPUTE_SUM
- result += val;
- #else
- result = max(result, val);
- #endif
- }
- }
- }
-
- outputData = TEXTURE_STORE(result);
-}
-#else
-void main() {
- ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize));
- ivec2 texelCoord = outputCoord * spatialTileSize;
- ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord);
- vec4 result = vec4(0.0);
-
- for (int y = 0; y < sumArea.y; y++) {
- for (int x = 0; x < sumArea.x; x++) {
- ivec2 idx = texelCoord + ivec2(x, y);
- vec4 val = TEXTURE_LOAD(inputData, idx);
-#if COMPUTE_SUM
- result += val;
-#else
- result = max(result, val);
-#endif
- }
- }
-
- outputData = TEXTURE_STORE(result);
-}
-#endif
-)GLSL";
-
-template <typename T>
-void GLSoftmaxReduce::reduce(const GLImage<T>* input_image,
- const GLImage<T>* output_image,
- int tile_size_x,
- int tile_size_y) {
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
- run(input_attachments,
- {output_image->textures.begin() + is,
- output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(
- inputTileSize->location, input_image->width, input_image->height);
- glUniform2i(
- outputSize->location,
- output_image->texture_width,
- output_image->texture_height);
- glUniform2i(
- outputTileSize->location,
- output_image->width,
- output_image->height);
- glUniform2i(
- tileSize->location, input_image->tile_x, input_image->tile_y);
- glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
- glUniform2i(
- inputTileRange->location,
- 0,
- std::min(
- (input_image->channels + 3) / 4,
- input_image->tile_x * input_image->tile_y));
- },
- output_image->texture_width,
- output_image->texture_height);
- }
-}
-
-class GLSoftmaxScale : public GLFilter {
- public:
- binding* outputSize;
- binding* inputData;
- binding* maxData;
- binding* sumData;
-
- const std::vector<binding*> input_bindings() {
- std::vector<binding*> bindings(
- {BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
- return bindings;
- }
-
- GLSoftmaxScale(bool _compute_exp = false, bool tiled = false)
- : GLFilter(
- "GLSoftmaxScale",
- vertex_shader,
- fragment_shader,
- input_bindings(),
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"COMPUTE_EXP", c10::to_string((int)_compute_exp)},
- {"TILED_SOFTMAX", c10::to_string((int)tiled)}}) {}
-
- template <typename T>
- void scale(const GLImage<T>* input_image,
- const GLImage<T>* max_image,
- const GLImage<T>* sum_image,
- const GLImage<T>* output_image);
-
- static const char* fragment_shader;
-};
-
-template <typename T>
-void GLSoftmaxScale::scale(const GLImage<T>* input_image,
- const GLImage<T>* max_image,
- const GLImage<T>* sum_image,
- const GLImage<T>* output_image) {
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
- {max_image->textures[is], maxData},
- {sum_image->textures[is], sumData}});
- run(input_attachments,
- {output_image->textures.begin() + is,
- output_image->textures.begin() + is + 1},
- [&]() {
- glUniform2i(
- outputSize->location,
- output_image->texture_width,
- output_image->texture_height);
- },
- output_image->texture_width,
- output_image->texture_height);
- }
-}
-
-// MARK: GLSL
-
-const char* GLSoftmaxScale::fragment_shader = R"GLSL(#version 300 es
-
-#define COMPUTE_EXP $(COMPUTE_EXP)
-#define TILED_SOFTMAX $(TILED_SOFTMAX)
-
-precision highp float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData);
-TEXTURE_INPUT(maxData);
-TEXTURE_INPUT(sumData);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 val = TEXTURE_LOAD(inputData, texelCoord);
-#if COMPUTE_EXP
- vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0));
- #if TILED_SOFTMAX
- float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w);
- maxVal = vec4(singleMax, singleMax, singleMax, singleMax);
- outputData = TEXTURE_STORE(exp(val - maxVal));
- #else
- outputData = TEXTURE_STORE(exp(val - maxVal));
- #endif
-
-#else
- vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0));
- #if TILED_SOFTMAX
- float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w;
- sumVal = vec4(singleSum, singleSum, singleSum, singleSum);
- outputData = TEXTURE_STORE(val / sumVal);
- #else
- outputData = TEXTURE_STORE(val / sumVal);
- #endif
-#endif
-
-}
-)GLSL";
-
-#include "../core/ImageAllocator.h"
-#include "caffe2/core/operator.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-namespace caffe2 {
-template <class T>
-class OpenGLSoftmax final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLSoftmax(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws),
- order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
- OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
- const int num_images = input.size();
- const int input_channels = input.channels();
- const int input_width = input.width();
- const int input_height = input.height();
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
- // For tiling
- const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
- const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
- const bool tiled = input_tile_x > 1 || input_tile_y > 1;
- if (tiled) {
- CAFFE_ENFORCE_EQ(
- input.slices(), 1, "Input needs to be tiled in a single texture");
- }
-
- CAFFE_ENFORCE(
- tiled || input_channels == 1,
- "Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
-
- // for spatial dimension
- const int tile_size_x = 16;
- const int tile_size_y = 16;
-
- int max_buf_width = input_width;
- int max_buf_height = input_height;
- int max_buf_channels = input_channels;
- vector<GLImageVector<T>*> reduce_buf;
-
- while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
- max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
- max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
- if (tiled) {
- // since we are summing over all the channels within a channel tile
- max_buf_channels =
- (max_buf_channels + input_tile_x * input_tile_y - 1) /
- (input_tile_x + input_tile_y);
- }
- reduce_buf.push_back(ImageAllocator<T>::newImage(
- 1,
- max_buf_width,
- max_buf_height,
- max_buf_channels,
- output_tile_x,
- output_tile_y));
- }
-
- GLImageVector<T>* max = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
- GLImageVector<T>* sum = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
- GLImageVector<T>* after_exp = ImageAllocator<T>::newImage(
- num_images,
- output_width,
- output_height,
- output_channels,
- output_tile_x,
- output_tile_y);
- GLImageVector<T>* output_images = ImageAllocator<T>::newImage(
- num_images,
- output_width,
- output_height,
- output_channels,
- output_tile_x,
- output_tile_y,
- is_last);
-
- if (!f_max) {
- f_max.reset(new GLSoftmaxReduce(false, tiled, input_tile_x));
- f_exp.reset(new GLSoftmaxScale(true, tiled));
- f_sum.reset(new GLSoftmaxReduce(true, tiled, input_tile_x));
- f_scale.reset(new GLSoftmaxScale(false, tiled));
- }
-
- for (int i = 0; i < num_images; i++) {
- auto input_image = input[i];
- auto max_image = (*max)[i];
- auto sum_image = (*sum)[i];
- auto after_exp_image = (*after_exp)[i];
- auto output_image = (*output_images)[i];
- // Get Max
- for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
- const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
- GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
-
- const int running_tile_size_x =
- ir < reduce_buf.size() ? tile_size_x : in->width;
- const int running_tile_size_y =
- ir < reduce_buf.size() ? tile_size_y : in->height;
- f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
- }
- // scale vals by exp(x - max)
- f_exp->scale(input_image, max_image, sum_image, after_exp_image);
-
- // Get sum of the exp
- for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
- const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
- GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
- const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
- const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
- f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
- }
-
- // Scale(softmax)
- f_scale->scale(after_exp_image, max_image, sum_image, output_image);
- }
-
- Outputs()[OUTPUT]->Reset(output_images);
-
- delete sum;
- delete max;
- delete after_exp;
- for (auto&& rb : reduce_buf) {
- delete rb;
- }
- return true;
- }
-
- private:
- StorageOrder order_;
- std::unique_ptr<GLSoftmaxReduce> f_max;
- std::unique_ptr<GLSoftmaxScale> f_exp;
- std::unique_ptr<GLSoftmaxReduce> f_sum;
- std::unique_ptr<GLSoftmaxScale> f_scale;
-
- INPUT_TAGS(INPUT, FILTER, BIAS);
- OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
-OPERATOR_SCHEMA(OpenGLSoftmax)
- .NumInputs(1)
- .NumOutputs(1)
- .AllowInplace({{0, 0}})
- .IdenticalTypeAndShape();
-} // namespace caffe2
-#endif // CAFFE2_MOBILE
+++ /dev/null
-
-#include "../core/GLContext.h"
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-enum InputFormat { BGRA = 0, RGBA = 1 };
-
-class GLStylizer : public GLFilter {
- binding* inputData;
- binding* outputSize;
- binding* mean;
- binding* noise_std;
- bool deprocess;
-
- public:
- GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
- : GLFilter(
- _deprocess ? "GLDeStylizer" : "GLStylizer",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(inputData),
- BINDING(mean),
- BINDING(noise_std),
- BINDING(outputSize)}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {{"DEPROCESS", c10::to_string(_deprocess)},
- {"RGBAINPUT", c10::to_string(input_format)}}),
- deprocess(_deprocess) {}
-
- template <typename T1, typename T2>
- void stylize(const GLImage<T1>* input_image,
- const GLImage<T2>* output_image,
- const float mean_values[3],
- float noise_std_value);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLStylizer::fragment_shader = R"GLSL(#version 300 es
-
-#define DEPROCESS $(DEPROCESS)
-#define RGBAINPUT $(RGBAINPUT)
-
-precision mediump float;
-precision mediump int;
-precision mediump sampler2D;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-uniform vec3 mean;
-uniform float noise_std;
-
-#if DEPROCESS
-TEXTURE_INPUT(inputData);
-layout(location = 0) out mediump vec4 outputData;
-#else
-uniform sampler2D inputData;
-TEXTURE_OUTPUT(0, outputData);
-#endif
-
-#if !DEPROCESS
-// http://byteblacksmith.com/improvements-to-the-canonical-one-liner-glsl-rand-for-opengl-es-2-0/
-
-highp float rand(vec2 co) {
- highp float a = 12.9898;
- highp float b = 78.233;
- highp float c = 43758.5453;
- highp float dt = dot(co.xy, vec2(a, b));
- highp float sn = mod(dt, 3.14);
- return fract(sin(sn) * c);
-}
-#endif
-
-// In AR Engine, input/output a RBGA texture; otherwise, BGRA tensor => texture
-#if RGBAINPUT
-void main() {
-#if DEPROCESS
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 val = TEXTURE_LOAD(inputData, texelCoord);
- outputData = vec4((val.rgb + mean) / 255.0, 1.0).bgra;
-#else
- outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).bgr - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
-#endif
-}
-#else
-void main() {
-#if DEPROCESS
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 val = TEXTURE_LOAD(inputData, texelCoord);
- outputData = vec4((val.rgb + mean) / 255.0, 1.0);
-#else
- outputData = TEXTURE_STORE(vec4(255.0 * texture(inputData, v_texCoord).rgb - mean + vec3(noise_std * rand(v_texCoord)), 0.0));
-#endif
-}
-#endif
-)GLSL";
-
-template <typename T1, typename T2>
-void GLStylizer::stylize(const GLImage<T1>* input_image,
- const GLImage<T2>* output_image,
- const float mean_values[3],
- float noise_std_value) {
- int input_slices = input_image->slices;
- int output_slices = output_image->slices;
-
- run(std::vector<texture_attachment>({{input_image->textures[0], inputData}}),
- {output_image->textures[0]},
- [&]() {
- glUniform2i(outputSize->location, output_image->width, output_image->height);
- glUniform3f(mean->location, mean_values[0], mean_values[1], mean_values[2]);
- if (!deprocess) {
- glUniform1f(noise_std->location, noise_std_value);
- }
- },
- output_image->width,
- output_image->height);
-}
-
-namespace caffe2 {
-class OpenGLTensorToTextureStylizerPreprocessOp : public Operator<CPUContext>,
- ImageAllocator<uint8_t>,
- ImageAllocator<float16_t> {
- public:
- // Expect this many channels as input
- static constexpr int kInputChannels = 4;
-
- // Expect this many channels as output
- static constexpr int kOutputChannels = 3;
-
- USE_OPERATOR_BASE_FUNCTIONS;
-
- OpenGLTensorToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {}
-
- bool RunOnDevice() {
- const auto& input = Input(0);
- const auto& mean = Input(1);
-
- CAFFE_ENFORCE(input.ndim() == 4);
-
- const int num_images = input.dim32(0);
- const int input_height = input.dim32(1);
- const int input_width = input.dim32(2);
- const int input_channels = input.dim32(3);
-
- CAFFE_ENFORCE(input.dim32(0) == 1); // N == 1
- CAFFE_ENFORCE(input_channels == kInputChannels);
- CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
-
- // get the buffers from input tensors
- const float* mean_buffer = mean.template data<float>();
- const uint8_t* input_buffer = input.template data<uint8_t>();
-
- // set up the OpenGL context
- GLContext::getGLContext()->set_context();
-
- GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(num_images,
- input_width,
- input_height,
- kOutputChannels,
-#if CAFFE2_IOS
- true
-#else
- false
-#endif
- );
- const int tile_x = 1, tile_y = 1;
- GLImageVector<uint8_t>* input_images = ImageAllocator<uint8_t>::newImage(
- num_images, input_width, input_height, kInputChannels, tile_x, tile_y, false);
- for (int i = 0; i < num_images; i++) {
- auto input_image = (*input_images)[i];
- auto output_image = (*output_images)[i];
- const GLTexture* inputTexture = input_image->textures[0];
- inputTexture->loadData(input_buffer);
-
- if (!glStylizer_) {
- glStylizer_.reset(new GLStylizer());
- }
-
- glStylizer_->stylize(
- input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
- }
- delete input_images;
- Outputs()[0]->Reset(output_images);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-template <InputFormat inputFormat>
-class OpenGLTextureToTextureStylizerPreprocessOp : public Operator<CPUContext>,
- ImageAllocator<uint8_t>,
- ImageAllocator<float16_t> {
- public:
- // Expect this many channels as input
- static constexpr int kInputChannels = 4;
-
- // Expect this many channels as output
- static constexpr int kOutputChannels = 3;
-
- USE_OPERATOR_BASE_FUNCTIONS;
-
- OpenGLTextureToTextureStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {}
-
- bool RunOnDevice() {
- const GLImageVector<uint8_t>& input = Inputs()[0]->template Get<GLImageVector<uint8_t>>();
- const auto& mean = Input(1);
-
- const int num_images = input.size();
- const int input_height = input.height();
- const int input_width = input.width();
- const int input_channels = input.channels();
-
- CAFFE_ENFORCE_GT(num_images, 0);
- CAFFE_ENFORCE(input[0]->slices == 1); // N == 1
- CAFFE_ENFORCE(input_channels == kInputChannels);
- CAFFE_ENFORCE(mean.size() == kOutputChannels); // Assume BGR or BGRA
-
- // get the buffers from input tensors
- const float* mean_buffer = mean.template data<float>();
-
- GLImageVector<float16_t>* output_images = ImageAllocator<float16_t>::newImage(
- num_images, input_width, input_height, kOutputChannels, false);
-
- if (!glStylizer_) {
- glStylizer_.reset(new GLStylizer(false, inputFormat));
- }
- for (int i = 0; i < num_images; i++) {
- auto input_image = input[i];
- auto output_image = (*output_images)[i];
- glStylizer_->stylize(
- input_image, output_image, mean_buffer, GetSingleArgument<float>("noise_std", 10.0));
- }
- Outputs()[0]->Reset(output_images);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLTensorToTextureStylizerPreprocess,
- OpenGLTensorToTextureStylizerPreprocessOp);
-OPERATOR_SCHEMA(OpenGLTensorToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerPreprocess,
- OpenGLTextureToTextureStylizerPreprocessOp<RGBA>);
-OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1);
-
-class OpenGLTextureToTensorStylizerDeprocessOp : public Operator<CPUContext>,
- ImageAllocator<uint8_t> {
- public:
- using Operator<CPUContext>::Operator;
-
- // Expect this many channels as input
- static constexpr int kInputChannels = 3;
-
- // Expect this many channels as output
- static constexpr int kOutputChannels = 4;
-
- bool RunOnDevice() {
- const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
- const auto& mean = Input(1);
- auto* output = Output(0);
-
- const int num_images = input.size(), channels = input.channels(), height = input.height(),
- width = input.width();
- // Assume BGR or BGRA
- CAFFE_ENFORCE(mean.size() == kInputChannels);
- CAFFE_ENFORCE(channels == kInputChannels);
- // RGB
- output->Resize(num_images, height, width, kOutputChannels);
-
- const auto* mean_data = mean.template data<float>();
- auto* output_buffer = output->template mutable_data<uint8_t>();
-
- GLImageVector<uint8_t>* output_images =
- ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels, true);
-
- if (!glStylizer_) {
- glStylizer_.reset(new GLStylizer(true));
- }
-
- for (int i = 0; i < num_images; i++) {
- auto input_image = input[i];
- auto output_image = (*output_images)[i];
- glStylizer_->stylize(input_image, output_image, mean_data, 0);
-
- output_image->textures[0]->map_read([&](const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {
- if (width == stride) {
- memcpy(output_buffer, buffer, channels * width * height);
- } else {
- typedef uint8_t(input_data_t)[height][stride][channels];
- typedef uint8_t(output_data_t)[height][width][channels];
-
- const input_data_t& input_data = *reinterpret_cast<const input_data_t*>(buffer);
- output_data_t& output_data = *reinterpret_cast<output_data_t*>(output_buffer);
-
- for (int y = 0; y < height; y++) {
- memcpy(output_data[y], input_data[y], channels * width);
- }
- }
- });
- }
- delete output_images;
-
- return true;
- }
-
- private:
- std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-template <InputFormat inputFormat>
-class OpenGLTextureToTextureStylizerDeprocessOp : public Operator<CPUContext>,
- ImageAllocator<uint8_t> {
- public:
- using Operator<CPUContext>::Operator;
-
- // Expect this many channels as input
- static constexpr int kInputChannels = 3;
-
- // Expect this many channels as output
- static constexpr int kOutputChannels = 4;
-
- bool RunOnDevice() {
- const GLImageVector<float16_t>& input = Inputs()[0]->template Get<GLImageVector<float16_t>>();
- const auto& mean = Input(1);
-
- const int num_images = input.size(), channels = input.channels(), height = input.height(),
- width = input.width();
-
- CAFFE_ENFORCE(mean.size() == kInputChannels);
- CAFFE_ENFORCE(channels == kInputChannels);
-
- const auto* mean_data = mean.template data<float>();
-
- // Use foreignTextureAllocator inside GLContext
- // glDeleteTexture will not be called from inside caffe2 for this texture
- GLImageVector<uint8_t>* output_images;
- auto textureAllocator = GLContext::getGLContext()->getTextureAllocator();
- const int tile_x = 1, tile_y = 1;
- if (textureAllocator != nullptr) {
- output_images = ImageAllocator<uint8_t>::newImage(
- num_images, width, height, kOutputChannels, tile_x, tile_y, textureAllocator);
- } else {
- // fallback when textureAllocator is not set
- output_images = ImageAllocator<uint8_t>::newImage(num_images, width, height, kOutputChannels);
- }
-
- if (!glStylizer_) {
- glStylizer_.reset(new GLStylizer(true, inputFormat));
- }
-
- for (int i = 0; i < num_images; i++) {
- auto input_image = input[i];
- auto output_image = (*output_images)[i];
- glStylizer_->stylize(input_image, output_image, mean_data, 0);
- }
-
- Outputs()[0]->Reset(output_images);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLStylizer> glStylizer_;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLTextureToTensorStylizerDeprocess,
- OpenGLTextureToTensorStylizerDeprocessOp);
-OPERATOR_SCHEMA(OpenGLTextureToTensorStylizerDeprocess).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(OpenGLTextureToTextureStylizerDeprocess,
- OpenGLTextureToTextureStylizerDeprocessOp<RGBA>);
-OPERATOR_SCHEMA(OpenGLTextureToTextureStylizerDeprocess).NumInputs(2).NumOutputs(1);
-} // namespace caffe2
+++ /dev/null
-
-#include "../core/GLFilter.h"
-#include "../core/GLImage.h"
-#include "../core/ImageAllocator.h"
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include <iostream>
-#include <vector>
-
-class GLSub : public GLFilter {
- public:
- binding* inputData[2];
- binding* outputSize;
-
- GLSub()
- : GLFilter("GLSub",
- vertex_shader,
- fragment_shader,
- std::vector<binding*>({BINDING(outputSize), BINDING(inputData[0]), BINDING(inputData[1])}),
- {/* no uniform blocks */},
- {/* no attributes */},
- {/* no replacements */}) {}
-
- template <typename T>
- void sub(const GLImageVector<T>& input_image0,
- const GLImageVector<T>& input_image1,
- const GLImageVector<T>& output_image);
-
- static const char* fragment_shader;
-};
-
-// MARK: GLSL
-
-const char* GLSub::fragment_shader = R"GLSL(#version 300 es
-
-precision mediump float;
-precision mediump int;
-
-in highp vec2 v_texCoord;
-
-uniform ivec2 outputSize;
-
-TEXTURE_INPUT(inputData[2]);
-TEXTURE_OUTPUT(0, outputData);
-
-void main() {
- ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
- vec4 A = TEXTURE_LOAD(inputData[0], texelCoord);
- vec4 B = TEXTURE_LOAD(inputData[1], texelCoord);
- vec4 value = A - B;
- outputData = TEXTURE_STORE(value);}
-
-)GLSL";
-
-template <typename T>
-void GLSub::sub(const GLImageVector<T>& input_images0,
- const GLImageVector<T>& input_images1,
- const GLImageVector<T>& output_images) {
- const int num_images = input_images0.size();
- for (int i = 0; i < num_images; i++) {
- GLImage<T>* input_image0 = input_images0[i];
- GLImage<T>* input_image1 = input_images1[i];
- int input_slices = input_image0->slices;
- GLImage<T>* output_image = output_images[i];
- int output_slices = output_image->slices;
-
- for (int is = 0; is < input_slices; is++) {
- std::vector<texture_attachment> input_attachments;
- input_attachments.push_back({input_image0->textures[is], inputData[0]});
- input_attachments.push_back({input_image1->textures[is], inputData[1]});
-
- run(input_attachments,
- {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
- [&]() { glUniform2i(outputSize->location, output_image->width, output_image->height); },
- output_image->width,
- output_image->height);
- }
- }
-}
-
-namespace caffe2 {
-template <typename T>
-class OpenGLSubOp final : public Operator<CPUContext>, ImageAllocator<T> {
- public:
- OpenGLSubOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<CPUContext>(operator_def, ws) {
- OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("broadcast") == false, "OpenGLSub does not support broadcast");
-
- OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, "OpenGLSub does not support axis");
- }
-
- bool RunOnDevice() override {
- const GLImageVector<T>& input0 = Inputs()[0]->template Get<GLImageVector<T>>();
- const GLImageVector<T>& input1 = Inputs()[1]->template Get<GLImageVector<T>>();
-
- CAFFE_ENFORCE_EQ(input0.size(), input1.size());
-
- const int num_images = input0.size();
- const int input_channels = input0.channels();
- const int input_width = input0.width();
- const int input_height = input0.height();
- CAFFE_ENFORCE_EQ(input1.channels(), input_channels);
- CAFFE_ENFORCE_EQ(input1.width(), input_width);
- CAFFE_ENFORCE_EQ(input1.height(), input_height);
-
- const int output_channels = input_channels;
- const int output_width = input_width;
- const int output_height = input_height;
-
- int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
-
- GLImageVector<T>* output = ImageAllocator<T>::newImage(
- num_images, output_width, output_height, output_channels, is_last);
-
- if (!_sub) {
- _sub.reset(new GLSub());
- }
-
- _sub->sub(input0, input1, *output);
-
- Outputs()[0]->Reset(output);
-
- return true;
- }
-
- private:
- std::unique_ptr<GLSub> _sub;
-};
-
-REGISTER_CPU_OPERATOR(OpenGLSub, OpenGLSubOp<float16_t>);
-OPERATOR_SCHEMA(OpenGLSub).NumInputs(2).NumOutputs(1);
-} // namespace caffe2
+++ /dev/null
-#pragma once
-#include <cmath>
-
-struct point {
- int x;
- int y;
-};
-
-struct tile_descriptor {
- point tile_dims;
- point tile_size;
- int tiles;
-};
-
-namespace caffe2 {
-inline static void squareFactors(int N, int& r1, int& r2) {
- int f = sqrt(N);
-
- if (f * f == N) {
- r1 = r2 = f;
- } else {
- while (N % f != 0) {
- f--;
- }
- r1 = N / f;
- r2 = f;
- }
-}
-
-inline static void computeOutputTiles(int output_channels, int& output_tile_x, int& output_tile_y) {
- squareFactors((output_channels + 3) / 4, output_tile_x, output_tile_y);
-}
-} // namespace caffe2
+++ /dev/null
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/utils/math.h"
-
-#include "../core/GL.h"
-#include "../core/GLLogging.h"
-#include "../core/arm_neon_support.h"
-#include "../operators/gl_tiling_utils.h"
-#include "TestGLConvolution.h"
-
-#include <vector>
-
-void AddNoiseInput(const std::vector<int64_t>& shape,
- const std::string& name,
- caffe2::Workspace* ws) {
- caffe2::CPUContext context;
- caffe2::Blob* blob = ws->CreateBlob(name);
- auto* tensor = blob->GetMutable<caffe2::TensorCPU>();
- tensor->Resize(shape);
-
- caffe2::math::RandGaussian<float, caffe2::CPUContext>(
- tensor->size(), 0.0f, 10.0f, tensor->mutable_data<float>(), &context);
-}
-
-double BenchOp(const std::string& typ,
- int inputC,
- int outputC,
- int kW,
- int kH,
- int stride,
- int inW,
- int inH,
- bool transposed,
- caffe2::Workspace* ws = nullptr) {
- caffe2::Workspace localWs;
- if (!ws) {
- ws = &localWs;
- }
-
- const char* engine = transposed ? "MOBILE" : "NNPACK";
-
- caffe2::OperatorDef def1;
- def1.set_name("test");
- def1.set_type(typ);
- def1.set_engine(engine);
- def1.add_input("X");
- def1.add_input("W");
- def1.add_input("B");
- def1.add_output("Y");
-
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_h", kH));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_w", kW));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_h", stride));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_w", stride));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_t", 0));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
- def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
-
- AddNoiseInput(std::vector<int64_t>{1, inputC, inH, inW}, "X", ws);
- if (transposed) {
- AddNoiseInput(std::vector<int64_t>{inputC, outputC, kH, kW}, "W", ws);
- } else {
- AddNoiseInput(std::vector<int64_t>{outputC, inputC, kH, kW}, "W", ws);
- }
- AddNoiseInput(std::vector<int64_t>{outputC}, "B", ws);
-
- std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
-
- // Measure one iteration
- caffe2::Timer timer;
- timer.Start();
-
- op1->Run();
-
- float one_iteration = timer.MilliSeconds();
-
- int target_iterations = std::max((int)(1000 / one_iteration), 1);
- int warmup_iterations = std::max((int)(200 / one_iteration), 1);
-
- // warm up
- for (int i = 0; i < warmup_iterations; i++) {
- op1->Run();
- }
-
- timer.Start();
-
- int runs = target_iterations;
- for (int i = 0; i < runs; i++) {
- op1->Run();
- }
-
- auto total_t = timer.MilliSeconds();
-
- gl_log(GL_LOG,
- "%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
- typ.c_str(),
- inputC,
- outputC,
- inW,
- inH,
- kW,
- kH,
- engine,
- timer.MilliSeconds() / (float)runs);
- return double(total_t) / runs;
-}
-
-template <typename T>
-static double BenchGLConvolution(int input_channels,
- int output_channels,
- int kernel_width,
- int kernel_height,
- int input_width,
- int input_height,
- int input_padding,
- int input_stride,
- bool transposed,
- caffe2::Workspace* ws = nullptr) {
- int tile_x = 1, tile_y = 1;
- caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
-
- gl_log(GL_LOG, "Input Tiles Factors: %d, %d\n", tile_x, tile_y);
-
- caffe2::Workspace localWs;
- if (!ws) {
- ws = &localWs;
- }
-
- AddNoiseInput(
- std::vector<int64_t>{1, input_channels, input_height, input_width}, "X_cpu", ws);
- if (transposed) {
- AddNoiseInput(
- std::vector<int64_t>{input_channels, output_channels, kernel_height, kernel_width},
- "W",
- ws);
- } else {
- AddNoiseInput(
- std::vector<int64_t>{output_channels, input_channels, kernel_height, kernel_width},
- "W",
- ws);
- }
- AddNoiseInput(std::vector<int64_t>{output_channels}, "b", ws);
-
- caffe2::NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type(transposed ? "OpenGLConvTranspose" : "OpenGLConv");
- op.add_input("X_gl");
- {
- op.add_input("W");
- op.add_input("b");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("kernel");
- arg.set_i(kernel_height);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad");
- arg.set_i(input_padding);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("stride");
- arg.set_i(input_stride);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("is_last");
- arg.set_i(1);
- }
- op.add_output("Y_gl");
- }
-
- std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
-
- for (auto& op : netdef.op()) {
- ops.push_back(CreateOperator(op, ws));
- }
-
- // Run the Copy Operator
- ops[0]->Run();
-
- // Make sure the tested operator is precompiled
- ops[1]->Run();
- glFinish();
-
- // Measure one iteration
- caffe2::Timer timer;
- timer.Start();
-
- ops[1]->Run();
- glFinish();
-
- float one_iteration = timer.MilliSeconds();
-
- int target_iterations = std::max((int)(1000 / one_iteration), 1);
- int warmup_iterations = std::max((int)(200 / one_iteration), 1);
-
- // warm up
- for (int i = 0; i < warmup_iterations; i++) {
- ops[1]->Run();
- }
- glFinish();
-
- timer.Start();
-
- int runs = target_iterations;
- for (int i = 0; i < runs; i++) {
- ops[1]->Run();
- }
- glFinish();
-
- const double gpuIterTime = double(timer.MilliSeconds()) / runs;
-
- gl_log(GL_LOG,
- "%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
- transposed ? "ConvTranspose" : "Conv",
- input_channels,
- output_channels,
- input_width,
- input_height,
- kernel_width,
- kernel_height,
- gpuIterTime);
-
- return gpuIterTime;
-}
-
-void TestGLConvolution() {
- caffe2::Workspace ws;
- ws.GetThreadPool()->setMinWorkSize(0);
-
- // small input sizes
- // std::vector<int> sizes({14, 26, 52, 104});
- // std::vector<int> channels({128, 64}); // not working for 512 and 256 channels yet
- // std::vector<int> channels({512, 256, 128, 64});
-
- // large input sizes
- // std::vector<int> sizes({208, 312, 416, 720, 1080});
- // std::vector<int> channels({16, 4});
- //
- std::vector<int> sizes({14, 26, 52, 104, 208});
- // std::vector<int> channels({24, 16, 4});
-
- // std::vector<int> sizes({14});
- std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
-
- std::vector<int> kernels({3});
-
- bool transposed = false;
-
- int stride = 1;
-
- for (const auto& space : sizes) {
- for (const auto& input_channel : channels) {
- int output_channel = input_channel;
- /* for (const auto& output_channel : channels) */ {
- for (const auto& kernel : kernels) {
- const double gpuIterTime = BenchGLConvolution<float16_t>(
- input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
- const double cpuIterTime = BenchOp(transposed ? "ConvTranspose" : "Conv",
- input_channel,
- output_channel,
- kernel,
- kernel,
- stride,
- space,
- space,
- transposed,
- &ws);
- const double flops = double(input_channel) * output_channel * kernel * kernel *
- (kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
- // gl_log(GL_LOG,
- printf(
- "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
- "%.2f\tratio: "
- "%.2f\n",
- space,
- space,
- input_channel,
- output_channel,
- kernel,
- kernel,
- flops / gpuIterTime / 1E6,
- flops / cpuIterTime / 1E6,
- cpuIterTime / gpuIterTime);
- }
- }
- }
- }
-
- // // ConvTranspose
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 2, true);
- // BenchGLConvolution<float16_t>(16, 16, 4, 4, 640, 360, 0, 2, true);
- // BenchGLConvolution<float16_t>(16, 16, 5, 5, 640, 360, 0, 2, true);
- // BenchGLConvolution<float16_t>(16, 16, 6, 6, 640, 360, 0, 2, true);
- // BenchGLConvolution<float16_t>(16, 16, 7, 7, 640, 360, 0, 2, true);
- // BenchGLConvolution<float16_t>(16, 16, 8, 8, 640, 360, 0, 2, true);
- // BenchGLConvolution<float16_t>(16, 16, 9, 9, 640, 360, 0, 2, true);
- //
- // BenchOp("ConvTranspose", 16, 16, 3, 3, 2, 640, 360, true);
- // BenchOp("ConvTranspose", 16, 16, 4, 4, 2, 640, 360, true);
- // BenchOp("ConvTranspose", 16, 16, 5, 5, 2, 640, 360, true);
- // BenchOp("ConvTranspose", 16, 16, 6, 6, 2, 640, 360, true);
- // BenchOp("ConvTranspose", 16, 16, 7, 7, 2, 640, 360, true);
- // BenchOp("ConvTranspose", 16, 16, 8, 8, 2, 640, 360, true);
- // BenchOp("ConvTranspose", 16, 16, 9, 9, 2, 640, 360, true);
- //
- // // Conv
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 4, 4, 1280, 720, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 5, 5, 1280, 720, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 6, 6, 1280, 720, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 7, 7, 1280, 720, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 8, 8, 1280, 720, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 9, 9, 1280, 720, 0, 1, false);
- //
- // BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
- // BenchOp("Conv", 16, 16, 4, 4, 1, 1280, 720, false);
- // BenchOp("Conv", 16, 16, 5, 5, 1, 1280, 720, false);
- // BenchOp("Conv", 16, 16, 6, 6, 1, 1280, 720, false);
- // BenchOp("Conv", 16, 16, 7, 7, 1, 1280, 720, false);
- // BenchOp("Conv", 16, 16, 8, 8, 1, 1280, 720, false);
- // BenchOp("Conv", 16, 16, 9, 9, 1, 1280, 720, false);
-
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 80, 45, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 160, 90, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 320, 180, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 1, false);
- // BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
- //
- // BenchOp("Conv", 16, 16, 3, 3, 1, 80, 45, false);
- // BenchOp("Conv", 16, 16, 3, 3, 1, 160, 90, false);
- // BenchOp("Conv", 16, 16, 3, 3, 1, 320, 180, false);
- // BenchOp("Conv", 16, 16, 3, 3, 1, 640, 360, false);
- // BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
- //
- // BenchGLConvolution<float16_t>(128, 128, 3, 3, 14, 14, 0, 1, false);
- // BenchGLConvolution<float16_t>(256, 256, 3, 3, 14, 14, 0, 1, false);
- // BenchGLConvolution<float16_t>(128, 128, 3, 3, 28, 28, 0, 1, false);
- // BenchGLConvolution<float16_t>(256, 256, 3, 3, 28, 28, 0, 1, false);
- // BenchGLConvolution<float16_t>(128, 128, 3, 3, 56, 56, 0, 1, false);
- // BenchGLConvolution<float16_t>(256, 256, 3, 3, 56, 56, 0, 1, false);
- // BenchGLConvolution<float16_t>(64, 64, 7, 7, 128, 128, 0, 1, false);
- //
- // BenchOp("Conv", 128, 128, 3, 3, 1, 14, 14, false);
- // BenchOp("Conv", 256, 256, 3, 3, 1, 14, 14, false);
- // BenchOp("Conv", 128, 128, 3, 3, 1, 28, 28, false);
- // BenchOp("Conv", 256, 256, 3, 3, 1, 28, 28, false);
- // BenchOp("Conv", 128, 128, 3, 3, 1, 56, 56, false);
- // BenchOp("Conv", 256, 256, 3, 3, 1, 56, 56, false);
- // BenchOp("Conv", 64, 64, 7, 7, 1, 128, 128, false);
-}
+++ /dev/null
-
-#pragma once
-
-void TestGLConvolution();
+++ /dev/null
-
-#include "opengl_test.h"
-
-#include "../core/GLContext.h"
-#include "../core/GLImageAllocator.h"
-#include "../core/GLLogging.h"
-#include "../core/ImageAllocator.h"
-#include "../core/arm_neon_support.h"
-#include "../core/rewrite_net.h"
-#include "../operators/gl_tiling_utils.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/utils/math.h"
-#include "caffe2/utils/proto_utils.h"
-
-#ifdef CAFFE2_USE_MPSCNN
-#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
-#endif
-
-#define DEBUGGING false
-
-namespace caffe2 {
-
-template <class T>
-float absolute_error(T t1, T t2) {
- return std::abs((float)t1 - (float)t2);
-}
-
-template <class T>
-float relative_error(T t1, T t2) {
- return t2 != 0 ? absolute_error(t1, t2) / (float)t2 : 1;
-}
-
-// OpenGL: t1, CPU: t2
-void checkError1D(const TensorCPU& t1, const TensorCPU& t2, float error) {
- CAFFE_ENFORCE_EQ(t1.size(), t2.size());
-#if DEBUGGING
- gl_log(GL_LOG, "OpenGL output:\n");
- for (int i = 0; i < t1.size(); i++) {
- gl_log(GL_LOG, "%.5f\t", t1.template data<float>()[i]);
- }
- gl_log(GL_LOG, "\n");
- gl_log(GL_LOG, "CPU output:\n");
- for (int i = 0; i < t2.size(); i++) {
- gl_log(GL_LOG, "%.5f\t", t2.template data<float>()[i]);
- }
- gl_log(GL_LOG, "\n");
-
-#else
- int count = 0;
- if (t1.template IsType<float>()) {
- for (auto i = 0; i < t1.size(); ++i) {
- const float t1_i = t1.template data<float>()[i];
- const float t2_i = t2.template data<float>()[i];
-
- if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
- gl_log(GL_ERR,
- "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n",
- i,
- t1_i,
- t2_i,
- absolute_error(t1_i, t2_i),
- relative_error(t1_i, t2_i) * 100);
- if (count++ == 10) {
- CAFFE_THROW("--- Test Failed ---");
- }
- }
- }
- }
-#endif
-}
-
-// OpenGL: t1, CPU: t2
-void checkError(const TensorCPU& t1, const TensorCPU& t2, float error) {
- CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-#if DEBUGGING
- gl_log(GL_LOG, "opengl_test output\n");
- gl_log(GL_LOG, "\nOpenGL output:\n");
- for (int i = 0; i < t1.size(); i++) {
- if (t1.ndim() > 2 && i % t1.dim(2) == 0) {
- gl_log(GL_LOG, "\n");
- }
- if (t1.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0) {
- gl_log(GL_LOG, "\n");
- }
- if (t1.template IsType<float>()) {
- const float t1_i = t1.template data<float>()[i];
- gl_log(GL_LOG, "%.3f\t", t1_i);
- } else if (t1.template IsType<uint8_t>()) {
- const uint8_t t1_i = t1.template data<uint8_t>()[i];
- gl_log(GL_LOG, "%.3d\t", (int)t1_i);
- }
- }
-
- gl_log(GL_LOG, "\nCPU output:\n");
- for (int i = 0; i < t2.size(); i++) {
- if (t2.ndim() > 2 && i % t2.dim(2) == 0)
- gl_log(GL_LOG, "\n");
- if (t2.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0)
- gl_log(GL_LOG, "\n");
- if (t2.template IsType<float>()) {
- const float t2_i = t2.template data<float>()[i];
- gl_log(GL_LOG, "%.3f\t", t2_i);
- } else if (t2.template IsType<uint8_t>()) {
- const uint8_t t2_i = t2.template data<uint8_t>()[i];
- gl_log(GL_LOG, "%.3d\t", (int)t2_i);
- }
- }
- gl_log(GL_LOG, "\n");
-#else
-
- int count = 0;
- float maxError = 0, minError = 0;
- if (t1.template IsType<float>()) {
- for (auto i = 0; i < t1.size(); ++i) {
- const float t1_i = t1.template data<float>()[i];
- const float t2_i = t2.template data<float>()[i];
- if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
- if (count < 10) {
- gl_log(GL_ERR,
- "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n",
- i,
- t1_i,
- t2_i,
- absolute_error(t1_i, t2_i),
- relative_error(t1_i, t2_i) * 100);
- } else {
- CAFFE_THROW("--- Test Failed ---");
- }
- count++;
- }
- float err = t1_i - t2_i;
- if (err > maxError) {
- maxError = err;
- } else if (err < minError) {
- minError = err;
- }
- }
- } else if (t1.template IsType<uint8_t>()) {
- for (auto i = 0; i < t1.size(); ++i) {
- const uint8_t t1_i = t1.template data<uint8_t>()[i];
- const uint8_t t2_i = t2.template data<uint8_t>()[i];
- if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) {
- if (count < 10) {
- gl_log(GL_ERR,
- "i: %d, GL: %d, CPU: %d, absolute error: %.2f, relative error: %.2f%%\n",
- i,
- t1_i,
- t2_i,
- absolute_error(t1_i, t2_i),
- relative_error(t1_i, t2_i) * 100);
- } else {
- CAFFE_THROW("--- Test Failed ---");
- }
- count++;
- }
- float err = t1_i - t2_i;
- if (err > maxError) {
- maxError = err;
- } else if (err < minError) {
- minError = err;
- }
- }
- }
- gl_log(GL_LOG,
- "#errors = %d in %d, maxError = %f, minError = %f\n",
- count,
- (int)t1.size(),
- maxError,
- minError);
-#endif
-}
-
-void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1, int tile_y = 1) {
- LOG(INFO) << "OPENGLCopyFrom/To Test";
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-
- // Note: may overflow for half precision
- // float *data = t->mutable_data<float>();
- // for (int i = 0; i < t->size(); i++) {
- // data[i] = i;
- // }
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("X_gl");
- op.add_output("Y_cpu");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
- const auto& t2 = ws.GetBlob("X_cpu")->Get<TensorCPU>(); // CPU
- CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-
- checkError(t1, t2, error);
-}
-
-typedef enum {
- AveragePool,
- MaxPool,
- Conv,
- ConvTranspose,
- ConvPRelu,
- ConvTransposePRelu,
- ConvRelu,
- ConvTransposeRelu
-} PoolOp;
-
-const char* glPoolOperationName[] = {"OpenGLAveragePool",
- "OpenGLMaxPool",
- "OpenGLConv",
- "OpenGLConvTranspose",
- "OpenGLConvPRelu",
- "OpenGLConvTransposePRelu",
- "OpenGLConvRelu",
- "OpenGLConvTransposeRelu"};
-
-const char* cpuPoolOperationName[] = {"AveragePool",
- "MaxPool",
- "Conv",
- "ConvTranspose",
- "Conv",
- "ConvTranspose",
- "Conv",
- "ConvTranspose"};
-
-void testOpenGLConv(int N,
- int C,
- int H,
- int W,
- int K, // output_channels
- int kernel_h,
- int kernel_w,
- int pad,
- int stride,
- PoolOp poolOp,
- float error,
- bool random_input = true,
- int input_batch_size = 1,
- int output_batch_size = 1,
- int input_tile_x = 1,
- int input_tile_y = 1,
- bool tiling = false) {
- LOG(INFO) << "OpenGL Conv Test: "
- << "input C: " << C << ", output C: " << K << ", H: " << H << ", W: " << W
- << ", K: " << kernel_w << "x" << kernel_h << ", P: " << pad << ", S: " << stride
- << " Op: " << glPoolOperationName[poolOp];
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
-#if 0
- gl_log(GL_LOG, "Input tensor:");
- for (int i = 0; i < t->size(); i++) {
- const float t1_i = t->data<float>()[i];
- if (i % t->dim(3) == 0)
- gl_log(GL_LOG, "\n");
- if (i % (4 * t->dim(2) * t->dim(3)) == 0)
- gl_log(GL_LOG, "-------------------------------\n");
- gl_log(GL_LOG, "%.3f\t", t1_i);
- }
- gl_log(GL_LOG, "\n\n");
-#endif
- }
-
- if (poolOp != AveragePool && poolOp != MaxPool) {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
- if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
- t->Resize(K, C, kernel_h, kernel_w);
- } else {
- t->Resize(C, K, kernel_h, kernel_w);
- }
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- // Set the weights to all 1s
- // for (int i = 0; i < t->size(); i++) {
- // data[i] = 1;
- // }
-
- // Set the weights to 1s, 2s, 3s... for channel 0, 1, 2, 3...
- int j = 0;
- for (int i = 0; i < t->size(); i++) {
- if (i % (C * kernel_h * kernel_w) == 0) {
- j++;
- }
- data[i] = j;
- }
- }
-
-#if 0
- gl_log(GL_LOG, "Kernel (printing only the first line for each output channel):");
- for (int i = 0; i < t->size(); i++) {
- if (i == 0 || i % (t->dim(1) * t->dim(2) * t->dim(3)) == 0) {
- gl_log(GL_LOG, "\n");
- for (int j = 0; j < t->dim(3); j++) {
- const float t1_i = t->data<float>()[i + j];
- gl_log(GL_LOG, "%.3f\t", t1_i);
- }
- }
- }
- gl_log(GL_LOG, "\n");
-#endif
-
- // bias
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
- t->Resize(K);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- // Set bias to 1
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = i + 1;
- }
- }
-#if 0
- gl_log(GL_LOG, "Bias:\n");
- for (int i = 0; i < t->size(); i++) {
- const float t1_i = t->data<float>()[i];
- gl_log(GL_LOG, "%.3f\t", t1_i);
- }
- gl_log(GL_LOG, "\n");
-#endif
- }
- }
-
- if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
- t->Resize(K);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- // Set prelu scale to i + 1
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = -0.5;
- }
- }
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(input_tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(input_tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type(glPoolOperationName[poolOp]);
- op.add_input("X_gl");
- if (poolOp != AveragePool && poolOp != MaxPool) {
- op.add_input("W");
- op.add_input("b");
- }
- if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
- op.add_input("p");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("kernel");
- arg.set_i(kernel_h);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad");
- arg.set_i(pad);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("stride");
- arg.set_i(stride);
- }
- if (poolOp != AveragePool && poolOp != MaxPool) {
- if (tiling) {
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tiling");
- arg.set_i(1);
- }
- } else {
- {
- auto& arg = *(op.add_arg());
- arg.set_name("input_batch_size");
- arg.set_i(input_batch_size);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("output_batch_size");
- arg.set_i(output_batch_size);
- }
- }
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("is_last");
- arg.set_i(1);
- }
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type(cpuPoolOperationName[poolOp]);
-
- op.add_input("X_cpu");
- if (poolOp != AveragePool && poolOp != MaxPool) {
- op.add_input("W");
- op.add_input("b");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("kernel");
- arg.set_i(kernel_h);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad");
- arg.set_i(pad);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("stride");
- arg.set_i(stride);
- }
- op.add_output("Y_ref");
- }
- if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
- auto& op = *(netdef.add_op());
- op.set_type("PRelu");
- op.add_input("Y_ref");
- op.add_input("p");
- op.add_output("Y_ref");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- } else if (poolOp == ConvRelu || poolOp == ConvTransposeRelu) {
- auto& op = *(netdef.add_op());
- op.set_type("Relu");
- op.add_input("Y_ref");
- op.add_output("Y_ref");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- }
-
- ws.RunNetOnce(netdef);
- const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
- const auto& t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(t1, t2, error);
-}
-
-void testOpenGLPRelu(
- int N, int C, int H, int W, int prelu_size, int input_tile_x, int input_tile_y, float error) {
- LOG(INFO) << "OpenGL PRelu Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
- }
-
- // prelu scale
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
- t->Resize(prelu_size);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(input_tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(input_tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLPRelu");
- op.add_input("X_gl");
- op.add_input("p");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("PRelu");
- op.add_input("X_cpu");
- op.add_input("p");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile_y, float error) {
- LOG(INFO) << "OpenGL Relu Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(input_tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(input_tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLRelu");
- op.add_input("X_gl");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Relu");
- op.add_input("X_cpu");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile_x = 1, int input_tile_y = 1) {
- LOG(INFO) << "OpenGL Add Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
- t0->Resize(N, C, H, W);
- CPUContext ctx0;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
-
- auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
- t1->Resize(N, C, H, W);
- CPUContext ctx1;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu0");
- op.add_output("X_gl0");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(input_tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(input_tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu1");
- op.add_output("X_gl1");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(input_tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(input_tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLAdd");
- op.add_input("X_gl0");
- op.add_input("X_gl1");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Add");
- op.add_input("X_cpu0");
- op.add_input("X_cpu1");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(t1, t2, error);
-}
-
-void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
- LOG(INFO) << "OpenGL Sub Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
-
- Workspace ws;
- {
- auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
- t0->Resize(N, C, H, W);
- CPUContext ctx0;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
-
- auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
- t1->Resize(N, C, H, W);
- CPUContext ctx1;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu0");
- op.add_output("X_gl0");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu1");
- op.add_output("X_gl1");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLSub");
- op.add_input("X_gl0");
- op.add_input("X_gl1");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Sub");
- op.add_input("X_cpu0");
- op.add_input("X_cpu1");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
- checkError(t2, t1, error);
-}
-
-void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = false, float error = 0.1) {
- LOG(INFO) << "OpenGL Concat Test "
- << "H: " << H << ", W: " << W;
- Workspace ws;
- for (int i = 0; i < Cs.size(); i++) {
- auto* t =
- BlobGetMutableTensor(ws.CreateBlob("X_cpu" + c10::to_string(i)), CPU);
- t->Resize(N, Cs[i], H, W);
- CPUContext ctx0;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx0);
- }
-
- NetDef netdef;
- for (int i = 0; i < Cs.size(); i++) {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu" + c10::to_string(i));
- op.add_output("X_gl" + c10::to_string(i));
- if (tiling) {
- int tile_x = 1, tile_y = 1;
- computeOutputTiles(Cs[i], tile_x, tile_y);
- printf("Cs[i] = %d, tile_x = %d, tile_y = %d\n", Cs[i], tile_x, tile_y);
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(tile_y);
- }
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLConcat");
- for (int i = 0; i < Cs.size(); i++) {
- op.add_input("X_gl" + c10::to_string(i));
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- op.add_output("Y_gl");
- op.add_output("Y_gl_mask");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Concat");
- for (int i = 0; i < Cs.size(); i++) {
- op.add_input("X_cpu" + c10::to_string(i));
- }
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- op.add_output("Y_ref_mask");
- }
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGL Sigmoid Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLSigmoid");
- op.add_input("X_gl");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Sigmoid");
- op.add_input("X_cpu");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLTanh(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGL Tanh Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLTanh");
- op.add_input("X_gl");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Tanh");
- op.add_input("X_cpu");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLMul(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGL Mul Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
- t->Resize(1);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLMul");
- op.add_input("X_gl");
- op.add_input("B");
- op.add_output("Y_gl");
-
- {
- auto& arg = *(op.add_arg());
- arg.set_name("broadcast");
- arg.set_i(1);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Mul");
- op.add_input("X_cpu");
- op.add_input("B");
-
- {
- auto& arg = *(op.add_arg());
- arg.set_name("broadcast");
- arg.set_i(1);
- }
-
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
- LOG(INFO) << "OpenGL Softmax Test "
- << "N: " << N << " D: " << D << " Tiled:" << tiled;
- Workspace ws;
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- {
- t->Resize(N, D);
- CPUContext ctx;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(
- t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("Reshape");
- op.add_input("X_cpu");
- op.add_output("X_reshaped");
- op.add_output("old_shape");
- auto& arg = *(op.add_arg());
- arg.set_name("shape");
- if (tiled) {
- arg.add_ints(N);
- arg.add_ints(D);
- arg.add_ints(1);
- arg.add_ints(1);
- } else {
- arg.add_ints(N);
- arg.add_ints(1);
- arg.add_ints(D);
- arg.add_ints(1);
- }
- }
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_reshaped");
- op.add_output("X_gl");
- if (tiled) {
- int tile_x = 1, tile_y = 1;
- squareFactors((D + 3) / 4, tile_x, tile_y);
- auto& argx = *(op.add_arg());
- argx.set_name("tile_x");
- argx.set_i(tile_x);
- auto& argy = *(op.add_arg());
- argy.set_name("tile_y");
- argy.set_i(tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLSoftmax");
- op.add_input("X_gl");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu0");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Reshape");
- op.add_input("Y_cpu0");
- op.add_output("Y_cpu");
- op.add_output("old_shape");
- auto& arg = *(op.add_arg());
- arg.set_name("shape");
- arg.add_ints(N);
- arg.add_ints(D);
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("Softmax");
- op.add_input("X_cpu");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGL InstanceNorm Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
- // for (auto i = 0; i < t->size(); ++i) {
- // t->mutable_data<float>()[i] = 0.001;
- // }
- }
-
- // scale
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
- t->Resize(C);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<float>()[i] = (i + 1) / t->size();
- }
- }
- // bias
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
- t->Resize(C);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<float>()[i] = 8 - 2 * i;
- }
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLInstanceNorm");
- op.add_input("X_gl");
- op.add_input("W");
- op.add_input("b");
- op.add_output("Y_gl");
- op.add_output("Mean_gl");
- op.add_output("InvStdev_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Mean_gl");
- op.add_output("Mean_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("InvStdev_gl");
- op.add_output("InvStdev_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("InstanceNorm");
- op.add_input("X_cpu");
- op.add_input("W");
- op.add_input("b");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- op.add_output("Mean_ref");
- op.add_output("InvStdev_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- LOG(INFO) << "Check mean";
- checkError1D(
- ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001);
- LOG(INFO) << "Check inv_stdev";
- checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(),
- ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(),
- 0.001);
- LOG(INFO) << "Check instance norm";
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGL InstanceNormPRelu Test "
- << "C: " << C << ", H: " << H << ", W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- // Too noisy.
- math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx);
- // for (auto i = 0; i < t->size(); ++i) {
- // t->mutable_data<float>()[i] = 0.001;
- // }
- }
-
- // scale
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
- t->Resize(C);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<float>()[i] = (i + 1) / t->size();
- }
- }
- // bias
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
- t->Resize(C);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<float>()[i] = 8 - 2 * i;
- }
- }
- // prelu scale
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
- t->Resize(C);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLInstanceNormPRelu");
- op.add_input("X_gl");
- op.add_input("W");
- op.add_input("b");
- op.add_input("p");
- op.add_output("Y_gl");
- op.add_output("Mean_gl");
- op.add_output("InvStdev_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Mean_gl");
- op.add_output("Mean_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("InvStdev_gl");
- op.add_output("InvStdev_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("InstanceNorm");
- op.add_input("X_cpu");
- op.add_input("W");
- op.add_input("b");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- op.add_output("Mean_ref");
- op.add_output("InvStdev_ref");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("PRelu");
- op.add_input("Y_ref");
- op.add_input("p");
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU
-
- LOG(INFO) << "Check mean";
- checkError1D(
- ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001);
- LOG(INFO) << "Check inv_stdev";
- checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(),
- ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(),
- 0.001);
- LOG(INFO) << "Check instance norm";
- checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error);
-}
-
-void OpenGL_speedtest(int N,
- int C,
- int H,
- int W,
- int K,
- int kernel_h,
- int kernel_w,
- int pad,
- float error,
- bool random_input = true) {
- LOG(INFO) << "OpenGL Conv Speed Test "
- << " C: " << C << " H: " << H << " W: " << W;
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
- t->Resize(K, C, kernel_h, kernel_w);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
- t->Resize(K);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
- }
-
- NetDef netdef;
- netdef.set_name("Test net");
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLConv");
- op.add_input("X_gl");
- op.add_input("W");
- op.add_input("b");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("order");
- arg.set_s("NCHW");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("kernel");
- arg.set_i(kernel_h);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad");
- arg.set_i(pad);
- }
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- CAFFE_ENFORCE(ws.RunNetOnce(netdef));
- caffe2::NetBase* net = ws.CreateNet(netdef);
- CHECK_NOTNULL(net);
- CAFFE_ENFORCE(net->Run());
- net->TEST_Benchmark(1, 4, true);
-}
-
-void testOpenGLPadImage(
- int N, int C, int H, int W, int pad_l, int pad_r, int pad_t, int pad_b, float error) {
- LOG(INFO) << "OpenGLPadImage Test";
- {
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- // for (auto i = 0; i < t->size(); ++i) {
- // t->mutable_data<float>()[i] = i + 1;
- // }
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLPadImage");
- op.add_input("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_l");
- arg.set_i(pad_l);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_r");
- arg.set_i(pad_r);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_t");
- arg.set_i(pad_t);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_b");
- arg.set_i(pad_b);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("mode");
- arg.set_s("reflect");
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("is_last");
- arg.set_i(1);
- }
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("PadImage");
- op.add_input("X_cpu");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_l");
- arg.set_i(pad_l);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_r");
- arg.set_i(pad_r);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_t");
- arg.set_i(pad_t);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("pad_b");
- arg.set_i(pad_b);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("mode");
- arg.set_s("reflect");
- }
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
-
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
- checkError(t2, t1, error);
- }
-}
-
-void testOpenGLResize(int N,
- int C,
- int H,
- int W,
- int width_scale,
- int height_scale,
- float error,
- int input_tile_x = 1,
- int input_tile_y = 1) {
- LOG(INFO) << "OpenGLResize Test";
- {
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- }
-
- NetDef netdef;
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_x");
- arg.set_i(input_tile_x);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("tile_y");
- arg.set_i(input_tile_y);
- }
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLResizeNearest");
- op.add_input("X_gl");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("width_scale");
- arg.set_f(width_scale);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("height_scale");
- arg.set_f(height_scale);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("is_last");
- arg.set_i(1);
- }
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("ResizeNearest");
- op.add_input("X_cpu");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("width_scale");
- arg.set_f(width_scale);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("height_scale");
- arg.set_f(height_scale);
- }
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
-
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
- checkError(t2, t1, error);
- }
-}
-
-void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGL Preprocess Test";
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, H, W, C);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<uint8_t>()[i] = rand() % 255;
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
- t->Resize(3);
- CPUContext ctx;
- t->mutable_data<float>()[0] = 100;
- t->mutable_data<float>()[1] = 50;
- t->mutable_data<float>()[2] = 150;
- }
-
- NetDef netdef;
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLTensorToTextureStylizerPreprocess");
- op.add_input("X_cpu");
- op.add_input("mean");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("noise_std");
- arg.set_f(0.00001);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("noise_size");
- arg.set_i(512);
- }
-
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess");
- op.add_input("X_cpu");
- op.add_input("mean");
- {
- auto& arg = *(op.add_arg());
- arg.set_name("noise_std");
- arg.set_f(0.00001);
- }
- {
- auto& arg = *(op.add_arg());
- arg.set_name("noise_size");
- arg.set_i(512);
- }
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu
- checkError(t2, t1, error);
-}
-
-void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGLDeprocess Test";
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<float>()[i] = rand() % 1000 - 500;
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
- t->Resize(3);
- CPUContext ctx;
- t->mutable_data<float>()[0] = 30;
- t->mutable_data<float>()[1] = 40;
- t->mutable_data<float>()[2] = 50;
- }
-
- NetDef netdef;
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLTextureToTensorStylizerDeprocess");
- op.add_input("X_gl");
- op.add_input("mean");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess");
- op.add_input("X_cpu");
- op.add_input("mean");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
- checkError(t2, t1, error);
-}
-
-void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
- LOG(INFO) << "OpenGLNormPlanarYUV Test";
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, 3, H, W);
- CPUContext ctx;
- for (auto i = 0; i < t->size(); ++i) {
- t->mutable_data<float>()[i] = rand() % 1000 - 500;
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
- t->Resize(1, 3);
- CPUContext ctx;
- t->mutable_data<float>()[0] = 30;
- t->mutable_data<float>()[1] = 40;
- t->mutable_data<float>()[2] = 50;
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
- t->Resize(1, 3);
- CPUContext ctx;
- t->mutable_data<float>()[0] = 6;
- t->mutable_data<float>()[1] = 7;
- t->mutable_data<float>()[2] = 8;
- }
-
- NetDef netdef;
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("OpenGLNormalizePlanarYUV");
- op.add_input("X_gl");
- op.add_input("mean");
- op.add_input("stdev");
- op.add_output("Y_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("Y_gl");
- op.add_output("Y_cpu");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("NormalizePlanarYUV");
- op.add_input("X_cpu");
- op.add_input("mean");
- op.add_input("stdev");
- op.add_output("Y_ref");
- }
-
- ws.RunNetOnce(netdef);
- const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
- const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
- checkError(t2, t1, error);
-}
-
-void OpenGL_copyops_speedtest(int N,
- int C,
- int H,
- int W,
- int K,
- int kernel_h,
- int kernel_w,
- int pad,
- float error,
- bool random_input = true) {
- LOG(INFO) << "OpenGL CopyOps Speed Test";
- Workspace ws;
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
- t->Resize(N, C, H, W);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
- t->Resize(K, C, kernel_h, kernel_w);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
- }
-
- {
- auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
- t->Resize(K);
- CPUContext ctx;
- if (random_input) {
- math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
- } else {
- float* data = t->mutable_data<float>();
- for (int i = 0; i < t->size(); i++) {
- data[i] = 1;
- }
- }
- }
-
- NetDef netdef;
- netdef.set_name("Test net");
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyToOpenGL");
- op.add_input("X_cpu");
- op.add_output("X_gl");
- }
-
- {
- auto& op = *(netdef.add_op());
- op.set_type("CopyFromOpenGL");
- op.add_input("X_gl");
- op.add_output("Y_cpu");
- }
-
- caffe2::NetBase* net = ws.CreateNet(netdef);
- CHECK_NOTNULL(net);
- net->TEST_Benchmark(1, 4, true);
-}
-
-static NetDef truncateAfter(NetDef def, size_t idx) {
- // idx = 0, net = 10 -> remove 9
- // idx = 0, net = 1 -> remove 0
- const auto toRemove = def.op_size() - idx - 1;
- for (auto i = 0; i < toRemove; ++i) {
- def.mutable_op()->RemoveLast();
- }
- CHECK_EQ(def.op_size(), idx + 1);
- return def;
-}
-
-void compareModelsForOpenGL(std::string name,
- const NetDef& initNet,
- NetDef predictNet,
- int width,
- int height,
- int channel,
- std::string input_type,
- std::string input_order) {
-
- if (name == "styleTransfer") {
- for (int i = 0; i < predictNet.mutable_op(0)->arg_size(); i++) {
- auto* arg = predictNet.mutable_op(0)->mutable_arg(i);
- if (arg->name() == "noise_std") {
- arg->set_f(0);
- }
- }
- }
-
- for (auto i = 0; i < predictNet.op_size(); ++i) {
- auto truncatedPredictNet = truncateAfter(predictNet, i);
-
- // Change the last blob to external_output(0) for the predict net
- auto output_blob = "_OUTPUT_BLOB__";
- truncatedPredictNet.set_external_output(0, output_blob);
- truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob);
-
- NetDef truncatedOpenGLPredictNet = rewritePredictNetForOpenGL(truncatedPredictNet);
-
- // LOG(INFO) << "truncatedPredictNet";
- // dumpDefForOpenGL(truncatedPredictNet);
- //
- LOG(INFO) << "truncatedOpenGLPredictNet";
- dumpDefForOpenGL(truncatedOpenGLPredictNet);
-
- CPUContext ctx;
- Workspace cws;
- cws.RunNetOnce(initNet);
-
- auto* t_cpu = BlobGetMutableTensor(
- cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
- if (name == "styleTransfer") {
- CAFFE_ENFORCE_EQ(input_order, "NHWC");
- CAFFE_ENFORCE_EQ(input_type, "uint8_t");
- t_cpu->Resize(1, height, width, channel);
- for (auto i = 0; i < t_cpu->size(); ++i) {
- t_cpu->mutable_data<uint8_t>()[i] = i % 255;
- }
- } else if (name == "segmentation") {
- CAFFE_ENFORCE_EQ(input_order, "NCHW");
- CAFFE_ENFORCE_EQ(input_type, "float");
- t_cpu->Resize(1, channel, height, width);
- float* input = t_cpu->mutable_data<float>();
- const int size = width * height;
- // Limit input range to YUV
- math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1
- math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436
- math::RandGaussian<float, CPUContext>(
- size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615
- } else if (name == "denoiser") {
- CAFFE_ENFORCE_EQ(input_order, "NCHW");
- CAFFE_ENFORCE_EQ(input_type, "float");
- t_cpu->Resize(1, channel, height, width);
- float* input = t_cpu->mutable_data<float>();
- const int spatial_size = width * height;
- math::RandGaussian<float, CPUContext>(spatial_size, 0, 0.33, input, &ctx); // R Channel
- math::RandGaussian<float, CPUContext>(
- spatial_size, 0, 0.33, input + spatial_size, &ctx); // G Channel
- math::RandGaussian<float, CPUContext>(
- spatial_size, 0, 0.33, input + 2 * spatial_size, &ctx); // B Channel
- // Clamp Range of input [-1, +1]
- for (auto i = 0; i < t_cpu->size(); ++i) {
- input[i] = input[i] > 1 ? 1 : input[i] < -1 ? -1 : input[i];
- }
- } else {
- CAFFE_THROW("CompareModels only works with style transfer and segmentation now");
- }
-
- Workspace mws;
- mws.RunNetOnce(initNet);
-
- auto* t_gl = BlobGetMutableTensor(
- mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
- if (name == "styleTransfer") {
- CAFFE_ENFORCE_EQ(input_order, "NHWC");
- CAFFE_ENFORCE_EQ(input_type, "uint8_t");
- t_gl->Resize(1, height, width, channel);
- uint8_t* input = t_gl->mutable_data<uint8_t>();
- memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->storage().capacity());
- } else if (name == "segmentation") {
- CAFFE_ENFORCE_EQ(input_order, "NCHW");
- CAFFE_ENFORCE_EQ(input_type, "float");
- t_gl->Resize(1, channel, height, width);
- float* input = t_gl->mutable_data<float>();
- memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
- } else if (name == "denoiser") {
- CAFFE_ENFORCE_EQ(input_order, "NCHW");
- CAFFE_ENFORCE_EQ(input_type, "float");
- t_gl->Resize(1, channel, height, width);
- float* input = t_gl->mutable_data<float>();
- memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
- }
-
- cws.RunNetOnce(truncatedPredictNet);
- mws.RunNetOnce(truncatedOpenGLPredictNet);
-
- const auto m_name =
- truncatedOpenGLPredictNet.op(truncatedOpenGLPredictNet.op_size() - 1).output(0);
- const auto c_name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
-
- LOG(INFO) << "Checking correspondence for name: " << m_name << ", idx: " << i;
- {
- const auto& mt = mws.GetBlob(m_name)->Get<TensorCPU>(); // GPU
- const auto& ct = cws.GetBlob(c_name)->Get<TensorCPU>(); // CPU
- if (name == "denoiser") {
- checkError(mt, ct, 0.02); // 1% of Scale
- LOG(INFO) << "Error Check Completed for Denoiser Layer: " << i;
- } else {
- checkError(mt, ct, 1);
- }
- }
- }
-}
-
-void compareBatchedToTiledModels(std::string name,
- const NetDef& initNet,
- NetDef predictNet,
- int width,
- int height,
- int channel,
- std::string input_type,
- std::string input_order) {
-
- if (name == "styleTransfer") {
- for (int i = 0; i < predictNet.mutable_op(0)->arg_size(); i++) {
- auto* arg = predictNet.mutable_op(0)->mutable_arg(i);
- if (arg->name() == "noise_std") {
- arg->set_f(0);
- }
- }
- }
-
- for (auto i = 19; i < predictNet.op_size(); ++i) {
- auto truncatedPredictNet = truncateAfter(predictNet, i);
-
- // Change the last blob to external_output(0) for the predict net
- auto output_blob = "_OUTPUT_BLOB__";
- truncatedPredictNet.set_external_output(0, output_blob);
- truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob);
-
- NetDef bachedNet = rewritePredictNetForOpenGL(truncatedPredictNet, false, false);
- NetDef tiledNet = rewritePredictNetForOpenGL(truncatedPredictNet, false, true);
-
- LOG(INFO) << "truncatedPredictNet";
- dumpDefForOpenGL(truncatedPredictNet);
-
- LOG(INFO) << "truncatedOpenGLPredictNet";
- dumpDefForOpenGL(bachedNet);
-
- CPUContext ctx;
-
- Workspace tws;
- tws.RunNetOnce(initNet);
-
- auto* t_batch =
- BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
- if (name == "styleTransfer") {
- CAFFE_ENFORCE_EQ(input_order, "NHWC");
- CAFFE_ENFORCE_EQ(input_type, "uint8_t");
- t_batch->Resize(1, height, width, channel);
- for (auto i = 0; i < t_batch->size(); ++i) {
- t_batch->mutable_data<uint8_t>()[i] = i % 255;
- }
- } else if (name == "segmentation") {
- CAFFE_ENFORCE_EQ(input_order, "NCHW");
- CAFFE_ENFORCE_EQ(input_type, "float");
- t_batch->Resize(1, channel, height, width);
- float* input = t_batch->mutable_data<float>();
- const int size = width * height;
- // Limit input range to YUV
- math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1
- math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436
- math::RandGaussian<float, CPUContext>(
- size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615
- } else {
- CAFFE_THROW("CompareModels only works with style transfer and segmentation now");
- }
-
- Workspace bws;
- bws.RunNetOnce(initNet);
-
- auto* t_tiling =
- BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
- if (name == "styleTransfer") {
- CAFFE_ENFORCE_EQ(input_order, "NHWC");
- CAFFE_ENFORCE_EQ(input_type, "uint8_t");
- t_tiling->Resize(1, height, width, channel);
- uint8_t* input = t_tiling->mutable_data<uint8_t>();
- memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->storage().capacity());
-
- } else if (name == "segmentation") {
- CAFFE_ENFORCE_EQ(input_order, "NCHW");
- CAFFE_ENFORCE_EQ(input_type, "float");
- t_tiling->Resize(1, channel, height, width);
- float* input = t_tiling->mutable_data<float>();
- memcpy(input, t_batch->mutable_data<float>(), t_batch->storage().capacity());
- }
-
- bws.RunNetOnce(bachedNet);
- tws.RunNetOnce(tiledNet);
-
- const auto batch_name = bachedNet.op(bachedNet.op_size() - 1).output(0);
- const auto tile_name = tiledNet.op(tiledNet.op_size() - 1).output(0);
-
- LOG(INFO) << "Checking correspondence for name: " << batch_name << ", idx: " << i;
- {
- const auto& bt = bws.GetBlob(batch_name)->Get<TensorCPU>(); // GPU
- const auto& tt = tws.GetBlob(tile_name)->Get<TensorCPU>(); // CPU
- checkError(bt, tt, 0.01);
- }
- }
-}
-
-int runModelBenchmarks(caffe2::NetDef& init_net,
- caffe2::NetDef& predict_net,
- int warm_up_runs,
- int main_runs,
- int channel,
- int height,
- int width,
- std::string input_type,
- std::string input_order,
- std::string engine, // "CPU", "OPENGL", or "MPSCNN"
- bool run_individual,
- bool use_texture_input,
- bool use_tiling,
- bool run_fusion) {
- std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
-
- // caffe2::dumpDefForOpenGL(init_net);
- caffe2::dumpDefForOpenGL(predict_net);
-
- CAFFE_ENFORCE(workspace->RunNetOnce(init_net));
- caffe2::NetDef net_def;
-
- // rewrite network
- if (engine == "CPU") {
- net_def.CopyFrom(predict_net);
- } else if (engine == "OPENGL") {
- if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input, use_tiling, run_fusion)) {
- CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run");
- return -1;
- }
- } else if (engine == "MPSCNN") {
-#ifdef CAFFE2_USE_MPSCNN
- if (!caffe2::tryConvertToMPSCNN(init_net, predict_net, &net_def)) {
- CAFFE_THROW("Failed to convert to MPSCNN. Benchmark failed to run");
- return -1;
- }
-#else
- CAFFE_THROW("MPSCNN not enabled. Benchmark failed to run");
- return -1;
-#endif
- } else {
- CAFFE_THROW("Unsupported engine. Benchmark failed to run");
- return -1;
- }
-
- if (!net_def.has_name()) {
- net_def.set_name("benchmark");
- }
- caffe2::NetBase* net = workspace->CreateNet(net_def);
-
- // create input blob
- if (engine == "CPU" || engine == "MPSCNN" || !use_texture_input) {
- caffe2::TensorCPU* b;
- if (!net_def.external_input_size()) {
- b = workspace->CreateBlob("data")->GetMutable<caffe2::TensorCPU>();
- } else {
- b = workspace->CreateBlob(net_def.external_input(0))->GetMutable<caffe2::TensorCPU>();
- }
-
- if (input_order == "NCHW") {
- b->Resize(std::vector<int32_t>(
- {1, static_cast<int>(channel), static_cast<int>(height), static_cast<int>(width)}));
- } else if (input_order == "NHWC") {
- b->Resize(std::vector<int32_t>(
- {1, static_cast<int>(height), static_cast<int>(width), static_cast<int>(channel)}));
- } else {
- CAFFE_THROW("Unknown input order: ", input_order);
- }
- if (input_type == "uint8_t") {
- b->mutable_data<uint8_t>();
- } else if (input_type == "float") {
- b->mutable_data<float>();
- } else {
- CAFFE_THROW("Unknown input type: ", input_type);
- }
- } else {
- const int tile_x = 1, tile_y = 1;
- Blob* blob = nullptr;
- if (!net_def.external_input_size()) {
- blob = workspace->CreateBlob("data");
- } else {
- blob = workspace->CreateBlob(net_def.external_input(0));
- }
- if (input_type == "float") {
- ImageAllocator<float16_t> allocator;
- GLImageVector<float16_t>* output_image = allocator.newImage(1,
- width,
- height,
- channel,
- tile_x,
- tile_y,
-#if CAFFE2_IOS
- true
-#else
- false
-#endif
- );
- blob->Reset(output_image);
- for (auto& texture : (*output_image)[0]->textures) {
- texture->map_load([&](void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {});
- }
- } else {
- ImageAllocator<uint8_t> allocator;
- GLImageVector<uint8_t>* output_image = allocator.newImage(1,
- width,
- height,
- channel,
- tile_x,
- tile_y,
-#if CAFFE2_IOS
- true
-#else
- false
-#endif
- );
- blob->Reset(output_image);
- for (auto& texture : (*output_image)[0]->textures) {
- texture->map_load([&](void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {});
- }
- }
- }
-
- // run benchmark
- if (engine == "CPU" || engine == "MPSCNN") {
- CHECK_NOTNULL(net);
- CAFFE_ENFORCE(net->Run());
- net->TEST_Benchmark(warm_up_runs, main_runs, run_individual);
- } else if (engine == "OPENGL") {
- CHECK_NOTNULL(net);
- CAFFE_ENFORCE(net->Run());
-
- for (int i = 0; i < warm_up_runs; i++) {
- net->Run();
- }
- glFinish();
-
- Timer timer;
- timer.Start();
- for (int i = 0; i < main_runs; i++) {
- net->Run();
- }
- if (use_texture_input) {
- glFinish();
- }
-
- double iter_time = (double)timer.MilliSeconds() / main_runs;
- LOG(INFO) << "Main run finished. Milliseconds per iter: " << iter_time
- << ". Iters per second: " << 1000.0 / iter_time;
-
- if (run_individual) {
- std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
-
- for (auto& op : net_def.op()) {
- ops.push_back(CreateOperator(op, workspace.get()));
- ops.back()->Run(); // warm up
- }
-
- for (int k = 0; k < ops.size(); k++) {
- timer.Start();
- for (int i = 0; i < main_runs; i++) {
- ops[k]->Run();
- }
- glFinish();
-
- LOG(INFO) << "Operator #" << k << " " << net_def.op(k).type() << ": "
- << (double)timer.MilliSeconds() / main_runs;
- }
- }
- }
-
- return 0;
-}
-
-template <typename T>
-void testGLTextureTypes() {
- gl_log(GL_LOG, "Executing %s...\n", __PRETTY_FUNCTION__);
-
- GLImageAllocator<T>* allocator = GLImageAllocator<T>::newGLImageAllocator();
-
- GLImageVector<T>* image = allocator->newImage(1, 10, 10, 4, 1, 1, true);
-
- const GLTexture* texture = (*image)[0]->textures[0];
-
- texture->map_load([&](void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {
- T* buffer_data = (T*)buffer;
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- for (int c = 0; c < channels; c++) {
- buffer_data[channels * (y * stride + x) + c] = x + y;
- }
- }
- }
- });
-
- texture->map_read([&](const void* buffer,
- size_t width,
- size_t height,
- size_t stride,
- size_t channels,
- const GLTexture::Type& type) {
- const T* buffer_data = (const T*)buffer;
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- gl_log(GL_LOG, "%d, ", (int)buffer_data[channels * (y * stride + x) + 0]);
- }
- gl_log(GL_LOG, "\n");
- }
- });
- delete image;
- delete allocator;
- gl_log(GL_LOG, "...done with %s\n", __PRETTY_FUNCTION__);
-}
-
-void testOpenGL() {
- {
- // Test a bunch of different tiled convolutions
- std::vector<int> channels({3, 4, 6, 8, 12, 16, 32, 64, 128, 256, 512});
-
- for (const auto& input_channels : channels) {
- int tile_x = 1, tile_y = 1;
- squareFactors((input_channels + 3) / 4, tile_x, tile_y);
-
- for (const auto& output_channels : channels) {
- for (int size = 5; size < 8; size *= 2) {
- testOpenGLConv(1,
- input_channels,
- size,
- size,
- output_channels,
- 3,
- 3,
- 0,
- 1,
- Conv,
- 0.1 * input_channels / 8,
- true,
- 1,
- 1,
- tile_x,
- tile_y,
- true);
- }
-
- for (int size = 5; size < 16; size *= 2) {
- testOpenGLConv(1,
- input_channels,
- size,
- size,
- output_channels,
- 3,
- 3,
- 0,
- 1,
- ConvTranspose,
- 0.1 * input_channels / 8,
- true,
- 1,
- 1,
- tile_x,
- tile_y,
- true);
- }
- }
- }
-
- // Test various paddings and strides with tiled convolution
- for (int kernel_size = 1; kernel_size <= 5; kernel_size++) {
- for (int pad = 0; pad < kernel_size; pad++) {
- for (int stride = 1; stride <= 8; stride++) {
- testOpenGLConv(1,
- 16,
- 100,
- 100,
- 16,
- kernel_size,
- kernel_size,
- pad,
- stride,
- Conv,
- 0.5,
- true,
- 1,
- 1,
- 2,
- 2,
- true);
- }
-
- for (int stride = 1; stride <= 8; stride++) {
- testOpenGLConv(1,
- 16,
- 100,
- 100,
- 16,
- kernel_size,
- kernel_size,
- pad,
- stride,
- ConvTranspose,
- 0.5,
- true,
- 1,
- 1,
- 2,
- 2,
- true);
- }
- }
- }
-
- // Test a bunch of batched convolutions
- for (int kernel_size = 1; kernel_size <= 8; kernel_size++) {
- for (int stride = 1; stride <= 8; stride++) {
- testOpenGLConv(1,
- 16,
- 10,
- 10,
- 16,
- kernel_size,
- kernel_size,
- 0,
- stride,
- ConvTranspose,
- 0.5 * (1 + kernel_size / 3.0),
- true,
- 1,
- 1);
- }
-
- for (int stride = 1; stride <= 8; stride++) {
- testOpenGLConv(1,
- 16,
- 10,
- 10,
- 16,
- kernel_size,
- kernel_size,
- 0,
- stride,
- Conv,
- 0.5 * (1 + kernel_size / 3.0),
- true,
- 1,
- 1);
- }
- }
- for (const auto& channel : channels) {
- int tile_x = 1, tile_y = 1;
- squareFactors((channel + 3) / 4, tile_x, tile_y);
- // clang-format off
- testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvPRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
- testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvTransposePRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
- testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
- testOpenGLConv(1, channel, 10, 10, channel, 3, 3, 0, 1, ConvTransposeRelu, 0.1 * channel / 8, true, 1, 1, tile_x, tile_y, true);
-
- testOpenGLPRelu(1, channel, 13, 4, channel, tile_x, tile_y, 0.1);
- testOpenGLRelu(1, channel, 4, 17, tile_x, tile_y, 0.1);
- testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, MaxPool, 0.01, true, 1, 1, tile_x, tile_y, true);
- testOpenGLConv(1, channel, 16, 16, channel, 3, 3, 0, 2, AveragePool, 0.01, true, 1, 1, tile_x, tile_y, true);
- testOpenGLAdd(1, channel, 14, 8, 0.1, tile_x, tile_y);
- testOpenGLResize(1, channel, 16, 16, 2, 2, 0.1, tile_x, tile_y);
- // clang-format on
- }
- }
-
- {
- testGLTextureTypes<uint8_t>();
- testGLTextureTypes<float16_t>();
-
- testOpenGLCopyOps(1, 4, 4, 4, 1e-2);
- testOpenGLCopyOps(1, 3, 4, 4, 1e-2);
- testOpenGLCopyOps(1, 2, 4, 4, 1e-2);
- testOpenGLCopyOps(1, 1, 4, 4, 1e-2);
- testOpenGLCopyOps(1, 4, 2, 2, 1e-2);
- testOpenGLCopyOps(1, 4, 4, 4, 1e-2);
- testOpenGLCopyOps(1, 4, 1, 1, 1e-2);
- testOpenGLCopyOps(1, 4, 8, 8, 1e-2);
- testOpenGLCopyOps(1, 6, 8, 3, 1e-2);
- testOpenGLCopyOps(1, 4, 1, 2, 1e-2);
- testOpenGLCopyOps(1, 8, 6, 1, 1e-2);
- testOpenGLCopyOps(1, 8, 13, 18, 1e-2);
- testOpenGLCopyOps(1, 16, 13, 18, 1e-2);
- testOpenGLCopyOps(1, 13, 128, 90, 1e-2);
- testOpenGLCopyOps(1, 16, 1280, 720, 1e-2);
-
- testOpenGLCopyOps(1, 16, 4, 4, 1e-2, 2, 2);
- testOpenGLCopyOps(1, 64, 16, 16, 1e-2, 2, 2);
- testOpenGLCopyOps(1, 48, 13, 17, 1e-2, 3, 2);
- testOpenGLCopyOps(1, 512, 1, 1, 1e-2, 4, 16);
- testOpenGLCopyOps(1, 256, 7, 7, 1e-2, 8, 8);
- testOpenGLCopyOps(1, 20, 13, 17, 1e-2, 5, 1);
-
- // Test pooling operators
- LOG(INFO) << "Test pooling operators";
- testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true);
- testOpenGLConv(1, 4, 5, 5, 4, 5, 5, 0, 1, AveragePool, 0.5, true);
-
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, AveragePool, 0.01, true);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, AveragePool, 0.01, true);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, AveragePool, 0.01, true);
-
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, MaxPool, 0.01, true);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, MaxPool, 0.01, true);
-
- // Test strided convolution
- LOG(INFO) << "Test strided convolution";
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, Conv, 0.5, true, 1, 1);
-
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 3, Conv, 0.5, true, 1, 1);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 3, Conv, 0.5, true, 1, 1);
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 3, Conv, 0.5, true, 1, 1);
-
- // Test input batching
- LOG(INFO) << "Test input batching";
- testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
- testOpenGLConv(1, 8, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 2, 1);
- testOpenGLConv(1, 12, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 3, 1);
- testOpenGLConv(1, 16, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 4, 1);
-
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input
- testOpenGLConv(1, 8, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 2, 1); // use random input
- testOpenGLConv(1, 12, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 3, 1); // use random input
- testOpenGLConv(1, 16, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 4, 1); // use random input
- testOpenGLConv(1, 32, 10, 10, 4, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
-
- // Test output batching
- LOG(INFO) << "Test output batching";
- testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
- testOpenGLConv(1, 4, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 1, 2);
- testOpenGLConv(1, 4, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 1, 3);
- testOpenGLConv(1, 4, 5, 5, 16, 3, 3, 0, 1, Conv, 0.5, false, 1, 4);
-
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input
- testOpenGLConv(1, 4, 10, 10, 8, 3, 3, 0, 1, Conv, 1.5, true, 1, 2); // use random input
- testOpenGLConv(1, 4, 10, 10, 12, 3, 3, 0, 1, Conv, 0.5, true, 1, 3); // use random input
- testOpenGLConv(1, 4, 10, 10, 16, 3, 3, 0, 1, Conv, 0.5, true, 1, 4); // use random input
-
- // Test both
- LOG(INFO) << "Test both input and output batching";
- testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1);
- testOpenGLConv(1, 8, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 2, 2);
- testOpenGLConv(1, 12, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 3, 3);
-
- testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input
- testOpenGLConv(1, 8, 10, 10, 8, 3, 3, 0, 1, Conv, 1, true, 2, 2); // use random input
- testOpenGLConv(1, 12, 10, 10, 12, 3, 3, 0, 1, Conv, 2, true, 3, 3); // use random input
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
-
- // Test different combination of batching
- LOG(INFO) << "Test mixed input and output batching sizes";
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 2);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 2);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 4);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 4);
-
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 1);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 1);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 1);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 2);
-
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2); // use random input
-
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1);
- testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2);
-
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 2); // use random input
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 2); // use random input
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input
- testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 4); // use random input
-
- // Test input/output channels
- for (int i = 0; i < 4; i++) {
- testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
- testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input
- }
-
- // Test large input size
- LOG(INFO) << "Test large input size";
- testOpenGLConv(1, 4, 1280, 720, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
-
- // Test non standard input size
- testOpenGLConv(1, 16, 125, 73, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input
- testOpenGLConv(1, 16, 127, 71, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input
-
- // Test for different kernel size
- LOG(INFO) << "Test kernel sizes 4 to 6";
- for (int w = 4; w < 7; w++) {
- testOpenGLConv(1, 4, 128, 72, 4, w, w, 0, 1, Conv, 4 * (w / 3.0) * (w / 3.0), true, 1, 1);
- }
-
- // Test for random failures
- for (int i = 0; i < 10; i++) {
- testOpenGLConv(1, 6, 111, 111, 3, 3, 3, 0, 2, ConvTranspose, 0.5, true, 2, 1);
- testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2);
- }
-
- LOG(INFO) << "Test OpenGL ConvPRelu";
- testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1);
- testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvPRelu, 1, true, 1, 1);
- testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvPRelu, 2, true, 2, 2);
- testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4);
- testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvPRelu, 4, true, 3, 1);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 1, 1);
-
- LOG(INFO) << "Test OpenGL ConvTransposePRelu";
- testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1);
- testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposePRelu, 1, true, 1, 1);
- testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 2, 2);
- testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4);
- testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 3, 1);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 1, 1);
-
- LOG(INFO) << "Test OpenGL ConvRelu";
- testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvRelu, 2, true, 1, 1);
- testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvRelu, 1, true, 1, 1);
- testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvRelu, 2, true, 2, 2);
- testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4);
- testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvRelu, 4, true, 3, 1);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 1, 1);
-
- LOG(INFO) << "Test OpenGL ConvTransposeRelu";
- testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 1, 1);
- testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposeRelu, 1, true, 1, 1);
- testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 2, 2);
- testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4);
- testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 3, 1);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4);
- testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 1, 1);
-
- LOG(INFO) << "Test OpenGL PRelu";
- testOpenGLPRelu(1, 4, 16, 16, 4, 1, 1, 0.1);
- testOpenGLPRelu(1, 16, 16, 16, 1, 1, 1, 0.1);
- testOpenGLPRelu(1, 12, 16, 16, 1, 1, 1, 0.1);
- testOpenGLPRelu(1, 6, 640, 360, 6, 1, 1, 0.1);
-
- LOG(INFO) << "Test OpenGL Relu";
- testOpenGLRelu(1, 4, 16, 16, 1, 1, 0.1);
- testOpenGLRelu(1, 16, 16, 16, 1, 1, 0.1);
- testOpenGLRelu(1, 6, 640, 360, 1, 1, 0.1);
-
- LOG(INFO) << "Test OpenGL Add";
- testOpenGLAdd(1, 16, 640, 360, 0.1);
- testOpenGLAdd(1, 12, 640, 360, 0.1);
-
- LOG(INFO) << "Test OpenGL Sub";
- testOpenGLSub(1, 16, 640, 360, 0.1);
- testOpenGLSub(1, 12, 640, 360, 0.1);
-
- LOG(INFO) << "Test OpenGL Sigmoid";
- testOpenGLSigmoid(1, 4, 16, 16, 0.1);
- testOpenGLSigmoid(1, 12, 64, 48, 0.1);
- testOpenGLSigmoid(1, 6, 640, 360, 0.1);
-
- LOG(INFO) << "Test OpenGL Tanh";
- testOpenGLTanh(1, 4, 16, 16, 0.1);
- testOpenGLTanh(1, 12, 64, 48, 0.1);
- testOpenGLTanh(1, 6, 640, 360, 0.1);
-
- LOG(INFO) << "Test OpenGL Mul";
- testOpenGLMul(1, 4, 16, 16, 0.1);
- testOpenGLMul(1, 12, 64, 48, 0.1);
- testOpenGLMul(1, 6, 640, 360, 0.1);
-
- LOG(INFO) << "Test OpenGL Concat";
- testOpenGLConcat(1, std::vector<int>{4, 4}, 16, 16);
- testOpenGLConcat(1, std::vector<int>{4, 4, 4}, 16, 16);
- testOpenGLConcat(1, std::vector<int>{4, 4, 4, 4}, 16, 16);
- testOpenGLConcat(1, std::vector<int>{8, 4, 12}, 16, 16);
- testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16);
- testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16);
-
- testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16, true);
- testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16, true);
-
- LOG(INFO) << "Test OpenGL Softmax";
- testOpenGLSoftmax(1, 100, 0.1);
- testOpenGLSoftmax(1, 500, 0.1);
- testOpenGLSoftmax(1, 1000, 0.1);
- testOpenGLSoftmax(1, 5000, 0.1);
-
- LOG(INFO) << "Test OpenGL InstanceNorm";
- testOpenGLInstanceNorm(1, 4, 16, 16, 0.2);
- testOpenGLInstanceNorm(1, 4, 20, 20, 0.2);
- testOpenGLInstanceNorm(1, 4, 128, 128, 0.2);
- testOpenGLInstanceNorm(1, 12, 120, 140, 0.3);
- testOpenGLInstanceNorm(1, 3, 120, 140, 0.2);
- testOpenGLInstanceNorm(1, 4, 192, 192, 0.2);
-
- testOpenGLInstanceNorm(1, 4, 258, 198, 0.2);
- testOpenGLInstanceNorm(1, 8, 338, 198, 0.2);
- testOpenGLInstanceNorm(1, 12, 334, 194, 0.2);
- testOpenGLInstanceNorm(1, 16, 324, 184, 0.2);
- testOpenGLInstanceNorm(1, 6, 640, 360, 0.2);
-
- LOG(INFO) << "Test OpenGL InstanceNormPRelu";
- testOpenGLInstanceNormPRelu(1, 4, 16, 16, 0.2);
- testOpenGLInstanceNormPRelu(1, 4, 20, 20, 0.2);
- testOpenGLInstanceNormPRelu(1, 4, 128, 128, 0.2);
- testOpenGLInstanceNormPRelu(1, 12, 120, 140, 0.3);
- testOpenGLInstanceNormPRelu(1, 3, 120, 140, 0.2);
- testOpenGLInstanceNormPRelu(1, 4, 192, 192, 0.2);
-
- testOpenGLInstanceNormPRelu(1, 4, 258, 198, 0.2);
- testOpenGLInstanceNormPRelu(1, 8, 338, 198, 0.2);
- testOpenGLInstanceNormPRelu(1, 12, 334, 194, 0.2);
- testOpenGLInstanceNormPRelu(1, 16, 324, 184, 0.2);
- testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2);
-
- LOG(INFO) << "Test OpenGL ResizeNearest";
- testOpenGLResize(1, 4, 16, 16, 1, 1, 0.1);
- testOpenGLResize(1, 4, 16, 16, 2, 2, 0.1);
- testOpenGLResize(1, 4, 16, 16, 3, 3, 0.1);
- testOpenGLResize(1, 4, 16, 16, 4, 4, 0.1);
- testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
- testOpenGLResize(1, 16, 25, 25, 3, 3, 0.1);
- testOpenGLResize(1, 12, 25, 25, 3, 3, 0.1);
- testOpenGLResize(1, 4, 720, 1280, 3, 3, 0.1);
-
- // debug style transfer
- // conv
- testOpenGLConv(1, 3, 82, 82, 8, 9, 9, 0, 1, Conv, 4, true, 1, 1);
- testOpenGLConv(1, 8, 74, 74, 8, 3, 3, 0, 1, Conv, 4, true, 1, 1);
- testOpenGLConv(1, 8, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1);
- testOpenGLConv(1, 12, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1);
-
- // convtranspose
- testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2);
- testOpenGLConv(1, 6, 112, 112, 3, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 1);
-
- LOG(INFO) << "Test OpenGL PadImage";
- testOpenGLPadImage(1, 3, 11, 11, 0, 1, 0, 1, 0.001);
- testOpenGLPadImage(1, 3, 50, 80, 0, 1, 0, 1, 0.001);
- testOpenGLPadImage(1, 12, 50, 80, 10, 9, 10, 9, 0.001);
-
- LOG(INFO) << "Test OpenGL Preprocess";
- testOpenGLPreprocess(1, 4, 8, 8, 0.20);
- testOpenGLPreprocess(1, 4, 1280, 720, 0.20);
-
- LOG(INFO) << "Test OpenGL Deprocess";
- testOpenGLDeprocess(1, 3, 8, 8, 0.01);
- testOpenGLDeprocess(1, 3, 1280, 720, 0.01);
-
- LOG(INFO) << "Test OpenGL NormalizePlanarYUV";
- testOpenGLNormPlanarYUV(1, 3, 8, 8, 0.01);
- testOpenGLNormPlanarYUV(1, 3, 192, 192, 0.01);
-
- // for (int i = 0; i < 4; i += 1) {
- // LOG(INFO) << "C: " << 4 << ", H: " << 1280 + i << ", W: " << 720 + i;
- // OpenGL_copyops_speedtest(1, 4, 1280, 720 + i, 4, 3, 3, 0, 0.5);
- // }
-
- // for (int i = 0; i < 1; i += 1) {
- // LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i;
- // OpenGL_copyops_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5);
- // }
- //
- // for (int i = 0; i < 9; i += 1) {
- // LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i;
- // OpenGL_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5);
- // }
-
- // Multi-Batch Tests
- LOG(INFO) << "Test OpenGL Multi-batch Support";
- testOpenGLCopyOps(2, 4, 4, 4, 1e-2);
- testOpenGLCopyOps(3, 4, 4, 4, 1e-2);
- testOpenGLCopyOps(5, 4, 4, 4, 1e-2);
- testOpenGLConv(2, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true);
- testOpenGLConv(2, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true);
- testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(5, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(7, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(11, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(12, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(21, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(50, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1);
- testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, ConvTranspose, 0.5, true, 1, 1);
- testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1);
- testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1);
-
- testOpenGLPRelu(3, 4, 16, 16, 4, 1, 1, 0.1);
- testOpenGLPRelu(5, 4, 16, 16, 4, 1, 1, 0.1);
-
- testOpenGLRelu(3, 4, 16, 16, 1, 1, 0.1);
- testOpenGLRelu(7, 4, 16, 16, 1, 1, 0.1);
-
- testOpenGLAdd(3, 16, 640, 360, 0.1);
- testOpenGLAdd(9, 16, 640, 360, 0.1);
-
- testOpenGLSigmoid(3, 4, 16, 16, 0.1);
- testOpenGLSigmoid(11, 4, 16, 16, 0.1);
-
- testOpenGLInstanceNorm(3, 4, 16, 16, 0.2);
- testOpenGLInstanceNorm(13, 4, 16, 16, 0.2);
-
- testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2);
- testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2);
-
- testOpenGLResize(3, 4, 16, 16, 1, 1, 0.1);
- testOpenGLResize(16, 4, 16, 16, 1, 1, 0.1);
-
- testOpenGLPadImage(3, 3, 4, 4, 0, 1, 0, 1, 0.01);
- testOpenGLPadImage(23, 3, 4, 4, 0, 1, 0, 1, 0.01);
-
- testOpenGLSoftmax(3, 1000, 0.1);
- testOpenGLSoftmax(27, 100, 0.1);
-
- testOpenGLNormPlanarYUV(4, 3, 192, 192, 0.01);
-
- // Test Tiling
- testOpenGLSoftmax(3, 1000, 0.1, true);
- testOpenGLSoftmax(9, 523, 0.1, true);
- testOpenGLSoftmax(27, 100, 0.1, true);
- }
-
- LOG(INFO) << "End of OpenGL tests";
-}
-} // namespace caffe2
+++ /dev/null
-
-#include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-void testOpenGL();
-void compareModelsForOpenGL(std::string name,
- const NetDef& initNet,
- NetDef predictNet,
- int width,
- int height,
- int channel,
- std::string input_type,
- std::string input_order);
-
-void compareBatchedToTiledModels(std::string name,
- const NetDef& initNet,
- NetDef predictNet,
- int width,
- int height,
- int channel,
- std::string input_type,
- std::string input_order);
-
-int runModelBenchmarks(caffe2::NetDef& init_net,
- caffe2::NetDef& predict_net,
- int warm_up_runs,
- int main_runs,
- int channel,
- int height,
- int width,
- std::string input_type,
- std::string input_order,
- std::string engine,
- bool run_individual = false,
- bool use_texture_input = false,
- bool use_tiling = false,
- bool run_fusion = true);
-} // namespace caffe2
endif()
endif()
-if (USE_MOBILE_OPENGL)
- if (ANDROID)
- list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
- elseif (IOS)
- message(STATUS "TODO item for adding ios opengl dependency")
- else()
- message(WARNING "mobile opengl is only used in android or ios builds.")
- caffe2_update_option(USE_MOBILE_OPENGL OFF)
- endif()
-endif()
-
# ---[ ARM Compute Library: check compatibility.
if (USE_ACL)
if (NOT ANDROID)
message(WARNING "ARM Compute Library is only supported for Android builds.")
caffe2_update_option(USE_ACL OFF)
else()
+ list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
set(ACL_ARCH "armv7a")
message(STATUS " USE_METAL : ${USE_METAL}")
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
message(STATUS " USE_MKLDNN : ${CAFFE2_USE_MKLDNN}")
- message(STATUS " USE_MOBILE_OPENGL : ${USE_MOBILE_OPENGL}")
message(STATUS " USE_NCCL : ${USE_NCCL}")
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
CMAKE_ARGS+=("-DANDROID_ABI=armeabi-v7a with NEON")
CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=21")
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
-# TODO: As the toolchain file doesn't support NEON-FP16 extension,
-# we disable USE_MOBILE_OPENGL for now, it will be re-enabled in the future.
-CMAKE_ARGS+=("-DUSE_MOBILE_OPENGL=OFF")
# Use-specified CMake arguments go last to allow overridding defaults
CMAKE_ARGS+=($@)