Initial check-in of Acxxel (StreamExecutor renamed)

author Jason Henline <jhen@google.com>

Tue, 25 Oct 2016 20:18:56 +0000 (20:18 +0000)

committer Jason Henline <jhen@google.com>

Tue, 25 Oct 2016 20:18:56 +0000 (20:18 +0000)
author Jason Henline <jhen@google.com>
Tue, 25 Oct 2016 20:18:56 +0000 (20:18 +0000)
committer Jason Henline <jhen@google.com>
Tue, 25 Oct 2016 20:18:56 +0000 (20:18 +0000)
diff --git a/parallel-libs/acxxel/.clang-format b/parallel-libs/acxxel/.clang-format

new file mode 100644 (file)

index 0000000..9b3aa8b
--- /dev/null
+++ b/parallel-libs/acxxel/.clang-format
@@ -0,0 +1 @@
+BasedOnStyle: LLVM
diff --git a/parallel-libs/acxxel/.clang-tidy b/parallel-libs/acxxel/.clang-tidy

new file mode 100644 (file)

index 0000000..f0651c1
--- /dev/null
+++ b/parallel-libs/acxxel/.clang-tidy
@@ -0,0 +1,17 @@
+Checks: '-*,clang-diagnostic-*,llvm-*,-llvm-header-guard,misc-*,-misc-unused-parameters,readability-identifier-naming'
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           camelBack
+  - key:             readability-identifier-naming.MemberCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ParameterCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           CamelCase
+
diff --git a/parallel-libs/acxxel/CMakeLists.txt b/parallel-libs/acxxel/CMakeLists.txt

new file mode 100644 (file)

index 0000000..a0c017e
--- /dev/null
+++ b/parallel-libs/acxxel/CMakeLists.txt
@@ -0,0 +1,75 @@
+cmake_minimum_required(VERSION 3.1)
+
+option(ACXXEL_ENABLE_UNIT_TESTS "enable acxxel unit tests" ON)
+option(ACXXEL_ENABLE_EXAMPLES "enable acxxel examples" OFF)
+option(ACXXEL_ENABLE_DOXYGEN "enable Doxygen for acxxel" OFF)
+option(ACXXEL_ENABLE_CUDA "enable CUDA for acxxel" ON)
+option(ACXXEL_ENABLE_OPENCL "enable OpenCL for acxxel" ON)
+
+project(acxxel)
+
+if(ACXXEL_ENABLE_CUDA)
+find_package(CUDA REQUIRED)
+include_directories(${CUDA_INCLUDE_DIRS})
+find_library(CUDA_DRIVER_LIBRARY cuda)
+if(NOT CUDA_DRIVER_LIBRARY)
+    message(FATAL_ERROR "could not find libcuda, is the CUDA driver installed on your system?")
+endif(NOT CUDA_DRIVER_LIBRARY)
+set(ACXXEL_CUDA_SOURCES cuda_acxxel.cpp)
+set(ACXXEL_CUDA_LIBRARIES ${CUDA_DRIVER_LIBRARY} ${CUDA_LIBRARIES})
+endif(ACXXEL_ENABLE_CUDA)
+
+if(ACXXEL_ENABLE_OPENCL)
+find_package(OpenCL REQUIRED)
+include_directories(${OpenCL_INCLUDE_DIRS})
+set(ACXXEL_OPENCL_SOURCES opencl_acxxel.cpp)
+set(ACXXEL_OPENCL_LIBRARIES ${OpenCL_LIBRARIES})
+endif()
+
+configure_file(config.h.in config.h)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+# Insist on C++ 11 features.
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Add warning flags.
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces")
+
+add_library(
+    acxxel
+    acxxel.cpp
+    ${ACXXEL_CUDA_SOURCES}
+    ${ACXXEL_OPENCL_SOURCES})
+target_link_libraries(
+    acxxel
+    ${ACXXEL_CUDA_LIBRARIES}
+    ${ACXXEL_OPENCL_LIBRARIES})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if(ACXXEL_ENABLE_EXAMPLES)
+    add_subdirectory(examples)
+endif()
+
+if(ACXXEL_ENABLE_UNIT_TESTS)
+    enable_testing()
+    find_package(GTest REQUIRED)
+    include_directories(${GTEST_INCLUDE_DIRS})
+    find_package(Threads)
+    add_subdirectory(tests)
+endif()
+
+if(ACXXEL_ENABLE_DOXYGEN)
+    find_package(Doxygen REQUIRED)
+    configure_file(Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
+    add_custom_target(
+        acxxel-doc
+        ${DOXYGEN_EXECUTABLE}
+        ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
+        WORKING_DIRECTORY
+        ${CMAKE_CURRENT_BINARY_DIR}
+        COMMENT
+        "Generating acxxel API documentation with Doxygen"
+        VERBATIM)
+endif()
diff --git a/parallel-libs/acxxel/Doxyfile.in b/parallel-libs/acxxel/Doxyfile.in

new file mode 100644 (file)

index 0000000..7c057c4
--- /dev/null
+++ b/parallel-libs/acxxel/Doxyfile.in
@@ -0,0 +1,2303 @@
+# Doxyfile 1.8.6
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Acxxel"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = Yes
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. Do not use file names with spaces, bibtex cannot handle them. See
+# also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */examples/* */tools/* */unittests/* */.*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = @CMAKE_CURRENT_SOURCE_DIR@/examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = YES
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 1
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
+# defined cascading style sheet that is included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet file to the output directory. For an example
+# see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = @CMAKE_CURRENT_SOURCE_DIR@/customdoxygen.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavours of web server based searching depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools. See
+# the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
+# replace them by respectively the title of the page, the current date and time,
+# only the current date, the version number of doxygen, the project name (see
+# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all refrences to function-like macros that are alone on a line, have an
+# all uppercase name, and do not end with a semicolon. Such function macros are
+# typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have an unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font n the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/parallel-libs/acxxel/LICENSE.TXT b/parallel-libs/acxxel/LICENSE.TXT

new file mode 100644 (file)

index 0000000..b452ca2
--- /dev/null
+++ b/parallel-libs/acxxel/LICENSE.TXT
@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2016 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+
diff --git a/parallel-libs/acxxel/acxxel.cpp b/parallel-libs/acxxel/acxxel.cpp

new file mode 100644 (file)

index 0000000..fa15c95
--- /dev/null
+++ b/parallel-libs/acxxel/acxxel.cpp
@@ -0,0 +1,112 @@
+//===--- acxxel.cpp - Implementation details for the Acxxel API -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "acxxel.h"
+#include "config.h"
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+
+namespace acxxel {
+
+namespace cuda {
+Expected<Platform *> getPlatform();
+} // namespace cuda
+
+namespace opencl {
+Expected<Platform *> getPlatform();
+} // namespace opencl
+
+void logWarning(const std::string &Message) {
+  std::cerr << "WARNING: " << Message << "\n";
+}
+
+Expected<Platform *> getCUDAPlatform() {
+#ifdef ACXXEL_ENABLE_CUDA
+  return cuda::getPlatform();
+#else
+  return Status("library was build without CUDA support");
+#endif
+}
+
+Expected<Platform *> getOpenCLPlatform() {
+#ifdef ACXXEL_ENABLE_OPENCL
+  return opencl::getPlatform();
+#else
+  return Status("library was build without OpenCL support");
+#endif
+}
+
+Stream::Stream(Stream &&) noexcept = default;
+Stream &Stream::operator=(Stream &&) noexcept = default;
+
+Status Stream::sync() {
+  return takeStatusOr(ThePlatform->streamSync(TheHandle.get()));
+}
+
+Status Stream::waitOnEvent(Event &Event) {
+  return takeStatusOr(ThePlatform->streamWaitOnEvent(
+      TheHandle.get(), ThePlatform->getEventHandle(Event)));
+}
+
+Stream &
+Stream::addCallback(std::function<void(Stream &, const Status &)> Callback) {
+  setStatus(ThePlatform->addStreamCallback(*this, std::move(Callback)));
+  return *this;
+}
+
+Stream &Stream::asyncKernelLaunch(const Kernel &TheKernel,
+                                  KernelLaunchDimensions LaunchDimensions,
+                                  Span<void *> Arguments,
+                                  Span<size_t> ArgumentSizes,
+                                  size_t SharedMemoryBytes) {
+  setStatus(ThePlatform->rawEnqueueKernelLaunch(
+      TheHandle.get(), TheKernel.TheHandle.get(), LaunchDimensions, Arguments,
+      ArgumentSizes, SharedMemoryBytes));
+  return *this;
+}
+
+Stream &Stream::enqueueEvent(Event &E) {
+  setStatus(ThePlatform->enqueueEvent(ThePlatform->getEventHandle(E),
+                                      TheHandle.get()));
+  return *this;
+}
+
+Event::Event(Event &&) noexcept = default;
+Event &Event::operator=(Event &&) noexcept = default;
+
+bool Event::isDone() { return ThePlatform->eventIsDone(TheHandle.get()); }
+
+Status Event::sync() { return ThePlatform->eventSync(TheHandle.get()); }
+
+Expected<float> Event::getSecondsSince(const Event &Previous) {
+  Expected<float> MaybeSeconds = ThePlatform->getSecondsBetweenEvents(
+      Previous.TheHandle.get(), TheHandle.get());
+  if (MaybeSeconds.isError())
+    MaybeSeconds.getError();
+  return MaybeSeconds;
+}
+
+Expected<Kernel> Program::createKernel(const std::string &Name) {
+  Expected<void *> MaybeKernelHandle =
+      ThePlatform->rawCreateKernel(TheHandle.get(), Name);
+  if (MaybeKernelHandle.isError())
+    return MaybeKernelHandle.getError();
+  return Kernel(ThePlatform, MaybeKernelHandle.getValue(),
+                ThePlatform->getKernelHandleDestructor());
+}
+
+Program::Program(Program &&) noexcept = default;
+Program &Program::operator=(Program &&That) noexcept = default;
+
+Kernel::Kernel(Kernel &&) noexcept = default;
+Kernel &Kernel::operator=(Kernel &&That) noexcept = default;
+
+} // namespace acxxel
diff --git a/parallel-libs/acxxel/acxxel.h b/parallel-libs/acxxel/acxxel.h

new file mode 100644 (file)

index 0000000..8cfc5c5
--- /dev/null
+++ b/parallel-libs/acxxel/acxxel.h
@@ -0,0 +1,1400 @@
+//===--- acxxel.h - The Acxxel API ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/// \mainpage Welcome to Acxxel
+///
+/// \section Introduction
+///
+/// \b Acxxel is a library providing a modern C++ interface for managing
+/// accelerator devices such as GPUs. Acxxel handles operations such as
+/// allocating device memory, copying data to and from device memory, creating
+/// and managing device events, and creating and managing device streams.
+///
+/// \subsection ExampleUsage Example Usage
+///
+/// Below is some example code to show you the basics of Acxxel.
+///
+/// \snippet examples/simple_example.cu Example simple saxpy
+///
+/// The above code could be compiled with either `clang` or `nvcc`. Compare this
+/// with the standard CUDA runtime library code to perform these same
+/// operations:
+///
+/// \snippet examples/simple_example.cu Example CUDA simple saxpy
+///
+/// Notice that the CUDA runtime calls are not type safe. For example, if you
+/// change the type of the inputs from `float` to `double`, you have to remember
+/// to change the size calculation. If you forget, you will get garbage output
+/// data. In the Acxxel example, you would instead get a helpful compile-time
+/// error that wouldn't let you forget to change the types inside the function.
+///
+/// The Acxxel example also automatically uses the right sizes for memory
+/// copies, so you don't have to worry about computing the sizes yourself.
+///
+/// The CUDA runtime interface makes it easy to get the source and destination
+/// mixed up in a call to `cudaMemcpy`. If you pass the pointers in the wrong
+/// order or pass the wrong enum value for the direction parameter, you won't
+/// find out until runtime (if you remembered to check the error return value of
+/// `cudaMemcpy`). In Acxxel there is no verbose direction enum because the name
+/// of the function says which way the copy goes, and mixing up the order of
+/// source and destination is a compile-time error.
+///
+/// The CUDA runtime interface makes you clean up your device memory by calling
+/// `cudaFree` for each call to `cudaMalloc`. In Acxxel, you don't have to worry
+/// about that because the memory cleans itself up when it goes out of scope.
+///
+/// \subsection AcxxelFeatures Acxxel Features
+///
+/// Acxxel provides many nice features compared to the C-like interfaces, such
+/// as the CUDA runtime API, which are normally used for the host code in
+/// applications using accelerators.
+///
+/// \subsubsection TypeSafety Type safety
+///
+/// Most errors involving mixing up types, sources and destinations, or host and
+/// device memory result in helpful compile-time errors.
+///
+/// \subsubsection NoCopySizes No need to specify sizes for memory copies
+///
+/// When the arguments to copy functions such as acxxel::Platform::copyHToD know
+/// their sizes (e.g std::array, std::vector, and C-style arrays), there is no
+/// need to specify the amount of memory to copy; Acxxel will just copy the
+/// whole thing. Of course the copy functions also have overloads that accept an
+/// element count for those times when you don't want to copy everything.
+///
+/// \subsubsection MemoryCleanup Automatic memory cleanup
+///
+/// Device memory allocated with acxxel::Platform::mallocD is automatically
+/// freed when it goes out of scope.
+///
+/// \subsubsection NiceErrorHandling Error handling
+///
+/// Operations that would normally return values return acxxel::Expected obects
+/// in Acxxel. These `Expected` objects contain either a value or an error
+/// message explaining why the value is not present. This reminds the user to
+/// check for errors, but also allows them to opt-out easily be calling the
+/// acxxel::Expected::getValue or acxxel::Expected::takeValue methods. The
+/// `getValue` method returns a reference to the value, leaving the `Expected`
+/// instance as the value owner, whereas the `takeValue` method moves the value
+/// out of the `Expected` object and transfers ownership to the caller.
+///
+/// \subsubsection PlatformIndependence Platform independence
+///
+/// Acxxel code works not only with CUDA, but also with any other platform that
+/// can support its interface. For example, Acxxel supports OpenCL. The
+/// acxxel::getCUDAPlatform and acxxel::getOpenCLPlatform functions are provided
+/// to allow easy access to the built-in CUDA and OpenCL platforms. Other
+/// platforms can be created by implementing the acxxel::Platform interface, and
+/// instances of those classes can be created directly.
+///
+/// \subsubsection CUDAInterop Seamless interoperation with CUDA
+///
+/// Acxxel functions as a modern replacement for the standard CUDA runtime
+/// library and interoperates seamlessly with kernel calls.
+
+#ifndef ACXXEL_ACXXEL_H
+#define ACXXEL_ACXXEL_H
+
+#include "span.h"
+#include "status.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#if defined(__clang__) || defined(__GNUC__)
+#define ACXXEL_WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#else
+#define ACXXEL_WARN_UNUSED_RESULT
+#endif
+
+/// This type is declared here to provide smooth interoperability with the CUDA
+/// triple-chevron kernel launch syntax.
+///
+/// A acxxel::Stream instance will be implicitly convertible to a CUstream_st*,
+/// which is the type expected for the stream argument in the triple-chevron
+/// CUDA kernel launch. This means that a acxxel::Stream can be passed without
+/// explicit casting as the fourth argument to a triple-chevron CUDA kernel
+/// launch.
+struct CUstream_st; // NOLINT
+
+namespace acxxel {
+
+class Event;
+class Platform;
+class Stream;
+
+template <typename T> class DeviceMemory;
+
+template <typename T> class DeviceMemorySpan;
+
+template <typename T> class AsyncHostMemory;
+
+template <typename T> class AsyncHostMemorySpan;
+
+template <typename T> class OwnedAsyncHostMemory;
+
+/// Function type used to destroy opaque handles given out by the platform.
+using HandleDestructor = void (*)(void *);
+
+/// Functor type for enqueuing host callbacks on a stream.
+using StreamCallback = std::function<void(Stream &, const Status &)>;
+
+struct KernelLaunchDimensions {
+  // Intentionally implicit
+  KernelLaunchDimensions(unsigned int BlockX = 1, unsigned int BlockY = 1,
+                         unsigned int BlockZ = 1, unsigned int GridX = 1,
+                         unsigned int GridY = 1, unsigned int GridZ = 1)
+      : BlockX(BlockX), BlockY(BlockY), BlockZ(BlockZ), GridX(GridX),
+        GridY(GridY), GridZ(GridZ) {}
+
+  unsigned int BlockX;
+  unsigned int BlockY;
+  unsigned int BlockZ;
+  unsigned int GridX;
+  unsigned int GridY;
+  unsigned int GridZ;
+};
+
+/// Logs a warning message.
+void logWarning(const std::string &Message);
+
+/// Gets a pointer to the standard CUDA platform.
+Expected<Platform *> getCUDAPlatform();
+
+/// Gets a pointer to the standard OpenCL platform.
+Expected<Platform *> getOpenCLPlatform();
+
+/// A function that can be executed on the device.
+///
+/// A Kernel is created from a Program by calling Program::createKernel, and a
+/// kernel is enqueued into a Stream by calling Stream::asyncKernelLaunch.
+class Kernel {
+public:
+  Kernel(const Kernel &) = delete;
+  Kernel &operator=(const Kernel &) = delete;
+  Kernel(Kernel &&) noexcept;
+  Kernel &operator=(Kernel &&That) noexcept;
+  ~Kernel() = default;
+
+private:
+  // Only a Program can make a kernel.
+  friend class Program;
+  Kernel(Platform *APlatform, void *AHandle, HandleDestructor Destructor)
+      : ThePlatform(APlatform), TheHandle(AHandle, Destructor) {}
+
+  // Let stream get raw handle for kernel launches.
+  friend class Stream;
+
+  Platform *ThePlatform;
+  std::unique_ptr<void, HandleDestructor> TheHandle;
+};
+
+/// A program loaded on a device.
+///
+/// A program can be created by calling Platform::createProgramFromSource, and a
+/// Kernel can be created from a program by running Program::createKernel.
+///
+/// A program can contain any number of kernels, and a program only needs to be
+/// loaded once in order to use all its kernels.
+class Program {
+public:
+  Program(const Program &) = delete;
+  Program &operator=(const Program &) = delete;
+  Program(Program &&) noexcept;
+  Program &operator=(Program &&That) noexcept;
+  ~Program() = default;
+
+  Expected<Kernel> createKernel(const std::string &Name);
+
+private:
+  // Only a platform can make a program.
+  friend class Platform;
+  Program(Platform *APlatform, void *AHandle, HandleDestructor Destructor)
+      : ThePlatform(APlatform), TheHandle(AHandle, Destructor) {}
+
+  Platform *ThePlatform;
+  std::unique_ptr<void, HandleDestructor> TheHandle;
+};
+
+/// A stream of computation.
+///
+/// All operations enqueued on a Stream are serialized, but operations enqueued
+/// on different Streams may run concurrently.
+///
+/// Each Platform has a notion of the currently active device on a particular
+/// thread (see Platform::getActiveDeviceForThread and
+/// Platform::setActiveDeviceForThread). Each Stream is associated with a
+/// specific, fixed device, set to the current thread's active device when the
+/// Stream is created. Whenver a thread enqueues commands onto a Stream, its
+/// active device must match the Stream's device.
+class Stream {
+public:
+  Stream(const Stream &) = delete;
+  Stream &operator=(const Stream &) = delete;
+  Stream(Stream &&) noexcept;
+  Stream &operator=(Stream &&) noexcept;
+  ~Stream() = default;
+
+  /// Gets the index of the device on which this Stream operates.
+  int getDeviceIndex() { return TheDeviceIndex; }
+
+  /// Blocks the host until the Stream is done executing all previously enqueued
+  /// work.
+  ///
+  /// Returns a Status for any errors emitted by the asynchronous work on the
+  /// Stream, or by any error in the synchronization process itself. Clears the
+  /// Status state of the stream.
+  Status sync() ACXXEL_WARN_UNUSED_RESULT;
+
+  /// Makes all future work submitted to this stream wait until the event
+  /// reports completion.
+  ///
+  /// This is useful because the event argument may be recorded on a different
+  /// stream, so this method allows for synchronization between streams without
+  /// synchronizing all streams.
+  ///
+  /// Returns a Status for any errors emitted by the asynchronous work on the
+  /// Stream, or by any error in the synchronization process itself. Clears the
+  /// Status state of the stream.
+  Status waitOnEvent(Event &Event) ACXXEL_WARN_UNUSED_RESULT;
+
+  /// Adds a host callback function to the stream.
+  ///
+  /// The callback will be called on the host after all previously enqueued work
+  /// on the stream is complete, and no work enqueued after the callback will
+  /// begin until after the callback has finished.
+  Stream &addCallback(std::function<void(Stream &, const Status &)> Callback);
+
+  /// \name Asynchronous device memory copies.
+  ///
+  /// These functions enqueue asynchronous memory copy operations into the
+  /// stream. Only async host memory is allowed for host arguments to these
+  /// functions. Async host memory can be created from normal host memory by
+  /// registering it with Platform::registerHostMem. AsyncHostMemory can also be
+  /// allocated directly by calling Platform::newAsyncHostMem.
+  ///
+  /// For all these functions, DeviceSrcTy must be convertible to
+  /// DeviceMemorySpan<const T>, DeviceDstTy must be convertible to
+  /// DeviceMemorySpan<T>, HostSrcTy must be convertible to
+  /// AsyncHostMemorySpan<const T> and HostDstTy must be convertible to
+  /// AsyncHostMemorySpan<T>. Additionally, the T types must match for the
+  /// destination and source.
+  /// \{
+
+  /// Copies from device memory to device memory.
+  template <typename DeviceSrcTy, typename DeviceDstTy>
+  Stream &asyncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst);
+
+  /// Copies from device memory to device memory with a given element count.
+  template <typename DeviceSrcTy, typename DeviceDstTy>
+  Stream &asyncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst,
+                        ptrdiff_t ElementCount);
+
+  /// Copies from device memory to host memory.
+  template <typename DeviceSrcTy, typename HostDstTy>
+  Stream &asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst);
+
+  /// Copies from device memory to host memory with a given element count.
+  template <typename DeviceSrcTy, typename HostDstTy>
+  Stream &asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst,
+                        ptrdiff_t ElementCount);
+
+  /// Copies from host memory to device memory.
+  template <typename HostSrcTy, typename DeviceDstTy>
+  Stream &asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst);
+
+  /// Copies from host memory to device memory with a given element count.
+  template <typename HostSrcTy, typename DeviceDstTy>
+  Stream &asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst,
+                        ptrdiff_t ElementCount);
+
+  /// \}
+
+  /// \name Stream-synchronous device memory copies
+  ///
+  /// These functions block the host until the copy and all previously-enqueued
+  /// work on the stream has completed.
+  ///
+  /// For all these functions, DeviceSrcTy must be convertible to
+  /// DeviceMemorySpan<const T>, DeviceDstTy must be convertible to
+  /// DeviceMemorySpan<T>, HostSrcTy must be convertible to Span<const T> and
+  /// HostDstTy must be convertible to Span<T>. Additionally, the T types must
+  /// match for the destination and source.
+  /// \{
+
+  template <typename DeviceSrcTy, typename DeviceDstTy>
+  Stream &syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst);
+
+  template <typename DeviceSrcTy, typename DeviceDstTy>
+  Stream &syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst,
+                       ptrdiff_t ElementCount);
+
+  template <typename DeviceSrcTy, typename HostDstTy>
+  Stream &syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst);
+
+  template <typename DeviceSrcTy, typename HostDstTy>
+  Stream &syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst,
+                       ptrdiff_t ElementCount);
+
+  template <typename HostSrcTy, typename DeviceDstTy>
+  Stream &syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst);
+
+  template <typename HostSrcTy, typename DeviceDstTy>
+  Stream &syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst,
+                       ptrdiff_t ElementCount);
+
+  /// \}
+
+  /// Enqueues an operation in the stream to set the bytes of a given device
+  /// memory region to a given value.
+  ///
+  /// DeviceDstTy must be convertible to DeviceMemorySpan<T> for non-const T.
+  template <typename DeviceDstTy>
+  Stream &asyncMemsetD(DeviceDstTy &&DeviceDst, char ByteValue);
+
+  /// Enqueues a kernel launch operation on this stream.
+  Stream &asyncKernelLaunch(const Kernel &TheKernel,
+                            KernelLaunchDimensions LaunchDimensions,
+                            Span<void *> Arguments, Span<size_t> ArgumentSizes,
+                            size_t SharedMemoryBytes = 0);
+
+  /// Enqueues an event in the stream.
+  Stream &enqueueEvent(Event &E);
+
+  // Allows implicit conversion to (CUstream_st *). This makes triple-chevron
+  // kernel calls look nicer because you can just pass a acxxel::Stream
+  // directly.
+  operator CUstream_st *() {
+    return static_cast<CUstream_st *>(TheHandle.get());
+  }
+
+  /// Gets the current status for the Stream and clears the Stream's status.
+  Status takeStatus() ACXXEL_WARN_UNUSED_RESULT {
+    Status OldStatus = TheStatus;
+    TheStatus = Status();
+    return OldStatus;
+  }
+
+private:
+  // Only a platform can make a stream.
+  friend class Platform;
+  Stream(Platform *APlatform, int DeviceIndex, void *AHandle,
+         HandleDestructor Destructor)
+      : ThePlatform(APlatform), TheDeviceIndex(DeviceIndex),
+        TheHandle(AHandle, Destructor) {}
+
+  const Status &setStatus(const Status &S) {
+    if (S.isError() && !TheStatus.isError()) {
+      TheStatus = S;
+    }
+    return S;
+  }
+
+  Status takeStatusOr(const Status &S) {
+    if (TheStatus.isError()) {
+      Status OldStatus = TheStatus;
+      TheStatus = Status();
+      return OldStatus;
+    }
+    return S;
+  }
+
+  // The platform that created the stream.
+  Platform *ThePlatform;
+
+  // The index of the device on which the stream operates.
+  int TheDeviceIndex;
+
+  // A handle to the platform-specific handle implementation.
+  std::unique_ptr<void, HandleDestructor> TheHandle;
+  Status TheStatus;
+};
+
+/// A user-created event on a device.
+///
+/// This is useful for setting synchronization points in a Stream. The host can
+/// synchronize with a Stream without using events, but that requires all the
+/// work in the Stream to be finished in order for the host to be notified.
+/// Events provide more flexibility by allowing the host to be notified when a
+/// single Event in the Stream is finished, rather than all the work in the
+/// Stream.
+class Event {
+public:
+  Event(const Event &) = delete;
+  Event &operator=(const Event &) = delete;
+  Event(Event &&) noexcept;
+  Event &operator=(Event &&That) noexcept;
+  ~Event() = default;
+
+  /// Checks to see if the event is done running.
+  bool isDone();
+
+  /// Blocks the host until the event is done.
+  Status sync();
+
+  /// Gets the time elapsed between the previous event's execution and this
+  /// event's execution.
+  Expected<float> getSecondsSince(const Event &Previous);
+
+private:
+  // Only a platform can make an event.
+  friend class Platform;
+  Event(Platform *APlatform, void *AHandle, HandleDestructor Destructor)
+      : ThePlatform(APlatform), TheHandle(AHandle, Destructor) {}
+
+  Platform *ThePlatform;
+  std::unique_ptr<void, HandleDestructor> TheHandle;
+};
+
+/// An accelerator platform.
+///
+/// This is the base class for all platforms such as CUDA and OpenCL. It
+/// contains many virtual methods that must be overridden by each platform
+/// implementation.
+///
+/// It also has some template wrapper functions that take care of type checking
+/// and then forward their arguments on to raw virtual functions that are
+/// implemented by each specific platform.
+class Platform {
+public:
+  virtual ~Platform(){};
+
+  /// Gets the number of devices for this platform in this system.
+  virtual Expected<int> getDeviceCount() = 0;
+
+  /// Sets the active device for this platform in this thread.
+  virtual Status setActiveDeviceForThread(int DeviceIndex) = 0;
+
+  /// Gets the currently active device for this platform in this thread.
+  virtual int getActiveDeviceForThread() = 0;
+
+  /// Creates a stream for the platform.
+  ///
+  /// The created Stream is associated with the active device for this thread.
+  virtual Expected<Stream> createStream() = 0;
+
+  /// Creates an event for the platform.
+  ///
+  /// The created Event is associated with the active device for this thread.
+  virtual Expected<Event> createEvent() = 0;
+
+  /// Allocates owned device memory.
+  ///
+  /// \warning This function only allocates space in device memory, it does not
+  /// call the constructor of T.
+  template <typename T>
+  Expected<DeviceMemory<T>> mallocD(ptrdiff_t ElementCount) {
+    Expected<void *> MaybePointer = rawMallocD(ElementCount * sizeof(T));
+    if (MaybePointer.isError())
+      return MaybePointer.getError();
+    return DeviceMemory<T>(this, MaybePointer.getValue(), ElementCount,
+                           this->getDeviceMemoryHandleDestructor());
+  }
+
+  /// Creates a DeviceMemorySpan for a device symbol.
+  ///
+  /// This function is present to support __device__ variables in CUDA. Given a
+  /// pointer to a __device__ variable, this function returns a DeviceMemorySpan
+  /// referencing the device memory that stores that __device__ variable.
+  template <typename ElementType>
+  Expected<DeviceMemorySpan<ElementType>> getSymbolMemory(ElementType *Symbol) {
+    Expected<void *> MaybeAddress = rawGetDeviceSymbolAddress(Symbol);
+    if (MaybeAddress.isError())
+      return MaybeAddress.getError();
+    ElementType *Address = static_cast<ElementType *>(MaybeAddress.getValue());
+    Expected<ptrdiff_t> MaybeSize = rawGetDeviceSymbolSize(Symbol);
+    if (MaybeSize.isError())
+      return MaybeSize.getError();
+    ptrdiff_t Size = MaybeSize.getValue();
+    return DeviceMemorySpan<ElementType>(this, Address,
+                                         Size / sizeof(ElementType), 0);
+  }
+
+  /// \name Host memory registration functions.
+  /// \{
+
+  template <typename T>
+  Expected<AsyncHostMemory<const T>> registerHostMem(Span<const T> Memory) {
+    Status S = rawRegisterHostMem(Memory.data(), Memory.size() * sizeof(T));
+    if (S.isError())
+      return S;
+    return AsyncHostMemory<const T>(
+        Memory.data(), Memory.size(),
+        this->getUnregisterHostMemoryHandleDestructor());
+  }
+
+  template <typename T>
+  Expected<AsyncHostMemory<T>> registerHostMem(Span<T> Memory) {
+    Status S = rawRegisterHostMem(Memory.data(), Memory.size() * sizeof(T));
+    if (S.isError())
+      return S;
+    return AsyncHostMemory<T>(Memory.data(), Memory.size(),
+                              this->getUnregisterHostMemoryHandleDestructor());
+  }
+
+  template <typename T, size_t N>
+  Expected<AsyncHostMemory<T>> registerHostMem(T (&Array)[N]) {
+    Span<T> Span(Array);
+    Status S = rawRegisterHostMem(Span.data(), Span.size() * sizeof(T));
+    if (S.isError())
+      return S;
+    return AsyncHostMemory<T>(Span.data(), Span.size(),
+                              this->getUnregisterHostMemoryHandleDestructor());
+  }
+
+  /// Registers memory stored in a container with a data() member function and
+  /// which can be converted to a Span<T*>.
+  template <typename Container>
+  auto registerHostMem(Container &Cont) -> Expected<AsyncHostMemory<
+      typename std::remove_reference<decltype(*Cont.data())>::type>> {
+    using ValueType =
+        typename std::remove_reference<decltype(*Cont.data())>::type;
+    Span<ValueType> Span(Cont);
+    Status S = rawRegisterHostMem(Span.data(), Span.size() * sizeof(ValueType));
+    if (S.isError())
+      return S;
+    return AsyncHostMemory<ValueType>(
+        Span.data(), Span.size(),
+        this->getUnregisterHostMemoryHandleDestructor());
+  }
+
+  /// Allocates an owned, registered array of objects on the host.
+  ///
+  /// Default constructs each element in the resulting array.
+  template <typename T>
+  Expected<OwnedAsyncHostMemory<T>> newAsyncHostMem(ptrdiff_t ElementCount) {
+    Expected<void *> MaybeMemory =
+        rawMallocRegisteredH(ElementCount * sizeof(T));
+    if (MaybeMemory.isError())
+      return MaybeMemory.getError();
+    T *Memory = static_cast<T *>(MaybeMemory.getValue());
+    for (ptrdiff_t I = 0; I < ElementCount; ++I)
+      new (Memory + I) T;
+    return OwnedAsyncHostMemory<T>(Memory, ElementCount,
+                                   this->getFreeHostMemoryHandleDestructor());
+  }
+
+  /// \}
+
+  virtual Expected<Program>
+  createProgramFromSource(Span<const char> Source) = 0;
+
+protected:
+  friend class Stream;
+  friend class Event;
+  friend class Program;
+  template <typename T> friend class DeviceMemorySpan;
+
+  void *getStreamHandle(Stream &Stream) { return Stream.TheHandle.get(); }
+  void *getEventHandle(Event &Event) { return Event.TheHandle.get(); }
+
+  // Pass along access to Stream constructor to subclasses.
+  Stream constructStream(Platform *APlatform, void *AHandle,
+                         HandleDestructor Destructor) {
+    return Stream(APlatform, getActiveDeviceForThread(), AHandle, Destructor);
+  }
+
+  // Pass along access to Event constructor to subclasses.
+  Event constructEvent(Platform *APlatform, void *AHandle,
+                       HandleDestructor Destructor) {
+    return Event(APlatform, AHandle, Destructor);
+  }
+
+  // Pass along access to Program constructor to subclasses.
+  Program constructProgram(Platform *APlatform, void *AHandle,
+                           HandleDestructor Destructor) {
+    return Program(APlatform, AHandle, Destructor);
+  }
+
+  virtual Status streamSync(void *Stream) = 0;
+  virtual Status streamWaitOnEvent(void *Stream, void *Event) = 0;
+
+  virtual Status enqueueEvent(void *Event, void *Stream) = 0;
+  virtual bool eventIsDone(void *Event) = 0;
+  virtual Status eventSync(void *Event) = 0;
+  virtual Expected<float> getSecondsBetweenEvents(void *StartEvent,
+                                                  void *EndEvent) = 0;
+
+  virtual Expected<void *> rawMallocD(ptrdiff_t ByteCount) = 0;
+  virtual HandleDestructor getDeviceMemoryHandleDestructor() = 0;
+  virtual void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
+                                          size_t ByteOffset) = 0;
+  virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) = 0;
+
+  virtual Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol) = 0;
+  virtual Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol) = 0;
+
+  virtual Status rawCopyDToD(const void *DeviceSrc,
+                             ptrdiff_t DeviceSrcByteOffset, void *DeviceDst,
+                             ptrdiff_t DeviceDstByteOffset,
+                             ptrdiff_t ByteCount) = 0;
+  virtual Status rawCopyDToH(const void *DeviceSrc,
+                             ptrdiff_t DeviceSrcByteOffset, void *HostDst,
+                             ptrdiff_t ByteCount) = 0;
+  virtual Status rawCopyHToD(const void *HostSrc, void *DeviceDst,
+                             ptrdiff_t DeviceDstByteOffset,
+                             ptrdiff_t ByteCount) = 0;
+
+  virtual Status rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                            ptrdiff_t ByteCount, char ByteValue) = 0;
+
+  virtual Status rawRegisterHostMem(const void *Memory,
+                                    ptrdiff_t ByteCount) = 0;
+  virtual HandleDestructor getUnregisterHostMemoryHandleDestructor() = 0;
+
+  virtual Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) = 0;
+  virtual HandleDestructor getFreeHostMemoryHandleDestructor() = 0;
+
+  virtual Status asyncCopyDToD(const void *DeviceSrc,
+                               ptrdiff_t DeviceSrcByteOffset, void *DeviceDst,
+                               ptrdiff_t DeviceDstByteOffset,
+                               ptrdiff_t ByteCount, void *Stream) = 0;
+  virtual Status asyncCopyDToH(const void *DeviceSrc,
+                               ptrdiff_t DeviceSrcByteOffset, void *HostDst,
+                               ptrdiff_t ByteCount, void *Stream) = 0;
+  virtual Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
+                               ptrdiff_t DeviceDstByteOffset,
+                               ptrdiff_t ByteCount, void *Stream) = 0;
+
+  virtual Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                              ptrdiff_t ByteCount, char ByteValue,
+                              void *Stream) = 0;
+
+  virtual Status addStreamCallback(Stream &Stream, StreamCallback Callback) = 0;
+
+  virtual Expected<void *> rawCreateKernel(void *Program,
+                                           const std::string &Name) = 0;
+  virtual HandleDestructor getKernelHandleDestructor() = 0;
+
+  virtual Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
+                                        KernelLaunchDimensions LaunchDimensions,
+                                        Span<void *> Arguments,
+                                        Span<size_t> ArgumentSizes,
+                                        size_t SharedMemoryBytes) = 0;
+};
+
+// Implementation of templated Stream functions.
+
+template <typename DeviceSrcTy, typename DeviceDstTy>
+Stream &Stream::asyncCopyDToD(DeviceSrcTy &&DeviceSrc,
+                              DeviceDstTy &&DeviceDst) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  static_assert(std::is_same<SrcElementTy, DstElementTy>::value,
+                "asyncCopyDToD cannot copy between arrays of different types");
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (DeviceSrcSpan.size() != DeviceDstSpan.size()) {
+    setStatus(Status("asyncCopyDToD source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " does not equal destination element count " +
+                     std::to_string(DeviceDstSpan.size())));
+    return *this;
+  }
+  setStatus(ThePlatform->asyncCopyDToD(
+      DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+      DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(),
+      DeviceSrcSpan.byte_size(), TheHandle.get()));
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename DeviceDstTy>
+Stream &Stream::asyncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst,
+                              ptrdiff_t ElementCount) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  static_assert(std::is_same<SrcElementTy, DstElementTy>::value,
+                "asyncCopyDToD cannot copy between arrays of different types");
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (DeviceSrcSpan.size() < ElementCount) {
+    setStatus(Status("asyncCopyDToD source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (DeviceDstSpan.size() < ElementCount) {
+    setStatus(Status("asyncCopyDToD destination element count " +
+                     std::to_string(DeviceDst.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  setStatus(ThePlatform->asyncCopyDToD(
+      DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+      DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(),
+      ElementCount * sizeof(SrcElementTy), TheHandle.get()));
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename HostDstTy>
+Stream &Stream::asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  AsyncHostMemorySpan<SrcElementTy> HostDstSpan(HostDst);
+  if (DeviceSrcSpan.size() != HostDstSpan.size()) {
+    setStatus(Status("asyncCopyDToH source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " does not equal destination element count " +
+                     std::to_string(HostDstSpan.size())));
+    return *this;
+  }
+  setStatus(ThePlatform->asyncCopyDToH(
+      DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+      HostDstSpan.data(), DeviceSrcSpan.byte_size(), TheHandle.get()));
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename HostDstTy>
+Stream &Stream::asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst,
+                              ptrdiff_t ElementCount) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  AsyncHostMemorySpan<SrcElementTy> HostDstSpan(HostDst);
+  if (DeviceSrcSpan.size() < ElementCount) {
+    setStatus(Status("asyncCopyDToH source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (HostDstSpan.size() < ElementCount) {
+    setStatus(Status("asyncCopyDToH destination element count " +
+                     std::to_string(HostDstSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  setStatus(ThePlatform->asyncCopyDToH(
+      DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+      HostDstSpan.data(), ElementCount * sizeof(SrcElementTy),
+      TheHandle.get()));
+  return *this;
+}
+
+template <typename HostSrcTy, typename DeviceDstTy>
+Stream &Stream::asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst) {
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  AsyncHostMemorySpan<const DstElementTy> HostSrcSpan(HostSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (HostSrcSpan.size() != DeviceDstSpan.size()) {
+    setStatus(Status("asyncCopyHToD source element count " +
+                     std::to_string(HostSrcSpan.size()) +
+                     " does not equal destination element count " +
+                     std::to_string(DeviceDstSpan.size())));
+    return *this;
+  }
+  setStatus(ThePlatform->asyncCopyHToD(
+      HostSrcSpan.data(), DeviceDstSpan.baseHandle(),
+      DeviceDstSpan.byte_offset(), HostSrcSpan.byte_size(), TheHandle.get()));
+  return *this;
+}
+
+template <typename HostSrcTy, typename DeviceDstTy>
+Stream &Stream::asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst,
+                              ptrdiff_t ElementCount) {
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  AsyncHostMemorySpan<const DstElementTy> HostSrcSpan(HostSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (HostSrcSpan.size() < ElementCount) {
+    setStatus(Status("copyHToD source element count " +
+                     std::to_string(HostSrcSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (DeviceDstSpan.size() < ElementCount) {
+    setStatus(Status("copyHToD destination element count " +
+                     std::to_string(DeviceDstSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  setStatus(ThePlatform->asyncCopyHToD(
+      HostSrcSpan.data(), DeviceDstSpan.baseHandle(),
+      DeviceDstSpan.byte_offset(), ElementCount * sizeof(DstElementTy),
+      TheHandle.get()));
+  return *this;
+}
+
+template <typename DeviceDstTy>
+Stream &Stream::asyncMemsetD(DeviceDstTy &&DeviceDst, char ByteValue) {
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  setStatus(ThePlatform->asyncMemsetD(
+      DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(),
+      DeviceDstSpan.byte_size(), ByteValue, TheHandle.get()));
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename DeviceDstTy>
+Stream &Stream::syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  static_assert(std::is_same<SrcElementTy, DstElementTy>::value,
+                "copyDToD cannot copy between arrays of different types");
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (DeviceSrcSpan.size() != DeviceDstSpan.size()) {
+    setStatus(Status("copyDToD source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " does not equal destination element count " +
+                     std::to_string(DeviceDstSpan.size())));
+    return *this;
+  }
+  if (setStatus(ThePlatform->asyncCopyDToD(
+                    DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+                    DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(),
+                    DeviceSrcSpan.byte_size(), TheHandle.get()))
+          .isError()) {
+    return *this;
+  }
+  setStatus(sync());
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename DeviceDstTy>
+Stream &Stream::syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst,
+                             ptrdiff_t ElementCount) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  static_assert(std::is_same<SrcElementTy, DstElementTy>::value,
+                "copyDToD cannot copy between arrays of different types");
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (DeviceSrcSpan.size() < ElementCount) {
+    setStatus(Status("copyDToD source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (DeviceDstSpan.size() < ElementCount) {
+    setStatus(Status("copyDToD destination element count " +
+                     std::to_string(DeviceDst.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (setStatus(ThePlatform->asyncCopyDToD(
+                    DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+                    DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(),
+                    ElementCount * sizeof(SrcElementTy), TheHandle.get()))
+          .isError()) {
+    return *this;
+  }
+  setStatus(sync());
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename HostDstTy>
+Stream &Stream::syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  Span<SrcElementTy> HostDstSpan(HostDst);
+  if (DeviceSrcSpan.size() != HostDstSpan.size()) {
+    setStatus(Status("copyDToH source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " does not equal destination element count " +
+                     std::to_string(HostDstSpan.size())));
+    return *this;
+  }
+  if (setStatus(ThePlatform->asyncCopyDToH(
+                    DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+                    HostDstSpan.data(), DeviceSrcSpan.byte_size(),
+                    TheHandle.get()))
+          .isError()) {
+    return *this;
+  }
+  setStatus(sync());
+  return *this;
+}
+
+template <typename DeviceSrcTy, typename HostDstTy>
+Stream &Stream::syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst,
+                             ptrdiff_t ElementCount) {
+  using SrcElementTy =
+      typename std::remove_reference<DeviceSrcTy>::type::value_type;
+  DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc);
+  Span<SrcElementTy> HostDstSpan(HostDst);
+  if (DeviceSrcSpan.size() < ElementCount) {
+    setStatus(Status("copyDToH source element count " +
+                     std::to_string(DeviceSrcSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (HostDstSpan.size() < ElementCount) {
+    setStatus(Status("copyDToH destination element count " +
+                     std::to_string(HostDstSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (setStatus(ThePlatform->asyncCopyDToH(
+                    DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(),
+                    HostDstSpan.data(), ElementCount * sizeof(SrcElementTy),
+                    TheHandle.get()))
+          .isError()) {
+    return *this;
+  }
+  setStatus(sync());
+  return *this;
+}
+
+template <typename HostSrcTy, typename DeviceDstTy>
+Stream &Stream::syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst) {
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  Span<const DstElementTy> HostSrcSpan(HostSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (HostSrcSpan.size() != DeviceDstSpan.size()) {
+    setStatus(Status("copyHToD source element count " +
+                     std::to_string(HostSrcSpan.size()) +
+                     " does not equal destination element count " +
+                     std::to_string(DeviceDstSpan.size())));
+    return *this;
+  }
+  if (setStatus(ThePlatform->asyncCopyHToD(
+                    HostSrcSpan.data(), DeviceDstSpan.baseHandle(),
+                    DeviceDstSpan.byte_offset(), DeviceDstSpan.byte_size(),
+                    TheHandle.get()))
+          .isError()) {
+    return *this;
+  }
+  setStatus(sync());
+  return *this;
+}
+
+template <typename HostSrcTy, typename DeviceDstTy>
+Stream &Stream::syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst,
+                             ptrdiff_t ElementCount) {
+  using DstElementTy =
+      typename std::remove_reference<DeviceDstTy>::type::value_type;
+  Span<const DstElementTy> HostSrcSpan(HostSrc);
+  DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst);
+  if (HostSrcSpan.size() < ElementCount) {
+    setStatus(Status("copyHToD source element count " +
+                     std::to_string(HostSrcSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (DeviceDstSpan.size() < ElementCount) {
+    setStatus(Status("copyHToD destination element count " +
+                     std::to_string(DeviceDstSpan.size()) +
+                     " is less than requested element count " +
+                     std::to_string(ElementCount)));
+    return *this;
+  }
+  if (setStatus(ThePlatform->asyncCopyHToD(
+                    HostSrcSpan.data(), DeviceDstSpan.baseHandle(),
+                    DeviceDstSpan.byte_offset(),
+                    ElementCount * sizeof(DstElementTy), TheHandle.get()))
+          .isError()) {
+    return *this;
+  }
+  setStatus(sync());
+  return *this;
+}
+
+/// Owned device memory.
+///
+/// Device memory that frees itself when it goes out of scope.
+template <typename ElementType> class DeviceMemory {
+public:
+  using element_type = ElementType;
+  using index_type = std::ptrdiff_t;
+  using value_type = typename std::remove_const<element_type>::type;
+
+  DeviceMemory(const DeviceMemory &) = delete;
+  DeviceMemory &operator=(const DeviceMemory &) = delete;
+  DeviceMemory(DeviceMemory &&) noexcept;
+  DeviceMemory &operator=(DeviceMemory &&) noexcept;
+  ~DeviceMemory() = default;
+
+  /// Gets the raw base handle for the underlying platform implementation.
+  void *handle() const { return ThePointer.get(); }
+
+  index_type length() const { return TheSize; }
+  index_type size() const { return TheSize; }
+  index_type byte_size() const { // NOLINT
+    return TheSize * sizeof(element_type);
+  }
+  bool empty() const { return TheSize == 0; }
+
+  // These conversion operators are useful for making triple-chevron kernel
+  // launches more concise.
+  operator element_type *() {
+    return static_cast<element_type *>(ThePointer.get());
+  }
+  operator const element_type *() const { return ThePointer.get(); }
+
+  /// Converts a const object to a DeviceMemorySpan of const elements.
+  DeviceMemorySpan<const element_type> asSpan() const {
+    return DeviceMemorySpan<const element_type>(
+        ThePlatform, static_cast<const element_type *>(ThePointer.get()),
+        TheSize, 0);
+  }
+
+  /// Converts an object to a DeviceMemorySpan.
+  DeviceMemorySpan<element_type> asSpan() {
+    return DeviceMemorySpan<element_type>(
+        ThePlatform, static_cast<element_type *>(ThePointer.get()), TheSize, 0);
+  }
+
+private:
+  friend class Platform;
+  template <typename T> friend class DeviceMemorySpan;
+
+  DeviceMemory(Platform *ThePlatform, void *Pointer, index_type ElementCount,
+               HandleDestructor Destructor)
+      : ThePlatform(ThePlatform), ThePointer(Pointer, Destructor),
+        TheSize(ElementCount) {}
+
+  Platform *ThePlatform;
+  std::unique_ptr<void, HandleDestructor> ThePointer;
+  ptrdiff_t TheSize;
+};
+
+template <typename T>
+DeviceMemory<T>::DeviceMemory(DeviceMemory &&) noexcept = default;
+template <typename T>
+DeviceMemory<T> &DeviceMemory<T>::operator=(DeviceMemory &&) noexcept = default;
+
+/// View into device memory.
+///
+/// Like a Span, but for device memory rather than host memory.
+template <typename ElementType> class DeviceMemorySpan {
+public:
+  /// \name constants and types
+  /// \{
+  using element_type = ElementType;
+  using index_type = std::ptrdiff_t;
+  using pointer = element_type *;
+  using reference = element_type &;
+  using iterator = element_type *;
+  using const_iterator = const element_type *;
+  using value_type = typename std::remove_const<element_type>::type;
+  /// \}
+
+  DeviceMemorySpan()
+      : ThePlatform(nullptr), TheHandle(nullptr), TheSize(0), TheOffset(0),
+        TheSpanHandle(nullptr) {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  DeviceMemorySpan(DeviceMemorySpan<OtherElementType> &ASpan)
+      : ThePlatform(ASpan.ThePlatform),
+        TheHandle(static_cast<pointer>(ASpan.baseHandle())),
+        TheSize(ASpan.size()), TheOffset(ASpan.offset()),
+        TheSpanHandle(nullptr) {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  DeviceMemorySpan(DeviceMemorySpan<OtherElementType> &&ASpan)
+      : ThePlatform(ASpan.ThePlatform),
+        TheHandle(static_cast<pointer>(ASpan.baseHandle())),
+        TheSize(ASpan.size()), TheOffset(ASpan.offset()),
+        TheSpanHandle(nullptr) {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  DeviceMemorySpan(DeviceMemory<OtherElementType> &Memory)
+      : ThePlatform(Memory.ThePlatform),
+        TheHandle(static_cast<value_type *>(Memory.handle())),
+        TheSize(Memory.size()), TheOffset(0), TheSpanHandle(nullptr) {}
+
+  ~DeviceMemorySpan() {
+    if (TheSpanHandle) {
+      ThePlatform->rawDestroyDeviceMemorySpanHandle(
+          const_cast<value_type *>(TheSpanHandle));
+    }
+  }
+
+  /// \name observers
+  /// \{
+  index_type length() const { return TheSize; }
+  index_type size() const { return TheSize; }
+  index_type byte_size() const { // NOLINT
+    return TheSize * sizeof(element_type);
+  }
+  index_type offset() const { return TheOffset; }
+  index_type byte_offset() const { // NOLINT
+    return TheOffset * sizeof(element_type);
+  }
+  bool empty() const { return TheSize == 0; }
+  /// \}
+
+  void *baseHandle() const {
+    return static_cast<void *>(const_cast<value_type *>(TheHandle));
+  }
+
+  /// Casts to a host memory pointer.
+  ///
+  /// This is only guaranteed to make sense for the CUDA platform, where device
+  /// pointers can be stored and manipulated much like host pointers. This makes
+  /// it easy to do triple-chevron kernel launches in CUDA because
+  /// DeviceMemorySpan values can be passed to parameters expecting regular
+  /// pointers.
+  ///
+  /// If the CUDA platform is using unified memory, it may also be possible to
+  /// dereference this pointer on the host.
+  ///
+  /// For platforms other than CUDA, this may return a garbage pointer.
+  operator element_type *() const {
+    if (!TheSpanHandle)
+      TheSpanHandle = ThePlatform->getDeviceMemorySpanHandle(
+          TheHandle, TheSize * sizeof(element_type),
+          TheOffset * sizeof(element_type));
+    return TheSpanHandle;
+  }
+
+  DeviceMemorySpan<element_type> first(index_type Count) const {
+    bool Valid = Count >= 0 && Count <= TheSize;
+    if (!Valid)
+      std::terminate();
+    return DeviceMemorySpan<element_type>(ThePlatform, TheHandle, Count,
+                                          TheOffset);
+  }
+
+  DeviceMemorySpan<element_type> last(index_type Count) const {
+    bool Valid = Count >= 0 && Count <= TheSize;
+    if (!Valid)
+      std::terminate();
+    return DeviceMemorySpan<element_type>(ThePlatform, TheHandle, Count,
+                                          TheOffset + TheSize - Count);
+  }
+
+  DeviceMemorySpan<element_type>
+  subspan(index_type Offset, index_type Count = dynamic_extent) const {
+    bool Valid =
+        (Offset == 0 || (Offset > 0 && Offset <= TheSize)) &&
+        (Count == dynamic_extent || (Count >= 0 && Offset + Count <= TheSize));
+    if (!Valid)
+      std::terminate();
+    return DeviceMemorySpan<element_type>(ThePlatform, TheHandle, Count,
+                                          TheOffset + Offset);
+  }
+
+private:
+  template <typename T> friend class DeviceMemory;
+  template <typename T> friend class DeviceMemorySpan;
+  friend class Platform;
+
+  DeviceMemorySpan(Platform *ThePlatform, pointer AHandle, index_type Size,
+                   index_type Offset)
+      : ThePlatform(ThePlatform), TheHandle(AHandle), TheSize(Size),
+        TheOffset(Offset), TheSpanHandle(nullptr) {}
+
+  Platform *ThePlatform;
+  pointer TheHandle;
+  index_type TheSize;
+  index_type TheOffset;
+  pointer TheSpanHandle;
+};
+
+/// Asynchronous host memory.
+///
+/// This memory is pinned or otherwise registered in the host memory space to
+/// allow for asynchronous copies between it and device memory.
+///
+/// This memory unpins/unregisters itself when it goes out of scope, but does
+/// not free itself.
+template <typename ElementType> class AsyncHostMemory {
+public:
+  using value_type = ElementType;
+  using remove_const_type = typename std::remove_const<ElementType>::type;
+
+  AsyncHostMemory(const AsyncHostMemory &) = delete;
+  AsyncHostMemory &operator=(const AsyncHostMemory &) = delete;
+  AsyncHostMemory(AsyncHostMemory &&) noexcept;
+  AsyncHostMemory &operator=(AsyncHostMemory &&) noexcept;
+  ~AsyncHostMemory() = default;
+
+  template <typename OtherElementType>
+  AsyncHostMemory(AsyncHostMemory<OtherElementType> &&Other)
+      : ThePointer(std::move(Other.ThePointer)),
+        TheElementCount(Other.TheElementCount) {
+    static_assert(
+        std::is_assignable<ElementType *, OtherElementType *>::value,
+        "cannot assign OtherElementType pointer to ElementType pointer type");
+  }
+
+  ElementType *data() const {
+    return const_cast<ElementType *>(
+        static_cast<remove_const_type *>(ThePointer.get()));
+  }
+  ptrdiff_t size() const { return TheElementCount; }
+
+private:
+  template <typename U> friend class AsyncHostMemory;
+  friend class Platform;
+  AsyncHostMemory(ElementType *Pointer, ptrdiff_t ElementCount,
+                  HandleDestructor Destructor)
+      : ThePointer(
+            static_cast<void *>(const_cast<remove_const_type *>(Pointer)),
+            Destructor),
+        TheElementCount(ElementCount) {}
+
+  std::unique_ptr<void, HandleDestructor> ThePointer;
+  ptrdiff_t TheElementCount;
+};
+
+template <typename T>
+AsyncHostMemory<T>::AsyncHostMemory(AsyncHostMemory &&) noexcept = default;
+template <typename T>
+AsyncHostMemory<T> &AsyncHostMemory<T>::
+operator=(AsyncHostMemory &&) noexcept = default;
+
+/// Owned registered host memory.
+///
+/// Like AsyncHostMemory, but this memory also frees itself in addition to
+/// unpinning/unregistering itself when it goes out of scope.
+template <typename ElementType> class OwnedAsyncHostMemory {
+public:
+  using remove_const_type = typename std::remove_const<ElementType>::type;
+
+  OwnedAsyncHostMemory(const OwnedAsyncHostMemory &) = delete;
+  OwnedAsyncHostMemory &operator=(const OwnedAsyncHostMemory &) = delete;
+  OwnedAsyncHostMemory(OwnedAsyncHostMemory &&) noexcept;
+  OwnedAsyncHostMemory &operator=(OwnedAsyncHostMemory &&) noexcept;
+
+  ~OwnedAsyncHostMemory() {
+    if (ThePointer.get()) {
+      // We use placement new to construct these objects, so we have to call the
+      // destructors explicitly.
+      for (ptrdiff_t I = 0; I < TheElementCount; ++I)
+        static_cast<ElementType *>(ThePointer.get())[I].~ElementType();
+    }
+  }
+
+  ElementType *get() const {
+    return const_cast<ElementType *>(
+        static_cast<remove_const_type *>(ThePointer.get()));
+  }
+
+  ElementType &operator[](ptrdiff_t I) const {
+    assert(I >= 0 && I < TheElementCount);
+    return get()[I];
+  }
+
+private:
+  template <typename T> friend class AsyncHostMemorySpan;
+
+  friend class Platform;
+
+  OwnedAsyncHostMemory(void *Memory, ptrdiff_t ElementCount,
+                       HandleDestructor Destructor)
+      : ThePointer(Memory, Destructor), TheElementCount(ElementCount) {}
+
+  std::unique_ptr<void, HandleDestructor> ThePointer;
+  ptrdiff_t TheElementCount;
+};
+
+template <typename T>
+OwnedAsyncHostMemory<T>::OwnedAsyncHostMemory(
+    OwnedAsyncHostMemory &&) noexcept = default;
+template <typename T>
+OwnedAsyncHostMemory<T> &OwnedAsyncHostMemory<T>::
+operator=(OwnedAsyncHostMemory &&) noexcept = default;
+
+/// View into registered host memory.
+///
+/// Like Span but for registered host memory.
+template <typename ElementType> class AsyncHostMemorySpan {
+public:
+  /// \name constants and types
+  /// \{
+  using element_type = ElementType;
+  using index_type = std::ptrdiff_t;
+  using pointer = element_type *;
+  using reference = element_type &;
+  using iterator = element_type *;
+  using const_iterator = const element_type *;
+  using value_type = typename std::remove_const<element_type>::type;
+  /// \}
+
+  AsyncHostMemorySpan() : TheSpan() {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  AsyncHostMemorySpan(AsyncHostMemory<OtherElementType> &Memory)
+      : TheSpan(Memory.data(), Memory.size()) {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  AsyncHostMemorySpan(OwnedAsyncHostMemory<OtherElementType> &Owned)
+      : TheSpan(Owned.get(), Owned.TheElementCount) {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  AsyncHostMemorySpan(AsyncHostMemorySpan<OtherElementType> &ASpan)
+      : TheSpan(ASpan) {}
+
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  AsyncHostMemorySpan(AsyncHostMemorySpan<OtherElementType> &&Span)
+      : TheSpan(Span) {}
+
+  /// \name observers
+  /// \{
+  index_type length() const { return TheSpan.length(); }
+  index_type size() const { return TheSpan.size(); }
+  index_type byte_size() const { // NOLINT
+    return TheSpan.size() * sizeof(element_type);
+  }
+  bool empty() const { return TheSpan.empty(); }
+  /// \}
+
+  pointer data() const noexcept { return TheSpan.data(); }
+  operator element_type *() const { return TheSpan.data(); }
+
+  AsyncHostMemorySpan<element_type> first(index_type Count) const {
+    return AsyncHostMemorySpan<element_type>(TheSpan.first(Count));
+  }
+
+  AsyncHostMemorySpan<element_type> last(index_type Count) const {
+    return AsyncHostMemorySpan<element_type>(TheSpan.last(Count));
+  }
+
+  AsyncHostMemorySpan<element_type>
+  subspan(index_type Offset, index_type Count = dynamic_extent) const {
+    return AsyncHostMemorySpan<element_type>(TheSpan.subspan(Offset, Count));
+  }
+
+private:
+  template <typename T> friend class AsyncHostMemory;
+
+  explicit AsyncHostMemorySpan(Span<ElementType> ArraySpan)
+      : TheSpan(ArraySpan) {}
+
+  Span<ElementType> TheSpan;
+};
+
+} // namespace acxxel
+
+#endif // ACXXEL_ACXXEL_H
diff --git a/parallel-libs/acxxel/config.h.in b/parallel-libs/acxxel/config.h.in

new file mode 100644 (file)

index 0000000..64d9099
--- /dev/null
+++ b/parallel-libs/acxxel/config.h.in
@@ -0,0 +1,16 @@
+//===--- config.h - Macros generated during configuration -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file declares macros that are generated during the configuration stage
+/// of the build.
+///
+//===----------------------------------------------------------------------===//
+
+#cmakedefine ACXXEL_ENABLE_CUDA
+#cmakedefine ACXXEL_ENABLE_OPENCL
diff --git a/parallel-libs/acxxel/cuda_acxxel.cpp b/parallel-libs/acxxel/cuda_acxxel.cpp

new file mode 100644 (file)

index 0000000..9052b8d
--- /dev/null
+++ b/parallel-libs/acxxel/cuda_acxxel.cpp
@@ -0,0 +1,548 @@
+//===--- cuda_acxxel.cpp - CUDA implementation of the Acxxel API ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file defines the standard CUDA implementation of the Acxxel API.
+///
+//===----------------------------------------------------------------------===//
+
+#include "acxxel.h"
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include <array>
+#include <cassert>
+#include <sstream>
+#include <vector>
+
+namespace acxxel {
+
+namespace {
+
+/// Index of active device for this thread.
+thread_local int ActiveDeviceIndex = 0;
+
+static std::string getCUErrorMessage(CUresult Result) {
+  if (!Result)
+    return "success";
+  const char *ErrorName = "UNKNOWN_ERROR_NAME";
+  const char *ErrorDescription = "UNKNOWN_ERROR_DESCRIPTION";
+  cuGetErrorName(Result, &ErrorName);
+  cuGetErrorString(Result, &ErrorDescription);
+  std::ostringstream OutStream;
+  OutStream << "CUDA driver error: code = " << Result
+            << ", name = " << ErrorName
+            << ", description = " << ErrorDescription;
+  return OutStream.str();
+}
+
+static Status getCUError(CUresult Result, const std::string &Message) {
+  if (!Result)
+    return Status();
+  std::ostringstream OutStream;
+  OutStream << getCUErrorMessage(Result) << ", message = " << Message;
+  return Status(OutStream.str());
+}
+
+static std::string getCUDAErrorMessage(cudaError_t E) {
+  if (!E)
+    return "success";
+  std::ostringstream OutStream;
+  OutStream << "CUDA runtime error: code = " << E
+            << ", name = " << cudaGetErrorName(E)
+            << ", description = " << cudaGetErrorString(E);
+  return OutStream.str();
+}
+
+static Status getCUDAError(cudaError_t E, const std::string &Message) {
+  if (!E)
+    return Status();
+  std::ostringstream OutStream;
+  OutStream << getCUDAErrorMessage(E) << ", message = " << Message;
+  return Status(OutStream.str());
+}
+
+static void logCUWarning(CUresult Result, const std::string &Message) {
+  if (Result) {
+    std::ostringstream OutStream;
+    OutStream << Message << ": " << getCUErrorMessage(Result);
+    logWarning(OutStream.str());
+  }
+}
+
+/// A CUDA Platform implementation.
+class CUDAPlatform : public Platform {
+public:
+  ~CUDAPlatform() override = default;
+
+  static Expected<CUDAPlatform> create();
+
+  Expected<int> getDeviceCount() override;
+
+  Status setActiveDeviceForThread(int DeviceIndex) override;
+
+  int getActiveDeviceForThread() override;
+
+  Expected<Stream> createStream() override;
+
+  Status streamSync(void *Stream) override;
+
+  Status streamWaitOnEvent(void *Stream, void *Event) override;
+
+  Expected<Event> createEvent() override;
+
+protected:
+  Expected<void *> rawMallocD(ptrdiff_t ByteCount) override;
+  HandleDestructor getDeviceMemoryHandleDestructor() override;
+  void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
+                                  size_t ByteOffset) override;
+  virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override;
+
+  Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol) override;
+  Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol) override;
+
+  Status rawCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                     void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
+                     ptrdiff_t ByteCount) override;
+  Status rawCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                     void *HostDst, ptrdiff_t ByteCount) override;
+  Status rawCopyHToD(const void *HostSrc, void *DeviceDst,
+                     ptrdiff_t DeviceDstByteOffset,
+                     ptrdiff_t ByteCount) override;
+
+  Status rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset, ptrdiff_t ByteCount,
+                    char ByteValue) override;
+
+  Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
+  HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
+
+  Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override;
+  HandleDestructor getFreeHostMemoryHandleDestructor() override;
+
+  Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                       void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
+                       ptrdiff_t ByteCount, void *Stream) override;
+  Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                       void *HostDst, ptrdiff_t ByteCount,
+                       void *Stream) override;
+  Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
+                       ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount,
+                       void *Stream) override;
+
+  Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                      ptrdiff_t ByteCount, char ByteValue,
+                      void *Stream) override;
+
+  Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;
+
+  Expected<Program> createProgramFromSource(Span<const char> Source) override;
+
+  Status enqueueEvent(void *Event, void *Stream) override;
+  bool eventIsDone(void *Event) override;
+  Status eventSync(void *Event) override;
+  Expected<float> getSecondsBetweenEvents(void *StartEvent,
+                                          void *EndEvent) override;
+
+  Expected<void *> rawCreateKernel(void *Program,
+                                   const std::string &Name) override;
+  HandleDestructor getKernelHandleDestructor() override;
+
+  Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
+                                KernelLaunchDimensions LaunchDimensions,
+                                Span<void *> Arguments,
+                                Span<size_t> ArgumentSizes,
+                                size_t SharedMemoryBytes) override;
+
+private:
+  explicit CUDAPlatform(const std::vector<CUcontext> &Contexts)
+      : TheContexts(Contexts) {}
+
+  // Vector of contexts for each device.
+  std::vector<CUcontext> TheContexts;
+};
+
+Expected<CUDAPlatform> CUDAPlatform::create() {
+  std::vector<CUcontext> Contexts;
+  if (CUresult Result = cuInit(0))
+    return getCUError(Result, "cuInit");
+
+  int DeviceCount = 0;
+  if (CUresult Result = cuDeviceGetCount(&DeviceCount))
+    return getCUError(Result, "cuDeviceGetCount");
+
+  for (int I = 0; I < DeviceCount; ++I) {
+    CUdevice Device;
+    if (CUresult Result = cuDeviceGet(&Device, I))
+      return getCUError(Result, "cuDeviceGet");
+    CUcontext Context;
+    if (CUresult Result = cuDevicePrimaryCtxRetain(&Context, Device))
+      return getCUError(Result, "cuDevicePrimaryCtxRetain");
+    if (CUresult Result = cuCtxSetCurrent(Context))
+      return getCUError(Result, "cuCtxSetCurrent");
+    Contexts.emplace_back(Context);
+  }
+
+  return CUDAPlatform(Contexts);
+}
+
+Status CUDAPlatform::setActiveDeviceForThread(int DeviceIndex) {
+  if (static_cast<size_t>(DeviceIndex) >= TheContexts.size())
+    return Status("invalid device index for SetActiveDevice: " +
+                  std::to_string(DeviceIndex));
+  ActiveDeviceIndex = DeviceIndex;
+  return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
+                    "setActiveDeviceForThread cuCtxSetCurrent");
+}
+
+int CUDAPlatform::getActiveDeviceForThread() { return ActiveDeviceIndex; }
+
+Expected<int> CUDAPlatform::getDeviceCount() {
+  int Count = 0;
+  if (CUresult Result = cuDeviceGetCount(&Count))
+    return getCUError(Result, "cuDeviceGetCount");
+  return Count;
+}
+
+static void cudaDestroyStream(void *H) {
+  logCUWarning(cuStreamDestroy(static_cast<CUstream_st *>(H)),
+               "cuStreamDestroy");
+}
+
+Expected<Stream> CUDAPlatform::createStream() {
+  unsigned int Flags = CU_STREAM_DEFAULT;
+  CUstream Handle;
+  if (CUresult Result = cuStreamCreate(&Handle, Flags))
+    return getCUError(Result, "cuStreamCreate");
+  return constructStream(this, Handle, cudaDestroyStream);
+}
+
+Status CUDAPlatform::streamSync(void *Stream) {
+  return getCUError(cuStreamSynchronize(static_cast<CUstream_st *>(Stream)),
+                    "cuStreamSynchronize");
+}
+
+Status CUDAPlatform::streamWaitOnEvent(void *Stream, void *Event) {
+  // CUDA docs says flags must be 0.
+  unsigned int Flags = 0u;
+  return getCUError(cuStreamWaitEvent(static_cast<CUstream_st *>(Stream),
+                                      static_cast<CUevent_st *>(Event), Flags),
+                    "cuStreamWaitEvent");
+}
+
+static void cudaDestroyEvent(void *H) {
+  logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy");
+}
+
+Expected<Event> CUDAPlatform::createEvent() {
+  unsigned int Flags = CU_EVENT_DEFAULT;
+  CUevent Handle;
+  if (CUresult Result = cuEventCreate(&Handle, Flags))
+    return getCUError(Result, "cuEventCreate");
+  return constructEvent(this, Handle, cudaDestroyEvent);
+}
+
+Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) {
+  return getCUError(cuEventRecord(static_cast<CUevent_st *>(Event),
+                                  static_cast<CUstream_st *>(Stream)),
+                    "cuEventRecord");
+}
+
+bool CUDAPlatform::eventIsDone(void *Event) {
+  return cuEventQuery(static_cast<CUevent_st *>(Event)) != CUDA_ERROR_NOT_READY;
+}
+
+Status CUDAPlatform::eventSync(void *Event) {
+  return getCUError(cuEventSynchronize(static_cast<CUevent_st *>(Event)),
+                    "cuEventSynchronize");
+}
+
+Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent,
+                                                      void *EndEvent) {
+  float Milliseconds;
+  if (CUresult Result = cuEventElapsedTime(
+          &Milliseconds, static_cast<CUevent_st *>(StartEvent),
+          static_cast<CUevent_st *>(EndEvent)))
+    return getCUError(Result, "cuEventElapsedTime");
+  return Milliseconds * 1e-6;
+}
+
+Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount) {
+  if (!ByteCount)
+    return nullptr;
+  CUdeviceptr Pointer;
+  if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
+    return getCUError(Result, "cuMemAlloc");
+  return reinterpret_cast<void *>(Pointer);
+}
+
+static void cudaDestroyDeviceMemory(void *H) {
+  logCUWarning(cuMemFree(reinterpret_cast<CUdeviceptr>(H)), "cuMemFree");
+}
+
+HandleDestructor CUDAPlatform::getDeviceMemoryHandleDestructor() {
+  return cudaDestroyDeviceMemory;
+}
+
+void *CUDAPlatform::getDeviceMemorySpanHandle(void *BaseHandle, size_t,
+                                              size_t ByteOffset) {
+  return static_cast<char *>(BaseHandle) + ByteOffset;
+}
+
+void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) {
+  // Do nothing for this platform.
+}
+
+Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol) {
+  void *Address;
+  if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol))
+    return getCUDAError(Status, "cudaGetSymbolAddress");
+  return Address;
+}
+
+Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol) {
+  size_t Size;
+  if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol))
+    return getCUDAError(Status, "cudaGetSymbolSize");
+  return Size;
+}
+
+static const void *offsetVoidPtr(const void *Ptr, ptrdiff_t ByteOffset) {
+  return static_cast<const void *>(static_cast<const char *>(Ptr) + ByteOffset);
+}
+
+static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) {
+  return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset);
+}
+
+Status CUDAPlatform::rawCopyDToD(const void *DeviceSrc,
+                                 ptrdiff_t DeviceSrcByteOffset, void *DeviceDst,
+                                 ptrdiff_t DeviceDstByteOffset,
+                                 ptrdiff_t ByteCount) {
+  return getCUError(cuMemcpyDtoD(reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
+                                     DeviceDst, DeviceDstByteOffset)),
+                                 reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
+                                     DeviceSrc, DeviceSrcByteOffset)),
+                                 ByteCount),
+                    "cuMemcpyDtoD");
+}
+
+Status CUDAPlatform::rawCopyDToH(const void *DeviceSrc,
+                                 ptrdiff_t DeviceSrcByteOffset, void *HostDst,
+                                 ptrdiff_t ByteCount) {
+  return getCUError(
+      cuMemcpyDtoH(HostDst, reinterpret_cast<CUdeviceptr>(
+                                offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
+                   ByteCount),
+      "cuMemcpyDtoH");
+}
+
+Status CUDAPlatform::rawCopyHToD(const void *HostSrc, void *DeviceDst,
+                                 ptrdiff_t DeviceDstByteOffset,
+                                 ptrdiff_t ByteCount) {
+  return getCUError(cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
+                                     DeviceDst, DeviceDstByteOffset)),
+                                 HostSrc, ByteCount),
+                    "cuMemcpyHtoD");
+}
+
+Status CUDAPlatform::rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                                ptrdiff_t ByteCount, char ByteValue) {
+  return getCUError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(
+                                   offsetVoidPtr(DeviceDst, ByteOffset)),
+                               ByteValue, ByteCount),
+                    "cuMemsetD8");
+}
+
+Status CUDAPlatform::rawRegisterHostMem(const void *Memory,
+                                        ptrdiff_t ByteCount) {
+  unsigned int Flags = 0;
+  return getCUError(
+      cuMemHostRegister(const_cast<void *>(Memory), ByteCount, Flags),
+      "cuMemHostRegiser");
+}
+
+static void cudaUnregisterHostMemoryHandleDestructor(void *H) {
+  logCUWarning(cuMemHostUnregister(H), "cuMemHostUnregister");
+}
+
+HandleDestructor CUDAPlatform::getUnregisterHostMemoryHandleDestructor() {
+  return cudaUnregisterHostMemoryHandleDestructor;
+}
+
+Expected<void *> CUDAPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) {
+  unsigned int Flags = 0;
+  void *Memory;
+  if (CUresult Result = cuMemHostAlloc(&Memory, ByteCount, Flags))
+    return getCUError(Result, "cuMemHostAlloc");
+  return Memory;
+}
+
+static void cudaFreeHostMemoryHandleDestructor(void *H) {
+  logCUWarning(cuMemFreeHost(H), "cuMemFreeHost");
+}
+
+HandleDestructor CUDAPlatform::getFreeHostMemoryHandleDestructor() {
+  return cudaFreeHostMemoryHandleDestructor;
+}
+
+Status CUDAPlatform::asyncCopyDToD(const void *DeviceSrc,
+                                   ptrdiff_t DeviceSrcByteOffset,
+                                   void *DeviceDst,
+                                   ptrdiff_t DeviceDstByteOffset,
+                                   ptrdiff_t ByteCount, void *Stream) {
+  return getCUError(
+      cuMemcpyDtoDAsync(reinterpret_cast<CUdeviceptr>(
+                            offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
+                        reinterpret_cast<CUdeviceptr>(
+                            offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
+                        ByteCount, static_cast<CUstream_st *>(Stream)),
+      "cuMemcpyDtoDAsync");
+}
+
+Status CUDAPlatform::asyncCopyDToH(const void *DeviceSrc,
+                                   ptrdiff_t DeviceSrcByteOffset, void *HostDst,
+                                   ptrdiff_t ByteCount, void *Stream) {
+  return getCUError(
+      cuMemcpyDtoHAsync(HostDst, reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
+                                     DeviceSrc, DeviceSrcByteOffset)),
+                        ByteCount, static_cast<CUstream_st *>(Stream)),
+      "cuMemcpyDtoHAsync");
+}
+
+Status CUDAPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst,
+                                   ptrdiff_t DeviceDstByteOffset,
+                                   ptrdiff_t ByteCount, void *Stream) {
+  return getCUError(
+      cuMemcpyHtoDAsync(reinterpret_cast<CUdeviceptr>(
+                            offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
+                        HostSrc, ByteCount, static_cast<CUstream_st *>(Stream)),
+      "cuMemcpyHtoDAsync");
+}
+
+Status CUDAPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                                  ptrdiff_t ByteCount, char ByteValue,
+                                  void *Stream) {
+  return getCUError(
+      cuMemsetD8Async(
+          reinterpret_cast<CUdeviceptr>(offsetVoidPtr(DeviceDst, ByteOffset)),
+          ByteValue, ByteCount, static_cast<CUstream_st *>(Stream)),
+      "cuMemsetD8Async");
+}
+
+struct StreamCallbackUserData {
+  StreamCallbackUserData(Stream &Stream, StreamCallback Function)
+      : TheStream(Stream), TheFunction(std::move(Function)) {}
+
+  Stream &TheStream;
+  StreamCallback TheFunction;
+};
+
+static void CUDA_CB cuStreamCallbackShim(CUstream HStream, CUresult Status,
+                                         void *UserData) {
+  std::unique_ptr<StreamCallbackUserData> Data(
+      static_cast<StreamCallbackUserData *>(UserData));
+  Stream &TheStream = Data->TheStream;
+  assert(static_cast<CUstream_st *>(TheStream) == HStream);
+  Data->TheFunction(TheStream,
+                    getCUError(Status, "stream callback error state"));
+}
+
+Status CUDAPlatform::addStreamCallback(Stream &Stream,
+                                       StreamCallback Callback) {
+  // CUDA docs say flags must always be 0 here.
+  unsigned int Flags = 0u;
+  std::unique_ptr<StreamCallbackUserData> UserData(
+      new StreamCallbackUserData(Stream, std::move(Callback)));
+  return getCUError(cuStreamAddCallback(Stream, cuStreamCallbackShim,
+                                        UserData.release(), Flags),
+                    "cuStreamAddCallback");
+}
+
+static void cudaDestroyProgram(void *H) {
+  logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload");
+}
+
+Expected<Program>
+CUDAPlatform::createProgramFromSource(Span<const char> Source) {
+  CUmodule Module;
+  constexpr int LogBufferSizeBytes = 1024;
+  char InfoLogBuffer[LogBufferSizeBytes];
+  char ErrorLogBuffer[LogBufferSizeBytes];
+  constexpr size_t OptionsCount = 4;
+  std::array<CUjit_option, OptionsCount> OptionNames = {
+      {CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+       CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}};
+  std::array<void *, OptionsCount> OptionValues = {
+      {InfoLogBuffer, const_cast<int *>(&LogBufferSizeBytes), ErrorLogBuffer,
+       const_cast<int *>(&LogBufferSizeBytes)}};
+  if (CUresult Result =
+          cuModuleLoadDataEx(&Module, Source.data(), OptionsCount,
+                             OptionNames.data(), OptionValues.data())) {
+    InfoLogBuffer[LogBufferSizeBytes - 1] = '\0';
+    ErrorLogBuffer[LogBufferSizeBytes - 1] = '\0';
+    std::ostringstream OutStream;
+    OutStream << "Error creating program from source: "
+              << getCUErrorMessage(Result)
+              << "\nINFO MESSAGES\n================\n"
+              << InfoLogBuffer << "\nERROR MESSAGES\n==================\n"
+              << ErrorLogBuffer;
+    return Status(OutStream.str());
+  }
+  return constructProgram(this, Module, cudaDestroyProgram);
+}
+
+Expected<void *> CUDAPlatform::rawCreateKernel(void *Program,
+                                               const std::string &Name) {
+  CUmodule Module = static_cast<CUmodule>(Program);
+  CUfunction Kernel;
+  if (CUresult Result = cuModuleGetFunction(&Kernel, Module, Name.c_str()))
+    return getCUError(Result, "cuModuleGetFunction");
+  return Kernel;
+}
+
+static void cudaDestroyKernel(void *) {
+  // Do nothing.
+}
+
+HandleDestructor CUDAPlatform::getKernelHandleDestructor() {
+  return cudaDestroyKernel;
+}
+
+Status CUDAPlatform::rawEnqueueKernelLaunch(
+    void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions,
+    Span<void *> Arguments, Span<size_t>, size_t SharedMemoryBytes) {
+  return getCUError(
+      cuLaunchKernel(static_cast<CUfunction>(Kernel), LaunchDimensions.GridX,
+                     LaunchDimensions.GridY, LaunchDimensions.GridZ,
+                     LaunchDimensions.BlockX, LaunchDimensions.BlockY,
+                     LaunchDimensions.BlockZ, SharedMemoryBytes,
+                     static_cast<CUstream>(Stream), Arguments.data(), nullptr),
+      "cuLaunchKernel");
+}
+
+} // namespace
+
+namespace cuda {
+
+/// Gets the CUDAPlatform instance and returns it as an unowned pointer to a
+/// Platform.
+Expected<Platform *> getPlatform() {
+  static auto MaybePlatform = []() -> Expected<CUDAPlatform *> {
+    Expected<CUDAPlatform> CreationResult = CUDAPlatform::create();
+    if (CreationResult.isError())
+      return CreationResult.getError();
+    else
+      return new CUDAPlatform(CreationResult.takeValue());
+  }();
+  return MaybePlatform;
+}
+
+} // namespace cuda
+
+} // namespace acxxel
diff --git a/parallel-libs/acxxel/customdoxygen.css b/parallel-libs/acxxel/customdoxygen.css

new file mode 100644 (file)

index 0000000..a40ac95
--- /dev/null
+++ b/parallel-libs/acxxel/customdoxygen.css
@@ -0,0 +1,20 @@
+body {
+  background-color: #e0e0eb;
+}
+
+div.header {
+  margin-left: auto;
+  margin-right: auto;
+  max-width: 60em;
+  padding-left: 2em;
+  padding-right: 2em;
+}
+
+div.contents {
+  margin-left: auto;
+  margin-right: auto;
+  max-width: 60em;
+  background-color: white;
+  padding: 2em;
+  border-radius: 1em;
+}
diff --git a/parallel-libs/acxxel/examples/CMakeLists.txt b/parallel-libs/acxxel/examples/CMakeLists.txt

new file mode 100644 (file)

index 0000000..585e80a
--- /dev/null
+++ b/parallel-libs/acxxel/examples/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CUDA_HOST_COMPILER gcc)
+set(CUDA_NVCC_FLAGS -std=c++11)
+
+if(ACXXEL_ENABLE_CUDA)
+cuda_add_executable(simple_example simple_example.cu)
+target_link_libraries(simple_example acxxel)
+endif()
+
+if(ACXXEL_ENABLE_OPENCL)
+add_executable(opencl_example opencl_example.cpp)
+target_link_libraries(opencl_example acxxel ${OpenCL_LIBRARIES})
+endif()
diff --git a/parallel-libs/acxxel/examples/opencl_example.cpp b/parallel-libs/acxxel/examples/opencl_example.cpp

new file mode 100644 (file)

index 0000000..713daac
--- /dev/null
+++ b/parallel-libs/acxxel/examples/opencl_example.cpp
@@ -0,0 +1,70 @@
+//===--- opencl_example.cpp - Example of using Acxxel with OpenCL ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file is an example of using OpenCL with Acxxel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "acxxel.h"
+
+#include <array>
+#include <cstdio>
+#include <cstring>
+
+static const char *SaxpyKernelSource = R"(
+__kernel void saxpyKernel(float A, __global float *X, __global float *Y, int N) {
+  int I = get_global_id(0);
+  if (I < N)
+    X[I] = A * X[I] + Y[I];
+}
+)";
+
+template <size_t N>
+void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
+  acxxel::Platform *OpenCL = acxxel::getOpenCLPlatform().getValue();
+  acxxel::Stream Stream = OpenCL->createStream().takeValue();
+  auto DeviceX = OpenCL->mallocD<float>(N).takeValue();
+  auto DeviceY = OpenCL->mallocD<float>(N).takeValue();
+  Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);
+  acxxel::Program Program =
+      OpenCL
+          ->createProgramFromSource(acxxel::Span<const char>(
+              SaxpyKernelSource, std::strlen(SaxpyKernelSource)))
+          .takeValue();
+  acxxel::Kernel Kernel = Program.createKernel("saxpyKernel").takeValue();
+  float *RawX = static_cast<float *>(DeviceX);
+  float *RawY = static_cast<float *>(DeviceY);
+  int IntLength = N;
+  void *Arguments[] = {&A, &RawX, &RawY, &IntLength};
+  size_t ArgumentSizes[] = {sizeof(float), sizeof(float *), sizeof(float *),
+                            sizeof(int)};
+  acxxel::Status Status =
+      Stream.asyncKernelLaunch(Kernel, N, Arguments, ArgumentSizes)
+          .syncCopyDToH(DeviceX, X)
+          .sync();
+  if (Status.isError()) {
+    std::fprintf(stderr, "Error during saxpy: %s\n",
+                 Status.getMessage().c_str());
+    std::exit(EXIT_FAILURE);
+  }
+}
+
+int main() {
+  float A = 2.f;
+  std::array<float, 3> X = {0.f, 1.f, 2.f};
+  std::array<float, 3> Y = {3.f, 4.f, 5.f};
+  std::array<float, 3> Expected = {3.f, 6.f, 9.f};
+  saxpy(A, X, Y);
+  for (int I = 0; I < 3; ++I)
+    if (X[I] != Expected[I]) {
+      std::fprintf(stderr, "Mismatch at position %d, %f != %f\n", I, X[I],
+                   Expected[I]);
+      std::exit(EXIT_FAILURE);
+    }
+}
diff --git a/parallel-libs/acxxel/examples/simple_example.cu b/parallel-libs/acxxel/examples/simple_example.cu

new file mode 100644 (file)

index 0000000..588a033
--- /dev/null
+++ b/parallel-libs/acxxel/examples/simple_example.cu
@@ -0,0 +1,110 @@
+//===--- simple_example.cu - Simple example of using Acxxel ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file is a simple example of using Acxxel.
+///
+//===----------------------------------------------------------------------===//
+
+/// [Example simple saxpy]
+#include "acxxel.h"
+
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+
+// A standard CUDA kernel.
+__global__ void saxpyKernel(float A, float *X, float *Y, int N) {
+  int I = (blockDim.x * blockIdx.x) + threadIdx.x;
+  if (I < N)
+    X[I] = A * X[I] + Y[I];
+}
+
+// A host library wrapping the CUDA kernel. All Acxxel calls are in here.
+template <size_t N>
+void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
+  // Get the CUDA platform and make a CUDA stream.
+  acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
+  acxxel::Stream Stream = CUDA->createStream().takeValue();
+
+  // Allocate space for device arrays.
+  auto DeviceX = CUDA->mallocD<float>(N).takeValue();
+  auto DeviceY = CUDA->mallocD<float>(N).takeValue();
+
+  // Copy X and Y out to the device.
+  Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);
+
+  // Launch the kernel using triple-chevron notation.
+  saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);
+
+  // Copy the results back to the host.
+  acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();
+
+  // Check for any errors.
+  if (Status.isError()) {
+    std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
+                 Status.getMessage().c_str());
+    std::exit(EXIT_FAILURE);
+  }
+}
+/// [Example simple saxpy]
+
+/// [Example CUDA simple saxpy]
+template <size_t N>
+void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
+  // This size is needed all over the place, so give it a name.
+  constexpr size_t Size = N * sizeof(float);
+
+  // Allocate space for device arrays.
+  float *DeviceX;
+  float *DeviceY;
+  cudaMalloc(&DeviceX, Size);
+  cudaMalloc(&DeviceY, Size);
+
+  // Copy X and Y out to the device.
+  cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
+  cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);
+
+  // Launch the kernel using triple-chevron notation.
+  saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);
+
+  // Copy the results back to the host.
+  cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);
+
+  // Free resources.
+  cudaFree(DeviceX);
+  cudaFree(DeviceY);
+
+  // Check for any errors.
+  cudaError_t Error = cudaGetLastError();
+  if (Error) {
+    std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
+                 cudaGetErrorString(Error));
+    std::exit(EXIT_FAILURE);
+  }
+}
+/// [Example CUDA simple saxpy]
+
+template <typename F> void testSaxpy(F &&SaxpyFunction) {
+  float A = 2.f;
+  std::array<float, 3> X = {{0.f, 1.f, 2.f}};
+  std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
+  std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
+  SaxpyFunction(A, X, Y);
+  for (int I = 0; I < 3; ++I)
+    if (X[I] != Expected[I]) {
+      std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
+                   Expected[I]);
+      std::exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+  testSaxpy(saxpy<3>);
+  testSaxpy(cudaSaxpy<3>);
+}
diff --git a/parallel-libs/acxxel/opencl_acxxel.cpp b/parallel-libs/acxxel/opencl_acxxel.cpp

new file mode 100644 (file)

index 0000000..2ca74ed
--- /dev/null
+++ b/parallel-libs/acxxel/opencl_acxxel.cpp
@@ -0,0 +1,620 @@
+//===--- opencl_acxxel.cpp - OpenCL implementation of the Acxxel API ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file defines the standard OpenCL implementation of the Acxxel API.
+///
+//===----------------------------------------------------------------------===//
+
+#include "acxxel.h"
+
+#include "CL/cl.h"
+
+#include <mutex>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+namespace acxxel {
+
+namespace {
+
+/// An ID containing the platform ID and the device ID within the platform.
+struct FullDeviceID {
+  cl_platform_id PlatformID;
+  cl_device_id DeviceID;
+
+  FullDeviceID(cl_platform_id PlatformID, cl_device_id DeviceID)
+      : PlatformID(PlatformID), DeviceID(DeviceID) {}
+};
+
+thread_local int ActiveDeviceIndex = 0;
+
+static std::string getOpenCLErrorMessage(cl_int Result) {
+  if (!Result)
+    return "success";
+  std::ostringstream OutStream;
+  OutStream << "OpenCL error: code = " << Result;
+  return OutStream.str();
+}
+
+static Status getOpenCLError(cl_int Result, const std::string &Message) {
+  if (!Result)
+    return Status();
+  std::ostringstream OutStream;
+  OutStream << getOpenCLErrorMessage(Result) << ", message = " << Message;
+  return Status(OutStream.str());
+}
+
+static void logOpenCLWarning(cl_int Result, const std::string &Message) {
+  if (Result) {
+    std::ostringstream OutStream;
+    OutStream << Message << ": " << getOpenCLErrorMessage(Result);
+    logWarning(OutStream.str());
+  }
+}
+
+class OpenCLPlatform : public Platform {
+public:
+  ~OpenCLPlatform() override = default;
+
+  static Expected<OpenCLPlatform> create();
+
+  Expected<int> getDeviceCount() override;
+
+  Status setActiveDeviceForThread(int DeviceIndex) override;
+
+  int getActiveDeviceForThread() override;
+
+  Expected<Stream> createStream() override;
+
+  Expected<Event> createEvent() override;
+
+  Expected<Program> createProgramFromSource(Span<const char> Source) override;
+
+protected:
+  Status streamSync(void *Stream) override;
+
+  Status streamWaitOnEvent(void *Stream, void *Event) override;
+
+  Expected<void *> rawMallocD(ptrdiff_t ByteCount) override;
+  HandleDestructor getDeviceMemoryHandleDestructor() override;
+  void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
+                                  size_t ByteOffset) override;
+  void rawDestroyDeviceMemorySpanHandle(void *Handle) override;
+
+  Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol) override;
+  Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol) override;
+
+  Status rawCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                     void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
+                     ptrdiff_t ByteCount) override;
+  Status rawCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                     void *HostDst, ptrdiff_t ByteCount) override;
+  Status rawCopyHToD(const void *HostSrc, void *DeviceDst,
+                     ptrdiff_t DeviceDstByteOffset,
+                     ptrdiff_t ByteCount) override;
+
+  Status rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset, ptrdiff_t ByteCount,
+                    char ByteValue) override;
+
+  Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
+  HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
+
+  Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override;
+  HandleDestructor getFreeHostMemoryHandleDestructor() override;
+
+  Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                       void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
+                       ptrdiff_t ByteCount, void *Stream) override;
+  Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
+                       void *HostDst, ptrdiff_t ByteCount,
+                       void *Stream) override;
+  Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
+                       ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount,
+                       void *Stream) override;
+
+  Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                      ptrdiff_t ByteCount, char ByteValue,
+                      void *Stream) override;
+
+  Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;
+
+  Status enqueueEvent(void *Event, void *Stream) override;
+  bool eventIsDone(void *Event) override;
+  Status eventSync(void *Event) override;
+  Expected<float> getSecondsBetweenEvents(void *StartEvent,
+                                          void *EndEvent) override;
+
+  Expected<void *> rawCreateKernel(void *Program,
+                                   const std::string &Name) override;
+  HandleDestructor getKernelHandleDestructor() override;
+
+  Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
+                                KernelLaunchDimensions LaunchDimensions,
+                                Span<void *> Arguments,
+                                Span<size_t> ArgumentSizes,
+                                size_t SharedMemoryBytes) override;
+
+private:
+  OpenCLPlatform(std::vector<FullDeviceID> &&FullDeviceIDs,
+                 std::vector<cl_context> &&Contexts,
+                 std::vector<cl_command_queue> &&CommandQueues)
+      : FullDeviceIDs(std::move(FullDeviceIDs)), Contexts(std::move(Contexts)),
+        CommandQueues(std::move(CommandQueues)) {}
+
+  std::vector<FullDeviceID> FullDeviceIDs;
+  std::vector<cl_context> Contexts;
+  std::vector<cl_command_queue> CommandQueues;
+};
+
+Expected<OpenCLPlatform> OpenCLPlatform::create() {
+  constexpr cl_uint MaxNumEntries = 100;
+  cl_platform_id Platforms[MaxNumEntries];
+  cl_uint NumPlatforms;
+  if (cl_int Result = clGetPlatformIDs(MaxNumEntries, Platforms, &NumPlatforms))
+    return getOpenCLError(Result, "clGetPlatformIDs");
+
+  std::vector<FullDeviceID> FullDeviceIDs;
+  for (cl_uint PlatformIndex = 0; PlatformIndex < NumPlatforms;
+       ++PlatformIndex) {
+    cl_uint NumDevices;
+    cl_device_id Devices[MaxNumEntries];
+    if (cl_int Result =
+            clGetDeviceIDs(Platforms[PlatformIndex], CL_DEVICE_TYPE_ALL,
+                           MaxNumEntries, Devices, &NumDevices))
+      return getOpenCLError(Result, "clGetDeviceIDs");
+    for (cl_uint DeviceIndex = 0; DeviceIndex < NumDevices; ++DeviceIndex)
+      FullDeviceIDs.emplace_back(Platforms[PlatformIndex],
+                                 Devices[DeviceIndex]);
+  }
+
+  if (FullDeviceIDs.empty())
+    return Status("No OpenCL device available on this system.");
+
+  std::vector<cl_context> Contexts(FullDeviceIDs.size());
+  std::vector<cl_command_queue> CommandQueues(FullDeviceIDs.size());
+  for (size_t I = 0; I < FullDeviceIDs.size(); ++I) {
+    cl_int CreateContextResult;
+    Contexts[I] = clCreateContext(nullptr, 1, &FullDeviceIDs[I].DeviceID,
+                                  nullptr, nullptr, &CreateContextResult);
+    if (CreateContextResult)
+      return getOpenCLError(CreateContextResult, "clCreateContext");
+
+    cl_int CreateCommandQueueResult;
+    CommandQueues[I] = clCreateCommandQueue(
+        Contexts[I], FullDeviceIDs[I].DeviceID, CL_QUEUE_PROFILING_ENABLE,
+        &CreateCommandQueueResult);
+    if (CreateCommandQueueResult)
+      return getOpenCLError(CreateCommandQueueResult, "clCreateCommandQueue");
+  }
+
+  return OpenCLPlatform(std::move(FullDeviceIDs), std::move(Contexts),
+                        std::move(CommandQueues));
+}
+
+Expected<int> OpenCLPlatform::getDeviceCount() { return FullDeviceIDs.size(); }
+
+Status OpenCLPlatform::setActiveDeviceForThread(int DeviceIndex) {
+  if (static_cast<size_t>(DeviceIndex) >= FullDeviceIDs.size())
+    return Status("Could not set active device index to " +
+                  std::to_string(DeviceIndex) + " because there are only " +
+                  std::to_string(FullDeviceIDs.size()) +
+                  " devices in the system");
+  ActiveDeviceIndex = DeviceIndex;
+  return Status();
+}
+
+int OpenCLPlatform::getActiveDeviceForThread() { return ActiveDeviceIndex; }
+
+static void openCLDestroyStream(void *H) {
+  logOpenCLWarning(clReleaseCommandQueue(static_cast<cl_command_queue>(H)),
+                   "clReleaseCommandQueue");
+}
+
+Expected<Stream> OpenCLPlatform::createStream() {
+  cl_int Result;
+  cl_command_queue Queue = clCreateCommandQueue(
+      Contexts[ActiveDeviceIndex], FullDeviceIDs[ActiveDeviceIndex].DeviceID,
+      CL_QUEUE_PROFILING_ENABLE, &Result);
+  if (Result)
+    return getOpenCLError(Result, "clCreateCommandQueue");
+  return constructStream(this, Queue, openCLDestroyStream);
+}
+
+static void openCLEventDestroy(void *H) {
+  cl_event *CLEvent = static_cast<cl_event *>(H);
+  logOpenCLWarning(clReleaseEvent(*CLEvent), "clReleaseEvent");
+  delete CLEvent;
+}
+
+Status OpenCLPlatform::streamSync(void *Stream) {
+  return getOpenCLError(clFinish(static_cast<cl_command_queue>(Stream)),
+                        "clFinish");
+}
+
+Status OpenCLPlatform::streamWaitOnEvent(void *Stream, void *Event) {
+  cl_event *CLEvent = static_cast<cl_event *>(Event);
+  return getOpenCLError(
+      clEnqueueBarrierWithWaitList(static_cast<cl_command_queue>(Stream), 1,
+                                   CLEvent, nullptr),
+      "clEnqueueMarkerWithWaitList");
+}
+
+Expected<Event> OpenCLPlatform::createEvent() {
+  cl_int Result;
+  cl_event Event = clCreateUserEvent(Contexts[ActiveDeviceIndex], &Result);
+  if (Result)
+    return getOpenCLError(Result, "clCreateUserEvent");
+  if (cl_int Result = clSetUserEventStatus(Event, CL_COMPLETE))
+    return getOpenCLError(Result, "clSetUserEventStatus");
+  return constructEvent(this, new cl_event(Event), openCLEventDestroy);
+}
+
+static void openCLDestroyProgram(void *H) {
+  logOpenCLWarning(clReleaseProgram(static_cast<cl_program>(H)),
+                   "clReleaseProgram");
+}
+
+Expected<Program>
+OpenCLPlatform::createProgramFromSource(Span<const char> Source) {
+  cl_int Error;
+  const char *CSource = Source.data();
+  size_t SourceSize = Source.size();
+  cl_program Program = clCreateProgramWithSource(Contexts[ActiveDeviceIndex], 1,
+                                                 &CSource, &SourceSize, &Error);
+  if (Error)
+    return getOpenCLError(Error, "clCreateProgramWithSource");
+  cl_device_id DeviceID = FullDeviceIDs[ActiveDeviceIndex].DeviceID;
+  if (cl_int Error =
+          clBuildProgram(Program, 1, &DeviceID, nullptr, nullptr, nullptr))
+    return getOpenCLError(Error, "clBuildProgram");
+  return constructProgram(this, Program, openCLDestroyProgram);
+}
+
+Expected<void *> OpenCLPlatform::rawMallocD(ptrdiff_t ByteCount) {
+  cl_int Result;
+  cl_mem Memory = clCreateBuffer(Contexts[ActiveDeviceIndex], CL_MEM_READ_WRITE,
+                                 ByteCount, nullptr, &Result);
+  if (Result)
+    return getOpenCLError(Result, "clCreateBuffer");
+  return reinterpret_cast<void *>(Memory);
+}
+
+static void openCLDestroyDeviceMemory(void *H) {
+  logOpenCLWarning(clReleaseMemObject(static_cast<cl_mem>(H)),
+                   "clReleaseMemObject");
+}
+
+HandleDestructor OpenCLPlatform::getDeviceMemoryHandleDestructor() {
+  return openCLDestroyDeviceMemory;
+}
+
+void *OpenCLPlatform::getDeviceMemorySpanHandle(void *BaseHandle,
+                                                size_t ByteSize,
+                                                size_t ByteOffset) {
+  cl_int Error;
+  cl_buffer_region Region;
+  Region.origin = ByteOffset;
+  Region.size = ByteSize;
+  cl_mem SubBuffer =
+      clCreateSubBuffer(static_cast<cl_mem>(BaseHandle), 0,
+                        CL_BUFFER_CREATE_TYPE_REGION, &Region, &Error);
+  logOpenCLWarning(Error, "clCreateSubBuffer");
+  if (Error)
+    return nullptr;
+  return SubBuffer;
+}
+
+void OpenCLPlatform::rawDestroyDeviceMemorySpanHandle(void *Handle) {
+  openCLDestroyDeviceMemory(Handle);
+}
+
+Expected<void *>
+OpenCLPlatform::rawGetDeviceSymbolAddress(const void * /*Symbol*/) {
+  // This doesn't seem to have any equivalent in OpenCL.
+  return Status("not implemented");
+}
+
+Expected<ptrdiff_t>
+OpenCLPlatform::rawGetDeviceSymbolSize(const void * /*Symbol*/) {
+  // This doesn't seem to have any equivalent in OpenCL.
+  return Status("not implemented");
+}
+
+Status OpenCLPlatform::rawCopyDToD(const void *DeviceSrc,
+                                   ptrdiff_t DeviceSrcByteOffset,
+                                   void *DeviceDst,
+                                   ptrdiff_t DeviceDstByteOffset,
+                                   ptrdiff_t ByteCount) {
+  cl_event DoneEvent;
+  if (cl_int Result = clEnqueueCopyBuffer(
+          CommandQueues[ActiveDeviceIndex],
+          static_cast<cl_mem>(const_cast<void *>(DeviceSrc)),
+          static_cast<cl_mem>(DeviceDst), DeviceSrcByteOffset,
+          DeviceDstByteOffset, ByteCount, 0, nullptr, &DoneEvent))
+    return getOpenCLError(Result, "clEnqueueCopyBuffer");
+  return getOpenCLError(clWaitForEvents(1, &DoneEvent), "clWaitForEvents");
+}
+
+Status OpenCLPlatform::rawCopyDToH(const void *DeviceSrc,
+                                   ptrdiff_t DeviceSrcByteOffset, void *HostDst,
+                                   ptrdiff_t ByteCount) {
+  cl_event DoneEvent;
+  if (cl_int Result = clEnqueueReadBuffer(
+          CommandQueues[ActiveDeviceIndex],
+          static_cast<cl_mem>(const_cast<void *>(DeviceSrc)), CL_TRUE,
+          DeviceSrcByteOffset, ByteCount, HostDst, 0, nullptr, &DoneEvent))
+    return getOpenCLError(Result, "clEnqueueReadBuffer");
+  return getOpenCLError(clWaitForEvents(1, &DoneEvent), "clWaitForEvents");
+}
+
+Status OpenCLPlatform::rawCopyHToD(const void *HostSrc, void *DeviceDst,
+                                   ptrdiff_t DeviceDstByteOffset,
+                                   ptrdiff_t ByteCount) {
+  cl_event DoneEvent;
+  if (cl_int Result = clEnqueueWriteBuffer(
+          CommandQueues[ActiveDeviceIndex], static_cast<cl_mem>(DeviceDst),
+          CL_TRUE, DeviceDstByteOffset, ByteCount, HostSrc, 0, nullptr,
+          &DoneEvent))
+    return getOpenCLError(Result, "clEnqueueWriteBuffer");
+  return getOpenCLError(clWaitForEvents(1, &DoneEvent), "clWaitForEvents");
+}
+
+Status OpenCLPlatform::rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                                  ptrdiff_t ByteCount, char ByteValue) {
+  cl_event DoneEvent;
+  if (cl_int Result = clEnqueueFillBuffer(
+          CommandQueues[ActiveDeviceIndex], static_cast<cl_mem>(DeviceDst),
+          &ByteValue, 1, ByteOffset, ByteCount, 0, nullptr, &DoneEvent))
+    return getOpenCLError(Result, "clEnqueueFillBuffer");
+  return getOpenCLError(clWaitForEvents(1, &DoneEvent), "clWaitForEvents");
+}
+
+static void noOpHandleDestructor(void *) {}
+
+Status OpenCLPlatform::rawRegisterHostMem(const void * /*Memory*/,
+                                          ptrdiff_t /*ByteCount*/) {
+  // TODO(jhen): Do we want to do something to pin the memory here?
+  return Status();
+}
+
+HandleDestructor OpenCLPlatform::getUnregisterHostMemoryHandleDestructor() {
+  // TODO(jhen): Do we want to unpin the memory here?
+  return noOpHandleDestructor;
+}
+
+Expected<void *> OpenCLPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) {
+  // TODO(jhen): Do we want to do something to pin the memory here?
+  return std::malloc(ByteCount);
+}
+
+static void freeMemoryHandleDestructor(void *Memory) {
+  // TODO(jhen): Do we want to unpin the memory here?
+  std::free(Memory);
+}
+
+HandleDestructor OpenCLPlatform::getFreeHostMemoryHandleDestructor() {
+  return freeMemoryHandleDestructor;
+}
+
+Status OpenCLPlatform::asyncCopyDToD(const void *DeviceSrc,
+                                     ptrdiff_t DeviceSrcByteOffset,
+                                     void *DeviceDst,
+                                     ptrdiff_t DeviceDstByteOffset,
+                                     ptrdiff_t ByteCount, void *Stream) {
+  return getOpenCLError(
+      clEnqueueCopyBuffer(static_cast<cl_command_queue>(Stream),
+                          static_cast<cl_mem>(const_cast<void *>(DeviceSrc)),
+                          static_cast<cl_mem>(DeviceDst), DeviceSrcByteOffset,
+                          DeviceDstByteOffset, ByteCount, 0, nullptr, nullptr),
+      "clEnqueueCopyBuffer");
+}
+
+Status OpenCLPlatform::asyncCopyDToH(const void *DeviceSrc,
+                                     ptrdiff_t DeviceSrcByteOffset,
+                                     void *HostDst, ptrdiff_t ByteCount,
+                                     void *Stream) {
+  return getOpenCLError(
+      clEnqueueReadBuffer(static_cast<cl_command_queue>(Stream),
+                          static_cast<cl_mem>(const_cast<void *>(DeviceSrc)),
+                          CL_TRUE, DeviceSrcByteOffset, ByteCount, HostDst, 0,
+                          nullptr, nullptr),
+      "clEnqueueReadBuffer");
+}
+
+Status OpenCLPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst,
+                                     ptrdiff_t DeviceDstByteOffset,
+                                     ptrdiff_t ByteCount, void *Stream) {
+  return getOpenCLError(
+      clEnqueueWriteBuffer(static_cast<cl_command_queue>(Stream),
+                           static_cast<cl_mem>(DeviceDst), CL_TRUE,
+                           DeviceDstByteOffset, ByteCount, HostSrc, 0, nullptr,
+                           nullptr),
+      "clEnqueueWriteBuffer");
+}
+
+Status OpenCLPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
+                                    ptrdiff_t ByteCount, char ByteValue,
+                                    void *Stream) {
+  return getOpenCLError(
+      clEnqueueFillBuffer(static_cast<cl_command_queue>(Stream),
+                          static_cast<cl_mem>(DeviceDst), &ByteValue, 1,
+                          ByteOffset, ByteCount, 0, nullptr, nullptr),
+      "clEnqueueFillBuffer");
+}
+
+struct StreamCallbackUserData {
+  StreamCallbackUserData(Stream &TheStream, StreamCallback Function,
+                         cl_event EndEvent)
+      : TheStream(TheStream), TheFunction(std::move(Function)),
+        EndEvent(EndEvent) {}
+
+  Stream &TheStream;
+  StreamCallback TheFunction;
+  cl_event EndEvent;
+};
+
+// A function with the right signature to pass to clSetEventCallback.
+void CL_CALLBACK openCLStreamCallbackShim(cl_event,
+                                          cl_int EventCommandExecStatus,
+                                          void *UserData) {
+  std::unique_ptr<StreamCallbackUserData> Data(
+      static_cast<StreamCallbackUserData *>(UserData));
+  Data->TheFunction(
+      Data->TheStream,
+      getOpenCLError(EventCommandExecStatus, "stream callback error state"));
+  if (cl_int Result = clSetUserEventStatus(Data->EndEvent, CL_COMPLETE))
+    logOpenCLWarning(Result, "clSetUserEventStatus");
+  if (cl_int Result = clReleaseEvent(Data->EndEvent))
+    logOpenCLWarning(Result, "clReleaseEvent");
+}
+
+Status OpenCLPlatform::addStreamCallback(Stream &TheStream,
+                                         StreamCallback Callback) {
+  cl_int Result;
+  cl_event StartEvent = clCreateUserEvent(Contexts[ActiveDeviceIndex], &Result);
+  if (Result)
+    return getOpenCLError(Result, "clCreateUserEvent");
+  cl_event EndEvent = clCreateUserEvent(Contexts[ActiveDeviceIndex], &Result);
+  if (Result)
+    return getOpenCLError(Result, "clCreateUserEvent");
+  cl_event StartBarrierEvent;
+  if (cl_int Result = clEnqueueBarrierWithWaitList(
+          static_cast<cl_command_queue>(getStreamHandle(TheStream)), 1,
+          &StartEvent, &StartBarrierEvent))
+    return getOpenCLError(Result, "clEnqueueBarrierWithWaitList");
+
+  if (cl_int Result = clEnqueueBarrierWithWaitList(
+          static_cast<cl_command_queue>(getStreamHandle(TheStream)), 1,
+          &EndEvent, nullptr))
+    return getOpenCLError(Result, "clEnqueueBarrierWithWaitList");
+
+  std::unique_ptr<StreamCallbackUserData> UserData(
+      new StreamCallbackUserData(TheStream, std::move(Callback), EndEvent));
+  if (cl_int Result =
+          clSetEventCallback(StartBarrierEvent, CL_RUNNING,
+                             openCLStreamCallbackShim, UserData.release()))
+    return getOpenCLError(Result, "clSetEventCallback");
+
+  if (cl_int Result = clSetUserEventStatus(StartEvent, CL_COMPLETE))
+    return getOpenCLError(Result, "clSetUserEventStatus");
+
+  if (cl_int Result = clReleaseEvent(StartBarrierEvent))
+    return getOpenCLError(Result, "clReleaseEvent");
+
+  return getOpenCLError(clReleaseEvent(StartEvent), "clReleaseEvent");
+}
+
+Status OpenCLPlatform::enqueueEvent(void *Event, void *Stream) {
+  cl_event *CLEvent = static_cast<cl_event *>(Event);
+  cl_event OldEvent = *CLEvent;
+  cl_event NewEvent;
+  if (cl_int Result = clEnqueueMarkerWithWaitList(
+          static_cast<cl_command_queue>(Stream), 0, nullptr, &NewEvent))
+    return getOpenCLError(Result, "clEnqueueMarkerWithWaitList");
+  *CLEvent = NewEvent;
+  return getOpenCLError(clReleaseEvent(OldEvent), "clReleaseEvent");
+}
+
+bool OpenCLPlatform::eventIsDone(void *Event) {
+  cl_event *CLEvent = static_cast<cl_event *>(Event);
+  cl_int EventStatus;
+  logOpenCLWarning(clGetEventInfo(*CLEvent, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                  sizeof(EventStatus), &EventStatus, nullptr),
+                   "clGetEventInfo");
+  return EventStatus == CL_COMPLETE || EventStatus < 0;
+}
+
+Status OpenCLPlatform::eventSync(void *Event) {
+  cl_event *CLEvent = static_cast<cl_event *>(Event);
+  return getOpenCLError(clWaitForEvents(1, CLEvent), "clWaitForEvents");
+}
+
+Expected<float> OpenCLPlatform::getSecondsBetweenEvents(void *StartEvent,
+                                                        void *EndEvent) {
+  cl_event *CLStartEvent = static_cast<cl_event *>(StartEvent);
+  cl_event *CLEndEvent = static_cast<cl_event *>(EndEvent);
+
+  cl_profiling_info ParamName = CL_PROFILING_COMMAND_END;
+  cl_ulong StartNanoseconds;
+  cl_ulong EndNanoseconds;
+  if (cl_int Result =
+          clGetEventProfilingInfo(*CLStartEvent, ParamName, sizeof(cl_ulong),
+                                  &StartNanoseconds, nullptr))
+    return getOpenCLError(Result, "clGetEventProfilingInfo");
+  if (cl_int Result = clGetEventProfilingInfo(
+          *CLEndEvent, ParamName, sizeof(cl_ulong), &EndNanoseconds, nullptr))
+    return getOpenCLError(Result, "clGetEventProfilingInfo");
+  return (EndNanoseconds - StartNanoseconds) * 1e-12;
+}
+
+Expected<void *> OpenCLPlatform::rawCreateKernel(void *Program,
+                                                 const std::string &Name) {
+
+  cl_int Error;
+  cl_kernel Kernel =
+      clCreateKernel(static_cast<cl_program>(Program), Name.c_str(), &Error);
+  if (Error)
+    return getOpenCLError(Error, "clCreateKernel");
+  return Kernel;
+}
+
+static void openCLDestroyKernel(void *H) {
+  logOpenCLWarning(clReleaseKernel(static_cast<cl_kernel>(H)),
+                   "clReleaseKernel");
+}
+
+HandleDestructor OpenCLPlatform::getKernelHandleDestructor() {
+  return openCLDestroyKernel;
+}
+
+Status OpenCLPlatform::rawEnqueueKernelLaunch(
+    void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions,
+    Span<void *> Arguments, Span<size_t> ArgumentSizes,
+    size_t SharedMemoryBytes) {
+  if (SharedMemoryBytes != 0)
+    return Status("OpenCL kernel launches only accept zero for the shared "
+                  "memory byte size");
+  cl_kernel TheKernel = static_cast<cl_kernel>(Kernel);
+  for (int I = 0; I < Arguments.size(); ++I)
+    if (cl_int Error =
+            clSetKernelArg(TheKernel, I, ArgumentSizes[I], Arguments[I]))
+      return getOpenCLError(Error, "clSetKernelArg");
+  size_t LocalWorkSize[] = {LaunchDimensions.BlockX, LaunchDimensions.BlockY,
+                            LaunchDimensions.BlockZ};
+  size_t GlobalWorkSize[] = {LaunchDimensions.BlockX * LaunchDimensions.GridX,
+                             LaunchDimensions.BlockY * LaunchDimensions.GridY,
+                             LaunchDimensions.BlockZ * LaunchDimensions.GridZ};
+  return getOpenCLError(
+      clEnqueueNDRangeKernel(static_cast<cl_command_queue>(Stream), TheKernel,
+                             3, nullptr, GlobalWorkSize, LocalWorkSize, 0,
+                             nullptr, nullptr),
+      "clEnqueueNDRangeKernel");
+}
+
+} // namespace
+
+namespace opencl {
+
+/// Gets an OpenCLPlatform instance and returns it as an unowned pointer to a
+/// Platform.
+Expected<Platform *> getPlatform() {
+  static auto MaybePlatform = []() -> Expected<OpenCLPlatform *> {
+    Expected<OpenCLPlatform> CreationResult = OpenCLPlatform::create();
+    if (CreationResult.isError())
+      return CreationResult.getError();
+    else
+      return new OpenCLPlatform(CreationResult.takeValue());
+  }();
+  return MaybePlatform;
+}
+
+} // namespace opencl
+
+} // namespace acxxel
diff --git a/parallel-libs/acxxel/span.h b/parallel-libs/acxxel/span.h

new file mode 100644 (file)

index 0000000..a1df624
--- /dev/null
+++ b/parallel-libs/acxxel/span.h
@@ -0,0 +1,222 @@
+//===--- span- The span class -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ACXXEL_SPAN_H
+#define ACXXEL_SPAN_H
+
+#include <array>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <type_traits>
+
+namespace acxxel {
+
+/// Value used to indicate slicing to the end of the span.
+static constexpr std::ptrdiff_t dynamic_extent = -1; // NOLINT
+
+class SpanBase {};
+
+/// Implementation of the proposed C++17 std::span class.
+///
+/// Based on the paper:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0122r1.pdf
+template <typename ElementType> class Span : public SpanBase {
+public:
+  /// \name constants and types
+  /// \{
+
+  using element_type = ElementType;
+  using index_type = std::ptrdiff_t;
+  using pointer = element_type *;
+  using reference = element_type &;
+  using iterator = element_type *;
+  using const_iterator = const element_type *;
+  using value_type = typename std::remove_const<element_type>::type;
+
+  /// \}
+
+  /// \name constructors, copy, assignment, and destructor.
+  /// \{
+
+  /// Constructs an empty span with null pointer data.
+  Span() : Data(nullptr), Size(0) {}
+
+  /// Constructs an empty span with null pointer data.
+  // Intentionally implicit.
+  Span(std::nullptr_t) : Data(nullptr), Size(0) {}
+
+  /// Constructs a span from a pointer and element count.
+  Span(pointer Ptr, index_type Count) : Data(Ptr), Size(Count) {
+    if (Count < 0 || (!Ptr && Count))
+      std::terminate();
+  }
+
+  /// Constructs a span from a pointer to the fist element in the range and a
+  /// pointer to one past the last element in the range.
+  Span(pointer FirstElem, pointer LastElem)
+      : Data(FirstElem), Size(std::distance(FirstElem, LastElem)) {
+    if (Size < 0)
+      std::terminate();
+  }
+
+  /// Constructs a span from an array.
+  // Intentionally implicit.
+  template <typename T, size_t N> Span(T (&Arr)[N]) : Data(Arr), Size(N) {}
+
+  /// Constructs a span from a std::array.
+  // Intentionally implicit.
+  template <size_t N>
+  Span(const std::array<typename std::remove_const<element_type>::type, N> &Arr)
+      : Data(Arr.data()), Size(N) {}
+
+  /// Constructs a span from a container such as a std::vector.
+  // TODO(jhen): Put in a check to make sure this constructor does not
+  // participate in overload resolution unless Container meets the following
+  // requirements:
+  //  * Container is a contiguous container and a sequence container.
+  // Intentionally implicit.
+  template <typename Container>
+  Span(Container &Cont,
+       typename std::enable_if<
+           std::is_same<
+               typename std::remove_const<typename Container::value_type>::type,
+               typename std::remove_const<element_type>::type>::value &&
+           !std::is_array<Container>::value &&
+           !std::is_base_of<SpanBase, Container>::value &&
+           std::is_convertible<decltype(&Cont[0]), pointer>::value>::type * =
+           nullptr)
+      : Data(Cont.data()), Size(Cont.size()) {}
+
+  /// Avoids creating spans from expiring temporary objects.
+  // TODO(jhen): Put in a check to make sure this constructor does not
+  // participate in overload resolution unless Container meets the following
+  // requirements:
+  //  * Container is a contiguous container and a sequence container.
+  template <typename Container>
+  Span(Container &&Cont,
+       typename std::enable_if<
+           std::is_same<
+               typename std::remove_const<typename Container::value_type>::type,
+               typename std::remove_const<element_type>::type>::value &&
+           !std::is_array<Container>::value &&
+           !std::is_base_of<SpanBase, Container>::value &&
+           std::is_convertible<decltype(&Cont[0]), pointer>::value>::type * =
+           nullptr) = delete;
+
+  Span(const Span &) noexcept = default;
+  Span(Span &&) noexcept;
+
+  /// Constructs a span from copying a span of another type that can be
+  /// implicitly converted to the type stored by the constructed span.
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  Span(const Span<OtherElementType> &Other)
+      : Data(Other.Data), Size(Other.Size) {}
+
+  /// Constructs a span from moving a span of another type that can be
+  /// implicitly converted to the type stored by the constructed span.
+  // Intentionally implicit.
+  template <typename OtherElementType>
+  Span(Span<OtherElementType> &&Other) : Data(Other.Data), Size(Other.Size) {}
+
+  ~Span() = default;
+
+  Span &operator=(const Span &) noexcept = default;
+  Span &operator=(Span &&) noexcept;
+
+  /// \}
+
+  /// \name subviews
+  /// \{
+
+  /// Creates a span out of the first Count elements of this span.
+  Span<element_type> first(index_type Count) const {
+    bool Valid = Count >= 0 && Count <= size();
+    if (!Valid)
+      std::terminate();
+    return Span<element_type>(data(), Count);
+  }
+
+  /// Creates a span out of the last Count elements of this span.
+  Span<element_type> last(index_type Count) const {
+    bool Valid = Count >= 0 && Count <= size();
+    if (!Valid)
+      std::terminate();
+    return Span<element_type>(Count == 0 ? data() : data() + (size() - Count),
+                              Count);
+  }
+
+  /// Creates a span out of the Count elements of this span beginning at Offset.
+  ///
+  /// If no arguments is provided for Count, the new span will extend to the end
+  /// of the current span.
+  Span<element_type> subspan(index_type Offset,
+                             index_type Count = dynamic_extent) const {
+    bool Valid =
+        (Offset == 0 || (Offset > 0 && Offset <= size())) &&
+        (Count == dynamic_extent || (Count >= 0 && Offset + Count <= size()));
+    if (!Valid)
+      std::terminate();
+    return Span<element_type>(
+        data() + Offset, Count == dynamic_extent ? size() - Offset : Count);
+  }
+
+  /// \}
+
+  /// \name observers
+  /// \{
+
+  index_type length() const { return Size; }
+  index_type size() const { return Size; }
+  bool empty() const { return size() == 0; }
+
+  /// \}
+
+  /// \name element access
+  /// \{
+
+  reference operator[](index_type Idx) const {
+    bool Valid = Idx >= 0 && Idx < size();
+    if (!Valid)
+      std::terminate();
+    return Data[Idx];
+  }
+
+  reference operator()(index_type Idx) const { return operator[](Idx); }
+
+  pointer data() const noexcept { return Data; }
+
+  /// \}
+
+  /// \name iterator support
+  /// \{
+
+  iterator begin() const noexcept { return Data; }
+  iterator end() const noexcept { return Data + Size; }
+  const_iterator cbegin() const noexcept { return Data; }
+  const_iterator cend() const noexcept { return Data + Size; }
+
+  /// \}
+
+private:
+  template <typename OtherElementType> friend class Span;
+
+  pointer Data;
+  index_type Size;
+};
+
+template <typename ElementType>
+Span<ElementType>::Span(Span &&) noexcept = default;
+template <typename ElementType>
+Span<ElementType> &Span<ElementType>::operator=(Span &&) noexcept = default;
+
+} // namespace acxxel
+
+#endif // ACXXEL_SPAN_H
diff --git a/parallel-libs/acxxel/status.h b/parallel-libs/acxxel/status.h

new file mode 100644 (file)

index 0000000..ce86c70
--- /dev/null
+++ b/parallel-libs/acxxel/status.h
@@ -0,0 +1,236 @@
+//===--- status.h - Status and Expected classes -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ACXXEL_STATUS_H
+#define ACXXEL_STATUS_H
+
+#include <cassert>
+#include <string>
+
+// The clang compiler supports annotating class declarations with the
+// warn_unused_result attribute, and this has the meaning that whenever that
+// type is returned from a function, the function is marked as
+// warn_unused_result.
+//
+// The gcc compiler does not support warn_unused_result for classes, only for
+// functions, so we only use this feature with clang.
+#ifdef __clang__
+#define ACXXEL_WARN_UNUSED_RESULT_TYPE __attribute__((warn_unused_result))
+#else
+#define ACXXEL_WARN_UNUSED_RESULT_TYPE
+#endif
+
+namespace acxxel {
+
+/// Status type.
+///
+/// May represent failure with a string error message, or may indicate success.
+class ACXXEL_WARN_UNUSED_RESULT_TYPE Status {
+public:
+  /// Creates a Status representing success.
+  Status() : HasMessage(false) {}
+
+  /// Creates a Status representing failure with the given error message.
+  explicit Status(const std::string &Message)
+      : HasMessage(true), Message(Message) {}
+
+  Status(const Status &) = default;
+
+  Status &operator=(const Status &) = default;
+
+  Status(Status &&) noexcept = default;
+
+  // Cannot use default because the move assignment operator for std::string is
+  // not marked noexcept.
+  Status &operator=(Status &&That) noexcept {
+    HasMessage = That.HasMessage;
+    Message = std::move(That.Message);
+    return *this;
+  }
+
+  ~Status() = default;
+
+  /// Returns true if this Status represents failure. Otherwise, returns false.
+  bool isError() const { return HasMessage; }
+
+  /// Returns true if this Status represents success. Otherwise, returns false.
+  operator bool() const { return !HasMessage; }
+
+  /// Gets a reference to the error message for this Status.
+  ///
+  /// Should only be called if isError() returns true.
+  const std::string &getMessage() const { return Message; }
+
+private:
+  bool HasMessage;
+  std::string Message;
+};
+
+class ExpectedBase {
+protected:
+  enum class State {
+    SUCCESS,
+    FAILURE,
+    MOVED,
+  };
+};
+
+/// Either a value of type T or a Status representing failure.
+template <typename T> class Expected : public ExpectedBase {
+public:
+  /// Creates an Expected representing failure with the given Error status.
+  // Intentionally implicit.
+  Expected(Status AnError)
+      : TheState(State::FAILURE), TheError(std::move(AnError)) {
+    assert(AnError.isError() && "constructing an error Expected value from a "
+                                "success status is not allowed");
+  }
+
+  /// Creates an Expected representing success with the given value.
+  // Intentionally implicit.
+  Expected(T Value) : TheState(State::SUCCESS), TheValue(std::move(Value)) {}
+
+  Expected(const Expected &That) : TheState(That.TheState) {
+    switch (TheState) {
+    case State::SUCCESS:
+      new (&TheValue) T(That.TheValue);
+      break;
+    case State::FAILURE:
+      new (&TheError) Status(That.TheError);
+      break;
+    case State::MOVED:
+      // Nothing to do in this case.
+      break;
+    }
+  }
+
+  Expected &operator=(Expected That) {
+    TheState = That.TheState;
+    switch (TheState) {
+    case State::SUCCESS:
+      new (&TheValue) T(std::move(That.TheValue));
+      break;
+    case State::FAILURE:
+      new (&TheError) Status(std::move(That.TheError));
+      break;
+    case State::MOVED:
+      // Nothing to do in this case.
+      break;
+    }
+    return *this;
+  }
+
+  Expected(Expected &&That) noexcept : TheState(That.TheState) {
+    switch (TheState) {
+    case State::SUCCESS:
+      new (&TheValue) T(std::move(That.TheValue));
+      break;
+    case State::FAILURE:
+      new (&TheError) Status(std::move(That.TheError));
+      break;
+    case State::MOVED:
+      // Nothing to do in this case.
+      break;
+    }
+    That.TheState = State::MOVED;
+  }
+
+  template <typename U>
+  Expected(const Expected<U> &That) : TheState(That.TheState) {
+    switch (TheState) {
+    case State::SUCCESS:
+      new (&TheValue) T(That.TheValue);
+      break;
+    case State::FAILURE:
+      new (&TheError) Status(That.TheError);
+      break;
+    case State::MOVED:
+      // Nothing to do in this case.
+      break;
+    }
+  }
+
+  template <typename U> Expected(Expected<U> &&That) : TheState(That.TheState) {
+    switch (TheState) {
+    case State::SUCCESS:
+      new (&TheValue) T(std::move(That.TheValue));
+      break;
+    case State::FAILURE:
+      new (&TheError) Status(std::move(That.TheError));
+      break;
+    case State::MOVED:
+      // Nothing to do in this case.
+      break;
+    }
+  }
+
+  ~Expected() {
+    switch (TheState) {
+    case State::SUCCESS:
+      TheValue.~T();
+      break;
+    case State::FAILURE:
+      TheError.~Status();
+      break;
+    case State::MOVED:
+      // Nothing to do for this case.
+      break;
+    }
+  }
+
+  /// Returns true if this instance represents failure.
+  bool isError() const { return TheState != State::SUCCESS; }
+
+  /// Gets a reference to the Status object.
+  ///
+  /// Should only be called if isError() returns true.
+  const Status &getError() const {
+    assert(isError());
+    return TheError;
+  }
+
+  /// Gets a const reference to the value object.
+  ///
+  /// Should only be called if isError() returns false.
+  const T &getValue() const {
+    assert(!isError());
+    return TheValue;
+  }
+
+  /// Gets a reference to the value object.
+  ///
+  /// Should only be called if isError() returns false.
+  T &getValue() {
+    assert(!isError());
+    return TheValue;
+  }
+
+  /// Takes the value from this object by moving it to the return value.
+  ///
+  /// Should only be called if isError() returns false.
+  T takeValue() {
+    assert(!isError());
+    TheState = State::MOVED;
+    return std::move(TheValue);
+  }
+
+private:
+  template <typename U> friend class Expected;
+
+  State TheState;
+
+  union {
+    T TheValue;
+    Status TheError;
+  };
+};
+
+} // namespace acxxel
+
+#endif // ACXXEL_STATUS_H
diff --git a/parallel-libs/acxxel/tests/CMakeLists.txt b/parallel-libs/acxxel/tests/CMakeLists.txt

new file mode 100644 (file)

index 0000000..17c2df4
--- /dev/null
+++ b/parallel-libs/acxxel/tests/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_executable(acxxel_test acxxel_test.cpp)
+target_link_libraries(
+    acxxel_test
+    acxxel
+    ${GTEST_BOTH_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT})
+add_test(AcxxelTest acxxel_test)
+
+add_executable(span_test span_test.cpp)
+target_link_libraries(
+    span_test
+    ${GTEST_BOTH_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT})
+add_test(SpanTest span_test)
+
+add_executable(status_test status_test.cpp)
+target_link_libraries(
+    status_test
+    ${GTEST_BOTH_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT})
+add_test(StatusTest status_test)
+
+if(ACXXEL_ENABLE_OPENCL)
+add_executable(opencl_test opencl_test.cpp)
+target_link_libraries(
+    opencl_test
+    acxxel
+    ${GTEST_BOTH_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT})
+add_test(OpenCLTest opencl_test)
+endif()
diff --git a/parallel-libs/acxxel/tests/acxxel_test.cpp b/parallel-libs/acxxel/tests/acxxel_test.cpp

new file mode 100644 (file)

index 0000000..f0bc89c
--- /dev/null
+++ b/parallel-libs/acxxel/tests/acxxel_test.cpp
@@ -0,0 +1,412 @@
+//===--- acxxel_test.cpp - Tests for the Acxxel API -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "acxxel.h"
+#include "config.h"
+#include "gtest/gtest.h"
+
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+namespace {
+
+template <typename T, size_t N> constexpr size_t size(T (&)[N]) { return N; }
+
+using PlatformGetter = acxxel::Expected<acxxel::Platform *> (*)();
+class AcxxelTest : public ::testing::TestWithParam<PlatformGetter> {};
+
+TEST_P(AcxxelTest, GetDeviceCount) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  int DeviceCount = Platform->getDeviceCount().getValue();
+  EXPECT_GE(DeviceCount, 0);
+}
+
+// Tests all the methods of a DeviceMemorySpan that was created from the asSpan
+// method of a DeviceMemory object.
+//
+// The length is the number of elements in the span. The ElementByteSize is the
+// number of bytes per element in the span.
+//
+// It is assumed that the input span has 10 or more elements.
+template <typename SpanType>
+void testFullDeviceMemorySpan(SpanType &&Span, ptrdiff_t Length,
+                              ptrdiff_t ElementByteSize) {
+  EXPECT_GE(Length, 10);
+  EXPECT_GT(ElementByteSize, 0);
+
+  // Full span
+  EXPECT_EQ(Length, Span.length());
+  EXPECT_EQ(Length, Span.size());
+  EXPECT_EQ(Length * ElementByteSize, Span.byte_size());
+  EXPECT_EQ(0, Span.offset());
+  EXPECT_EQ(0, Span.byte_offset());
+  EXPECT_FALSE(Span.empty());
+
+  // Sub-span with first method.
+  auto First2 = Span.first(2);
+  EXPECT_EQ(2, First2.length());
+  EXPECT_EQ(2, First2.size());
+  EXPECT_EQ(2 * ElementByteSize, First2.byte_size());
+  EXPECT_EQ(0, First2.offset());
+  EXPECT_EQ(0, First2.byte_offset());
+  EXPECT_FALSE(First2.empty());
+
+  auto First0 = Span.first(0);
+  EXPECT_EQ(0, First0.length());
+  EXPECT_EQ(0, First0.size());
+  EXPECT_EQ(0, First0.byte_size());
+  EXPECT_EQ(0, First0.offset());
+  EXPECT_EQ(0, First0.byte_offset());
+  EXPECT_TRUE(First0.empty());
+
+  // Sub-span with last method.
+  auto Last2 = Span.last(2);
+  EXPECT_EQ(2, Last2.length());
+  EXPECT_EQ(2, Last2.size());
+  EXPECT_EQ(2 * ElementByteSize, Last2.byte_size());
+  EXPECT_EQ(Length - 2, Last2.offset());
+  EXPECT_EQ((Length - 2) * ElementByteSize, Last2.byte_offset());
+  EXPECT_FALSE(Last2.empty());
+
+  auto Last0 = Span.last(0);
+  EXPECT_EQ(0, Last0.length());
+  EXPECT_EQ(0, Last0.size());
+  EXPECT_EQ(0, Last0.byte_size());
+  EXPECT_EQ(Length, Last0.offset());
+  EXPECT_EQ(Length * ElementByteSize, Last0.byte_offset());
+  EXPECT_TRUE(Last0.empty());
+
+  // Sub-span with subspan method.
+  auto Middle2 = Span.subspan(4, 2);
+  EXPECT_EQ(2, Middle2.length());
+  EXPECT_EQ(2, Middle2.size());
+  EXPECT_EQ(2 * ElementByteSize, Middle2.byte_size());
+  EXPECT_EQ(4, Middle2.offset());
+  EXPECT_EQ(4 * ElementByteSize, Middle2.byte_offset());
+  EXPECT_FALSE(Middle2.empty());
+
+  auto Middle0 = Span.subspan(4, 0);
+  EXPECT_EQ(0, Middle0.length());
+  EXPECT_EQ(0, Middle0.size());
+  EXPECT_EQ(0, Middle0.byte_size());
+  EXPECT_EQ(4, Middle0.offset());
+  EXPECT_EQ(4 * ElementByteSize, Middle0.byte_offset());
+  EXPECT_TRUE(Middle0.empty());
+
+  auto Subspan2AtStart = Span.subspan(0, 2);
+  EXPECT_EQ(2, Subspan2AtStart.length());
+  EXPECT_EQ(2, Subspan2AtStart.size());
+  EXPECT_EQ(2 * ElementByteSize, Subspan2AtStart.byte_size());
+  EXPECT_EQ(0, Subspan2AtStart.offset());
+  EXPECT_EQ(0, Subspan2AtStart.byte_offset());
+  EXPECT_FALSE(Subspan2AtStart.empty());
+
+  auto Subspan2AtEnd = Span.subspan(Length - 2, 2);
+  EXPECT_EQ(2, Subspan2AtEnd.length());
+  EXPECT_EQ(2, Subspan2AtEnd.size());
+  EXPECT_EQ(2 * ElementByteSize, Subspan2AtEnd.byte_size());
+  EXPECT_EQ(Length - 2, Subspan2AtEnd.offset());
+  EXPECT_EQ((Length - 2) * ElementByteSize, Subspan2AtEnd.byte_offset());
+  EXPECT_FALSE(Subspan2AtEnd.empty());
+
+  auto Subspan0AtStart = Span.subspan(0, 0);
+  EXPECT_EQ(0, Subspan0AtStart.length());
+  EXPECT_EQ(0, Subspan0AtStart.size());
+  EXPECT_EQ(0, Subspan0AtStart.byte_size());
+  EXPECT_EQ(0, Subspan0AtStart.offset());
+  EXPECT_EQ(0, Subspan0AtStart.byte_offset());
+  EXPECT_TRUE(Subspan0AtStart.empty());
+
+  auto Subspan0AtEnd = Span.subspan(Length, 0);
+  EXPECT_EQ(0, Subspan0AtEnd.length());
+  EXPECT_EQ(0, Subspan0AtEnd.size());
+  EXPECT_EQ(0, Subspan0AtEnd.byte_size());
+  EXPECT_EQ(Length, Subspan0AtEnd.offset());
+  EXPECT_EQ(Length * ElementByteSize, Subspan0AtEnd.byte_offset());
+  EXPECT_TRUE(Subspan0AtEnd.empty());
+}
+
+TEST_P(AcxxelTest, DeviceMemory) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Expected<acxxel::DeviceMemory<int>> MaybeMemory =
+      Platform->mallocD<int>(10);
+  EXPECT_FALSE(MaybeMemory.isError());
+
+  // ref
+  acxxel::DeviceMemory<int> &MemoryRef = MaybeMemory.getValue();
+  EXPECT_EQ(10, MemoryRef.length());
+  EXPECT_EQ(10, MemoryRef.size());
+  EXPECT_EQ(10 * sizeof(int), static_cast<size_t>(MemoryRef.byte_size()));
+  EXPECT_FALSE(MemoryRef.empty());
+
+  // mutable span
+  acxxel::DeviceMemorySpan<int> MutableSpan = MemoryRef.asSpan();
+  testFullDeviceMemorySpan(MutableSpan, 10, sizeof(int));
+
+  // const ref
+  const acxxel::DeviceMemory<int> &ConstMemoryRef = MaybeMemory.getValue();
+  EXPECT_EQ(10, ConstMemoryRef.length());
+  EXPECT_EQ(10, ConstMemoryRef.size());
+  EXPECT_EQ(10 * sizeof(int), static_cast<size_t>(ConstMemoryRef.byte_size()));
+  EXPECT_FALSE(ConstMemoryRef.empty());
+
+  // immutable span
+  acxxel::DeviceMemorySpan<const int> ImmutableSpan = ConstMemoryRef.asSpan();
+  testFullDeviceMemorySpan(ImmutableSpan, 10, sizeof(int));
+}
+
+TEST_P(AcxxelTest, CopyHostAndDevice) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  int A[] = {0, 1, 2};
+  std::array<int, size(A)> B;
+  acxxel::DeviceMemory<int> X = Platform->mallocD<int>(size(A)).takeValue();
+  Stream.syncCopyHToD(A, X);
+  Stream.syncCopyDToH(X, B);
+  for (size_t I = 0; I < size(A); ++I)
+    EXPECT_EQ(A[I], B[I]);
+  EXPECT_FALSE(Stream.takeStatus().isError());
+}
+
+TEST_P(AcxxelTest, CopyDToD) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  int A[] = {0, 1, 2};
+  std::array<int, size(A)> B;
+  acxxel::DeviceMemory<int> X = Platform->mallocD<int>(size(A)).takeValue();
+  acxxel::DeviceMemory<int> Y = Platform->mallocD<int>(size(A)).takeValue();
+  Stream.syncCopyHToD(A, X);
+  Stream.syncCopyDToD(X, Y);
+  Stream.syncCopyDToH(Y, B);
+  for (size_t I = 0; I < size(A); ++I)
+    EXPECT_EQ(A[I], B[I]);
+  EXPECT_FALSE(Stream.takeStatus().isError());
+}
+
+TEST_P(AcxxelTest, AsyncCopyHostAndDevice) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  int A[] = {0, 1, 2};
+  std::array<int, size(A)> B;
+  acxxel::DeviceMemory<int> X = Platform->mallocD<int>(size(A)).takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  acxxel::AsyncHostMemory<int> AsyncA =
+      Platform->registerHostMem(A).takeValue();
+  acxxel::AsyncHostMemory<int> AsyncB =
+      Platform->registerHostMem(B).takeValue();
+  EXPECT_FALSE(Stream.asyncCopyHToD(AsyncA, X).takeStatus().isError());
+  EXPECT_FALSE(Stream.asyncCopyDToH(X, AsyncB).takeStatus().isError());
+  EXPECT_FALSE(Stream.sync().isError());
+  for (size_t I = 0; I < size(A); ++I)
+    EXPECT_EQ(A[I], B[I]);
+}
+
+TEST_P(AcxxelTest, AsyncMemsetD) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  constexpr size_t ArrayLength = 10;
+  std::array<uint32_t, ArrayLength> Host;
+  acxxel::DeviceMemory<uint32_t> X =
+      Platform->mallocD<uint32_t>(ArrayLength).takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  acxxel::AsyncHostMemory<uint32_t> AsyncHost =
+      Platform->registerHostMem(Host).takeValue();
+  EXPECT_FALSE(Stream.asyncMemsetD(X, 0x12).takeStatus().isError());
+  EXPECT_FALSE(Stream.asyncCopyDToH(X, AsyncHost).takeStatus().isError());
+  EXPECT_FALSE(Stream.sync().isError());
+  for (size_t I = 0; I < ArrayLength; ++I)
+    EXPECT_EQ(0x12121212u, Host[I]);
+}
+
+TEST_P(AcxxelTest, RegisterHostMem) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  auto Data = std::unique_ptr<int[]>(new int[3]);
+  acxxel::Expected<acxxel::AsyncHostMemory<const int>> MaybeAsyncHostMemory =
+      Platform->registerHostMem<int>({Data.get(), 3});
+  EXPECT_FALSE(MaybeAsyncHostMemory.isError())
+      << MaybeAsyncHostMemory.getError().getMessage();
+  acxxel::AsyncHostMemory<const int> AsyncHostMemory =
+      MaybeAsyncHostMemory.takeValue();
+  EXPECT_EQ(Data.get(), AsyncHostMemory.data());
+  EXPECT_EQ(3, AsyncHostMemory.size());
+}
+
+struct RefCounter {
+  static int Count;
+
+  RefCounter() { ++Count; }
+  ~RefCounter() { --Count; }
+  RefCounter(const RefCounter &) = delete;
+  RefCounter &operator=(const RefCounter &) = delete;
+};
+
+int RefCounter::Count;
+
+TEST_P(AcxxelTest, OwnedAsyncHost) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  RefCounter::Count = 0;
+  {
+    acxxel::OwnedAsyncHostMemory<RefCounter> A =
+        Platform->newAsyncHostMem<RefCounter>(3).takeValue();
+    EXPECT_EQ(3, RefCounter::Count);
+  }
+  EXPECT_EQ(0, RefCounter::Count);
+}
+
+TEST_P(AcxxelTest, OwnedAsyncCopyHostAndDevice) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  size_t Length = 3;
+  acxxel::OwnedAsyncHostMemory<int> A =
+      Platform->newAsyncHostMem<int>(Length).takeValue();
+  for (size_t I = 0; I < Length; ++I)
+    A[I] = I;
+  acxxel::OwnedAsyncHostMemory<int> B =
+      Platform->newAsyncHostMem<int>(Length).takeValue();
+  acxxel::DeviceMemory<int> X = Platform->mallocD<int>(Length).takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  EXPECT_FALSE(Stream.asyncCopyHToD(A, X).takeStatus().isError());
+  EXPECT_FALSE(Stream.asyncCopyDToH(X, B).takeStatus().isError());
+  EXPECT_FALSE(Stream.sync().isError());
+  for (size_t I = 0; I < Length; ++I)
+    EXPECT_EQ(A[I], B[I]);
+}
+
+TEST_P(AcxxelTest, AsyncCopyDToD) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  int A[] = {0, 1, 2};
+  std::array<int, size(A)> B;
+  acxxel::DeviceMemory<int> X = Platform->mallocD<int>(size(A)).takeValue();
+  acxxel::DeviceMemory<int> Y = Platform->mallocD<int>(size(A)).takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  acxxel::AsyncHostMemory<int> AsyncA =
+      Platform->registerHostMem(A).takeValue();
+  acxxel::AsyncHostMemory<int> AsyncB =
+      Platform->registerHostMem(B).takeValue();
+  EXPECT_FALSE(Stream.asyncCopyHToD(AsyncA, X).takeStatus().isError());
+  EXPECT_FALSE(Stream.asyncCopyDToD(X, Y).takeStatus().isError());
+  EXPECT_FALSE(Stream.asyncCopyDToH(Y, AsyncB).takeStatus().isError());
+  EXPECT_FALSE(Stream.sync().isError());
+  for (size_t I = 0; I < size(A); ++I)
+    EXPECT_EQ(A[I], B[I]);
+}
+
+TEST_P(AcxxelTest, Stream) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  EXPECT_FALSE(Stream.sync().isError());
+}
+
+TEST_P(AcxxelTest, Event) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Event Event = Platform->createEvent().takeValue();
+  EXPECT_TRUE(Event.isDone());
+  EXPECT_FALSE(Event.sync().isError());
+}
+
+TEST_P(AcxxelTest, RecordEventsInAStream) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  acxxel::Event Start = Platform->createEvent().takeValue();
+  acxxel::Event End = Platform->createEvent().takeValue();
+  EXPECT_FALSE(Stream.enqueueEvent(Start).takeStatus().isError());
+  EXPECT_FALSE(Start.sync().isError());
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  EXPECT_FALSE(Stream.enqueueEvent(End).takeStatus().isError());
+  EXPECT_FALSE(End.sync().isError());
+  EXPECT_GT(End.getSecondsSince(Start).takeValue(), 0);
+}
+
+TEST_P(AcxxelTest, StreamCallback) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  int Value = 0;
+  acxxel::Stream Stream = Platform->createStream().takeValue();
+  EXPECT_FALSE(
+      Stream
+          .addCallback([&Value](acxxel::Stream &, const acxxel::Status &) {
+            Value = 42;
+          })
+          .takeStatus()
+          .isError());
+  EXPECT_FALSE(Stream.sync().isError());
+  EXPECT_EQ(42, Value);
+}
+
+TEST_P(AcxxelTest, WaitForEventsInAStream) {
+  acxxel::Platform *Platform = GetParam()().takeValue();
+  acxxel::Stream Stream0 = Platform->createStream().takeValue();
+  acxxel::Stream Stream1 = Platform->createStream().takeValue();
+  acxxel::Event Event0 = Platform->createEvent().takeValue();
+  acxxel::Event Event1 = Platform->createEvent().takeValue();
+
+  // Thread loops on Stream0 until someone sets the GoFlag, then set the
+  // MarkerFlag.
+
+  std::mutex Mutex;
+  std::condition_variable ConditionVar;
+  bool GoFlag = false;
+  bool MarkerFlag = false;
+
+  EXPECT_FALSE(Stream0
+                   .addCallback([&Mutex, &ConditionVar, &GoFlag, &MarkerFlag](
+                       acxxel::Stream &, const acxxel::Status &) {
+                     std::unique_lock<std::mutex> Lock(Mutex);
+                     ConditionVar.wait(Lock,
+                                       [&GoFlag] { return GoFlag == true; });
+                     MarkerFlag = true;
+                   })
+                   .takeStatus()
+                   .isError());
+
+  // Event0 can only occur after GoFlag and MarkerFlag are set.
+  EXPECT_FALSE(Stream0.enqueueEvent(Event0).takeStatus().isError());
+
+  // Use waitOnEvent to make a callback on Stream1 wait for an event on Stream0.
+  EXPECT_FALSE(Stream1.waitOnEvent(Event0).isError());
+  EXPECT_FALSE(Stream1.enqueueEvent(Event1).takeStatus().isError());
+  EXPECT_FALSE(Stream1
+                   .addCallback([&Mutex, &MarkerFlag](acxxel::Stream &,
+                                                      const acxxel::Status &) {
+                     std::unique_lock<std::mutex> Lock(Mutex);
+                     // This makes sure that this callback runs after the
+                     // callback on Stream0.
+                     EXPECT_TRUE(MarkerFlag);
+                   })
+                   .takeStatus()
+                   .isError());
+
+  // Allow the callback on Stream0 to set MarkerFlag and finish.
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    GoFlag = true;
+  }
+  ConditionVar.notify_one();
+
+  // Make sure the events have finished and that Event1 did not happen before
+  // Event0.
+  EXPECT_FALSE(Event0.sync().isError());
+  EXPECT_FALSE(Event1.sync().isError());
+  EXPECT_FALSE(Stream1.sync().isError());
+}
+
+#if defined(ACXXEL_ENABLE_CUDA) || defined(ACXXEL_ENABLE_OPENCL)
+INSTANTIATE_TEST_CASE_P(BothPlatformTest, AcxxelTest,
+                        ::testing::Values(
+#ifdef ACXXEL_ENABLE_CUDA
+                            acxxel::getCUDAPlatform
+#ifdef ACXXEL_ENABLE_OPENCL
+                            ,
+#endif
+#endif
+#ifdef ACXXEL_ENABLE_OPENCL
+                            acxxel::getOpenCLPlatform
+#endif
+                            ));
+#endif
+
+} // namespace
diff --git a/parallel-libs/acxxel/tests/opencl_test.cpp b/parallel-libs/acxxel/tests/opencl_test.cpp

new file mode 100644 (file)

index 0000000..509761f
--- /dev/null
+++ b/parallel-libs/acxxel/tests/opencl_test.cpp
@@ -0,0 +1,62 @@
+//===--- opencl_test.cpp - Tests for OpenCL and the Acxxel API ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "acxxel.h"
+#include "gtest/gtest.h"
+
+#include <array>
+#include <cstring>
+
+namespace {
+
+static const char *SaxpyKernelSource = R"(
+__kernel void saxpyKernel(float A, __global float *X, __global float *Y, int N) {
+  int I = get_global_id(0);
+  if (I < N)
+    X[I] = A * X[I] + Y[I];
+}
+)";
+
+TEST(OpenCL, Saxpy) {
+  constexpr size_t Length = 3;
+
+  float A = 2.f;
+  std::array<float, Length> X = {{0.f, 1.f, 2.f}};
+  std::array<float, Length> Y = {{3.f, 4.f, 5.f}};
+  std::array<float, Length> Expected = {{3.f, 6.f, 9.f}};
+
+  acxxel::Platform *OpenCL = acxxel::getOpenCLPlatform().getValue();
+  acxxel::Stream Stream = OpenCL->createStream().takeValue();
+  auto DeviceX = OpenCL->mallocD<float>(Length).takeValue();
+  auto DeviceY = OpenCL->mallocD<float>(Length).takeValue();
+  Stream.syncCopyHToD(X, DeviceX);
+  Stream.syncCopyHToD(Y, DeviceY);
+  acxxel::Program Program =
+      OpenCL
+          ->createProgramFromSource(acxxel::Span<const char>(
+              SaxpyKernelSource, std::strlen(SaxpyKernelSource)))
+          .takeValue();
+  acxxel::Kernel Kernel = Program.createKernel("saxpyKernel").takeValue();
+  float *RawX = static_cast<float *>(DeviceX);
+  float *RawY = static_cast<float *>(DeviceY);
+  int IntLength = Length;
+  void *Arguments[] = {&A, &RawX, &RawY, &IntLength};
+  size_t ArgumentSizes[] = {sizeof(float), sizeof(float *), sizeof(float *),
+                            sizeof(int)};
+  EXPECT_FALSE(
+      Stream.asyncKernelLaunch(Kernel, Length, Arguments, ArgumentSizes)
+          .takeStatus()
+          .isError());
+  Stream.syncCopyDToH(DeviceX, X);
+  EXPECT_FALSE(Stream.sync().isError());
+
+  EXPECT_EQ(X, Expected);
+}
+
+} // namespace
diff --git a/parallel-libs/acxxel/tests/span_test.cpp b/parallel-libs/acxxel/tests/span_test.cpp

new file mode 100644 (file)

index 0000000..eee5aef
--- /dev/null
+++ b/parallel-libs/acxxel/tests/span_test.cpp
@@ -0,0 +1,293 @@
+//===--- span_test.cpp - Tests for the span class -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "span.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <array>
+#include <vector>
+
+namespace {
+
+template <typename T, size_t N> size_t arraySize(T (&)[N]) { return N; }
+
+TEST(Span, NullConstruction) {
+  acxxel::Span<int> Span0;
+  EXPECT_EQ(nullptr, Span0.data());
+  EXPECT_EQ(0, Span0.size());
+
+  acxxel::Span<int> Span1(nullptr);
+  EXPECT_EQ(nullptr, Span1.data());
+  EXPECT_EQ(0, Span1.size());
+}
+
+TEST(Span, PtrSizeConstruction) {
+  int ZeroSize = 0;
+  acxxel::Span<int> Span0(nullptr, ZeroSize);
+  EXPECT_EQ(Span0.data(), nullptr);
+  EXPECT_EQ(Span0.size(), 0);
+
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span1(Values, arraySize(Values));
+  EXPECT_EQ(Span1.data(), Values);
+  EXPECT_EQ(static_cast<size_t>(Span1.size()), arraySize(Values));
+
+  acxxel::Span<int> Span2(Values, ZeroSize);
+  EXPECT_EQ(Span2.data(), Values);
+  EXPECT_EQ(Span2.size(), 0);
+}
+
+TEST(Span, PtrSizeConstruction_NegativeCount) {
+  int Values[] = {0, 1, 2};
+  EXPECT_DEATH(acxxel::Span<int> Span0(Values, -1), "terminate");
+}
+
+TEST(Span, PtrSizeConstruction_NullptrNonzeroSize) {
+  EXPECT_DEATH(acxxel::Span<int> Span0(nullptr, 1), "terminate");
+}
+
+TEST(Span, FirstLastConstruction) {
+  int Values[] = {0, 1, 2};
+
+  acxxel::Span<int> Span0(Values, Values);
+  EXPECT_EQ(Span0.data(), Values);
+  EXPECT_EQ(Span0.size(), 0);
+
+  acxxel::Span<int> Span(Values, Values + 2);
+  EXPECT_EQ(Span.data(), Values);
+  EXPECT_EQ(Span.size(), 2);
+}
+
+TEST(Span, FirstLastConstruction_LastBeforeFirst) {
+  int Values[] = {0, 1, 2};
+  EXPECT_DEATH(acxxel::Span<int> Span(Values + 2, Values), "terminate");
+}
+
+TEST(Span, ArrayConstruction) {
+  int Array[] = {0, 1, 2};
+  acxxel::Span<int> Span(Array);
+  EXPECT_EQ(Span.data(), Array);
+  EXPECT_EQ(Span.size(), 3);
+}
+
+TEST(Span, StdArrayConstruction) {
+  std::array<int, 3> Array{0, 1, 2};
+  acxxel::Span<int> Span(Array);
+  EXPECT_EQ(Span.data(), Array.data());
+  EXPECT_EQ(static_cast<size_t>(Span.size()), Array.size());
+
+  std::array<const int, 3> ConstArray{0, 1, 2};
+  acxxel::Span<const int> ConstSpan(ConstArray);
+  EXPECT_EQ(ConstSpan.data(), ConstArray.data());
+  EXPECT_EQ(static_cast<size_t>(ConstSpan.size()), ConstArray.size());
+}
+
+TEST(Span, ContainerConstruction) {
+  std::vector<int> Vector = {0, 1, 2};
+  acxxel::Span<int> Span(Vector);
+  EXPECT_EQ(Span.data(), &Vector[0]);
+  EXPECT_EQ(static_cast<size_t>(Span.size()), Vector.size());
+}
+
+TEST(Span, CopyConstruction) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span0(Values);
+  acxxel::Span<int> Span1(Span0);
+  EXPECT_EQ(Span1.data(), Values);
+  EXPECT_EQ(static_cast<size_t>(Span1.size()), arraySize(Values));
+}
+
+TEST(Span, CopyAssignment) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span0(Values);
+  acxxel::Span<int> Span1;
+  Span1 = Span0;
+  EXPECT_EQ(Span1.data(), Values);
+  EXPECT_EQ(static_cast<size_t>(Span1.size()), arraySize(Values));
+}
+
+TEST(Span, CopyConstFromNonConst) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span0(Values);
+  acxxel::Span<const int> Span1(Span0);
+  EXPECT_EQ(Span1.data(), Values);
+  EXPECT_EQ(static_cast<size_t>(Span1.size()), arraySize(Values));
+}
+
+TEST(Span, FirstMethod) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+  acxxel::Span<int> Span0 = Span.first(0);
+  acxxel::Span<int> Span1 = Span.first(1);
+  acxxel::Span<int> Span2 = Span.first(2);
+  acxxel::Span<int> Span3 = Span.first(3);
+
+  EXPECT_EQ(Span0.data(), Values);
+  EXPECT_EQ(Span1.data(), Values);
+  EXPECT_EQ(Span2.data(), Values);
+  EXPECT_EQ(Span3.data(), Values);
+
+  EXPECT_TRUE(Span0.empty());
+
+  EXPECT_THAT(Span1, ::testing::ElementsAre(0));
+  EXPECT_THAT(Span2, ::testing::ElementsAre(0, 1));
+  EXPECT_THAT(Span3, ::testing::ElementsAre(0, 1, 2));
+}
+
+TEST(Span, FirstMethod_IllegalArguments) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+
+  EXPECT_DEATH(Span.first(-1), "terminate");
+  EXPECT_DEATH(Span.first(4), "terminate");
+}
+
+TEST(Span, LastMethod) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+  acxxel::Span<int> Span0 = Span.last(0);
+  acxxel::Span<int> Span1 = Span.last(1);
+  acxxel::Span<int> Span2 = Span.last(2);
+  acxxel::Span<int> Span3 = Span.last(3);
+
+  EXPECT_EQ(Span0.data(), Values);
+  EXPECT_EQ(Span1.data(), Values + 2);
+  EXPECT_EQ(Span2.data(), Values + 1);
+  EXPECT_EQ(Span3.data(), Values);
+
+  EXPECT_TRUE(Span0.empty());
+
+  EXPECT_THAT(Span1, ::testing::ElementsAre(2));
+  EXPECT_THAT(Span2, ::testing::ElementsAre(1, 2));
+  EXPECT_THAT(Span3, ::testing::ElementsAre(0, 1, 2));
+}
+
+TEST(Span, LastMethod_IllegalArguments) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+
+  EXPECT_DEATH(Span.last(-1), "terminate");
+  EXPECT_DEATH(Span.last(4), "terminate");
+}
+
+TEST(Span, SubspanMethod) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+
+  acxxel::Span<int> Span0 = Span.subspan(0);
+  acxxel::Span<int> Span0e = Span.subspan(0, acxxel::dynamic_extent);
+  acxxel::Span<int> Span00 = Span.subspan(0, 0);
+  acxxel::Span<int> Span01 = Span.subspan(0, 1);
+  acxxel::Span<int> Span02 = Span.subspan(0, 2);
+  acxxel::Span<int> Span03 = Span.subspan(0, 3);
+
+  acxxel::Span<int> Span1 = Span.subspan(1);
+  acxxel::Span<int> Span1e = Span.subspan(1, acxxel::dynamic_extent);
+  acxxel::Span<int> Span10 = Span.subspan(1, 0);
+  acxxel::Span<int> Span11 = Span.subspan(1, 1);
+  acxxel::Span<int> Span12 = Span.subspan(1, 2);
+
+  acxxel::Span<int> Span2 = Span.subspan(2);
+  acxxel::Span<int> Span2e = Span.subspan(2, acxxel::dynamic_extent);
+  acxxel::Span<int> Span20 = Span.subspan(2, 0);
+  acxxel::Span<int> Span21 = Span.subspan(2, 1);
+
+  acxxel::Span<int> Span3 = Span.subspan(3);
+  acxxel::Span<int> Span3e = Span.subspan(3, acxxel::dynamic_extent);
+  acxxel::Span<int> Span30 = Span.subspan(3, 0);
+
+  EXPECT_EQ(Span0.data(), Values);
+  EXPECT_EQ(Span0e.data(), Values);
+  EXPECT_EQ(Span00.data(), Values);
+  EXPECT_EQ(Span01.data(), Values);
+  EXPECT_EQ(Span02.data(), Values);
+  EXPECT_EQ(Span03.data(), Values);
+
+  EXPECT_EQ(Span1.data(), Values + 1);
+  EXPECT_EQ(Span1e.data(), Values + 1);
+  EXPECT_EQ(Span10.data(), Values + 1);
+  EXPECT_EQ(Span11.data(), Values + 1);
+  EXPECT_EQ(Span12.data(), Values + 1);
+
+  EXPECT_EQ(Span2.data(), Values + 2);
+  EXPECT_EQ(Span2e.data(), Values + 2);
+  EXPECT_EQ(Span20.data(), Values + 2);
+  EXPECT_EQ(Span21.data(), Values + 2);
+
+  EXPECT_EQ(Span3.data(), Values + 3);
+  EXPECT_EQ(Span3e.data(), Values + 3);
+  EXPECT_EQ(Span30.data(), Values + 3);
+
+  EXPECT_TRUE(Span00.empty());
+  EXPECT_TRUE(Span10.empty());
+  EXPECT_TRUE(Span20.empty());
+  EXPECT_TRUE(Span30.empty());
+
+  EXPECT_THAT(Span0, ::testing::ElementsAre(0, 1, 2));
+  EXPECT_THAT(Span0e, ::testing::ElementsAre(0, 1, 2));
+  EXPECT_THAT(Span01, ::testing::ElementsAre(0));
+  EXPECT_THAT(Span02, ::testing::ElementsAre(0, 1));
+  EXPECT_THAT(Span03, ::testing::ElementsAre(0, 1, 2));
+
+  EXPECT_THAT(Span1, ::testing::ElementsAre(1, 2));
+  EXPECT_THAT(Span1e, ::testing::ElementsAre(1, 2));
+  EXPECT_THAT(Span11, ::testing::ElementsAre(1));
+  EXPECT_THAT(Span12, ::testing::ElementsAre(1, 2));
+
+  EXPECT_THAT(Span2, ::testing::ElementsAre(2));
+  EXPECT_THAT(Span2e, ::testing::ElementsAre(2));
+  EXPECT_THAT(Span21, ::testing::ElementsAre(2));
+
+  EXPECT_TRUE(Span3.empty());
+  EXPECT_TRUE(Span3e.empty());
+}
+
+TEST(Span, SubspanMethod_IllegalArguments) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+  EXPECT_DEATH(Span.subspan(-1, 0), "terminate");
+  EXPECT_DEATH(Span.subspan(0, -2), "terminate");
+  EXPECT_DEATH(Span.subspan(0, 4), "terminate");
+  EXPECT_DEATH(Span.subspan(1, 3), "terminate");
+  EXPECT_DEATH(Span.subspan(2, 2), "terminate");
+  EXPECT_DEATH(Span.subspan(3, 1), "terminate");
+  EXPECT_DEATH(Span.subspan(4, 0), "terminate");
+}
+
+TEST(Span, ElementAccess) {
+  int Values[] = {0, 1, 2};
+  acxxel::Span<int> Span(Values);
+
+  EXPECT_EQ(&Span[0], Values);
+  EXPECT_EQ(&Span[1], Values + 1);
+  EXPECT_EQ(&Span[2], Values + 2);
+  EXPECT_EQ(&Span(0), Values);
+  EXPECT_EQ(&Span(1), Values + 1);
+  EXPECT_EQ(&Span(2), Values + 2);
+
+  Span[0] = 5;
+  EXPECT_EQ(Values[0], 5);
+
+  Span(0) = 0;
+  EXPECT_EQ(Values[0], 0);
+
+  const int ConstValues[] = {0, 1, 2};
+  acxxel::Span<const int> ConstSpan(ConstValues);
+
+  EXPECT_EQ(&ConstSpan[0], ConstValues);
+  EXPECT_EQ(&ConstSpan[1], ConstValues + 1);
+  EXPECT_EQ(&ConstSpan[2], ConstValues + 2);
+  EXPECT_EQ(&ConstSpan(0), ConstValues);
+  EXPECT_EQ(&ConstSpan(1), ConstValues + 1);
+  EXPECT_EQ(&ConstSpan(2), ConstValues + 2);
+}
+
+} // namespace
diff --git a/parallel-libs/acxxel/tests/status_test.cpp b/parallel-libs/acxxel/tests/status_test.cpp

new file mode 100644 (file)

index 0000000..ef618d0
--- /dev/null
+++ b/parallel-libs/acxxel/tests/status_test.cpp
@@ -0,0 +1,56 @@
+//===--- status_test.cpp - Tests for the Status and Expected classes ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "status.h"
+
+#include "gtest/gtest.h"
+
+#include <memory>
+
+namespace {
+
+struct RefCounter {
+  static int Count;
+
+  RefCounter() { ++Count; }
+  ~RefCounter() { --Count; }
+  RefCounter(const RefCounter &) = delete;
+  RefCounter &operator=(const RefCounter &) = delete;
+};
+
+int RefCounter::Count;
+
+TEST(Expected, RefCounter) {
+  RefCounter::Count = 0;
+  using uptr = std::unique_ptr<RefCounter>;
+
+  acxxel::Expected<uptr> E0(uptr(new RefCounter));
+  EXPECT_FALSE(E0.isError());
+  EXPECT_EQ(1, RefCounter::Count);
+
+  acxxel::Expected<uptr> E1(std::move(E0));
+  EXPECT_FALSE(E1.isError());
+  EXPECT_EQ(1, RefCounter::Count);
+
+  acxxel::Expected<uptr> E2(acxxel::Status("nothing in here yet"));
+  EXPECT_TRUE(E2.isError());
+  EXPECT_EQ(1, RefCounter::Count);
+  E2 = std::move(E1);
+  EXPECT_FALSE(E2.isError());
+  EXPECT_EQ(1, RefCounter::Count);
+
+  EXPECT_EQ(1, E2.getValue()->Count);
+  EXPECT_FALSE(E2.isError());
+  EXPECT_EQ(1, RefCounter::Count);
+
+  EXPECT_EQ(1, E2.takeValue()->Count);
+  EXPECT_EQ(0, RefCounter::Count);
+}
+
+} // namespace
author	Jason Henline <jhen@google.com>
	Tue, 25 Oct 2016 20:18:56 +0000 (20:18 +0000)
committer	Jason Henline <jhen@google.com>
	Tue, 25 Oct 2016 20:18:56 +0000 (20:18 +0000)
parallel-libs/acxxel/.clang-format	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/.clang-tidy	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/Doxyfile.in	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/LICENSE.TXT	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/acxxel.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/acxxel.h	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/config.h.in	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/cuda_acxxel.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/customdoxygen.css	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/examples/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/examples/opencl_example.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/examples/simple_example.cu	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/opencl_acxxel.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/span.h	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/status.h	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/tests/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/tests/acxxel_test.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/tests/opencl_test.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/tests/span_test.cpp	[new file with mode: 0644]	patch \| blob
parallel-libs/acxxel/tests/status_test.cpp	[new file with mode: 0644]	patch \| blob