Merge changes from github.
authorYifei Feng <yifeif@google.com>
Fri, 25 May 2018 02:12:26 +0000 (19:12 -0700)
committerTensorFlower Gardener <gardener@tensorflow.org>
Fri, 25 May 2018 02:15:01 +0000 (19:15 -0700)
Revert #18413. Too many internal test failures due to the name scope change caused by this change.
Revert #18192. Cannot use re2::StringPiece internally. Need alternative for set call. Will pull and clean this up in a separate change.

PiperOrigin-RevId: 197991247

266 files changed:
CONTRIBUTING.md
README.md
RELEASE.md
SECURITY.md
configure.py
tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
tensorflow/compiler/xla/README.md
tensorflow/compiler/xla/service/conditional_simplifier.cc
tensorflow/compiler/xla/service/copy_insertion.cc
tensorflow/compiler/xla/service/cpu/ir_function.h
tensorflow/compiler/xla/service/cpu/shape_partition.h
tensorflow/compiler/xla/service/despecializer.h
tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
tensorflow/compiler/xla/service/hlo_evaluator.cc
tensorflow/compiler/xla/service/interpreter/README.md
tensorflow/compiler/xla/service/layout_assignment.h
tensorflow/compiler/xla/service/reduce_precision_insertion.cc
tensorflow/compiler/xla/service/source_map_util.h
tensorflow/compiler/xla/shape_util.h
tensorflow/compiler/xla/tests/dot_operation_test.cc
tensorflow/compiler/xla/tests/tuple_test.cc
tensorflow/compiler/xla/xlalogo.png [new file with mode: 0644]
tensorflow/contrib/autograph/impl/config.py
tensorflow/contrib/autograph/operators/control_flow.py
tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
tensorflow/contrib/cmake/CMakeLists.txt
tensorflow/contrib/cmake/external/zlib.cmake
tensorflow/contrib/cmake/tf_tests.cmake
tensorflow/contrib/data/python/kernel_tests/BUILD
tensorflow/contrib/data/python/kernel_tests/resample_test.py
tensorflow/contrib/data/python/ops/BUILD
tensorflow/contrib/data/python/ops/resampling.py
tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
tensorflow/contrib/eager/README.md
tensorflow/contrib/ffmpeg/ffmpeg_lib.h
tensorflow/contrib/framework/python/ops/critical_section_ops.py
tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
tensorflow/contrib/gan/python/estimator/python/head_impl.py
tensorflow/contrib/gan/python/estimator/python/head_test.py
tensorflow/contrib/gan/python/features/python/conditioning_utils.py
tensorflow/contrib/graph_editor/transform.py
tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
tensorflow/contrib/image/__init__.py
tensorflow/contrib/kfac/examples/convnet.py
tensorflow/contrib/kfac/python/ops/optimizer.py
tensorflow/contrib/kfac/python/ops/placement.py
tensorflow/contrib/layers/python/layers/layers.py
tensorflow/contrib/layers/python/layers/layers_test.py
tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
tensorflow/contrib/lite/BUILD
tensorflow/contrib/lite/Makefile
tensorflow/contrib/lite/examples/minimal/minimal.cc [new file with mode: 0644]
tensorflow/contrib/lite/g3doc/rpi.md
tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
tensorflow/contrib/lite/schema/schema.fbs
tensorflow/contrib/lite/schema/schema_v0.fbs
tensorflow/contrib/lite/schema/schema_v1.fbs
tensorflow/contrib/lite/schema/schema_v2.fbs
tensorflow/contrib/lite/schema/schema_v3.fbs
tensorflow/contrib/lite/testing/generate_examples.py
tensorflow/contrib/lite/testing/tflite_driver.cc
tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
tensorflow/contrib/lite/toco/tflite/operator.h
tensorflow/contrib/lite/toco/toco_flags.proto
tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
tensorflow/contrib/signal/python/ops/window_ops.py
tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
tensorflow/contrib/slim/python/slim/learning.py
tensorflow/contrib/tensorboard/db/summary_db_writer.cc
tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
tensorflow/contrib/tensorrt/BUILD
tensorflow/contrib/tensorrt/convert/convert_graph.cc
tensorflow/contrib/tensorrt/convert/convert_graph.h
tensorflow/contrib/tensorrt/convert/convert_nodes.cc
tensorflow/contrib/tensorrt/convert/convert_nodes.h
tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py [new file with mode: 0644]
tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
tensorflow/contrib/tensorrt/log/trt_logger.h
tensorflow/contrib/tensorrt/plugin/trt_plugin.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/plugin/trt_plugin.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/resources/trt_allocator.cc [new file with mode: 0644]
tensorflow/contrib/tensorrt/resources/trt_allocator.h [new file with mode: 0644]
tensorflow/contrib/tensorrt/resources/trt_resources.h
tensorflow/contrib/tensorrt/segment/segment.cc
tensorflow/contrib/tensorrt/segment/segment.h
tensorflow/contrib/tensorrt/segment/segment_test.cc
tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
tensorflow/contrib/tensorrt/test/test_tftrt.py
tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
tensorflow/contrib/tpu/python/tpu/tpu_context.py
tensorflow/contrib/verbs/README.md
tensorflow/core/BUILD
tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt [new file with mode: 0644]
tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt [new file with mode: 0644]
tensorflow/core/common_runtime/broadcaster.cc
tensorflow/core/common_runtime/buf_rendezvous.h
tensorflow/core/common_runtime/ring_reducer.cc
tensorflow/core/common_runtime/scoped_allocator_mgr.cc
tensorflow/core/debug/debug_io_utils.cc
tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
tensorflow/core/example/example.proto
tensorflow/core/example/example_parser_configuration.proto
tensorflow/core/example/feature.proto
tensorflow/core/framework/allocation_description.proto
tensorflow/core/framework/api_def.proto
tensorflow/core/framework/attr_value.proto
tensorflow/core/framework/cost_graph.proto
tensorflow/core/framework/device_attributes.proto
tensorflow/core/framework/function.proto
tensorflow/core/framework/graph.proto
tensorflow/core/framework/graph_transfer_info.proto
tensorflow/core/framework/iterator.proto
tensorflow/core/framework/kernel_def.proto
tensorflow/core/framework/log_memory.proto
tensorflow/core/framework/node_def.proto
tensorflow/core/framework/op_def.proto
tensorflow/core/framework/op_gen_lib.h
tensorflow/core/framework/op_kernel.h
tensorflow/core/framework/reader_base.proto
tensorflow/core/framework/remote_fused_graph_execute_info.proto
tensorflow/core/framework/resource_handle.proto
tensorflow/core/framework/step_stats.proto
tensorflow/core/framework/summary.proto
tensorflow/core/framework/tensor.proto
tensorflow/core/framework/tensor_description.proto
tensorflow/core/framework/tensor_shape.proto
tensorflow/core/framework/tensor_slice.proto
tensorflow/core/framework/types.proto
tensorflow/core/framework/variable.proto
tensorflow/core/framework/versions.proto
tensorflow/core/graph/mkl_layout_pass_test.cc
tensorflow/core/graph/while_context.h
tensorflow/core/grappler/costs/graph_properties.cc
tensorflow/core/grappler/costs/virtual_scheduler.h
tensorflow/core/grappler/optimizers/layout_optimizer.cc
tensorflow/core/kernels/BUILD
tensorflow/core/kernels/batch_matmul_op_impl.h
tensorflow/core/kernels/batch_matmul_op_real.cc
tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
tensorflow/core/kernels/conv_grad_ops_3d.cc
tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
tensorflow/core/kernels/nth_element_op.cc
tensorflow/core/kernels/regex_full_match_op.cc [new file with mode: 0644]
tensorflow/core/kernels/roll_op.cc
tensorflow/core/kernels/segment_reduction_ops.cc
tensorflow/core/kernels/segment_reduction_ops.h
tensorflow/core/lib/core/error_codes.proto
tensorflow/core/ops/image_ops.cc
tensorflow/core/ops/image_ops_test.cc
tensorflow/core/ops/math_ops.cc
tensorflow/core/ops/nn_ops.cc
tensorflow/core/ops/random_ops.cc
tensorflow/core/ops/string_ops.cc
tensorflow/core/platform/cloud/gcs_file_system.cc
tensorflow/core/platform/cloud/gcs_throttle.h
tensorflow/core/profiler/g3doc/command_line.md
tensorflow/core/protobuf/cluster.proto
tensorflow/core/protobuf/config.proto
tensorflow/core/protobuf/control_flow.proto
tensorflow/core/protobuf/critical_section.proto
tensorflow/core/protobuf/debug.proto
tensorflow/core/protobuf/device_properties.proto
tensorflow/core/protobuf/master.proto
tensorflow/core/protobuf/master_service.proto
tensorflow/core/protobuf/meta_graph.proto
tensorflow/core/protobuf/named_tensor.proto
tensorflow/core/protobuf/queue_runner.proto
tensorflow/core/protobuf/rewriter_config.proto
tensorflow/core/protobuf/saved_model.proto
tensorflow/core/protobuf/saver.proto
tensorflow/core/protobuf/tensor_bundle.proto
tensorflow/core/protobuf/tensorflow_server.proto
tensorflow/core/protobuf/worker.proto
tensorflow/core/protobuf/worker_service.proto
tensorflow/core/public/version.h
tensorflow/core/util/cuda_device_functions.h
tensorflow/core/util/mkl_util.h
tensorflow/core/util/tensor_format.h
tensorflow/docs_src/api_guides/python/reading_data.md
tensorflow/docs_src/community/benchmarks.md
tensorflow/docs_src/community/swift.md
tensorflow/docs_src/deploy/s3.md
tensorflow/docs_src/extend/adding_an_op.md
tensorflow/docs_src/extend/architecture.md
tensorflow/docs_src/install/install_c.md
tensorflow/docs_src/install/install_go.md
tensorflow/docs_src/install/install_java.md
tensorflow/docs_src/install/install_linux.md
tensorflow/docs_src/install/install_mac.md
tensorflow/docs_src/install/install_sources.md
tensorflow/docs_src/mobile/mobile_intro.md
tensorflow/docs_src/mobile/tflite/index.md
tensorflow/docs_src/programmers_guide/faq.md
tensorflow/docs_src/programmers_guide/tensors.md
tensorflow/docs_src/programmers_guide/variables.md
tensorflow/docs_src/tutorials/layers.md
tensorflow/examples/learn/text_classification_cnn.py
tensorflow/go/op/wrappers.go
tensorflow/python/data/util/nest.py
tensorflow/python/debug/cli/curses_ui.py
tensorflow/python/estimator/estimator.py
tensorflow/python/estimator/inputs/queues/feeding_functions.py
tensorflow/python/estimator/keras.py
tensorflow/python/estimator/training.py
tensorflow/python/feature_column/feature_column.py
tensorflow/python/framework/fast_tensor_util.pyx
tensorflow/python/framework/ops.py
tensorflow/python/framework/tensor_util.py
tensorflow/python/framework/test_util.py
tensorflow/python/keras/utils/__init__.py
tensorflow/python/kernel_tests/BUILD
tensorflow/python/kernel_tests/conv1d_test.py
tensorflow/python/kernel_tests/conv3d_transpose_test.py
tensorflow/python/kernel_tests/distributions/util_test.py
tensorflow/python/kernel_tests/manip_ops_test.py
tensorflow/python/kernel_tests/regex_full_match_op_test.py [new file with mode: 0644]
tensorflow/python/kernel_tests/segment_reduction_ops_test.py
tensorflow/python/layers/base.py
tensorflow/python/layers/base_test.py
tensorflow/python/ops/math_ops.py
tensorflow/python/ops/string_ops.py
tensorflow/python/profiler/model_analyzer_test.py
tensorflow/python/saved_model/builder_impl.py
tensorflow/python/training/distribute.py
tensorflow/python/training/saver.py
tensorflow/python/util/tf_inspect.py
tensorflow/python/util/util.cc
tensorflow/python/util/util.h
tensorflow/stream_executor/blas.h
tensorflow/stream_executor/cuda/cuda_blas.cc
tensorflow/stream_executor/cuda/cuda_blas.h
tensorflow/stream_executor/stream.cc
tensorflow/stream_executor/stream.h
tensorflow/tensorflow.bzl
tensorflow/tools/api/generator/BUILD
tensorflow/tools/api/golden/tensorflow.pbtxt
tensorflow/tools/api/golden/tensorflow.strings.pbtxt [new file with mode: 0644]
tensorflow/tools/ci_build/install/install_pip_packages.sh
tensorflow/tools/docker/Dockerfile.devel
tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
tensorflow/tools/docker/Dockerfile.devel-gpu
tensorflow/tools/graph_transforms/README.md
tensorflow/tools/pip_package/build_pip_package.sh
tensorflow/tools/pip_package/setup.py
third_party/examples/eager/spinn/README.md
third_party/gpus/cuda_configure.bzl
third_party/mkl/BUILD

index 3dad41a..8669c25 100644 (file)
@@ -1,5 +1,16 @@
 # Contributing guidelines
 
+## Pull Request Checklist
+
+Before sending your pull requests, make sure you followed this list.
+
+- Read [contributing guidelines](CONTRIBUTING.md).
+- Read [Code of Conduct](CODE_OF_CONDUCT.md).
+- Ensure you have signed the [Contributor License Agreement (CLA)](https://cla.developers.google.com/).
+- Check if my changes are consistent with the [guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#general-guidelines-and-philosophy-for-contribution).
+- Changes are consistent with the [Coding Style](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#c-coding-style).
+- Run [Unit Tests](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#running-unit-tests).
+
 ## How to become a contributor and submit your own code
 
 ### Contributor License Agreements
index e1a50c8..6fb4486 100644 (file)
--- a/README.md
+++ b/README.md
@@ -5,9 +5,9 @@
 -----------------
 
 
-| **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
-|-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| **`Documentation`** |
+|-----------------|
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -40,15 +40,6 @@ environment to install the nightly TensorFlow build. We support CPU and GPU
 packages on Linux, Mac, and Windows.
 
 
-**Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
-([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
-
 #### *Try your first TensorFlow program*
 ```shell
 $ python
@@ -82,6 +73,30 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
 
+
+## Continuous build status
+
+### Official Builds
+
+| Build Type      | Status | Artifacts |
+| ---             | ---    | ---       |
+| **Linux CPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Linux GPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Linux XLA**   | TBA | TBA |
+| **MacOS**       | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows CPU** | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows GPU** | [![Status](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/badge/icon)](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Android**     | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) [build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/) |
+
+
+### Community Supported Builds
+
+| Build Type      | Status | Artifacts |
+| ---             | ---    | ---       |
+| **IBM s390x**       | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA |
+| **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA |
+
+
 ## For more information
 
 * [TensorFlow Website](https://www.tensorflow.org)
index 2717c75..84d9d52 100644 (file)
@@ -6,7 +6,7 @@
 * Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
 * Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
 * `tf.contrib.bayesflow` is moving out to it's own repo.
-* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication<sup>[1](#rpc-issue)</sup>.
 
 ## Bug Fixes and Other Changes
 * `tf.data`:
   * Fix non-uniformity of orthogonal matrices.
   * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
 
+<a name="rpc-issue"><sup>1</sup></a> The cancellation logic of the RPC op contains a concurrency error. A fix has been submitted to master and will be part of the next release.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
 4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
 
-
 # Release 1.7.0
 
 ## Major Features And Improvements
@@ -235,7 +236,7 @@ Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田
   * Add `complex64` support to XLA compiler.
   * `bfloat` support is now added to XLA infrastructure.
   * Make `ClusterSpec` propagation work with XLA devices.
-  * Use a determinisitic executor to generate XLA graph.
+  * Use a deterministic executor to generate XLA graph.
 * `tf.contrib`:
   * `tf.contrib.distributions`:
     * Add `tf.contrib.distributions.Autoregressive`.
index a5ce3a6..01886b6 100644 (file)
@@ -173,7 +173,7 @@ the progress being made towards a fix and announcement.
 In addition, please include the following information along with your report:
 
 * Your name and affiliation (if any).
-* A description the technical details of the vulnerabilities. It is very
+* A description of the technical details of the vulnerabilities. It is very
   important to let us know how we can reproduce your findings.
 * An explanation who can exploit this vulnerability, and what they gain when
   doing so -- write an attack scenario. This will help us evaluate your report
index 3a7f7b3..b6c3254 100644 (file)
@@ -1222,6 +1222,9 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
     # Check whether all capabilities from the input is valid
     all_valid = True
+    # Remove all whitespace characters before splitting the string
+    # that users may insert by accident, as this will result in error 
+    tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
index f06deba..6d1e332 100644 (file)
@@ -240,7 +240,7 @@ class Encapsulator {
   // Once edges between compiled and outside_compilation clusters have been
   // replaced by send/recv ops, some dependencies may no longer be apparent.
   // A clustering pass finds all the dependencies between HC nodes that are only
-  // present as a result of edges between nodes in outside_compilaton clusters.
+  // present as a result of edges between nodes in outside_compilation clusters.
   // Suppose there is a path from outside_compilation cluster C in subgraph S
   // to outside_compilation cluster D in subgraph T. If S != T then a control
   // edge is added from the call node for S to the call node for T, which
index c93c39e..39f8caa 100644 (file)
@@ -1 +1,7 @@
-This is the home of XLA.
+<p align="center">
+  <img width="200" src="xlalogo.png"/>
+</p>
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that optimizes TensorFlow computations. See the
+[documentation](https://www.tensorflow.org/performance/xla/) for more details.
index e560abc..e9ec796 100644 (file)
@@ -35,7 +35,7 @@ namespace xla {
 
 // Tries to replace a conditional with a call operation of the corresponding
 // computation. If the given conditional has a constant predicate, tries to
-// replace it with a call to its true/false computation as appropirate and then
+// replace it with a call to its true/false computation as appropriate and then
 // inline that computation.
 //
 // Returns true if it made a change to the graph.
index dce2014..33d8338 100644 (file)
@@ -64,7 +64,7 @@ struct SpecialCaseCopyPolicy {
   // output tuple.
   bool copy_root_replicated_buffers = false;
   // If true, insert a copy if a buffer coming from a constant or a parameter
-  // is found wihtin the output tuple.
+  // is found within the output tuple.
   bool copy_parameters_and_constants = false;
 };
 
index 557aa4a..2e55181 100644 (file)
@@ -33,8 +33,8 @@ namespace cpu {
 // emitters for function and function argument access.
 // The llvm::Function is created with the standard function signature
 // used in the XLA CPU backend (see ir_function.cc for argument details).
-// In addtion IrFunction saves the callers IR insert point during contruction,
-// and restores it after desctruction.
+// In addition IrFunction saves the callers IR insert point during construction,
+// and restores it after destruction.
 //
 // Example usage:
 //
index 33d02b7..db2cda2 100644 (file)
@@ -38,7 +38,7 @@ namespace cpu {
 //
 //     [0, 1), [1, 2), [2, 3), [3, 4), [4, 5) [5, 8)
 //
-//   Note that the last partition has residule because the dimension size is
+//   Note that the last partition has residual because the dimension size is
 //   not a multiple of the partition count.
 //
 //
index af48f4a..cc1695b 100644 (file)
@@ -25,7 +25,7 @@ namespace xla {
 
 // Creates an HloPassPipeline containing multiple HloPasses that can
 // despecialize an optimized HloModule. This is useful to run an HloModule
-// optimized for one specfic platform on a different platform (undoing platform
+// optimized for one specific platform on a different platform (undoing platform
 // specific passes) with matching numerics for comparison.
 //
 // Current despecialization passes are Defuser, ImplicitBroadcastRemover,
index a1d4dca..b41eaa3 100644 (file)
@@ -38,7 +38,7 @@ namespace gpu {
 //
 // Examples of things that are not unnested computations:
 //
-//  - The reducer of a kReduce HLO.  This is emited using IrEmitterNested.
+//  - The reducer of a kReduce HLO.  This is emitted using IrEmitterNested.
 //  - The body of a fusion node.  IrEmitterUnenested emits the relevant code
 //    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
 //    really an IrEmitter, but is more an "IR generator generator".)
index 2beac32..fa59a5f 100644 (file)
@@ -135,6 +135,7 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
 
 }  // namespace
 
+
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
   typed_visitors_[PRED] = MakeUnique<HloEvaluatorTypedVisitor<bool>>(this);
index 4c19a1b..0b21b25 100644 (file)
@@ -5,7 +5,7 @@ evaluating the result of the HLO graph directly with HloEvaluator, without
 lowering it further (to LLVM IR for example) before execution as other backends
 (CPU and GPU for example) do.
 
-Its key componenets are:
+Its key components are:
 
 *   [`InterpreterCompiler`] despite the inherited naming of "compiler", all
     `InterpreterCompiler` really does is the following:
index 8b4e079..c287cca 100644 (file)
@@ -282,8 +282,8 @@ class LayoutAssignment : public HloPassInterface {
   // the case that no particular layout is requested.
   //
   // channel_constraints is both an input and output. Any sends or recvs that
-  // are present in channel_constraints will be layed out as constrained. Any
-  // unconstrained sends or recvs will be layed out as locally optimal and their
+  // are present in channel_constraints will be laid out as constrained. Any
+  // unconstrained sends or recvs will be laid out as locally optimal and their
   // layout will be added as a constraint to channel_constraints.
   //
   // If channel_constraints is nullptr, no kSend or kRecvs must be contained
index e2c07e3..688ccef 100644 (file)
@@ -75,7 +75,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_after(
     return false;
   }
 
-  // Check that we haven't already inserted an equivalant reduce-precision
+  // Check that we haven't already inserted an equivalent reduce-precision
   // operation after this instruction.  (The zero-user case occurs when this is
   // the root instruction.)
   if (instruction->user_count() > 0) {
index a776d74..18e2651 100644 (file)
@@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 namespace source_map_util {
 
-// Creates an INVALID_ARUGMENT status with the given format string.
+// Creates an INVALID_ARGUMENT status with the given format string.
 //
 // Also, attempts to extract the OpMetadata for parameter_number on executable
 // and append it to the status message for source mapping to user code.
index 73e0148..6f57658 100644 (file)
@@ -234,7 +234,7 @@ class ShapeUtil {
   }
 
   // Returns the higher-precision element type if a and b are both floating
-  // point types; otherwise, checks that that they have the same element type
+  // point types; otherwise, checks that they have the same element type
   // and returns it.
   static PrimitiveType HigherPrecisionElementType(const Shape& a,
                                                   const Shape& b) {
index efa5aed..0fd846c 100644 (file)
@@ -61,7 +61,7 @@ using TypesF16F32F64CF64 = ::testing::Types<Eigen::half, float>;
 #endif
 
 // Check that we can safely pass an input tuple's elements to a dot operation.
-TEST_F(DotOperationTest, DotOfInputTupleElem) {
+XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
   XlaBuilder builder(TestName());
 
   XlaOp param;
@@ -798,7 +798,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64,
       this->error_spec_);
 }
 
-TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
       {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
   std::unique_ptr<Array2D<float>> constant_rhs_array(
@@ -826,7 +826,7 @@ TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
       {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
   std::unique_ptr<Array2D<float>> constant_rhs_array(
@@ -855,7 +855,7 @@ TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
            DotOfGatherOptimizationWithConstRHSReverseMM)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -886,7 +886,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
            DotOfGatherOptimizationWithConstLHSReverseMM)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -917,7 +917,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSRows)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -953,7 +953,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSRows)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -989,7 +989,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSCols)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
@@ -1017,7 +1017,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSCols)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
index 0984438..4118923 100644 (file)
@@ -514,7 +514,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 class TupleHloTest : public HloTestBase {};
 
 // Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
   const char* testcase = R"(
     HloModule m
 
diff --git a/tensorflow/compiler/xla/xlalogo.png b/tensorflow/compiler/xla/xlalogo.png
new file mode 100644 (file)
index 0000000..7a0a295
Binary files /dev/null and b/tensorflow/compiler/xla/xlalogo.png differ
index 2600088..878bb7e 100644 (file)
@@ -33,7 +33,7 @@ DEFAULT_UNCOMPILED_MODULES = set((
     (utils.__name__,),
 
     # All of tensorflow's subpackages. Unlike the root tf module, they don't
-    # have well-known names. Not refering to the module directly to avoid
+    # have well-known names. Not referring to the module directly to avoid
     # circular imports.
     (
         utils.__name__[:-len('.contrib.autograph.utils')],),
index 9f72028..671c9cc 100644 (file)
@@ -174,7 +174,7 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
     Tuple containing the final state.
   """
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
-  # That could be somethins as simple as a collection of dispatch rules, with
+  # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
   if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
     return _tf_while_stmt(test, body, init_state, opts)
index c492ef1..5dd2e0c 100644 (file)
@@ -371,7 +371,7 @@ class GradientBoostedDecisionTreeModel(object):
     Returns:
       a dictionary of prediction results -
         ENSEMBLE_STAMP, PREDICTION, PARTITION_IDS,
-        NUM_LAYER_ATTEMPTED, NUM_TREES_ATTEMPED.
+        NUM_LAYER_ATTEMPTED, NUM_TREES_ATTEMPTED.
     """
     ensemble_stats = training_ops.tree_ensemble_stats(ensemble_handle,
                                                       ensemble_stamp)
index 44e39f7..0708d6b 100644 (file)
@@ -172,19 +172,20 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
   endif()
 endif()
 
+include(CheckCXXCompilerFlag)
+
+# OpenMP Support
+CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+if (GCC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+endif()
+CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+if (MSVC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+endif()
+
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
-  include(CheckCXXCompilerFlag)
-  if (tensorflow_ENABLE_MKL_SUPPORT)
-    add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
-    if (NOT tensorflow_ENABLE_MKLDNN_SUPPORT)
-      add_definitions(-DINTEL_MKL_ML)
-    endif()
-  endif()
-  CHECK_CXX_COMPILER_FLAG("-fopenmp" COMPILER_OPT_OPENMP_SUPPORT)
-  if (COMPILER_OPT_OPENMP_SUPPORT)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-  endif()
   if (WIN32)
     CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
@@ -323,10 +324,13 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+# MKL Support
 if (tensorflow_ENABLE_MKL_SUPPORT)
+  add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
   if (WIN32)
     find_path(MKL_HOME_PLATFORM mkl
       PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
       PATH_SUFFIXES windows)
     set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
     set(MKL_LINK_DIRS
@@ -345,6 +349,7 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     # Fix me: complete the path on linux
     find_path(MKL_HOME_PLATFORM mkl
       HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
       PATH_SUFFIXES linux)
     set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
     set(MKL_LINK_DIRS) # incompleted
@@ -357,6 +362,8 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
     list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
     include_directories(${mkldnn_INCLUDE_DIRS})
+  else (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    add_definitions(-DINTEL_MKL_ML)
   endif()
 endif (tensorflow_ENABLE_MKL_SUPPORT)
 
index 116d423..8942f3e 100644 (file)
@@ -31,7 +31,8 @@ else (systemlib_ZLIB)
   set(ZLIB_URL https://github.com/madler/zlib)
   set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
   set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
-  set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+  # Match zlib version in tensorflow/workspace.bzl
+  set(ZLIB_TAG v1.2.11)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
index 92f2ab6..5942ff3 100644 (file)
@@ -267,6 +267,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
+      # Flaky on Windows cpu with py36 (b/73556968)
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/sparse_reshape_op_test.py"
       # Windows file management related issues.
       "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"
       # training tests
index d269b5b..c483a43 100644 (file)
@@ -355,11 +355,15 @@ py_test(
     deps = [
         "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
index 5f47dcb..bdc003a 100644 (file)
@@ -18,6 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import time
+from absl.testing import parameterized
 
 from tensorflow.contrib.data.python.ops import resampling
 from tensorflow.python.data.ops import dataset_ops
@@ -30,52 +33,98 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class ResampleTest(test.TestCase):
+def _time_resampling(
+    test_obj, data_np, target_dist, init_dist, num_to_sample):
+  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
 
-  def testInitialKnownDistribution(self):
-    self._testDistribution(initial_known=True)
+  # Reshape distribution via rejection sampling.
+  dataset = dataset.apply(
+      resampling.rejection_resample(
+          class_func=lambda x: x,
+          target_dist=target_dist,
+          initial_dist=init_dist,
+          seed=142))
 
-  def testInitialNotKnownDistribution(self):
-    self._testDistribution(initial_known=False)
+  get_next = dataset.make_one_shot_iterator().get_next()
 
-  def _testDistribution(self, initial_known):
+  with test_obj.test_session() as sess:
+    start_time = time.time()
+    for _ in xrange(num_to_sample):
+      sess.run(get_next)
+    end_time = time.time()
+
+  return end_time - start_time
+
+
+class ResampleTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("InitialDistributionKnown", True),
+      ("InitialDistributionUnknown", False))
+  def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
     initial_dist = [0.2] * 5 if initial_known else None
-    iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
-        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
-            resampling.rejection_resample(
-                target_dist=target_dist,
-                initial_dist=initial_dist,
-                class_func=lambda c, _: c,
-                seed=27)).make_one_shot_iterator())
-    get_next = iterator.get_next()
+    classes = math_ops.to_int64(classes)  # needed for Windows build.
+    dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
+        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
+
+    get_next = dataset.apply(
+        resampling.rejection_resample(
+            target_dist=target_dist,
+            initial_dist=initial_dist,
+            class_func=lambda c, _: c,
+            seed=27)).make_one_shot_iterator().get_next()
 
     with self.test_session() as sess:
       returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+      while len(returned) < 4000:
+        returned.append(sess.run(get_next))
 
     returned_classes, returned_classes_and_data = zip(*returned)
     _, returned_data = zip(*returned_classes_and_data)
     self.assertAllEqual([compat.as_bytes(str(c))
                          for c in returned_classes], returned_data)
     total_returned = len(returned_classes)
-    # Subsampling rejects a large percentage of the initial data in
-    # this case.
-    self.assertGreater(total_returned, 20000 * 0.2)
     class_counts = np.array([
         len([True for v in returned_classes if v == c])
         for c in range(5)])
     returned_dist = class_counts / total_returned
     self.assertAllClose(target_dist, returned_dist, atol=1e-2)
 
+  @parameterized.named_parameters(
+      ("OnlyInitial", True),
+      ("NotInitial", False))
+  def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
+    init_dist = [0.5, 0.5]
+    target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test that this works.
+    num_samples = 100
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)
+
+    # Reshape distribution.
+    dataset = dataset.apply(
+        resampling.rejection_resample(
+            class_func=lambda x: x,
+            target_dist=target_dist,
+            initial_dist=init_dist))
+
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
     num_classes = len(init_dist)
-    # We don't need many samples to test a dirac-delta target distribution
+    # We don't need many samples to test a dirac-delta target distribution.
     num_samples = 100
     data_np = np.random.choice(num_classes, num_samples, p=init_dist)
 
@@ -109,5 +158,23 @@ class ResampleTest(test.TestCase):
 
     self.assertAllClose(target_dist, bincount, atol=1e-2)
 
+
+class ResampleDatasetBenchmark(test.Benchmark):
+
+  def benchmarkResamplePerformance(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 1000
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    resample_time = _time_resampling(
+        self, data_np, target_dist, init_dist, num_to_sample=1000)
+
+    self.report_benchmark(
+        iters=1000, wall_time=resample_time, name="benchmark_resample")
+
+
 if __name__ == "__main__":
   test.main()
index 144460f..eceecfd 100644 (file)
@@ -214,6 +214,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":batching",
+        ":interleave_ops",
         ":scan_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -223,6 +224,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
index a182ddd..bad6edd 100644 (file)
@@ -20,10 +20,12 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
@@ -50,79 +52,182 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
   """
-
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    dist_estimation_batch_size = 32
     target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
     class_values_ds = dataset.map(class_func)
+
+    # Get initial distribution.
     if initial_dist is not None:
       initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
-      acceptance_dist = _calculate_acceptance_probs(initial_dist_t,
-                                                    target_dist_t)
+      acceptance_dist, prob_of_original = (
+          _calculate_acceptance_probs_with_mixing(initial_dist_t,
+                                                  target_dist_t))
       initial_dist_ds = dataset_ops.Dataset.from_tensors(
           initial_dist_t).repeat()
       acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
           acceptance_dist).repeat()
+      prob_of_original_ds = dataset_ops.Dataset.from_tensors(
+          prob_of_original).repeat()
+    else:
+      initial_dist_ds = _estimate_initial_dist_ds(
+          target_dist_t, class_values_ds)
+      acceptance_and_original_prob_ds = initial_dist_ds.map(
+          lambda initial: _calculate_acceptance_probs_with_mixing(
+              initial, target_dist_t))
+      acceptance_dist_ds = acceptance_and_original_prob_ds.map(
+          lambda accept_prob, _: accept_prob)
+      prob_of_original_ds = acceptance_and_original_prob_ds.map(
+          lambda _, prob_original: prob_original)
+    filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
+                             class_values_ds, seed)
+    # Prefetch filtered dataset for speed.
+    filtered_ds = filtered_ds.prefetch(3)
+
+    prob_original_static = _get_prob_original_static(
+        initial_dist_t, target_dist_t) if initial_dist is not None else None
+    if prob_original_static == 1:
+      return dataset_ops.Dataset.zip((class_values_ds, dataset))
+    elif prob_original_static == 0:
+      return filtered_ds
     else:
-      num_classes = (target_dist_t.shape[0].value or
-                     array_ops.shape(target_dist_t)[0])
-      smoothing_constant = 10
-      initial_examples_per_class_seen = array_ops.fill(
-          [num_classes], np.int64(smoothing_constant))
-
-      def update_estimate_and_tile(num_examples_per_class_seen, c):
-        updated_examples_per_class_seen, dist = _estimate_data_distribution(
-            c, num_examples_per_class_seen)
-        tiled_dist = array_ops.tile(
-            array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
-        return updated_examples_per_class_seen, tiled_dist
-
-      initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
-                         .apply(scan_ops.scan(initial_examples_per_class_seen,
-                                              update_estimate_and_tile))
-                         .apply(batching.unbatch()))
-      acceptance_dist_ds = initial_dist_ds.map(
-          lambda initial: _calculate_acceptance_probs(initial, target_dist_t))
-
-    def maybe_warn_on_large_rejection(accept_dist, initial_dist):
-      proportion_rejected = math_ops.reduce_sum(
-          (1 - accept_dist) * initial_dist)
-      return control_flow_ops.cond(
-          math_ops.less(proportion_rejected, .5),
-          lambda: accept_dist,
-          lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
-              accept_dist, [proportion_rejected, initial_dist, accept_dist],
-              message="Proportion of examples rejected by sampler is high: ",
-              summarize=100,
-              first_n=10))
-
-    acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
-                                                   initial_dist_ds))
-                          .map(maybe_warn_on_large_rejection))
-
-    def _gather_and_copy(class_val, acceptance_prob, data):
-      return (class_val, array_ops.gather(acceptance_prob, class_val), data)
-    current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
-        (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
-    filtered_ds = (
-        current_probabilities_and_class_and_data_ds
-        .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
-    return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+      return interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds],
+          weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]),
+          seed=seed)
 
   return _apply_fn
 
 
-def _calculate_acceptance_probs(initial_probs, target_probs):
-  """Calculate the per-class acceptance rates.
+def _get_prob_original_static(initial_dist_t, target_dist_t):
+  """Returns the static probability of sampling from the original.
+
+  `tensor_util.constant_value(prob_of_original)` returns `None` if it encounters
+  an Op that it isn't defined for. We have some custom logic to avoid this.
+
+  Args:
+    initial_dist_t: A tensor of the initial distribution.
+    target_dist_t: A tensor of the target distribution.
+
+  Returns:
+    The probability of sampling from the original distribution as a constant,
+    if it is a constant, or `None`.
+  """
+  init_static = tensor_util.constant_value(initial_dist_t)
+  target_static = tensor_util.constant_value(target_dist_t)
+
+  if init_static is None or target_static is None:
+    return None
+  else:
+    return np.min(target_static / init_static)
+
+
+def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
+               seed):
+  """Filters a dataset based on per-class acceptance probabilities.
 
   Args:
-    initial_probs: The class probabilities of the data.
-    target_probs: The desired class proportion in minibatches.
+    dataset: The dataset to be filtered.
+    acceptance_dist_ds: A dataset of acceptance probabilities.
+    initial_dist_ds: A dataset of the initial probability distribution, given or
+        estimated.
+    class_values_ds: A dataset of the corresponding classes.
+    seed: (Optional.) Python integer seed for the resampler.
+
   Returns:
-    A list of the per-class acceptance probabilities.
+    A dataset of (class value, data) after filtering.
+  """
+  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
+    proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
+    return control_flow_ops.cond(
+        math_ops.less(proportion_rejected, .5),
+        lambda: accept_dist,
+        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
+            accept_dist, [proportion_rejected, initial_dist, accept_dist],
+            message="Proportion of examples rejected by sampler is high: ",
+            summarize=100,
+            first_n=10))
+
+  acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
+                                                 initial_dist_ds))
+                        .map(maybe_warn_on_large_rejection))
+
+  def _gather_and_copy(class_val, acceptance_prob, data):
+    return class_val, array_ops.gather(acceptance_prob, class_val), data
+
+  current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
+      (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
+  filtered_ds = (
+      current_probabilities_and_class_and_data_ds
+      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
+  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+
+
+def _estimate_initial_dist_ds(
+    target_dist_t, class_values_ds, dist_estimation_batch_size=32,
+    smoothing_constant=10):
+  num_classes = (target_dist_t.shape[0].value or
+                 array_ops.shape(target_dist_t)[0])
+  initial_examples_per_class_seen = array_ops.fill(
+      [num_classes], np.int64(smoothing_constant))
+
+  def update_estimate_and_tile(num_examples_per_class_seen, c):
+    updated_examples_per_class_seen, dist = _estimate_data_distribution(
+        c, num_examples_per_class_seen)
+    tiled_dist = array_ops.tile(
+        array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
+    return updated_examples_per_class_seen, tiled_dist
 
-  This method is based on solving the following analysis:
+  initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
+                     .apply(scan_ops.scan(initial_examples_per_class_seen,
+                                          update_estimate_and_tile))
+                     .apply(batching.unbatch()))
+
+  return initial_dist_ds
+
+
+def _get_target_to_initial_ratio(initial_probs, target_probs):
+  # Add tiny to initial_probs to avoid divide by zero.
+  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
+  return target_probs / denom
+
+
+def _estimate_data_distribution(c, num_examples_per_class_seen):
+  """Estimate data distribution as labels are seen.
+
+  Args:
+    c: The class labels.  Type `int32`, shape `[batch_size]`.
+    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
+      containing counts.
+
+  Returns:
+    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
+      `[num_classes]`.
+    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+  """
+  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  # Update the class-count based on what labels are seen in batch.
+  num_examples_per_class_seen = math_ops.add(
+      num_examples_per_class_seen, math_ops.reduce_sum(
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
+  return num_examples_per_class_seen, dist
+
+
+def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs):
+  """Calculates the acceptance probabilities and mixing ratio.
+
+  In this case, we assume that we can *either* sample from the original data
+  distribution with probability `m`, or sample from a reshaped distribution
+  that comes from rejection sampling on the original distribution. This
+  rejection sampling is done on a per-class basis, with `a_i` representing the
+  probability of accepting data from class `i`.
+
+  This method is based on solving the following analysis for the reshaped
+  distribution:
 
   Let F be the probability of a rejection (on any example).
   Let p_i be the proportion of examples in the data in class i (init_probs)
@@ -151,39 +256,39 @@ def _calculate_acceptance_probs(initial_probs, target_probs):
   0 <= t_i <= 1, sum_i(t_i) = 1
   ```
 
-
   A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
-  """
-  # Add tiny to initial_probs to avoid divide by zero.
-  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
-  ratio_l = target_probs / denom
 
-  # Calculate list of acceptance probabilities.
-  max_ratio = math_ops.reduce_max(ratio_l)
-  return ratio_l / max_ratio
+  If we try to minimize the amount of data rejected, we get the following:
 
+  M_max = max_i [ t_i / p_i ]
+  M_min = min_i [ t_i / p_i ]
 
-def _estimate_data_distribution(c, num_examples_per_class_seen):
-  """Estimate data distribution as labels are seen.
+  The desired probability of accepting data if it comes from class `i`:
+
+  a_i = (t_i/p_i - m) / (M_max - m)
+
+  The desired probability of pulling a data element from the original dataset,
+  rather than the filtered one:
+
+  m = M_min
 
   Args:
-    c: The class labels.  Type `int32`, shape `[batch_size]`.
-    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
-      containing counts.
+    initial_probs: A Tensor of the initial probability distribution, given or
+      estimated.
+    target_probs: A Tensor of the corresponding classes.
 
   Returns:
-    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
-      `[num_classes]`.
-    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+    (A 1D Tensor with the per-class acceptance probabilities, the desired
+    probability of pull from the original distribution.)
   """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
-  # Update the class-count based on what labels are seen in batch.
-  num_examples_per_class_seen = math_ops.add(
-      num_examples_per_class_seen, math_ops.reduce_sum(
-          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
-  init_prob_estimate = math_ops.truediv(
-      num_examples_per_class_seen,
-      math_ops.reduce_sum(num_examples_per_class_seen))
-  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
-  return num_examples_per_class_seen, dist
+  ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs)
+  max_ratio = math_ops.reduce_max(ratio_l)
+  min_ratio = math_ops.reduce_min(ratio_l)
+
+  # Target prob to sample from original distribution.
+  m = min_ratio
+
+  # TODO(joelshor): Simplify fraction, if possible.
+  a_i = (ratio_l - m) / (max_ratio - m)
+  return a_i, m
\ No newline at end of file
index ecdb896..268c8d0 100644 (file)
@@ -53,7 +53,7 @@ class CholeskyOuterProduct(bijector.Bijector):
   its spectrum), and that the product of two positive-diagonal lower-triangular
   matrices is another positive-diagonal lower-triangular matrix.
 
-  A simple inductive argument (proceding one column of L_3 at a time) shows
+  A simple inductive argument (proceeding one column of L_3 at a time) shows
   that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
   diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
 
index 762685d..4384431 100644 (file)
@@ -1,6 +1,6 @@
 # Eager Execution
 
-Eager execution provides an imperative interface to TensorFlow (similiar to
+Eager execution provides an imperative interface to TensorFlow (similar to
 [NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
 operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
index a8d5a0d..bf2aa75 100644 (file)
@@ -53,7 +53,7 @@ Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
                        int32 samples_per_second, int32 channel_count,
                        const std::vector<float>& samples, string* output_data);
 
-// Reads an video file using ffmpeg adn converts it into a RGB24 in uint8
+// Reads an video file using ffmpeg and converts it into a RGB24 in uint8
 // [frames, height, width, 3]. The w, h, and frames are obtained from ffmpeg.
 Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
                      uint32* width, uint32* height, uint32* frames);
index bd764ed..72835c3 100644 (file)
@@ -202,7 +202,7 @@ class CriticalSection(object):
         or lazy way that may cause a deadlock.
       ValueError: If `exclusive_resource_access` is not provided (is `True`) and
         another `CriticalSection` has an execution requesting the same
-        resources as in `*args`, `**kwargs`, and any additionaly captured
+        resources as in `*args`, `**kwargs`, and any additionally captured
         inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
         if another execution in another `CriticalSection` was created without
         `exclusive_resource_access=True`, a `ValueError` will be raised.
index e3fc6bf..4092b32 100644 (file)
@@ -112,6 +112,7 @@ class GANEstimator(estimator.Estimator):
                generator_optimizer=None,
                discriminator_optimizer=None,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -146,6 +147,9 @@ class GANEstimator(estimator.Estimator):
         list of hooks. These hooks are run on the generator and discriminator
         train ops, and can be used to implement the GAN training scheme.
         Defaults to `train.get_sequential_train_hooks()`.
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -160,7 +164,8 @@ class GANEstimator(estimator.Estimator):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries, get_hooks_fn=get_hooks_fn)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn,
+          get_eval_metric_ops_fn=get_eval_metric_ops_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
index 387a62b..9554825 100644 (file)
@@ -38,6 +38,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -194,6 +195,12 @@ class GANEstimatorIntegrationTest(test.TestCase):
       lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
       return training.GradientDescentOptimizer(lr)
 
+    def get_metrics(gan_model):
+      return {
+          'mse_custom_metric': metrics_lib.mean_squared_error(
+              gan_model.real_data, gan_model.generated_data)
+      }
+
     gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     est = estimator.GANEstimator(
@@ -203,6 +210,7 @@ class GANEstimatorIntegrationTest(test.TestCase):
         discriminator_loss_fn=losses.wasserstein_discriminator_loss,
         generator_optimizer=gopt,
         discriminator_optimizer=dopt,
+        get_eval_metric_ops_fn=get_metrics,
         model_dir=self._model_dir)
 
     # TRAIN
@@ -213,6 +221,9 @@ class GANEstimatorIntegrationTest(test.TestCase):
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
     self.assertIn('loss', six.iterkeys(scores))
+    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
+                     scores['loss'])
+    self.assertIn('mse_custom_metric', six.iterkeys(scores))
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
index a21358c..ff903a7 100644 (file)
@@ -25,17 +25,21 @@ from tensorflow.contrib.gan.python import train as tfgan_train
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.canned import head
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import metrics as metrics_lib
 
 __all__ = [
     'GANHead',
     'gan_head',
 ]
 
+def _summary_key(head_name, val):
+  return '%s/%s' % (val, head_name) if head_name else val
+
 
 def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
              discriminator_optimizer, use_loss_summaries=True,
              get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
-             name=None):
+             get_eval_metric_ops_fn=None, name=None):
   """Creates a `GANHead`.
 
   Args:
@@ -47,9 +51,12 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
     discriminator_optimizer: Same as `generator_optimizer`, but for the
       discriminator updates.
     use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-    get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+      If `None`, uses defaults.
+    get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+      list of hooks.
+    get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+      dict of metric results keyed by name. The output of this function is
+      passed into `tf.estimator.EstimatorSpec` during evaluation.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`.
 
@@ -62,6 +69,7 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
                  discriminator_optimizer=discriminator_optimizer,
                  use_loss_summaries=use_loss_summaries,
                  get_hooks_fn=get_hooks_fn,
+                 get_eval_metric_ops_fn=get_eval_metric_ops_fn,
                  name=name)
 
 
@@ -72,6 +80,7 @@ class GANHead(head._Head):  # pylint: disable=protected-access
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -85,8 +94,11 @@ class GANHead(head._Head):  # pylint: disable=protected-access
         discriminator updates.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
-      get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
@@ -104,6 +116,8 @@ class GANHead(head._Head):  # pylint: disable=protected-access
     self._generator_optimizer = generator_optimizer
     self._discriminator_optimizer = discriminator_optimizer
     self._get_hooks_fn = get_hooks_fn
+    self._get_eval_metric_ops_fn = get_eval_metric_ops_fn
+    self._name = name
 
   @property
   def name(self):
@@ -173,13 +187,26 @@ class GANHead(head._Head):  # pylint: disable=protected-access
         gan_loss = self.create_loss(
             features=None, mode=mode, logits=gan_model, labels=None)
         scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+        with ops.name_scope(None, 'metrics',
+                            [gan_loss.generator_loss,
+                             gan_loss.discriminator_loss]):
+          eval_metric_ops = {
+              _summary_key(self._name, 'generator_loss'):
+                  metrics_lib.mean(gan_loss.generator_loss),
+              _summary_key(self._name, 'discriminator_loss'):
+                  metrics_lib.mean(gan_loss.discriminator_loss)
+          }
+          if self._get_eval_metric_ops_fn is not None:
+            custom_eval_metric_ops = self._get_eval_metric_ops_fn(gan_model)
+            if not isinstance(custom_eval_metric_ops, dict):
+              raise TypeError('get_eval_metric_ops_fn must return a dict, '
+                              'received: {}'.format(custom_eval_metric_ops))
+            eval_metric_ops.update(custom_eval_metric_ops)
         return model_fn_lib.EstimatorSpec(
             mode=model_fn_lib.ModeKeys.EVAL,
             predictions=gan_model.generated_data,
             loss=scalar_loss,
-            # TODO(joelshor): Add metrics. If head name provided, append it to
-            # metric keys.
-            eval_metric_ops={})
+            eval_metric_ops=eval_metric_ops)
       elif mode == model_fn_lib.ModeKeys.TRAIN:
         if train_op_fn is None:
           raise ValueError('train_op_fn can not be None.')
index 8168f00..6587f1f 100644 (file)
@@ -62,9 +62,14 @@ class GANHeadTest(test.TestCase):
         generator_loss_fn=dummy_loss,
         discriminator_loss_fn=dummy_loss,
         generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0))
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        get_eval_metric_ops_fn=self.get_metrics)
     self.assertTrue(isinstance(self.gan_head, head.GANHead))
 
+  def get_metrics(self, gan_model):
+    self.assertTrue(isinstance(gan_model, tfgan_tuples.GANModel))
+    return {}
+
   def _test_modes_helper(self, mode):
     self.gan_head.create_estimator_spec(
         features=None,
index df71187..a9b8faa 100644 (file)
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Miscellanous utilities for TFGAN code and examples."""
+"""Miscellaneous utilities for TFGAN code and examples."""
 
 from __future__ import absolute_import
 from __future__ import division
index a320a3f..592d37b 100644 (file)
@@ -677,7 +677,7 @@ def copy_with_input_replacements(sgv, replacement_ts,
 
 
 def _add_control_flow_ops(ops, control_ios):
-  """Complete `ops` so that the tranformed graph is valid.
+  """Complete `ops` so that the transformed graph is valid.
 
   Partially copying a graph can lead to a malformed graph. For instance,
   copying half of a while construct is likely to result in an invalid graph.
index 6028195..66939fb 100644 (file)
@@ -115,7 +115,7 @@ static void CheckOpsSupport(const GraphDef& graph_def,
       HexagonOpsDefinitions::getInstance();
   LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
   LOG(INFO) << "dump_all_nodes = " << dump_all_nodes
-            << ", dump_shape_and_tpye = " << dump_shape_and_type;
+            << ", dump_shape_and_type = " << dump_shape_and_type;
 
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
index 8f406ac..f230d93 100755 (executable)
@@ -17,7 +17,7 @@
 ### API
 
 This module provides functions for image manipulation; currently, chrominance
-transformas (including changing saturation and hue) in YIQ space and
+transforms (including changing saturation and hue) in YIQ space and
 projective transforms (including rotation) are supported.
 
 ## Image Transformation `Ops`
index b261f41..d6b1a61 100644 (file)
@@ -325,7 +325,7 @@ def distributed_grads_only_and_ops_chief_worker(
 
   All workers perform gradient computation. Chief worker applies gradient after
   averaging the gradients obtained from all the workers. All workers block
-  execution untill the update is applied. Chief worker runs covariance and
+  execution until the update is applied. Chief worker runs covariance and
   inverse update ops. Covariance and inverse matrices are placed on parameter
   servers in a round robin manner. For further details on synchronous
   distributed optimization check `tf.train.SyncReplicasOptimizer`.
index 45a760c..b7f63d8 100644 (file)
@@ -66,7 +66,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           the local approximation with the Fisher information matrix, and to
           regularize the update direction by making it closer to the gradient.
           If damping is adapted during training then this value is used for
-          initializing damping varaible.
+          initializing damping variable.
           (Higher damping means the update looks more like a standard gradient
           update - see Tikhonov regularization.)
       layer_collection: The layer collection object, which holds the fisher
@@ -114,7 +114,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     self._estimation_mode = estimation_mode
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
 
-    # The below paramaters are required only if damping needs to be adapated.
+    # The below parameters are required only if damping needs to be adapated.
     # These parameters can be set by calling
     # set_damping_adaptation_params() explicitly.
     self._damping_adaptation_decay = 0.95
@@ -195,7 +195,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       min_damping: `float`(Optional), Minimum value the damping parameter
         can take. Default value 1e-5.
       damping_adaptation_decay: `float`(Optional), The `damping` parameter is
-        multipled by the `damping_adaptation_decay` every
+        multiplied by the `damping_adaptation_decay` every
         `damping_adaptation_interval` number of iterations. Default value 0.99.
       damping_adaptation_interval: `int`(Optional), Number of steps in between
         updating the `damping` parameter. Default value 5.
index 8a20ebe..c445432 100644 (file)
@@ -51,7 +51,7 @@ class RoundRobinPlacementMixin(object):
     self._inv_devices = inv_devices
 
   def make_vars_and_create_op_thunks(self, scope=None):
-    """Make vars and create op thunks w/ a round-robin device placement strat.
+    """Make vars and create op thunks w/ a round-robin device placement start.
 
     For each factor, all of that factor's cov variables and their associated
     update ops will be placed on a particular device.  A new device is chosen
index f708da6..b7194ae 100644 (file)
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1411,7 +1543,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1555,7 +1687,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
index 997f910..b01fd5d 100644 (file)
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
index c7cdb41..f8106d1 100644 (file)
@@ -343,7 +343,8 @@ def get_temp_export_dir(timestamped_export_dir):
   """
   (dirname, basename) = os.path.split(timestamped_export_dir)
   temp_export_dir = os.path.join(
-      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
+      compat.as_bytes(dirname),
+      compat.as_bytes('temp-{}'.format(compat.as_text(basename))))
   return temp_export_dir
 
 
index 01c76b7..55b984f 100644 (file)
@@ -6,8 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
-exports_files(["LICENSE"])
-
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
index 1053cce..cc8a803 100644 (file)
@@ -1,4 +1,3 @@
-
 # Find where we're running from, so we can store generated files here.
 ifeq ($(origin MAKEFILE_DIR), undefined)
        MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
@@ -69,12 +68,12 @@ LIB_NAME := libtensorflow-lite.a
 LIB_PATH := $(LIBDIR)$(LIB_NAME)
 
 # A small example program that shows how to link against the library.
-BENCHMARK_PATH := $(BINDIR)benchmark_model
+MINIMAL_PATH := $(BINDIR)minimal
 
-BENCHMARK_SRCS := \
-tensorflow/contrib/lite/tools/benchmark_model.cc
-BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_SRCS))))
+MINIMAL_SRCS := \
+tensorflow/contrib/lite/examples/minimal/minimal.cc
+MINIMAL_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
 
 # What sources we want to compile, must be kept in sync with the main Bazel
 # build files.
@@ -100,7 +99,7 @@ $(wildcard tensorflow/contrib/lite/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/*/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/*/*/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/kernels/test_util.cc) \
-$(BENCHMARK_SRCS)
+$(MINIMAL_SRCS)
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # File names of the intermediate files target compilation generates.
@@ -119,17 +118,17 @@ $(OBJDIR)%.o: %.c
        $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
-all: $(LIB_PATH) $(BENCHMARK_PATH)
+all: $(LIB_PATH)  $(MINIMAL_PATH)
 
 # Gathers together all the objects we've compiled into a single '.a' archive.
 $(LIB_PATH): $(LIB_OBJS)
        @mkdir -p $(dir $@)
        $(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
 
-$(BENCHMARK_PATH): $(BENCHMARK_OBJS) $(LIB_PATH)
+$(MINIMAL_PATH): $(MINIMAL_OBJS) $(LIB_PATH)
        @mkdir -p $(dir $@)
        $(CXX) $(CXXFLAGS) $(INCLUDES) \
-       -o $(BENCHMARK_PATH) $(BENCHMARK_OBJS) \
+       -o $(MINIMAL_PATH) $(MINIMAL_OBJS) \
        $(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Gets rid of all generated files.
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
new file mode 100644 (file)
index 0000000..106e3b0
--- /dev/null
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include <cstdio>
+
+// This is an example that is minimal to read a model
+// from disk and perform inference. There is no data being loaded
+// that is up to you to add as a user.
+//
+// NOTE: Do not add any dependencies to this that cannot be built with
+// the minimal makefile. This example must remain trivial to build with
+// the minimal build tool.
+//
+// Usage: minimal <tflite model>
+
+using namespace tflite;
+
+#define TFLITE_MINIMAL_CHECK(x) \
+  if(!(x)) {                                                    \
+    fprintf(stderr, "Error at %s:%d\n",  __FILE__, __LINE__); \
+    exit(1); \
+  }
+
+
+int main(int argc, char *argv[]) {
+  if(argc != 2) {
+    fprintf(stderr, "Usage: %s <model>\n");
+    return 1;
+  }
+  const char* filename = argv[1];
+
+  // Load model
+  std::unique_ptr<tflite::FlatBufferModel> model
+      = tflite::FlatBufferModel::BuildFromFile(filename);
+  TFLITE_MINIMAL_CHECK(model != nullptr);
+
+  // Build the interpreter
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model.get(), resolver);
+  std::unique_ptr<Interpreter> interpreter;
+  builder(&interpreter);
+  TFLITE_MINIMAL_CHECK(interpreter != nullptr);
+
+  // Allocate tensor buffers.
+  TFLITE_MINIMAL_CHECK(interpreter->AllocateTensors() == kTfLiteOk);
+
+  // Fill input buffers
+  // TODO(user): Insert code to fill input tensors
+
+  // Run inference
+  TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk);
+
+  // Read output buffers
+  // TODO(user): Insert getting data out code.
+
+  return 0;
+}
index 7a3a231..ab50789 100644 (file)
@@ -32,7 +32,7 @@ This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc v
 
 Log in to you RPI, install the toolchain.
 ```bash
-sudo apt-get instal build-essential
+sudo apt-get install build-essential
 ```
 
 First, clone this TensorFlow repository. Run this at the root of the repository:
index f23b90d..d48178d 100644 (file)
@@ -3387,7 +3387,7 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
                           const int32 output_zeropoint,
                           const float output_scale) {
   // The arguments input_zeropoint and input_scale are expected to be an array
-  // that have the quantization paramaters for all the inputs to the concat
+  // that have the quantization parameters for all the inputs to the concat
   // operator.
   gemmlowp::ScopedProfilingLabel label("Concatenation");
   TFLITE_DCHECK_GT(inputs_count, 1);
index f6d8d32..62d6fe0 100644 (file)
@@ -1789,7 +1789,7 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
                           const int32 output_zeropoint,
                           const float output_scale) {
   // The arguments input_zeropoint and input_scale are expected to be an array
-  // that have the quantization paramaters for all the inputs to the concat
+  // that have the quantization parameters for all the inputs to the concat
   // operator.
   TFLITE_DCHECK_GT(inputs_count, 1);
   int64_t concat_size = 0;
@@ -1975,7 +1975,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // requiring a power-of-two representation interval. Thus, we should right
 // away quantize this array to a power-of-two interval; otherwise,
 // implementation will need to rescale that, losing any benefit that a tighter
-// representation interval might otherwise yield, while introducting some
+// representation interval might otherwise yield, while introducing some
 // numerical error and computational overhead.
 //
 // Now, Logistic and Tanh
index e5b640f..8bdeb03 100644 (file)
@@ -65,7 +65,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
index 852ea98..891d836 100644 (file)
@@ -48,7 +48,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
index 06cd940..b438b56 100644 (file)
@@ -53,7 +53,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
index 96731c8..b90408f 100644 (file)
@@ -54,7 +54,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
index cedefe0..020da38 100644 (file)
@@ -53,7 +53,7 @@ table Tensor {
   type:TensorType;
   // An index that refers to the buffers table at the root of the model. Or,
   // if there is no data buffer associated (i.e. intermediate results), then
-  // this is 0 (which refers to an always existant empty buffer).
+  // this is 0 (which refers to an always existent empty buffer).
   //
   // The data_buffer itself is an opaque container, with the assumption that the
   // target device is little-endian. In addition, all builtin operators assume
@@ -64,7 +64,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
index 07d2b28..0e036bd 100644 (file)
@@ -109,7 +109,7 @@ KNOWN_BUGS = {
 
 
 class ExtraTocoOptions(object):
-  """Additonal toco options besides input, output, shape."""
+  """Additional toco options besides input, output, shape."""
 
   def __init__(self):
     # Whether to ignore control dependency nodes.
@@ -2016,7 +2016,7 @@ def make_lstm_tests(zip_path):
     return inputs_after_split, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    """Feed inputs, assign vairables, and freeze graph."""
+    """Feed inputs, assign variables, and freeze graph."""
 
     with tf.variable_scope("", reuse=True):
       kernel = tf.get_variable("rnn/basic_lstm_cell/kernel")
index 1f07068..8cab6cd 100644 (file)
@@ -227,8 +227,8 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   if (!IsValid()) return;
   auto* tensor = interpreter_->tensor(id);
   if (expected_output_.count(id) != 0) {
-    fprintf(stderr, "Overriden expectation for tensor %d\n", id);
-    Invalidate("Overriden expectation");
+    fprintf(stderr, "Overridden expectation for tensor %d\n", id);
+    Invalidate("Overridden expectation");
   }
   expected_output_[id].reset(new Expectation);
   switch (tensor->type) {
index 495014c..7680cdd 100644 (file)
@@ -115,7 +115,7 @@ bazel run --config=opt \
 
 In order to evaluate the possible benefit of generating a quantized graph, TOCO
 allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
-and `--default_ranges_max` accept plausable values for the min-max ranges of the
+and `--default_ranges_max` accept plausible values for the min-max ranges of the
 values in all arrays that do not have min-max information. "Dummy-quantization"
 will produce lower accuracy but will emulate the performance of a correctly
 quantized model.
@@ -338,7 +338,7 @@ below outline the use cases for each.
 ### Using `--output_format=GRAPHVIZ_DOT`
 
 The first way to get a graphviz rendering is to pass `GRAPHVIZ_DOT` into
-`--output_format`. This results in a plausable visualization of the graph. This
+`--output_format`. This results in a plausible visualization of the graph. This
 reduces the requirements that normally exist during conversion between other
 input and output formats. For example, this may be useful if conversion from
 TENSORFLOW_GRAPHDEF to TFLITE is failing.
index 50f0620..5e9c20e 100644 (file)
@@ -25,10 +25,10 @@ namespace tflite {
 
 class BaseOperator;
 
-// Return a map contained all knwo TF Lite Operators, keyed by their names.
+// Return a map contained all know TF Lite Operators, keyed by their names.
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap();
 
-// Return a map contained all knwo TF Lite Operators, keyed by the type of
+// Return a map contained all know TF Lite Operators, keyed by the type of
 // their tf.mini counterparts.
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap();
 
index 253f022..8589ca3 100644 (file)
@@ -127,7 +127,7 @@ message TocoFlags {
   // transformations that are necessary in order to generate inference
   // code for these graphs. Such graphs should be fixed, but as a
   // temporary work-around, setting this reorder_across_fake_quant flag
-  // allows toco to perform necessary graph transformaitons on them,
+  // allows toco to perform necessary graph transformations on them,
   // at the cost of no longer faithfully matching inference and training
   // arithmetic.
   optional bool reorder_across_fake_quant = 8;
index 37539b9..5ed8057 100644 (file)
@@ -58,7 +58,7 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 
 # Creates the workers and return their sessions, graphs, train_ops.
-# Cheif worker will update at last
+# Chief worker will update at last
 def _get_workers(num_workers, period, workers, moving_rate):
   sessions = []
   graphs = []
index 6cca0a8..3acd940 100644 (file)
@@ -57,7 +57,7 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 
 # Creates the workers and return their sessions, graphs, train_ops.
-# Cheif worker will update at last
+# Chief worker will update at last
 def _get_workers(num_workers, steps, workers):
   sessions = []
   graphs = []
@@ -146,7 +146,7 @@ class ModelAverageOptimizerTest(test.TestCase):
     self.assertAllEqual(1.0, sessions[0].run(global_var_1))
     self.assertAllEqual(0, sessions[0].run(global_step))
 
-    # iteration 2, global varibale update
+    # iteration 2, global variable update
     thread_0 = self.checkedThread(
         target=self._run, args=(train_ops[0], sessions[0]))
     thread_1 = self.checkedThread(
index 5009401..59e67e8 100644 (file)
@@ -47,7 +47,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hann_window', window_length, periodic,
                                dtype, 0.5, 0.5)
@@ -72,7 +72,7 @@ def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hamming_window', window_length, periodic,
                                dtype, 0.54, 0.46)
index f2d31dc..d877831 100644 (file)
@@ -102,7 +102,7 @@ class BoundingBox(ItemHandler):
   """An ItemHandler that concatenates a set of parsed Tensors to Bounding Boxes.
   """
 
-  def __init__(self, keys=None, prefix=None):
+  def __init__(self, keys=None, prefix=''):
     """Initialize the bounding box handler.
 
     Args:
index 8a2c747..6e55b94 100644 (file)
@@ -571,7 +571,7 @@ def train(train_op,
       default, two `Boolean`, scalar ops called "should_stop" and "should_log"
       are provided.
     log_every_n_steps: The frequency, in terms of global steps, that the loss
-      and global step and logged.
+      and global step are logged.
     graph: The graph to pass to the supervisor. If no graph is supplied the
       default graph is used.
     master: The address of the tensorflow master.
index d5d8e41..cfdc884 100644 (file)
@@ -1080,14 +1080,20 @@ class SummaryDbWriter : public SummaryWriterInterface {
     // See tensorboard/plugins/histogram/summary.py and data_compat.py
     Tensor t{DT_DOUBLE, {k, 3}};
     auto data = t.flat<double>();
-    for (int i = 0; i < k; ++i) {
-      double left_edge = ((i - 1 >= 0) ? histo.bucket_limit(i - 1)
-                                       : std::numeric_limits<double>::min());
-      double right_edge = ((i + 1 < k) ? histo.bucket_limit(i + 1)
-                                       : std::numeric_limits<double>::max());
-      data(i + 0) = left_edge;
-      data(i + 1) = right_edge;
-      data(i + 2) = histo.bucket(i);
+    for (int i = 0, j = 0; i < k; ++i) {
+      // TODO(nickfelt): reconcile with TensorBoard's data_compat.py
+      // From summary.proto
+      // Parallel arrays encoding the bucket boundaries and the bucket values.
+      // bucket(i) is the count for the bucket i.  The range for
+      // a bucket is:
+      //   i == 0:  -DBL_MAX .. bucket_limit(0)
+      //   i != 0:  bucket_limit(i-1) .. bucket_limit(i)
+      double left_edge = (i == 0) ? std::numeric_limits<double>::min()
+                                  : histo.bucket_limit(i - 1);
+
+      data(j++) = left_edge;
+      data(j++) = histo.bucket_limit(i);
+      data(j++) = histo.bucket(i);
     }
     int64 tag_id;
     PatchPluginName(s->mutable_metadata(), kHistogramPluginName);
index c34b676..2e8d410 100644 (file)
@@ -100,6 +100,56 @@ class SummaryDbWriterTest : public ::testing::Test {
   SummaryWriterInterface* writer_ = nullptr;
 };
 
+TEST_F(SummaryDbWriterTest, WriteHistogram_VerifyTensorValues) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "histtest", "test1", "user1", &env_,
+                                     &writer_));
+  int step = 0;
+  std::unique_ptr<Event> e{new Event};
+  e->set_step(step);
+  e->set_wall_time(123);
+  Summary::Value* s = e->mutable_summary()->add_value();
+  s->set_tag("normal/myhisto");
+
+  double dummy_value = 10.123;
+  HistogramProto* proto = s->mutable_histo();
+  proto->Clear();
+  proto->set_min(dummy_value);
+  proto->set_max(dummy_value);
+  proto->set_num(dummy_value);
+  proto->set_sum(dummy_value);
+  proto->set_sum_squares(dummy_value);
+
+  int size = 3;
+  double bucket_limits[] = {-30.5, -10.5, -5.5};
+  double bucket[] = {-10, 10, 20};
+  for (int i = 0; i < size; i++) {
+    proto->add_bucket_limit(bucket_limits[i]);
+    proto->add_bucket(bucket[i]);
+  }
+  TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
+  TF_ASSERT_OK(writer_->Flush());
+  writer_->Unref();
+  writer_ = nullptr;
+
+  // TODO(nickfelt): implement QueryTensor() to encapsulate this
+  // Verify the data
+  string result = QueryString("SELECT data FROM Tensors");
+  const double* val = reinterpret_cast<const double*>(result.data());
+  double histarray[] = {std::numeric_limits<double>::min(),
+                        -30.5,
+                        -10,
+                        -30.5,
+                        -10.5,
+                        10,
+                        -10.5,
+                        -5.5,
+                        20};
+  int histarray_size = 9;
+  for (int i = 0; i < histarray_size; i++) {
+    EXPECT_EQ(histarray[i], val[i]);
+  }
+}
+
 TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
                                      &writer_));
index 6d6feb3..a5d8b06 100644 (file)
@@ -67,6 +67,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -86,6 +87,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
         ":trt_resources",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -197,10 +199,12 @@ tf_py_wrap_cc(
 tf_cuda_library(
     name = "trt_resources",
     srcs = [
+        "resources/trt_allocator.cc",
         "resources/trt_int8_calibrator.cc",
         "resources/trt_resource_manager.cc",
     ],
     hdrs = [
+        "resources/trt_allocator.h",
         "resources/trt_int8_calibrator.h",
         "resources/trt_resource_manager.h",
         "resources/trt_resources.h",
@@ -221,18 +225,25 @@ tf_cuda_library(
     srcs = [
         "convert/convert_graph.cc",
         "convert/convert_nodes.cc",
+        "convert/trt_optimization_pass.cc",
     ],
     hdrs = [
         "convert/convert_graph.h",
         "convert/convert_nodes.h",
+        "convert/trt_optimization_pass.h",
     ],
     deps = [
         ":segment",
+        ":trt_plugins",
         ":trt_logging",
         ":trt_resources",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -241,8 +252,7 @@ tf_cuda_library(
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:constant_folding",
-        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -256,7 +266,6 @@ cc_library(
         "segment/segment.h",
         "segment/union_find.h",
     ],
-    linkstatic = 1,
     deps = [
         "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
@@ -279,6 +288,46 @@ tf_cc_test(
     ],
 )
 
+# Library for the plugin factory
+tf_cuda_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 py_test(
     name = "tf_trt_integration_test",
     srcs = ["test/tf_trt_integration_test.py"],
index 0774027..b7b26cf 100644 (file)
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <list>
 #include <map>
@@ -24,6 +25,9 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -31,8 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
-#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -75,7 +78,8 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
       // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node->type_string());
+  return (candidate_ops.count(node->type_string()) ||
+          PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
@@ -144,7 +148,8 @@ struct ConvertGraphParams {
       size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
       const tensorflow::grappler::GraphProperties& current_graph_properties,
       std::unordered_map<string, std::pair<int, string>>* output_edges,
-      int engine_precision_mode)
+      int engine_precision_mode, const string& device_name,
+      std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id)
       : graph(inp_graph),
         output_names(output_node_names),
         subgraph_node_ids(subgraph_node_id_numbers),
@@ -152,7 +157,10 @@ struct ConvertGraphParams {
         max_workspace_size_bytes(max_consumed_workspace_size_bytes),
         graph_properties(current_graph_properties),
         output_edge_map(output_edges),
-        precision_mode(engine_precision_mode) {}
+        precision_mode(engine_precision_mode),
+        device_name_(device_name),
+        allocator_(allocator),
+        cuda_gpu_id_(cuda_gpu_id) {}
   tensorflow::Graph& graph;
   const std::vector<string>& output_names;
   const std::set<int>& subgraph_node_ids;
@@ -161,6 +169,9 @@ struct ConvertGraphParams {
   const tensorflow::grappler::GraphProperties& graph_properties;
   std::unordered_map<string, std::pair<int, string>>* output_edge_map;
   int precision_mode;
+  string device_name_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  int cuda_gpu_id_;
   std::vector<std::pair<int, int>> subgraph_inputs;
   std::vector<std::pair<int, int>> subgraph_outputs;
   tensorflow::EdgeSet subgraph_incoming_edges;
@@ -194,7 +205,7 @@ static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
                              subgraph_outputs_set.begin(),
                              subgraph_outputs_set.end());
   return tensorflow::Status::OK();
-};
+}
 
 tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
@@ -203,7 +214,8 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
                    params->subgraph_inputs, params->subgraph_outputs,
                    params->max_batch_size, params->max_workspace_size_bytes,
                    params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
+                   &trt_node_def, params->precision_mode, params->device_name_,
+                   params->allocator_, params->cuda_gpu_id_);
   TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
   tensorflow::Status status;
   tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
@@ -233,7 +245,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
                    params->subgraph_inputs, params->subgraph_outputs,
                    params->max_batch_size, params->max_workspace_size_bytes,
                    params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
+                   &trt_node_def, params->precision_mode, params->device_name_,
+                   params->allocator_, params->cuda_gpu_id_);
   TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
   tensorflow::Status status;
   tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
@@ -331,19 +344,12 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
-  tensorflow::GraphDef gdef;
-
-  // Layout optimization
   item.graph = graph_def;
-  tensorflow::grappler::LayoutOptimizer optimizer;
-  tensorflow::grappler::Cluster* cluster;
 
-  // virtual cluster
   tensorflow::DeviceProperties device_properties;
-
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
-  cluster =
+  tensorflow::grappler::Cluster* cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
   // single machine
@@ -351,27 +357,38 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   VLOG(2) << "cpu_cores: " << num_cpu_cores;
   VLOG(2) << "gpus: " << num_gpus;
-
-  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
-
-  // constant folding
+  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::GraphDef gdef;
+  TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef));
   item.graph = gdef;
-  tensorflow::grappler::ConstantFolding fold(nullptr);
-  TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
 
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
-  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   // Build full graph
+
+  return ConvertAfterShapes(gdef, output_names, max_batch_size,
+                            max_workspace_size_bytes, new_graph_def,
+                            precision_mode, minimum_segment_size,
+                            static_graph_properties, nullptr);
+}
+
+tensorflow::Status ConvertAfterShapes(
+    const tensorflow::GraphDef& gdef, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    tensorflow::GraphDef* new_graph_def, int precision_mode,
+    int minimum_segment_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const tensorflow::grappler::Cluster* cluster) {
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              gdef.library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
 
-  // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorflow::tensorrt::segment::SegmentOptions segment_options;
-
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : output_names) {
     segment_options.exclude_node_list.insert(node);
@@ -381,7 +398,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   segment_options.minimum_segment_size = minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      gdef, IsTensorRTCandidate, segment_options, &segments));
+      &graph, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
@@ -391,9 +408,21 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int count = 0;
   float total_num_nodes_in_segments = 0.;
   for (auto s : segments) {
-    total_num_nodes_in_segments += s.size();
+    total_num_nodes_in_segments += s.first.size();
   }
-  for (const std::set<string>& subgraph_node_names : segments) {
+  // We create the map here since cluster may not be available in all cases.
+  std::map<string, tensorflow::Device*> name_to_device_map;
+  if (cluster) {
+    // TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a
+    // distributed environment, devices from different workers can have same
+    // short name.
+    for (const auto dm : cluster->GetDeviceSet()->devices()) {
+      name_to_device_map[dm->name()] = dm;
+    }
+  }
+  for (const auto& segment_nodes_and_device : segments) {
+    const std::set<string>& subgraph_node_names =
+        segment_nodes_and_device.first;
     std::set<int> subgraph_node_ids;
     size_t max_mem_per_engine =
         max_workspace_size_bytes *
@@ -403,10 +432,40 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       oss << " " << node_name;
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
-    VLOG(2) << "Subgraph nodes" << oss.str();
+    VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second
+            << " : " << oss.str();
+    auto target_device =
+        name_to_device_map.find(segment_nodes_and_device.second);
+    std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0);
+
+    int cuda_device_id = 0;
+    if (target_device != name_to_device_map.end()) {
+      tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (!s.ok()) {
+        LOG(ERROR)
+            << "Cuda device identification failed, using device 0. Error= "
+            << s;
+      } else {
+        cuda_device_id = cuda_gpu_id.value();
+      }
+      tensorflow::GPUOptions gpuoptions;
+      // we need to us PM here since in python path there is no way to get to
+      // allocators
+      auto pm = tensorflow::ProcessState::singleton();
+      // this should be instantiated by now
+      auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
+      VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
+              << " cuda device= " << cuda_device_id << " at " << dev_allocator;
+      allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
+    } else {  // device unknown or not available
+      allocator = std::make_shared<TRTCudaAllocator>();
+    }
     ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
-                         max_mem_per_engine, static_graph_properties,
-                         &output_edge_map, precision_mode);
+                         max_mem_per_engine, graph_properties, &output_edge_map,
+                         precision_mode, segment_nodes_and_device.second,
+                         allocator, cuda_device_id);
     if (precision_mode == INT8MODE) {
       tensorflow::Status status = GetCalibNode(&p);
       if (status != tensorflow::Status::OK()) {
index e01e4a5..65a67d7 100644 (file)
@@ -18,6 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -43,6 +45,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size);
 
+// Method to call from optimization pass
+tensorflow::Status ConvertAfterShapes(
+    const tensorflow::GraphDef& graph, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    tensorflow::GraphDef* new_graph_def, int precision_mode,
+    int minimum_segment_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const tensorflow::grappler::Cluster* cluster);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
index b81ae9d..32b211d 100644 (file)
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <algorithm>
 #include <list>
@@ -240,35 +241,49 @@ class TFAttrs {
     return attrs_.at(key);
   }
   template <typename T>
-  T get(string key) const;
+  T get(const string& key) const;
   template <typename T>
-  T get(string key, const T& default_value) const {
+  T get(const string& key, const T& default_value) const {
     return attrs_.count(key) ? this->get<T>(key) : default_value;
   }
 
+  std::vector<string> GetAllAttrKey() {
+    std::vector<string> attr_list;
+    for (const auto& attr_item : attrs_) {
+      attr_list.emplace_back(attr_item.first);
+    }
+    return attr_list;
+  }
+
  private:
   typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
   AttrMap attrs_;
 };
 
 template <>
-string TFAttrs::get<string>(string key) const {
+string TFAttrs::get<string>(const string& key) const {
   return this->at(key)->s();
 }
 
 template <>
-std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
+std::vector<int> TFAttrs::get<std::vector<int>>(const string& key) const {
   auto attr = this->at(key)->list().i();
   return std::vector<int>(attr.begin(), attr.end());
 }
 
 template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
+std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
+  auto attr = this->at(key)->list().f();
+  return std::vector<float>(attr.begin(), attr.end());
+}
+
+template <>
+std::vector<string> TFAttrs::get<std::vector<string>>(const string& key) const {
   auto attr = this->at(key)->list().s();
   return std::vector<string>(attr.begin(), attr.end());
 }
 template <>
-nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
+nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(const string& key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
   dims.nbDims = values.size();
@@ -278,24 +293,25 @@ nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
 }
 
 template <>
-nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(string key) const {
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
   TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
   return trt_dtype;
 }
 
 template <>
-tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
+tensorflow::DataType TFAttrs::get<tensorflow::DataType>(
+    const string& key) const {
   return this->at(key)->type();
 }
 
 template <>
-float TFAttrs::get<float>(string key) const {
+float TFAttrs::get<float>(const string& key) const {
   return this->at(key)->f();
 }
 
 template <>
-bool TFAttrs::get<bool>(string key) const {
+bool TFAttrs::get<bool>(const string& key) const {
   return this->at(key)->b();
 }
 
@@ -424,6 +440,7 @@ using OpConverter =
 class Converter {
   std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
   std::unordered_map<string, OpConverter> op_registry_;
+  OpConverter plugin_converter_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
   tensorflow::tensorrt::TRTWeightStore* weight_store_;
@@ -481,7 +498,7 @@ class Converter {
     weights.SetValues(weight_store_->store_.back().data());
     return weights;
   }
-  bool isFP16() { return fp16_; };
+  bool isFP16() { return fp16_; }
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
@@ -490,13 +507,17 @@ class Converter {
     std::vector<TRT_TensorOrWeights> inputs;
     TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
-    if (!op_registry_.count(op)) {
-      return tensorflow::errors::Unimplemented(
-          "No converter registered for op: " + op);
-    }
-    OpConverter op_converter = op_registry_.at(op);
     std::vector<TRT_TensorOrWeights> outputs;
-    TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
+      TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs));
+    } else {
+      if (!op_registry_.count(op)) {
+        return tensorflow::errors::Unimplemented(
+            "No converter registered for op: " + op);
+      }
+      OpConverter op_converter = op_registry_.at(op);
+      TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    }
     for (size_t i = 0; i < outputs.size(); ++i) {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
@@ -672,7 +693,7 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
       return [](Eigen::half t) -> Eigen::half {
-        return Eigen::half(1.0 / sqrt(float(t)));
+        return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
@@ -1158,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1173,6 +1194,45 @@ tensorflow::Status BinaryTensorOpTensor(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertPlugin(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  // prepare input
+  std::vector<nvinfer1::ITensor*> all_inputs;
+  for (auto input : inputs) {
+    all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
+  }
+
+  // plugin is owned by PluginFactory
+  // TODO(jie): destroy plugins later (resource management)
+  PluginTensorRT* plugin =
+      PluginFactoryTensorRT::GetInstance()->CreatePlugin(node_def.op());
+
+  // passing attributes
+  // TODO(jie): support more general attribute
+  TFAttrs attrs(node_def);
+  auto attr_key_vector = attrs.GetAllAttrKey();
+  for (auto attr_key : attr_key_vector) {
+    // TODO(jie): support only list of float for toy example here.
+    auto data = attrs.get<std::vector<float>>(attr_key);
+    size_t size_data = data.size() * sizeof(float);
+    if (!plugin->SetAttribute(attr_key, static_cast<void*>(data.data()),
+                              size_data)) {
+      return tensorflow::errors::InvalidArgument("plugin SetAttribute failed");
+    }
+  }
+
+  nvinfer1::IPluginLayer* layer = ctx.network()->addPlugin(
+      &all_inputs[0], static_cast<int>(inputs.size()), *plugin);
+
+  for (int i = 0; i < layer->getNbOutputs(); i++) {
+    nvinfer1::ITensor* output_tensor = layer->getOutput(i);
+    outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertPlaceholder(
     Converter& ctx, const tensorflow::NodeDef& node_def,
     const std::vector<TRT_TensorOrWeights>& inputs,
@@ -2073,6 +2133,8 @@ void Converter::register_op_converters() {
   op_registry_["Reshape"] = ConvertReshape;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
+
+  plugin_converter_ = ConvertPlugin;
 }
 
 }  // namespace
@@ -2144,7 +2206,7 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   if (!status.ok() || !calib_res->calibrator_) {
     return tensorflow::errors::FailedPrecondition(
         "You must run calibration"
-        " and inference conversion in the same proces");
+        " and inference conversion in the same process");
   }
 
   calib_res->calibrator_->setDone();
@@ -2213,60 +2275,63 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
+tensorflow::Status ReverseTopologicalSort(
+    const tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order) {
   std::vector<tensorflow::Node*> order_vec;
   tensorflow::GetPostOrder(s.graph, &order_vec);
   // Select just the subgraph
-  std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to construct the
+      // We want topological order to contstruct the
       // network layer by layer
+      order->push_front(node);
     }
   }
-  // topological order is needed to build TRT network
-  static int static_id = 0;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetInputList(
+    const tensorrt::convert::SubGraphParams& s,
+    tensorflow::NodeDefBuilder* op_builder,
+    const std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes) {
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  VLOG(2) << "input edge size: " << input_names->size();
+  for (size_t i = 0; i < input_names->size(); ++i) {
+    VLOG(2) << "input edges: " << i << " " << input_names->at(i);
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names->at(i), output_idx, input_dtypes->at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder->Input(input_list);
+  return tensorflow::Status::OK();
+}
+
+string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) {
   string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
+  if (!order->empty()) {
+    subgraph_name_scope = order->front()->name();
   }
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
   }
   // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
-  }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+  return subgraph_name_scope;
+}
 
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
+tensorflow::Status ConvertSubgraph(
+    Converter& converter, tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order, std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes,
+    std::vector<string>* output_names,
+    std::vector<tensorflow::DataType>* output_dtypes,
+    const string& engine_name) {
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2309,22 +2374,21 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
+    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
     if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Data type conversion for input '" << node_name
-                   << "' failed";
+      LOG(WARNING) << "Type conversion failed for " << node_name;
       return type_status;
     }
 
-    VLOG(2) << "accessing output index of: " << output_idx
+    VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
-            << "with output entry from shape_map: " << op_info_vec.size();
+            << " with output entry from shape_map: " << op_info_vec.size();
     // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
+    nvinfer1::DimsCHW input_dim_pseudo_chw;
+    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
 
     // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
     //            update the code once TRT 4.0 comes out.
@@ -2338,7 +2402,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
+      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
     // TODO(ben,jie): proper way to restore input tensor name?
@@ -2347,33 +2411,29 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
 
-    input_names.push_back(input_tensor_name);
+    input_names->push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
+        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
+    VLOG(2) << "Input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
+          "Output tensor already exists for op: " + input_tensor_name);
   }
 
-  VLOG(2) << "finished sorting";
-
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
+    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  VLOG(2) << "finished conversion";
+  VLOG(2) << "Finished conversion";
 
   // Gather output metadata
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
   int trt_engine_op_output_idx = 0;
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
@@ -2388,14 +2448,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
              : StrCat(engine_name, ":", trt_engine_op_output_idx),
          {output_idx, tensor_name}});
     trt_engine_op_output_idx++;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-    VLOG(1) << "output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
+    if (output_idx != 0)
+      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
+    VLOG(2) << "Output tensor name: " << tensor_name;
+    output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
                                                  "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
@@ -2405,12 +2464,65 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     }
     converter.network()->markOutput(*tensor);
     tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
+    output_dtypes->push_back(tf_dtype);
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
     TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
     tensor->setType(trt_dtype);
   }
 
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+  // Toposort
+  std::list<tensorflow::Node*> order;
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
+
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  // TODO(sami,ben,jie): proper naming!
+  string calib_op_name =
+      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
+  static_id++;
+
+  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
+  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
+  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
+  op_res->logger_ = new tensorflow::tensorrt::Logger();
+  cudaSetDevice(s.cuda_gpu_id_);
+  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
+  op_res->allocator_ = s.allocator_;
+#if NV_TENSORRT_MAJOR > 3
+  op_res->builder_->setGpuAllocator(s.allocator_.get());
+#endif
+  if (!op_res->builder_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT builder object");
+  }
+
+  op_res->network_ = op_res->builder_->createNetwork();
+  if (!op_res->network_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT network object");
+  }
+
+  // Build the network
+  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
+  auto ws = new tensorflow::tensorrt::TRTWeightStore();
+  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
+  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
+
   VLOG(2) << "Finished processing outputs";
 
   // Build the engine
@@ -2422,21 +2534,8 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
   tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
-            << ":" << output_idx
-            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  SetInputList(s, &op_builder, &input_names, &input_dtypes);
+
   std::vector<string> segment_names;
   segment_names.reserve(s.subgraph_node_ids.size());
   for (int i : s.subgraph_node_ids) {
@@ -2460,46 +2559,29 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensorrt::convert::SubGraphParams& s) {
   // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
   std::list<tensorflow::Node*> order;
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      // We want topological order to contstruct the
-      // network layer by layer
-      order.push_front(node);
-    }
-  }
-  // Topological order is needed to build TRT network
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
 
-  tensorflow::tensorrt::Logger trt_logger;
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++);
 
+  tensorflow::tensorrt::Logger trt_logger;
+  cudaSetDevice(s.cuda_gpu_id_);
   auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
   if (!trt_builder) {
     return tensorflow::errors::Internal(
         "Failed to create TensorRT builder object");
   }
-
+#if NV_TENSORRT_MAJOR > 3
+  trt_builder->setGpuAllocator(s.allocator_.get());
+#endif
   auto trt_network = infer_object(trt_builder->createNetwork());
   if (!trt_network) {
     return tensorflow::errors::Internal(
         "Failed to create TensorRT network object");
   }
 
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  static int static_id = 0;
-  // TODO(sami,ben,jie): proper naming!
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
-  engine_name = StrCat(engine_name, static_id++);
   auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
   auto weight_rmgr = trt_rmgr->getManager("WeightStore");
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
@@ -2510,147 +2592,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
-
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
-    tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    auto type_status = ConvertDType(tf_dtype, &dtype);
-    if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Type conversion failed for " << node_name;
-      return type_status;
-    }
-
-    VLOG(2) << "Accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
-
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4) {
-      string err_str = "Require 4 dimensional input.";
-      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
-                shape_inference_node_name);
-      return tensorflow::errors::Unimplemented(err_str);
-    }
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
-
-    input_names.push_back(input_tensor_name);
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "Input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "Output tensor already exists for op: " + input_tensor_name);
-  }
-
-  VLOG(2) << "Finished sorting";
-
-  for (const tensorflow::Node* node : order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "Finished conversion";
-
-  // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
-    VLOG(2) << "Output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
-                                                 "' is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
-    }
-    converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
-  }
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
 
   VLOG(2) << "Finished output";
 
@@ -2686,20 +2632,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   // Build the TRT op
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names.size();
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  SetInputList(s, &op_builder, &input_names, &input_dtypes);
 
   VLOG(0) << "Finished op preparation";
 
@@ -2707,9 +2640,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
+                    .Device(s.device_name_)
                     .Finalize(s.trt_node);
 
-  VLOG(0) << status.ToString() << " finished op building";
+  VLOG(0) << status.ToString() << " finished op building for " << engine_name
+          << " on device " << s.device_name_;
 
   return tensorflow::Status::OK();
 }
index 954a1e7..3f6592c 100644 (file)
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
-
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
@@ -48,7 +48,9 @@ struct SubGraphParams {
       const tensorflow::grappler::GraphProperties& current_graph_properties,
       std::unordered_map<string, std::pair<int, string>>* output_edges,
       tensorflow::NodeDef* constructed_trt_node,
-      int engine_precision_mode = FP32MODE)
+      int engine_precision_mode = FP32MODE, const string& device_name = "",
+      std::shared_ptr<nvinfer1::IGpuAllocator> allocator = nullptr,
+      int cuda_gpu_id = 0)
       : graph(inp_graph),
         subgraph_node_ids(subgraph_node_id_numbers),
         input_inds(input_indices),
@@ -58,7 +60,10 @@ struct SubGraphParams {
         graph_properties(current_graph_properties),
         output_edge_map(output_edges),
         trt_node(constructed_trt_node),
-        precision_mode(engine_precision_mode) {}
+        precision_mode(engine_precision_mode),
+        device_name_(device_name),
+        allocator_(allocator),
+        cuda_gpu_id_(cuda_gpu_id) {}
 
   tensorflow::Graph& graph;
   const std::set<int>& subgraph_node_ids;
@@ -70,6 +75,9 @@ struct SubGraphParams {
   std::unordered_map<string, std::pair<int, string>>* output_edge_map;
   tensorflow::NodeDef* trt_node;
   const int precision_mode;
+  const string device_name_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  const int cuda_gpu_id_;
 };
 
 // TODO(sami): Replace references with const reference or pointers
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
new file mode 100644 (file)
index 0000000..8f634b1
--- /dev/null
@@ -0,0 +1,246 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+// TODO(sami): Remove VLOG messages once the code matures
+using tensorflow::str_util::Uppercase;
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
+tensorflow::Status TRTOptimizationPass::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  VLOG(1) << "Called INIT for " << name_ << " with config = " << config;
+  if (config == nullptr) {
+    maximum_workspace_size_ = 2 << 30;
+    return tensorflow::Status::OK();
+  }
+  const auto params = config->parameter_map();
+  if (params.count("minimum_segment_size")) {
+    minimum_segment_size_ = params.at("minimum_segment_size").i();
+  }
+  if (params.count("max_batch_size")) {
+    maximum_batch_size_ = params.at("max_batch_size").i();
+  }
+  if (params.count("max_workspace_size_bytes"))
+    maximum_workspace_size_ = params.at("max_workspace_size_bytes").i();
+  if (params.count("precision_mode")) {
+    string pm = Uppercase(params.at("precision_mode").s());
+    if (pm == "FP32") {
+      precision_mode_ = 0;
+    } else if (pm == "FP16") {
+      precision_mode_ = 1;
+    } else if (pm == "INT8") {
+      precision_mode_ = 2;
+    } else {
+      LOG(ERROR) << "Unknown precision mode '" << pm << "'";
+      return tensorflow::errors::InvalidArgument(
+          "Unknown precision mode argument" + pm +
+          " Valid values are FP32, FP16, INT8");
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+void TRTOptimizationPass::PrintDebugInfo(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item) {
+  VLOG(1) << "Cluster = " << cluster;
+  string offset("  ");
+  string offset2 = StrCat(offset, offset);
+  string offset3 = StrCat(offset2, offset);
+  string offset4 = StrCat(offset2, offset2);
+  if (cluster) {
+    VLOG(1) << offset << "type             = " << cluster->type();
+    VLOG(1) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
+    const auto dev_names = cluster->GetDeviceNames();
+    if (dev_names.size()) {
+      VLOG(1) << offset << " Device names:";
+      for (const auto s : dev_names) {
+        VLOG(1) << offset2 << s;
+      }
+    }
+    std::unordered_map<string, uint64> peak_mem;
+    auto status = cluster->GetPeakMemoryUsage(&peak_mem);
+    if (status == tensorflow::Status::OK()) {
+      VLOG(1) << offset << "Peak Memory Usage :";
+      for (auto s : peak_mem) {
+        VLOG(1) << offset2 << s.first << " = " << s.second;
+      }
+    }
+
+    const auto dev_props = cluster->GetDevices();
+    if (dev_props.size()) {
+      VLOG(1) << offset << "Device properties:";
+      for (auto k : dev_props) {
+        VLOG(1) << offset2 << k.first;
+        const auto& dt = k.second;
+        VLOG(1) << offset3 << "type          = " << dt.type();
+        VLOG(1) << offset3 << "vendor        = " << dt.vendor();
+        VLOG(1) << offset3 << "model         = " << dt.model();
+        VLOG(1) << offset3 << "frequency     = " << dt.frequency();
+        VLOG(1) << offset3 << "num cores     = " << dt.num_cores();
+        VLOG(1) << offset3 << "num registers = " << dt.num_registers();
+        VLOG(1) << offset3 << "L1 cache size = " << dt.l1_cache_size();
+        VLOG(1) << offset3 << "L2 cache size = " << dt.l2_cache_size();
+        VLOG(1) << offset3 << "L3 cache size = " << dt.l3_cache_size();
+        VLOG(1) << offset3 << "SHMem per SMP = "
+                << dt.shared_memory_size_per_multiprocessor();
+        VLOG(1) << offset3 << "memory size   = " << dt.memory_size();
+        VLOG(1) << offset3 << "bandwidth     = " << dt.bandwidth();
+        if (dt.environment_size()) {
+          VLOG(1) << offset3 << "environment   :";
+          for (const auto e : dt.environment()) {
+            VLOG(1) << offset4 << e.first << " = " << e.second;
+          }
+        }
+      }
+    }
+  }
+  VLOG(1) << "item: " << item.id;
+  if (item.feed.size()) {
+    VLOG(1) << offset << "Feeds  :";
+    for (const auto& f : item.feed) {
+      const auto& shape = f.second.shape();
+      VLOG(1) << offset2 << f.first << " = shaped " << shape.DebugString();
+    }
+  } else {
+    VLOG(1) << offset << "No Feeds";
+  }
+  if (item.fetch.size()) {
+    VLOG(1) << offset << "Fetches  :";
+    for (const auto& f : item.fetch) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No Fetches";
+  }
+
+  if (item.init_ops.size()) {
+    VLOG(1) << offset << "init ops  :";
+    for (const auto& f : item.init_ops) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No init ops";
+  }
+  VLOG(1) << "Save Op = " << item.save_op;
+  VLOG(1) << "Restore Op = " << item.restore_op;
+  VLOG(1) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
+  if (item.keep_ops.size()) {
+    VLOG(1) << offset << "keep ops  :";
+    for (const auto& f : item.keep_ops) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No keep ops";
+  }
+  VLOG(3) << item.graph.DebugString();
+  for (const auto dev : cluster->GetDeviceSet()->devices()) {
+    const auto& pname = dev->parsed_name();
+    VLOG(1) << "Device name= " << dev->name()
+            << " parsedname job= " << pname.job << " id= " << pname.id
+            << " has_id: " << pname.has_id << " has_job: " << pname.has_job
+            << "has_type: " << pname.has_type << " type =" << pname.type;
+  }
+}
+
+tensorflow::Status TRTOptimizationPass::Optimize(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item, GraphDef* optimized_graph) {
+  VLOG(1) << "Called TRTOptimization Pass " << name_;
+  if (VLOG_IS_ON(1)) {
+    PrintDebugInfo(cluster, item);
+  }
+  int max_dim = -1;
+  if (item.feed.size()) {
+    for (const auto& f : item.feed) {
+      const auto& shape = f.second.shape();
+      if (shape.dims() > 0) {
+        if (shape.dim_size(0) > max_dim) max_dim = shape.dim_size(0);
+      }
+    }
+  }
+  if (maximum_batch_size_ < 0) {  // automatic batch size from input
+    if (max_dim > 0) {
+      maximum_batch_size_ = max_dim;
+      VLOG(1) << "Setting maximum batch size to " << max_dim;
+    } else {
+      maximum_batch_size_ = 128;
+      LOG(WARNING) << "Maximum batch size is not set"
+                      " and can't be deduced from inputs setting it to"
+                   << maximum_batch_size_
+                   << ". Suggest configuring it from configuration parameters";
+    }
+  } else {
+    if (max_dim > maximum_batch_size_) {
+      LOG(WARNING) << "Configured batch size " << maximum_batch_size_
+                   << " is less than input batch size " << max_dim
+                   << " adjusting maximum batch size to match input batch size";
+    }
+  }
+  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
+  auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(
+      item.graph, item.fetch, maximum_batch_size_, maximum_workspace_size_,
+      optimized_graph, precision_mode_, minimum_segment_size_,
+      static_graph_properties, cluster);
+  VLOG(2) << optimized_graph->DebugString();
+  return status;
+}
+
+void TRTOptimizationPass::Feedback(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item,
+    const GraphDef& optimized_graph, double result) {}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+class VerboseCustomGraphOptimizerRegistrar
+    : public tensorflow::grappler::CustomGraphOptimizerRegistrar {
+ public:
+  VerboseCustomGraphOptimizerRegistrar(
+      const tensorflow::grappler::CustomGraphOptimizerRegistry::Creator& cr,
+      const tensorflow::string& name)
+      : tensorflow::grappler::CustomGraphOptimizerRegistrar(cr, name) {
+    VLOG(1) << "Constructing a CustomOptimizationPass registration object for "
+            << name;
+  }
+};
+
+static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
+    []() {
+      VLOG(1)
+          << "Instantiating CustomOptimizationPass object TensorRTOptimizer";
+      return new tensorflow::tensorrt::convert::TRTOptimizationPass(
+          "TensorRTOptimizer");
+    },
+    ("TensorRTOptimizer"));
+
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
new file mode 100644 (file)
index 0000000..d8ecead
--- /dev/null
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
+ public:
+  TRTOptimizationPass(const string& name = "TRTOptimizationPass")
+      : name_(name),
+        minimum_segment_size_(3),
+        precision_mode_(0),
+        maximum_batch_size_(-1),
+        maximum_workspace_size_(-1) {
+    VLOG(1) << "Constructing " << name_;
+  }
+
+  string name() const override { return name_; };
+
+  tensorflow::Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
+                              config = nullptr) override;
+
+  tensorflow::Status Optimize(tensorflow::grappler::Cluster* cluster,
+                              const tensorflow::grappler::GrapplerItem& item,
+                              GraphDef* optimized_graph) override;
+
+  void Feedback(tensorflow::grappler::Cluster* cluster,
+                const tensorflow::grappler::GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+  void PrintDebugInfo(tensorflow::grappler::Cluster* cluster,
+                      const tensorflow::grappler::GrapplerItem& item);
+
+ private:
+  string name_;
+  int minimum_segment_size_;
+  int precision_mode_;
+  int maximum_batch_size_;
+  int64_t maximum_workspace_size_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
new file mode 100644 (file)
index 0000000..a89cf3a
--- /dev/null
@@ -0,0 +1,118 @@
+# Description:
+#   Example for plugin support in TensorRT(http://developer.nvidia.com/tensorrt)
+#   through TensorFlow integration. Targeting TensorRT 3.0.4
+#   APIs are meant to change while upgrading TRT.
+#   add init_py into pip package BUILD dependency to install it.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["inc_op"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "inc_op",
+    deps = [":inc_op_op_lib"],
+)
+
+tf_custom_op_library(
+    name = "_inc_op.so",
+    srcs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.cc",
+        "inc_op_plugin.h",
+        "ops/inc_op.cc",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:framework_lite",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_kernel_library(
+    name = "inc_op_plugin_kernel",
+    srcs = ["inc_op_plugin.cc"],
+    hdrs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.h",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_custom_op_py_library(
+    name = "inc_op_loader",
+    srcs = ["inc_op.py"],
+    dso = [
+        ":_inc_op.so",
+    ],
+    kernels = [
+        ":inc_op_op_lib",
+        ":inc_op_plugin_kernel",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inc_op",
+        ":inc_op_loader",
+    ],
+)
+
+cuda_py_test(
+    name = "plugin_test",
+    size = "small",
+    srcs = ["plugin_test.py"],
+    additional_deps = [
+        ":init_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/contrib/tensorrt:init_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:tf_optimizer",
+    ],
+    tags = [
+        "manual",
+        "noguitar",
+        "notap",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
new file mode 100644 (file)
index 0000000..363edab
--- /dev/null
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Import custom op for plugin and register it in plugin factory registry."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.custom_plugin_examples import inc_op as import_inc_op_so
+from tensorflow.contrib.tensorrt.custom_plugin_examples.ops import gen_inc_op
+
+inc_op = gen_inc_op.inc_plugin_trt
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
new file mode 100644 (file)
index 0000000..a007c3f
--- /dev/null
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Loader for the custom inc_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+
+if platform.system() != "Windows":
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=g-import-not-at-top
+
+  _inc_op = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_inc_op.so"))
+else:
+  raise RuntimeError("Windows not supported")
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
new file mode 100644 (file)
index 0000000..988b35f
--- /dev/null
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+__global__ void VecInc(const float* vec, float inc, float* dest, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) dest[i] = vec[i] + inc;
+}
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream) {
+  int threads_per_block = 256;
+  int blocks_per_grid = (count + threads_per_block - 1) / threads_per_block;
+
+  VecInc<<<threads_per_block, blocks_per_grid, 0, stream>>>(d_input, inc,
+                                                            d_output, count);
+}
+
+// Note: this kernel definition is not needed in the plugin_test rule, but it is
+// required for correctness of the TF program, i.e. if not using plugin or when
+// run with trt optimization pass, the test should work.
+class IncPluginTRT : public OpKernel {
+ public:
+  explicit IncPluginTRT(OpKernelConstruction* context) : OpKernel(context) {
+    std::vector<float> inc_list;
+    OP_REQUIRES_OK(context, context->GetAttr("inc", &inc_list));
+    OP_REQUIRES(context, inc_list.size() == 1,
+                errors::InvalidArgument(
+                    "The increment list should contain single element."));
+    inc_ = inc_list[0];
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &output_tensor));
+    const cudaStream_t* stream = CHECK_NOTNULL(
+        reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                  ->stream()
+                                                  ->implementation()
+                                                  ->CudaStreamMemberHack()));
+    IncrementKernel(input_tensor.flat<float>().data(), inc_,
+                    output_tensor->flat<float>().data(),
+                    input_shape.num_elements(), *stream);
+  }
+
+ private:
+  float inc_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IncPluginTRT").Device(DEVICE_GPU), IncPluginTRT);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
new file mode 100644 (file)
index 0000000..c35955e
--- /dev/null
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
new file mode 100644 (file)
index 0000000..8d4c893
--- /dev/null
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+const char* kPluginName = "IncPluginTRT";
+
+IncOpPlugin* CreateIncPlugin() { return new IncOpPlugin(); }
+
+IncOpPlugin* CreateIncPluginDeserialize(const void* buffer, size_t length) {
+  return new IncOpPlugin(buffer, length);
+}
+
+REGISTER_TRT_PLUGIN(kPluginName, CreateIncPluginDeserialize, CreateIncPlugin);
+
+IncOpPlugin::IncOpPlugin() : plugin_name_(kPluginName) {}
+
+IncOpPlugin::IncOpPlugin(const void* serialized_data, size_t length)
+    : PluginTensorRT(serialized_data, length), plugin_name_(kPluginName) {
+  // account for the consumed pointer.
+  size_t consumed_data = PluginTensorRT::getSerializationSize();
+  assert(length - consumed_data >= sizeof(float));
+  const char* buffer = reinterpret_cast<const char*>(serialized_data);
+  SetAttribute("inc", buffer + consumed_data, sizeof(float));
+}
+
+bool IncOpPlugin::SetAttribute(const string& key, const void* ptr,
+                               const size_t size) {
+  if (strcmp(key.c_str(), "inc") == 0 && size == sizeof(float)) {
+    StoreAttribute(key, ptr, size);  // save the attribute to own the data;
+    inc_ = *static_cast<const float*>(ptr);
+    return true;
+  }
+  return false;
+}
+
+bool IncOpPlugin::GetAttribute(const string& key, const void** ptr,
+                               size_t* size) const {
+  const auto& iter = attr_map_.find(key);
+  if (iter != attr_map_.end()) {
+    *ptr = iter->second.data();
+    *size = iter->second.size();
+    return true;
+  }
+  return false;
+}
+
+int IncOpPlugin::enqueue(int batch_size, const void* const* inputs,
+                         void** outputs, void*, cudaStream_t stream) {
+  int count = 1;
+  for (int i = 0; i < input_dim_list_[0].nbDims; i++) {
+    count *= input_dim_list_[0].d[i];
+  }
+  count *= batch_size;
+  const float* input = reinterpret_cast<const float*>(inputs[0]);
+  float* output = reinterpret_cast<float*>(outputs[0]);
+  IncrementKernel(input, inc_, output, count, stream);
+  return 0;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
new file mode 100644 (file)
index 0000000..189e9c9
--- /dev/null
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+
+#include <cassert>
+#include <cstring>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class IncOpPlugin : public PluginTensorRT {
+ public:
+  IncOpPlugin();
+
+  IncOpPlugin(const void* serialized_data, size_t length);
+
+  const string& GetPluginName() const override { return plugin_name_; };
+
+  bool Finalize() override { return true; };
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override;
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override;
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int num_input_dims) override {
+    assert(index == 0);
+    assert(num_input_dims == 1);
+    return inputs[0];
+  }
+
+  // use configure to setup input dimensions
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override {
+    assert(num_inputs == 1);
+    PluginTensorRT::configure(inputs, num_inputs, outputs, num_outputs,
+                              max_batch_size);
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int max_batch_size) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+  size_t getSerializationSize() override {
+    return PluginTensorRT::getSerializationSize() + sizeof(float);
+  }
+
+  void serialize(void* buffer) override {
+    // Serialize parent data.
+    PluginTensorRT::serialize(buffer);
+    // Incremented buffer after parent serialization.
+    buffer =
+        static_cast<char*>(buffer) + PluginTensorRT::getSerializationSize();
+    std::memcpy(buffer, &inc_, sizeof(float));
+    buffer = static_cast<char*>(buffer) + sizeof(float);
+  }
+
+ protected:
+  float inc_;
+  nvinfer1::Dims dim_;
+
+ private:
+  const string plugin_name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
new file mode 100644 (file)
index 0000000..d0eb0d2
--- /dev/null
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+
+REGISTER_OP("IncPluginTRT")
+    .Attr("inc: list(float)")
+    .Input("input: float32")
+    .Output("output: float32")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
new file mode 100644 (file)
index 0000000..bc4d270
--- /dev/null
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to show usage of TensorRT custom op & plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib import tensorrt
+from tensorflow.contrib.tensorrt import custom_plugin_examples
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class TrtPluginTest(test_util.TensorFlowTestCase):
+
+  def _get_plugin_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = array_ops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      relu = nn.relu(a, "relu")
+      v = nn_ops.max_pool(
+          relu, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+
+      # insert custom_op in the graph
+      v = custom_plugin_examples.inc_op(v, inc=[16.5], name="plugin_test")
+
+      v *= 2.0
+      v = nn.relu(v)
+      v = nn.relu(v)
+      array_ops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def _run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+
+    with session.Session(
+        config=config_pb2.ConfigProto(gpu_options=gpu_options),
+        graph=g) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def testIncOpPlugin(self):
+    inp_dims = (5, 24, 24, 2)
+    dummy_input = numpy.ones(inp_dims).astype(numpy.float32)
+    orig_graph = self._get_plugin_graph_def()  # graph with plugin node
+
+    # trigger conversion.
+    # plugin nodes have been registered during import, converter will be able to
+    # create corresponding plugin layer during conversion.
+    trt_graph = tensorrt.create_inference_graph(
+        input_graph_def=orig_graph,
+        outputs=["output"],
+        max_batch_size=inp_dims[0],
+        max_workspace_size_bytes=1 << 25,
+        precision_mode="FP32",
+        minimum_segment_size=2)
+    o2 = self._run_graph(trt_graph, dummy_input)
+    self.assertEqual(35, o2.reshape([-1])[0])
+
+
+if __name__ == "__main__":
+  test.main()
index b8f881c..9ac8047 100644 (file)
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -32,38 +33,40 @@ namespace tensorrt {
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
-  string serialized_engine;
   OP_REQUIRES_OK(context,
-                 context->GetAttr("serialized_engine", &serialized_engine));
+                 context->GetAttr("serialized_engine", &serialized_engine_));
 
   // register input output node name in trt_sub_graph
   OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
   OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+}
 
-  // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  // Only engine should be in the op and context and runtime should be taken
-  // from resourcemanager
-  // TODO(jie): cudaSetDevice make sure trt engine is allocated on the same
-  // gpu where the input/output is also located.
-  int gpu_id = context->device()->tensorflow_gpu_device_info()->gpu_id;
-  cudaSetDevice(gpu_id);
-  int device;
-  cudaGetDevice(&device);
-  if (gpu_id != device) LOG(FATAL) << "set device failed!";
-
+void TRTEngineOp::Compute(OpKernelContext* context) {
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
   // Only engine should be in the op and context and runtime should be taken
   // from resourcemanager
 
-  IRuntime* infer = nvinfer1::createInferRuntime(logger);
-  trt_engine_ptr_.reset(infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr));
-  trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
-  // Runtime is safe to delete after engine creation
-  infer->destroy();
-}
-
-void TRTEngineOp::Compute(OpKernelContext* context) {
+  if (!trt_execution_context_ptr_) {
+    IRuntime* infer = nvinfer1::createInferRuntime(logger);
+#if NV_TENSORRT_MAJOR > 3
+    auto device = context->device();
+    auto dev_allocator =
+        device->GetAllocator(tensorflow::AllocatorAttributes());
+    if (!dev_allocator) {
+      LOG(FATAL) << "Can't find device allocator for gpu device "
+                 << device->name();
+    }
+    allocator_ = std::make_shared<TRTDeviceAllocator>(dev_allocator);
+    infer->setGpuAllocator(allocator_.get());
+#endif
+    trt_engine_ptr_.reset(infer->deserializeCudaEngine(
+        serialized_engine_.c_str(), serialized_engine_.size(),
+        PluginFactoryTensorRT::GetInstance()));
+    trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
+    // Runtime is safe to delete after engine creation
+    infer->destroy();
+    serialized_engine_.clear();
+  }
   int num_binding = context->num_inputs() + context->num_outputs();
   std::vector<void*> buffers(num_binding);
 
@@ -154,7 +157,12 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
   VLOG(2) << "enqueue returns: " << ret;
   // sync should be done by TF.
 }
-
+TRTEngineOp::~TRTEngineOp() {
+  // Order matters!
+  trt_execution_context_ptr_.reset();
+  trt_engine_ptr_.reset();
+  allocator_.reset();
+}
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 
 }  // namespace tensorrt
index 0964b4b..e613a71 100644 (file)
@@ -17,25 +17,28 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 
 #include <memory>
-#include <string>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 class Logger;
 
+//  TODO(Sami): Remove this file?
 class TRTEngineOp : public OpKernel {
  public:
   explicit TRTEngineOp(OpKernelConstruction* context);
 
   void Compute(OpKernelContext* context) override;
+  ~TRTEngineOp();
 
  private:
   template <typename T>
@@ -51,6 +54,8 @@ class TRTEngineOp : public OpKernel {
 
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  string serialized_engine_;
 };
 
 }  // namespace tensorrt
index 7f3544f..96ccacb 100644 (file)
@@ -28,7 +28,7 @@ namespace tensorrt {
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
  public:
-  Logger(string name = "DefaultLogger") : name_(name){};
+  Logger(string name = "DefaultLogger") : name_(name) {}
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
  private:
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
new file mode 100644 (file)
index 0000000..062f86e
--- /dev/null
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include <cassert>
+#include <cstring>
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT::PluginTensorRT(const void* serialized_data, size_t length) {
+  const char* buffer = static_cast<const char*>(serialized_data);
+  size_t op_name_char_count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+  buffer += op_name_char_count;
+
+  size_t count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+
+  for (int i = 0; i < count; i++) {
+    nvinfer1::Dims dim;
+    std::memcpy(&(dim.nbDims), buffer, sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(dim.d, buffer, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(dim.type, buffer, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+void PluginTensorRT::configure(const nvinfer1::Dims* inputs, int num_inputs,
+                               const nvinfer1::Dims* outputs, int num_outputs,
+                               int max_batch_size) {
+  for (int index = 0; index < num_inputs; index++) {
+    nvinfer1::Dims dim;
+    dim.nbDims = inputs[index].nbDims;
+    for (int i = 0; i < dim.nbDims; i++) {
+      dim.d[i] = inputs[index].d[i];
+      dim.type[i] = inputs[index].type[i];
+    }
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+size_t PluginTensorRT::getSerializationSize() {
+  nvinfer1::Dims dim;
+  return sizeof(size_t) + GetPluginName().size() +
+         sizeof(input_dim_list_.size()) + sizeof(dim.nbDims) + sizeof(dim.d) +
+         sizeof(dim.type);
+}
+
+void PluginTensorRT::serialize(void* serialized_data) {
+  size_t op_name_size = GetPluginName().size();
+  char* buffer = static_cast<char*>(serialized_data);
+  std::memcpy(buffer, &op_name_size, sizeof(size_t));
+  buffer += sizeof(size_t);
+
+  std::memcpy(buffer, GetPluginName().data(), op_name_size);
+  buffer += op_name_size;
+
+  auto list_size = input_dim_list_.size();
+  std::memcpy(buffer, &list_size, sizeof(input_dim_list_.size()));
+  buffer += sizeof(input_dim_list_.size());
+
+  for (int i = 0; i < input_dim_list_.size(); i++) {
+    auto dim = input_dim_list_[i];
+    std::memcpy(buffer, &(dim.nbDims), sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(buffer, dim.d, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(buffer, dim.type, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+  }
+}
+
+bool PluginTensorRT::StoreAttribute(const string& key, const void* ptr,
+                                    const size_t size) {
+  if (attr_map_.count(key) != 0) return false;
+
+  attr_map_.emplace(key, std::vector<char>(size));
+  std::memcpy(attr_map_[key].data(), ptr, size);
+  return true;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
new file mode 100644 (file)
index 0000000..754920b
--- /dev/null
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// A wrapper class for TensorRT plugin
+// User application should inherit from this class to write custom kernels.
+// Allows user to insert custom op in TensorRT engine
+// To register plugin in converter, user should also register custom
+// PluginDeserializeFunc & PluginConstructFunc through PluginFactoryTensorRT
+class PluginTensorRT : public nvinfer1::IPlugin {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length);
+
+  virtual const string& GetPluginName() const = 0;
+
+  virtual bool Finalize() = 0;
+
+  virtual bool SetAttribute(const string& key, const void* ptr,
+                            const size_t size) = 0;
+  virtual bool GetAttribute(const string& key, const void** ptr,
+                            size_t* size) const = 0;
+
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override;
+
+  virtual bool StoreAttribute(const string& key, const void* ptr,
+                              const size_t size);
+
+  size_t getSerializationSize() override;
+
+  void serialize(void* buffer) override;
+
+ protected:
+  std::unordered_map<string, std::vector<char> > attr_map_;
+
+  std::vector<nvinfer1::Dims> input_dim_list_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
new file mode 100644 (file)
index 0000000..2bc5914
--- /dev/null
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  size_t parsed_byte = 0;
+  // extract op_name from serial_data
+  string encoded_op_name =
+      ExtractOpName(serial_data, serial_length, &parsed_byte);
+
+  if (!IsPlugin(encoded_op_name)) {
+    return nullptr;
+  }
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr =
+      plugin_registry_[encoded_op_name].first(serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string& op_name) {
+  if (!IsPlugin(op_name)) return nullptr;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr = plugin_registry_[op_name].second();
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const string& op_name, PluginDeserializeFunc deserialize_func,
+    PluginConstructFunc construct_func) {
+  if (IsPlugin(op_name)) return false;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto ret = plugin_registry_.emplace(
+      op_name, std::make_pair(deserialize_func, construct_func));
+
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() {
+  tensorflow::mutex_lock lock(instance_m_);
+  for (auto& owned_plugin_ptr : owned_plugins_) {
+    owned_plugin_ptr.release();
+  }
+  owned_plugins_.clear();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
new file mode 100644 (file)
index 0000000..bbae9fb
--- /dev/null
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
+ public:
+  // TODO(aaroey): this static method has to be inlined to make the singleton a
+  // unique global symbol. Find a way to fix it.
+  static PluginFactoryTensorRT* GetInstance() {
+    static PluginFactoryTensorRT* factory_instance =
+        new PluginFactoryTensorRT();
+    return factory_instance;
+  }
+
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  // Plugin construction, PluginFactoryTensorRT owns the plugin.
+  PluginTensorRT* CreatePlugin(const string& op_name);
+
+  bool RegisterPlugin(const string& op_name,
+                      PluginDeserializeFunc deserialize_func,
+                      PluginConstructFunc construct_func);
+
+  bool IsPlugin(const string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  size_t CountOwnedPlugins() { return owned_plugins_.size(); }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<string,
+                     std::pair<PluginDeserializeFunc, PluginConstructFunc>>
+      plugin_registry_;
+
+  // TODO(jie): Owned plugin should be associated with different sessions;
+  //            should really hand ownership of plugins to resource management;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+  tensorflow::mutex instance_m_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const string& name, PluginDeserializeFunc deserialize_func,
+                     PluginConstructFunc construct_func) {
+    auto factory = PluginFactoryTensorRT::GetInstance();
+    QCHECK(factory->RegisterPlugin(name, deserialize_func, construct_func))
+        << "Failed to register plugin: " << name;
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func)    \
+  REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \
+                                  construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \
+                                        construct_func)              \
+  REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
+  static ::tensorflow::tensorrt::TrtPluginRegistrar trt_plugin_registrar##ctr \
+      TF_ATTRIBUTE_UNUSED = ::tensorflow::tensorrt::TrtPluginRegistrar(       \
+          name, deserialize_func, construct_func)
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
new file mode 100644 (file)
index 0000000..129bdcd
--- /dev/null
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace test {
+
+class StubPlugin : public PluginTensorRT {
+ public:
+  static const char* kPluginName;
+
+  StubPlugin() : plugin_name_(kPluginName) {}
+
+  StubPlugin(const void* serialized_data, size_t length)
+      : PluginTensorRT(serialized_data, length) {}
+
+  const string& GetPluginName() const override { return plugin_name_; }
+
+  bool Finalize() override { return true; }
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override {
+    return true;
+  }
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override {
+    return true;
+  }
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) override {
+    return inputs[0];
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override {
+    return 0;
+  }
+
+ private:
+  const string plugin_name_;
+};
+
+const char* StubPlugin::kPluginName = "StubPlugin";
+
+StubPlugin* CreateStubPlugin() { return new StubPlugin(); }
+
+StubPlugin* CreateStubPluginDeserialize(const void* serialized_data,
+                                        size_t length) {
+  return new StubPlugin(serialized_data, length);
+}
+
+class TrtPluginFactoryTest : public ::testing::Test {
+ public:
+  bool RegisterStubPlugin() {
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(
+            StubPlugin::kPluginName)) {
+      return true;
+    }
+    return PluginFactoryTensorRT::GetInstance()->RegisterPlugin(
+        StubPlugin::kPluginName, CreateStubPluginDeserialize, CreateStubPlugin);
+  }
+};
+
+TEST_F(TrtPluginFactoryTest, Registration) {
+  EXPECT_FALSE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+  EXPECT_TRUE(RegisterStubPlugin());
+
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+}
+
+TEST_F(TrtPluginFactoryTest, CreationDeletion) {
+  EXPECT_TRUE(RegisterStubPlugin());
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_TRUE(PluginFactoryTensorRT::GetInstance()->CreatePlugin(
+      StubPlugin::kPluginName));
+  ASSERT_EQ(1, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_EQ(0, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+}
+
+}  // namespace test
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
new file mode 100644 (file)
index 0000000..a8f6088
--- /dev/null
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include <cassert>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental) {
+  size_t op_name_char_count = *static_cast<const size_t*>(serial_data);
+  *incremental = sizeof(size_t) + op_name_char_count;
+
+  assert(serial_length >= *incremental);
+
+  const char* buffer = static_cast<const char*>(serial_data) + sizeof(size_t);
+  string op_name(buffer, op_name_char_count);
+
+  return op_name;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
new file mode 100644 (file)
index 0000000..274ce42
--- /dev/null
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
+// TODO(jie): work on error handling here
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
new file mode 100644 (file)
index 0000000..0f05083
--- /dev/null
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#if NV_TENSORRT_MAJOR > 2
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
+                                 uint32_t flags) {
+  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
+  void* memory;
+  cudaMalloc(&memory, size);
+  return memory;
+}
+
+void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
+
+void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
+                                   uint32_t flags) {
+  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
+  void* mem = allocator_->AllocateRaw(alignment, size);
+  VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
+          << " @ " << mem;
+  return mem;
+}
+
+TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
+    : allocator_(allocator) {
+  VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
+}
+
+void TRTDeviceAllocator::free(void* memory) {
+  VLOG(2) << "Deallocating " << memory;
+  allocator_->DeallocateRaw(memory);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
new file mode 100644 (file)
index 0000000..a0c2540
--- /dev/null
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/framework/allocator.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+#if NV_TENSORRT_MAJOR == 3
+// Define interface here temporarily until TRT 4.0 is released
+namespace nvinfer1 {
+class IGpuAllocator {
+ public:
+  virtual void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) = 0;
+  virtual void free(void* memory) = 0;
+};
+}  // namespace nvinfer1
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+
+class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
+  // Allocator implementation that is using cuda allocator instead of device
+  // allocator in case we can't get device allocator from TF.
+ public:
+  TRTCudaAllocator() {}
+  virtual ~TRTCudaAllocator() {}
+  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
+  void free(void* memory) override;
+};
+
+class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
+  // Allocator implementation wrapping TF device allocators.
+ public:
+  TRTDeviceAllocator(tensorflow::Allocator* allocator);
+  virtual ~TRTDeviceAllocator() {}
+  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
+  void free(void* memory) override;
+
+ private:
+  tensorflow::Allocator* allocator_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
index 3c85968..e346912 100644 (file)
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
 
 #include <list>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <vector>
+
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -40,6 +43,11 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
         engine_(nullptr),
         logger_(nullptr),
         thr_(nullptr) {}
+
+  ~TRTCalibrationResource() {
+    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  }
+
   string DebugString() override {
     std::stringstream oss;
     oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
@@ -47,16 +55,17 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
         << " Network    = " << std::hex << network_ << std::dec << std::endl
         << " Engine     = " << std::hex << engine_ << std::dec << std::endl
         << " Logger     = " << std::hex << logger_ << std::dec << std::endl
+        << " Allocator  = " << std::hex << allocator_.get() << std::dec
+        << std::endl
         << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
     return oss.str();
   }
-  ~TRTCalibrationResource() {
-    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
-  }
+
   TRTInt8Calibrator* calibrator_;
   nvinfer1::IBuilder* builder_;
   nvinfer1::INetworkDefinition* network_;
   nvinfer1::ICudaEngine* engine_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
   tensorflow::tensorrt::Logger* logger_;
   // TODO(sami): Use threadpool threads!
   std::thread* thr_;
@@ -65,31 +74,28 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
 class TRTWeightStore : public tensorflow::ResourceBase {
  public:
   TRTWeightStore() {}
-  std::list<std::vector<uint8_t>> store_;
+
+  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
+
   string DebugString() override {
     std::stringstream oss;
-    size_t lenBytes = 0;
+    size_t len_bytes = 0;
     for (const auto& v : store_) {
-      lenBytes += v.size() * sizeof(uint8_t);
+      len_bytes += v.size() * sizeof(uint8_t);
     }
     oss << " Number of entries     = " << store_.size() << std::endl
         << " Total number of bytes = "
-        << store_.size() * sizeof(std::vector<uint8_t>) + lenBytes << std::endl;
+        << store_.size() * sizeof(std::vector<uint8_t>) + len_bytes
+        << std::endl;
     return oss.str();
   }
-  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
-};
 
-class TRTEngineResource : public tensorflow::ResourceBase {
- public:
-  TRTEngineResource() : runtime_(nullptr), ctx_(nullptr){};
-  string DebugString() override { return string(""); }
-  nvinfer1::IRuntime* runtime_;
-  nvinfer1::IExecutionContext* ctx_;
+  std::list<std::vector<uint8_t>> store_;
 };
 
 }  // namespace tensorrt
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCEMGR_TRTRESOURCES_H_
+
 #endif
 #endif
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
index 8fc4697..cc42913 100644 (file)
@@ -25,18 +25,239 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
+using ::tensorflow::strings::StrAppend;
+// A simple graph representation to mirror tensorflow::Graph. This structure
+// helps saving memory since segmenter modifies the graph in place, preventing
+// the need to create a copy of the graph. It is composed of edges and nodes.
+// Nodes keep pointers to original TF nodes.
+class SimpleNode;
+class SimpleGraph;
+class SimpleEdge {
+ public:
+  SimpleEdge(int id, SimpleNode* src, int src_port, SimpleNode* dst,
+             int dst_port, bool is_control = false)
+      : id_(id),
+        src_(src),
+        src_port_(src_port),
+        dst_(dst),
+        dst_port_(dst_port),
+        control_(is_control) {}
+  ~SimpleEdge() {}
+
+  SimpleNode* src() const { return src_; }
+  SimpleNode* dst() const { return dst_; }
+  int src_output() const { return src_port_; }
+  int dst_input() const { return dst_port_; }
+  int id() const { return id_; }
+  bool IsControlEdge() const { return control_; }
+
+ private:
+  int id_;
+  SimpleNode* src_;
+  int src_port_;
+  SimpleNode* dst_;
+  int dst_port_;
+  bool control_;
+};
+
+class SimpleNode {
+ public:
+  SimpleNode(const tensorflow::Node* node, const int id);
+
+  const std::vector<SimpleEdge*>& in_edges() const { return in_edges_; }
+  const std::vector<SimpleEdge*>& out_edges() const { return out_edges_; }
+  std::vector<SimpleNode*> in_nodes() const {
+    std::vector<SimpleNode*> res;
+    res.reserve(in_edges_.size());
+    for (const auto e : in_edges_) {
+      if (e) res.push_back(e->src());
+    }
+    return res;
+  }
+  const string& name() const { return node_->name(); }
+  const tensorflow::Node* tf_node() const { return node_; }
+  int id() const { return id_; }
+
+ private:
+  const tensorflow::Node* node_;
+  std::vector<SimpleEdge*> in_edges_;
+  std::vector<SimpleEdge*> out_edges_;
+  int id_;
+
+  friend class SimpleGraph;
+};
+
+class SimpleGraph {
+ public:
+  explicit SimpleGraph(const tensorflow::Graph* g);
+  ~SimpleGraph();
+
+  void AddControlEdge(SimpleNode* src, SimpleNode* dst);
+  void AddEdge(SimpleNode* src, int out_port, SimpleNode* dst, int in_port);
+  void RemoveEdge(const SimpleEdge*);
+  SimpleNode* FindNodeId(int node_id) {
+    if (node_id < 0 || node_id > static_cast<int>(nodes_.size())) {
+      return nullptr;
+    }
+    return nodes_[node_id];
+  }
+  int num_node_ids() const { return nodes_.size(); }
+  const SimpleNode* source_node() const {
+    return nodes_[tensorflow::Graph::kSourceId];
+  }
+  const SimpleNode* sink_node() const {
+    return nodes_[tensorflow::Graph::kSinkId];
+  }
+
+ private:
+  const tensorflow::Graph* g_;
+  std::vector<SimpleNode*> nodes_;
+  std::vector<SimpleEdge*> edges_;
+  // free_edge_ids_ and free_node_ids_ contain freed indices.
+  std::set<int> free_edge_ids_;
+  std::set<int> free_node_ids_;
+};
+
+SimpleNode::SimpleNode(const tensorflow::Node* node, const int id)
+    : node_(node), id_(id) {
+  if (node_) {
+    in_edges_.reserve(node_->in_edges().size());
+    out_edges_.reserve(node_->out_edges().size());
+  }
+}
+
+SimpleGraph::SimpleGraph(const tensorflow::Graph* g) : g_(g) {
+  int n_nodes = g_->num_node_ids();
+  nodes_.resize(n_nodes, nullptr);
+  nodes_[g->kSourceId] = new SimpleNode(g->source_node(), g->kSourceId);
+  nodes_[g->kSinkId] = new SimpleNode(g->sink_node(), g->kSinkId);
+  int n_edges = g->num_edge_ids();
+  edges_.resize(n_edges, nullptr);
+  for (int i = 2; i < n_nodes; i++) {
+    const auto n = g->FindNodeId(i);
+    if (n) {
+      nodes_[i] = new SimpleNode(n, i);
+    } else {
+      free_node_ids_.insert(i);
+    }
+  }
+  for (int i = 0; i < n_edges; i++) {
+    const auto e = g->FindEdgeId(i);
+    if (e) {
+      const auto tfsrc = e->src();
+      const auto tfdst = e->dst();
+      bool is_control = e->IsControlEdge();
+      auto src = nodes_[tfsrc->id()];
+      auto dst = nodes_[tfdst->id()];
+      auto edge = new SimpleEdge(i, src, e->src_output(), dst, e->dst_input(),
+                                 is_control);
+      edges_[i] = edge;
+      src->out_edges_.push_back(edge);
+      dst->in_edges_.push_back(edge);
+    } else {
+      free_edge_ids_.insert(i);
+    }
+  }
+}
+
+void SimpleGraph::AddEdge(SimpleNode* src, int out_port, SimpleNode* dst,
+                          int in_port) {
+  int i = edges_.size();
+  if (!free_edge_ids_.empty()) {
+    auto it = free_edge_ids_.begin();
+    i = *it;
+    free_edge_ids_.erase(it);
+  } else {
+    edges_.push_back(nullptr);
+  }
+  bool is_control = (out_port == tensorflow::Graph::kControlSlot);
+  is_control |= (in_port == tensorflow::Graph::kControlSlot);
+  auto edge = new SimpleEdge(i, src, out_port, dst, in_port, is_control);
+  edges_[i] = edge;
+  src->out_edges_.push_back(edge);
+  dst->in_edges_.push_back(edge);
+}
+
+void SimpleGraph::AddControlEdge(SimpleNode* src, SimpleNode* dst) {
+  AddEdge(src, tensorflow::Graph::kControlSlot, dst,
+          tensorflow::Graph::kControlSlot);
+}
+
+void SimpleGraph::RemoveEdge(const SimpleEdge* edge) {
+  auto src = edge->src();
+  auto dst = edge->dst();
+  for (auto it = src->out_edges_.begin(); it != src->out_edges_.end(); ++it) {
+    if (*it == edge) {
+      src->out_edges_.erase(it);
+      break;
+    }
+  }
+  for (auto it = dst->in_edges_.begin(); it != dst->in_edges_.end(); ++it) {
+    if (*it == edge) {
+      dst->in_edges_.erase(it);
+      break;
+    }
+  }
+}
+
+SimpleGraph::~SimpleGraph() {
+  for (auto x : nodes_) delete x;
+  for (auto x : edges_) delete x;
+}
 
 namespace {
 
-bool CanContractEdge(const tensorflow::Edge* edge,
-                     const tensorflow::Graph& graph) {
-  const tensorflow::Node* src = edge->src();
-  const tensorflow::Node* dst = edge->dst();
+bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
+                 const std::vector<SimpleNode*>& start) {
+  // copied from TF ReverseDFS.
+  struct Work {
+    SimpleNode* node;
+    bool leave;  // Are we entering or leaving n?
+  };
+
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
+
+  std::vector<bool> visited(g->num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+
+    auto n = w.node;
+    if (w.leave) {
+      if (n == src) {
+        return true;
+      }
+      continue;
+    }
+
+    if (visited[n->id()]) continue;
+    visited[n->id()] = true;
+    // Arrange to call leave(n) when all done with descendants.
+    stack.push_back(Work{n, true});
+
+    auto nodes = n->in_nodes();
+    for (const auto node : nodes) {
+      if (!visited[node->id()]) {
+        stack.push_back(Work{node, false});
+      }
+    }
+  }
+  return false;
+}
+
+bool CanContractEdge(const SimpleEdge* edge,
+                     const std::unique_ptr<SimpleGraph>& graph) {
+  const auto src = edge->src();
+  const auto dst = edge->dst();
 
   // Can't contract edge if doing so would cause a cycle in the
   // graph. So, if there is a directed path from 'src' to 'dst', other
@@ -48,46 +269,38 @@ bool CanContractEdge(const tensorflow::Edge* edge,
   //   1. Get all nodes incoming to 'dst', excluding 'src'
   //   2. Reverse DFS from those nodes
   //   3. If reverse DFS reaches 'src' then we have a cycle
-  std::vector<tensorflow::Node*> dfs_start_nodes;
-  for (tensorflow::Node* node : dst->in_nodes()) {
+  std::vector<SimpleNode*> dfs_start_nodes;
+  for (SimpleNode* node : dst->in_nodes()) {
     if (node != src) {
       dfs_start_nodes.push_back(node);
     }
   }
 
-  bool is_cycle = false;
-  if (!dfs_start_nodes.empty()) {
-    tensorflow::ReverseDFSFrom(graph, dfs_start_nodes, {},
-                               [&is_cycle, src](tensorflow::Node* node) {
-                                 if (node == src) {
-                                   is_cycle = true;
-                                 }
-                               });
-  }
-
+  bool is_cycle = CheckCycles(graph, src, dfs_start_nodes);
   return !is_cycle;
 }
+}  // namespace
 
-void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
-                  std::vector<const tensorflow::Edge*>* remove_edges) {
+void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
+                  std::vector<const SimpleEdge*>* remove_edges) {
   // Transfer all inputs and outputs of 'dst' to 'src' except edges
   // connecting the two.
-  tensorflow::Node* src = edge->src();
-  tensorflow::Node* dst = edge->dst();
+  auto src = edge->src();
+  auto dst = edge->dst();
 
   // We can use '0' for input/output index because we don't need them
   // to be accurate for the way we are using the graph.
-  std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
-                                                dst->in_edges().end());
-  for (const tensorflow::Edge* in_edge : in_edges) {
+  std::vector<const SimpleEdge*> in_edges(dst->in_edges().begin(),
+                                          dst->in_edges().end());
+  for (const SimpleEdge* in_edge : in_edges) {
     if (in_edge->IsControlEdge()) {
       if (in_edge->src() != src) {
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         graph->AddControlEdge(e->src(), src);
       }
     } else {
       if (in_edge->src() != src) {
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         if (e->src() == graph->source_node()) {
           graph->AddEdge(e->src(), e->src_output(), src,
                          tensorflow::Graph::kControlSlot);
@@ -98,14 +311,14 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
     }
   }
 
-  std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
-                                                 dst->out_edges().end());
-  for (const tensorflow::Edge* out_edge : out_edges) {
+  std::vector<const SimpleEdge*> out_edges(dst->out_edges().begin(),
+                                           dst->out_edges().end());
+  for (const SimpleEdge* out_edge : out_edges) {
     if (out_edge->IsControlEdge()) {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      SimpleEdge* e = const_cast<SimpleEdge*>(out_edge);
       graph->AddControlEdge(src, e->dst());
     } else {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      SimpleEdge* e = const_cast<SimpleEdge*>(out_edge);
       if (e->dst() == graph->sink_node()) {
         VLOG(1) << " edge to sink node " << src->name() << " -> "
                 << e->dst()->name();
@@ -128,8 +341,6 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   }
 }
 
-}  // namespace
-
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
@@ -140,17 +351,22 @@ tensorflow::Status SegmentGraph(
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
+  return SegmentGraph(&graph, candidate_fn, options, segments);
+}
 
-  // tensorflow::DumpGraph("Pre-Segment", &graph);
-
+tensorflow::Status SegmentGraph(
+    tensorflow::Graph* tf_graph,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments) {
+  auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
   // Use a union-find to collect the nodes that belong to the same
-  // segment. A node value of nullptr indicates that the node is not a
-  // candidate for TRT.
-  std::vector<UnionFind<tensorflow::Node*>> node_segments;
-  for (int i = 0; i < graph.num_node_ids(); ++i) {
-    tensorflow::Node* node = graph.FindNodeId(i);
+  // segment. A node value of nullptr indicates that the node is not a candidate
+  // for TRT.
+  std::vector<UnionFind<SimpleNode*>> node_segments;
+  for (int i = 0; i < graph->num_node_ids(); ++i) {
+    SimpleNode* node = graph->FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node)) {
+        !candidate_fn(node->tf_node())) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -164,10 +380,16 @@ tensorflow::Status SegmentGraph(
   // a measure of how beneficial it is to include a given node in a
   // TRT subgraph then we can revisit this algorithm to take advantage
   // of that information.
-  std::vector<tensorflow::Node*> order;
-  tensorflow::GetPostOrder(graph, &order);
-
-  for (const tensorflow::Node* node : order) {
+  std::vector<tensorflow::Node*> tforder;
+  tensorflow::GetPostOrder(*tf_graph, &tforder);
+  // use postorder implementation from tensorflow and construct mirror in
+  // internal format
+  std::vector<SimpleNode*> order;
+  order.reserve(tforder.size());
+  for (const auto tfnode : tforder) {
+    order.push_back(graph->FindNodeId(tfnode->id()));
+  }
+  for (const SimpleNode* node : order) {
     // All output nodes of 'node' have been visited...
     VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
 
@@ -181,8 +403,8 @@ tensorflow::Status SegmentGraph(
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
     while (true) {
-      std::set<const tensorflow::Edge*> contract_edges;
-      for (const tensorflow::Edge* out_edge : node->out_edges()) {
+      std::set<const SimpleEdge*> contract_edges;
+      for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(2) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
         if (out_edge->IsControlEdge()) {
@@ -210,9 +432,9 @@ tensorflow::Status SegmentGraph(
       // Contract edges and collect the adjacent nodes into the same
       // segment/subgraph.
       while (!contract_edges.empty()) {
-        const tensorflow::Edge* contract_edge = *contract_edges.begin();
-        const tensorflow::Node* src = contract_edge->src();
-        const tensorflow::Node* dst = contract_edge->dst();
+        const SimpleEdge* contract_edge = *contract_edges.begin();
+        const SimpleNode* src = contract_edge->src();
+        const SimpleNode* dst = contract_edge->dst();
 
         VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " ("
                 << src->id() << " <- " << dst->id();
@@ -221,13 +443,13 @@ tensorflow::Status SegmentGraph(
         // Contracting the edge leaves disconnected graph edges.
         // Remove these from the graph and from 'contract_edges' so we
         // don't visit them again.
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(contract_edge);
-        std::vector<const tensorflow::Edge*> remove_edges;
-        ContractEdge(e, &graph, &remove_edges);
+        SimpleEdge* e = const_cast<SimpleEdge*>(contract_edge);
+        std::vector<const SimpleEdge*> remove_edges;
+        ContractEdge(e, graph.get(), &remove_edges);
 
-        for (const tensorflow::Edge* r : remove_edges) {
+        for (const SimpleEdge* r : remove_edges) {
           contract_edges.erase(r);
-          graph.RemoveEdge(r);
+          graph->RemoveEdge(r);
         }
       }
     }
@@ -236,9 +458,27 @@ tensorflow::Status SegmentGraph(
   // Collect the segments/subgraphs. Each subgraph is represented by a
   // set of the names of the nodes in that subgraph.
   std::unordered_map<string, std::set<string>> sg_map;
+  std::unordered_map<string, std::set<string>> device_maps;
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
       sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+      auto tf_node = u.Value()->tf_node();
+      // has_assigned_device_name() is expected to return true
+      // when called from optimization pass. However, since graph
+      // is converted back and forth between graph and graphdef,
+      // assigned devices demoted to requested devices. If the graph
+      // is passed directly to this module, assigned devices will be set.
+      if (tf_node->has_assigned_device_name()) {
+        device_maps[u.ParentValue()->name()].insert(
+            tf_node->assigned_device_name());
+      } else if (!tf_node->requested_device().empty()) {
+        device_maps[u.ParentValue()->name()].insert(
+            tf_node->requested_device());
+      } else {
+        VLOG(1) << "Node " << tf_node->name()
+                << " has no device assigned requested device is: "
+                << tf_node->requested_device();
+      }
     }
   }
 
@@ -260,10 +500,35 @@ tensorflow::Status SegmentGraph(
               << segment_node_names.size() << " nodes, dropping";
       continue;
     }
-
-    segments->emplace_back(segment_node_names);
+    // TODO(sami): Make segmenter placement aware once trtscopes are in place
+    const auto& dev_itr = device_maps.find(itr.first);
+    if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
+      VLOG(1) << "No device assigned to segment " << segments->size();
+      segments->emplace_back(std::make_pair(segment_node_names, string()));
+    } else if (dev_itr->second.size() > 1) {
+      string s("Segment ");
+      StrAppend(&s, segments->size(), " has multiple devices attached: ");
+      for (const auto& dev : dev_itr->second) {
+        StrAppend(&s, dev, ", ");
+      }
+      LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
+      segments->emplace_back(
+          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+    } else {
+      segments->emplace_back(
+          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+    }
+  }
+  if (VLOG_IS_ON(1)) {
+    for (const auto& d : device_maps) {
+      string s("Segment ");
+      StrAppend(&s, ": '", d.first, "' ");
+      for (const auto& dd : d.second) {
+        StrAppend(&s, dd, ", ");
+      }
+      VLOG(1) << "Devices " << s;
+    }
   }
-
   return tensorflow::Status::OK();
 }
 
index 7e8685f..1568dd9 100644 (file)
@@ -29,7 +29,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-using SegmentNodesVector = std::vector<std::set<string>>;
+// vector of segments, each entry contains a device name and a set of nodes in
+// segment
+using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
@@ -51,6 +53,20 @@ tensorflow::Status SegmentGraph(
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
+// Get the subgraphs of a graph that can be handled by TensorRT.
+//
+// @param graph tensorflow::Graph of the network
+// @param candidate_fn A function that returns true for a Node* if
+// that node can be handled by TensorRT.
+// @param segments Returns the TensorRT segments/subgraphs. Each entry
+// in the vector describes a subgraph by giving a set of the names of
+// all the NodeDefs in that subgraph.
+// @return the status.
+tensorflow::Status SegmentGraph(
+    tensorflow::Graph* tf_graph,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments);
+
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
index 6f7655f..2de3923 100644 (file)
@@ -34,7 +34,7 @@ class SegmentTest : public ::testing::Test {
   TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                     TF_Status* s, const char* name);
 
-  std::function<bool(const Node*)> MakeCandidateFn(
+  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
       const std::set<string>& node_names);
 
  protected:
@@ -59,9 +59,9 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
   return ret;
 }
 
-std::function<bool(const Node*)> SegmentTest::MakeCandidateFn(
+std::function<bool(const tensorflow::Node*)> SegmentTest::MakeCandidateFn(
     const std::set<string>& node_names) {
-  return [node_names](const Node* node) -> bool {
+  return [node_names](const tensorflow::Node* node) -> bool {
     return node_names.find(node->name()) != node_names.end();
   };
 }
@@ -164,7 +164,7 @@ TEST_F(SegmentTest, Simple) {
   ASSERT_EQ(segments.size(), 1);
   std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
   for (const auto& ex : expected) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
@@ -277,13 +277,13 @@ TEST_F(SegmentTest, Multiple) {
 
   std::vector<string> expected0{"add0", "add1", "add2", "add3"};
   for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
   std::vector<string> expected1{"add6", "add8"};
   for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
@@ -347,13 +347,13 @@ TEST_F(SegmentTest, BigIfElse) {
 
   std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
   for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
   std::vector<string> expected1{"add0", "add1"};
   for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
index 8b47517..f36495f 100644 (file)
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
@@ -33,7 +34,8 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
   nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr);
+      serialized_engine.c_str(), serialized_engine.size(),
+      tensorrt::PluginFactoryTensorRT::GetInstance());
 
   int num_batch = -1;
   std::vector<::tensorflow::DataType> input_type;
index ad01bed..175ccd8 100644 (file)
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import numpy as np
+
 # normally we should do import tensorflow as tf and then
 # tf.placeholder, tf.constant, tf.nn.conv2d etc but
 # it looks like internal builds don't like it so
@@ -26,6 +28,7 @@ import numpy as np
 
 from tensorflow.contrib import tensorrt as trt
 from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.core.protobuf import rewriter_config_pb2 as rwpb2
 from tensorflow.python.client import session as csess
 from tensorflow.python.framework import constant_op as cop
 from tensorflow.python.framework import dtypes as dtypes
@@ -59,9 +62,11 @@ def get_simple_graph_def():
   return g.as_graph_def()
 
 
-def run_graph(gdef, dumm_inp):
+def execute_graph(gdef, dumm_inp):
   """Run given graphdef once."""
+  print("executing")
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
   ops.reset_default_graph()
   g = ops.Graph()
   with g.as_default():
@@ -69,15 +74,14 @@ def run_graph(gdef, dumm_inp):
         graph_def=gdef, return_elements=["input", "output"])
     inp = inp.outputs[0]
     out = out.outputs[0]
-  with csess.Session(
-      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+  with csess.Session(config=sessconfig, graph=g) as sess:
     val = sess.run(out, {inp: dumm_inp})
   return val
 
 
 # Use real data that is representative of the inference dataset
 # for calibration. For this test script it is random data.
-def run_calibration(gdef, dumm_inp):
+def execute_calibration(gdef, dumm_inp):
   """Run given calibration graph multiple times."""
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
@@ -96,7 +100,9 @@ def run_calibration(gdef, dumm_inp):
   return val
 
 
-if "__main__" in __name__:
+def user(run_graph=execute_graph, run_calibration=execute_calibration):
+  """Example function that converts a graph to TFTRT graph."""
+
   inp_dims = (100, 24, 24, 2)
   dummy_input = np.random.random_sample(inp_dims)
   orig_graph = get_simple_graph_def()  # use a frozen graph for inference
@@ -137,3 +143,51 @@ if "__main__" in __name__:
   assert np.allclose(o1, o4)
   assert np.allclose(o1, o5)
   print("Pass")
+
+
+def auto():
+  """Run the conversion as an optimization pass."""
+  inp_dims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inp_dims)
+  orig_graph = get_simple_graph_def()
+  opt_config = rwpb2.RewriterConfig()
+  opt_config.optimizers.extend(["constfold", "layout"])
+  custom_op = opt_config.custom_optimizers.add()
+  custom_op.name = "TensorRTOptimizer"
+  custom_op.parameter_map["minimum_segment_size"].i = 3
+  custom_op.parameter_map["precision_mode"].s = "FP32"
+  custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
+  custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
+  print(custom_op)
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
+  sessconfig = cpb2.ConfigProto(
+      gpu_options=gpu_options, graph_options=graph_options)
+  print(sessconfig)
+  g = ops.Graph()
+  ops.reset_default_graph()
+  with g.as_default():
+    inp, out = importer.import_graph_def(
+        graph_def=orig_graph, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+    with csess.Session(config=sessconfig, graph=g) as sess:
+      val = sess.run(out, {inp: dummy_input})
+  print(val.shape)
+
+
+if "__main__" in __name__:
+  P = argparse.ArgumentParser(
+      prog="tftrt_test",
+      description="Example utilization of TensorFlow-TensorRT integration")
+  P.add_argument(
+      "--automatic",
+      "-a",
+      action="store_true",
+      help="Do TRT conversion automatically",
+      default=False)
+  flags, unparsed = P.parse_known_args()
+  if flags.automatic:
+    auto()
+  else:
+    user()
index d426e9f..0403b65 100644 (file)
@@ -44,8 +44,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
     inp_dims = (100, 24, 24, 2)
     self._input = np.random.random_sample(inp_dims)
     self._original_graph = self.get_simple_graph_def()
-    self._gpu_options = cpb2.GPUOptions(
-        per_process_gpu_memory_fraction=0.50)
+    self._gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
     self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
     self._reference = self.run_graph(self._original_graph, self._input)
 
@@ -60,11 +59,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
           name="weights",
           dtype=dtypes.float32)
       conv = nn.conv2d(
-          input=a,
-          filter=e,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
+          input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
       b = cop.constant(
           [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
       t = nn.bias_add(conv, b, name="biasAdd")
@@ -85,8 +80,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
       inp = inp.outputs[0]
       out = out.outputs[0]
     with self.test_session(
-        graph=g, config=self._config, use_gpu=True,
-        force_gpu=True) as sess:
+        graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess:
       val = sess.run(out, {inp: dumm_inp})
     return val
 
@@ -104,15 +98,14 @@ class IntegrationTest(test_util.TensorFlowTestCase):
       # run over real calibration data here, we are mimicking a calibration
       # set of 30 different batches. Use as much calibration data as you want
     with self.test_session(
-        graph=g, config=self._config, use_gpu=True,
-        force_gpu=True) as sess:
+        graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess:
       for _ in range(30):
         val = sess.run(out, {inp: dumm_inp})
     return val
 
   def get_trt_graph(self, mode):
     """Return trt converted graph."""
-    if mode in  ["FP32", "FP16", "INT8"]:
+    if mode in ["FP32", "FP16", "INT8"]:
       return trt.create_inference_graph(
           input_graph_def=self._original_graph,
           outputs=["output"],
@@ -120,7 +113,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
           max_workspace_size_bytes=1 << 25,
           precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
           minimum_segment_size=2  # minimum number of nodes in an engine
-          )
+      )
     return None
 
   def testFP32(self):
index 5dd7bde..5b9aeaa 100644 (file)
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-"""TPU system metdata and associated tooling."""
+"""TPU system metadata and associated tooling."""
 
 from __future__ import absolute_import
 from __future__ import division
index 4b6104a..3137bfd 100644 (file)
@@ -159,7 +159,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
        * step_id - Step ID.
        * request_index - Request index.
        * remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
-* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occured on the sender side, so it can propagate it to the upper levels.
+* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occurred on the sender side, so it can propagate it to the upper levels.
        * type - The message type.
        * name (name_size) - Name of the requested tensor.
        * step_id - Step ID.
index 2dd8e6f..3286f85 100644 (file)
@@ -2762,6 +2762,7 @@ cc_library(
     ],
     visibility = [
         "//tensorflow/compiler:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/core/profiler:__subpackages__",
     ],
     deps = [":lib_internal"],
@@ -3683,7 +3684,11 @@ tf_cuda_only_cc_test(
         ":test",
         ":test_main",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(
+        [
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
 )
 
 tf_cc_test_gpu(
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
new file mode 100644 (file)
index 0000000..8cef243
--- /dev/null
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  in_arg {
+    name: "input"
+    description: <<END
+A string tensor of the text to be processed.
+END
+  }
+  in_arg {
+    name: "pattern"
+    description: <<END
+A 1-D string tensor of the regular expression to match the input.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A bool tensor with the same shape as `input`.
+END
+  }
+  summary: "Check if the input matches the regex pattern."
+  description: <<END
+The input is a string tensor of any shape. The pattern is a scalar
+string tensor which is applied to every element of the input tensor.
+The boolean values (True or False) of the output tensor indicate
+if the input matches the regex pattern provided.
+
+The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
new file mode 100644 (file)
index 0000000..ec310c8
--- /dev/null
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  visibility: HIDDEN
+}
index 9ceff86..9646a08 100644 (file)
@@ -80,7 +80,7 @@ void Broadcaster::Run(StatusCallback done) {
 // continuing to occupy its current position.  Hence we calculate as
 // though each device's rank is actually r+1, then subtract 1 again to
 // get the descendent ranks.  If the source is not rank 0 then its
-// decendents include both {0,1} and the descendents of its current
+// descendants include both {0,1} and the descendents of its current
 // position.  Where a non-0-rank source is a descendent of another
 // device, no send to it is necessary.
 
@@ -115,7 +115,7 @@ void Broadcaster::TreeSendTo(const CollectiveParams& cp,
   DCHECK_NE(successor_rank, my_rank);
   if (cp.is_source && source_rank != 0) {
     // The source sends to rank 0,1 in addition to its positional
-    // decendents.
+    // descendants.
     if (cp.group.group_size > 1) {
       targets->push_back(0);
     }
index e94e88b..9eb9f06 100644 (file)
@@ -79,7 +79,7 @@ class BufRendezvous {
                   const ProducerCallback& done);
 
   // Called to request access to a Tensor value corresponding to key.
-  // Consumer is provide with a Hook as soon as availble.
+  // Consumer is provide with a Hook as soon as available.
   void ConsumeBuf(const string& key, const ConsumerCallback& done);
 
   // Consumer must call this function when it's done reading the Hook provided
index 6b072f3..f8428f2 100644 (file)
@@ -283,7 +283,7 @@ void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
   // Note on field indexing: There are group_size_ devices in the
   // instance, implying the same number of chunks per tensor, where a
   // chunk is the unit of data transferred in a time step.  However, if
-  // a device can simultaenously send data by 2 or more independent
+  // a device can simultaneously send data by 2 or more independent
   // channels we can speed up the transfer by subdividing chunks and
   // processing multiple subdivisions at once.  So the actual number
   // of RingFields is group_size_ * num_subdivs_.
index be79cc4..c045596 100644 (file)
@@ -104,7 +104,7 @@ ScopedAllocatorContainer::~ScopedAllocatorContainer() {
   // contents deleted via Drop.  When when a step ends early
   // (e.g. through abnormal termination) we need to clean up
   // explicitly.  So long as graph execution of the associated step has
-  // completey terminated this should be safe.
+  // completely terminated this should be safe.
   for (auto& it : allocators_) {
     if (it.second.field_index == ScopedAllocator::kBackingIndex) {
       delete it.second.scoped_allocator;
index 4998a7a..03a011f 100644 (file)
@@ -52,7 +52,7 @@ namespace {
 
 // Creates an Event proto representing a chunk of a Tensor. This method only
 // populates the field of the Event proto that represent the envelope
-// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype,
+// information (e.g., timestamp, device_name, num_chunks, chunk_index, dtype,
 // shape). It does not set the value.tensor field, which should be set by the
 // caller separately.
 Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
index 18998bb..b9f21ea 100644 (file)
@@ -115,7 +115,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   size_t AssignWorkerToThread(const string& target) {
     // Round-robin target assignment, but keeps the same target on the same
-    // polling thread always, as this is important for gRPC performace
+    // polling thread always, as this is important for gRPC performance
     mutex_lock lock(assignment_mu_);
     auto it = target_assignments_.find(target);
     if (it == target_assignments_.end()) {
index b2b7232..e7142a4 100644 (file)
@@ -7,7 +7,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ExampleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 // An Example is a mostly-normalized data format for storing data for
index 15846c0..b2c115d 100644 (file)
@@ -6,6 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ExampleParserConfigurationProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 import "tensorflow/core/framework/tensor_shape.proto";
index da3dc59..6d81974 100644 (file)
@@ -58,7 +58,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "FeatureProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 // Containers to hold repeated fundamental values.
index bb1037c..64133b0 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "AllocationDescriptionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 message AllocationDescription {
   // Total number of bytes requested
index e878ab6..3f8dd27 100644 (file)
@@ -8,6 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ApiDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 // Used to specify and override the default API & behavior in the
index 62f0a90..054e3ec 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "AttrValueProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
index 7885b01..19d765c 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CostGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
index 0b3c0d5..44236ca 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DeviceAttributesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 message InterconnectLink {
   int32 device_id = 1;
index 72e3c43..e69d393 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "FunctionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/op_def.proto";
index 7d6e16d..76d3589 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/versions.proto";
index 41dd54d..232297d 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphTransferInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/types.proto";
 
 message GraphTransferNodeInput {
index 7e5f5ea..f015342 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "IteratorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing the metadata for an iterator's state stored
 // as a Variant tensor.
index 65e9ef0..a17b9c8 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "KernelDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 message KernelDef {
index d1e1263..7f37ead 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "LogMemoryProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor_description.proto";
 
 message MemoryLogStep {
index 8fcee32..0a095f9 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NodeProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 message NodeDef {
index ca0e5e7..aea2d2b 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "OpDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/types.proto";
 
index ff38e4b..533dd64 100644 (file)
@@ -59,14 +59,14 @@ class ApiDefMap {
   // You can call this method multiple times to load multiple
   // sets of files. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
-  // definitions take precedense.
+  // definitions take precedence.
   // ApiDefs loaded from files must contain a subset of ops defined
   // in the OpList passed to the constructor.
   Status LoadFileList(Env* env, const std::vector<string>& filenames);
 
   // Load a single file. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
-  // definitions take precedense.
+  // definitions take precedence.
   // ApiDefs loaded from file must contain a subset of ops defined
   // in the OpList passed to the constructor.
   Status LoadFile(Env* env, const string& filename);
index 6794337..f577664 100644 (file)
@@ -534,7 +534,7 @@ class OpKernelContext {
     Rendezvous* rendezvous = nullptr;
 
     // Mechanism for executing a collective op that needs to coordinate
-    // with parallel instances runing on other devices.
+    // with parallel instances running on other devices.
     CollectiveExecutor* collective_executor = nullptr;
 
     // The session state for this op.
index 1b8b965..9e187cf 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ReaderBaseProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // For serializing and restoring the state of ReaderBase, see
 // reader_base.h for details.
index 946da40..eb689ec 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+//add go_package externally
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
index b192133..a54d3d9 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ResourceHandle";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
index 65c8089..d98999c 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "StepStatsProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/allocation_description.proto";
 import "tensorflow/core/framework/tensor_description.proto";
 
index 55879f8..532e4fc 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SummaryProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor.proto";
 
 // Metadata associated with a series of Summary data
index abbf16e..55921af 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/resource_handle.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
index 6ac3c1b..4c23c7e 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorDescriptionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/allocation_description.proto";
index 1ec3c53..286156a 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorShapeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 package tensorflow;
 
index 24b0166..a5c366e 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorSliceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 package tensorflow;
 
index e003fd0..03835d1 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TypesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // LINT.IfChange
 enum DataType {
index e0df01c..93ae423 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VariableProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing a Variable.
 message VariableDef {
index 7d5e58a..dd2ec55 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VersionsProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Version information for a piece of serialized data
 //
index 5e2a465..029cdcf 100644 (file)
@@ -2022,6 +2022,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2051,6 +2052,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
@@ -2069,6 +2071,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2095,6 +2098,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2125,6 +2129,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2151,6 +2156,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2178,6 +2184,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2204,6 +2211,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2233,6 +2241,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2272,6 +2281,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2289,6 +2299,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['F', 'B', 'E']}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2319,6 +2330,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }");
@@ -2341,6 +2353,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2348,6 +2361,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
@@ -2370,6 +2384,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
       " input: ['B', 'C'] }");
@@ -2389,6 +2404,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2411,6 +2427,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['B', 'A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2477,6 +2494,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2529,6 +2547,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2536,6 +2555,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2572,6 +2592,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -2634,6 +2655,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2641,6 +2663,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2678,6 +2701,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -3274,6 +3298,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }",
@@ -3296,6 +3321,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -3323,6 +3349,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }",
index 5944e36..2a83eb7 100644 (file)
@@ -31,7 +31,7 @@ namespace tensorflow {
 // future to support these features.
 //
 // TODO(skyewm): de/serialize in MetaGraphDef so imported while loops will be
-// differentiable. Figure out backwards compatability story.
+// differentiable. Figure out backwards compatibility story.
 class WhileContext {
  public:
   WhileContext(StringPiece frame_name, std::vector<Node*> enter_nodes,
index 4941fb2..203f7b0 100644 (file)
@@ -1082,7 +1082,7 @@ Status GraphProperties::PropagateShapes(
     const std::unordered_map<const NodeDef*, const NodeDef*>& resource_handles,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
-  // incorrect shape functions. The algoritm should converge in at most
+  // incorrect shape functions. The algorithm should converge in at most
   // num_nested_loops^2 * max_rank. We approximate max_rank with the constant 4.
   // The same applies to resources.
   VLOG(1) << "Propagating " << new_shapes->size() << " new shapes through "
index 67bf1e6..34d4881 100644 (file)
@@ -328,7 +328,7 @@ class VirtualScheduler {
   Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
-  // Auxilliary data structures for constructing NodeState and DeviceState.
+  // Auxiliary data structures for constructing NodeState and DeviceState.
   GraphProperties graph_properties_;
   Cluster* cluster_;  // Not owned.
 
index 87ab460..e08ab1e 100644 (file)
@@ -2183,7 +2183,7 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   TuningConfig config;
   config.no_gemm = true;
-  // TODO(yaozhang): Enable tuning with various TuningConfig choices wtih
+  // TODO(yaozhang): Enable tuning with various TuningConfig choices with
   // the measurement-based estimator.
   status = Tune(item, graph_properties, config, output);
   if (!status.ok()) {
index b2b631a..5948f8d 100644 (file)
@@ -4248,6 +4248,7 @@ cc_library(
         ":as_string_op",
         ":base64_ops",
         ":reduce_join_op",
+        ":regex_full_match_op",
         ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
@@ -4285,6 +4286,12 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
+    name = "regex_full_match_op",
+    prefix = "regex_full_match_op",
+    deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
+)
+
+tf_kernel_library(
     name = "regex_replace_op",
     prefix = "regex_replace_op",
     deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
@@ -5174,6 +5181,7 @@ filegroup(
             "debug_ops.*",
             "mutex_ops.*",
             "batch_kernels.*",
+            "regex_full_match_op.cc",
             "regex_replace_op.cc",
         ],
     ),
index a1c03f9..475bda8 100644 (file)
@@ -329,6 +329,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       c_ptrs.push_back(&c_device_memory.back());
     }
 
+    typedef Scalar Coefficient;
+
     // Cublas does
     // C = A x B
     // where A, B and C are assumed to be in column major.
@@ -352,9 +354,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
-                               static_cast<Scalar>(1.0), *(a_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(a_ptrs[0]),
                                adj_x ? m : k, *(b_ptrs[0]), 1,
-                               static_cast<Scalar>(0.0), c_ptrs[0], 1)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], 1)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -366,9 +368,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
-                               static_cast<Scalar>(1.0), *(b_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(b_ptrs[0]),
                                adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
-                               static_cast<Scalar>(0.0), c_ptrs[0], n)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], n)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -383,8 +385,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           stream
               ->ThenBlasGemmBatchedWithScratch(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                  adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
                   batch_size, &scratch_allocator)
               .ok();
       if (!blas_launch_status) {
@@ -398,6 +400,98 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
   }
 };
 
+template <>
+struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+    typedef Eigen::half Scalar;
+    constexpr perftools::gputools::blas::Transpose kTranspose =
+        is_complex<Scalar>::value
+            ? perftools::gputools::blas::Transpose::kConjugateTranspose
+            : perftools::gputools::blas::Transpose::kTranspose;
+    perftools::gputools::blas::Transpose trans[] = {
+        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+    const uint64 batch_size = in_x.dim_size(0);
+    auto blas_transpose_a = trans[adj_x];
+    auto blas_transpose_b = trans[adj_y];
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    std::vector<DeviceMemoryType> a_device_memory;
+    std::vector<DeviceMemoryType> b_device_memory;
+    std::vector<DeviceMemoryType> c_device_memory;
+    std::vector<DeviceMemoryType*> a_ptrs;
+    std::vector<DeviceMemoryType*> b_ptrs;
+    std::vector<DeviceMemoryType*> c_ptrs;
+    a_device_memory.reserve(batch_size);
+    b_device_memory.reserve(batch_size);
+    c_device_memory.reserve(batch_size);
+    a_ptrs.reserve(batch_size);
+    b_ptrs.reserve(batch_size);
+    c_ptrs.reserve(batch_size);
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* b_base_ptr = in_y.template flat<Scalar>().data();
+    auto* c_base_ptr = out->template flat<Scalar>().data();
+    for (int64 i = 0; i < batch_size; ++i) {
+      a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+      b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+      c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+      a_ptrs.push_back(&a_device_memory.back());
+      b_ptrs.push_back(&b_device_memory.back());
+      c_ptrs.push_back(&c_device_memory.back());
+    }
+
+    typedef float Coefficient;
+
+    // Cublas does
+    // C = A x B
+    // where A, B and C are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // C' = B' x A', where ' stands for transpose (not adjoint).
+    // TODO(yangzihao): Choose the best of the three strategies using autotune.
+    if (batch_size == 1) {
+      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+      // overhead of the scratch allocator and the batch interface.
+      // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                             static_cast<Coefficient>(1.0), *(b_ptrs[0]),
+                             adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                             static_cast<Coefficient>(0.0), c_ptrs[0], n)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k));
+      }
+    } else {
+      CublasScratchAllocator scratch_allocator(context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
+                  batch_size, &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(
+            errors::Internal("Blas xGEMMBatched launch failed : a.shape=",
+                             in_x.shape().DebugString(), ", b.shape=",
+                             in_y.shape().DebugString(), ", m=", m, ", n=", n,
+                             ", k=", k, ", batch_size=", batch_size));
+      }
+    }
+  }
+};
+
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
index 97cec3a..87a0795 100644 (file)
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 #if !defined(INTEL_MKL)
index c6119b5..b77c14d 100644 (file)
@@ -76,7 +76,7 @@ class AdaptiveSharedBatchScheduler
           AdaptiveSharedBatchScheduler<TaskType>> {
  public:
   ~AdaptiveSharedBatchScheduler() {
-    // Finish processing batches before destorying other class members.
+    // Finish processing batches before destroying other class members.
     batch_thread_pool_.reset();
   }
 
index 9edc6d4..980b106 100644 (file)
@@ -195,8 +195,8 @@ class Conv3DBackpropInputOp : public OpKernel {
     TensorShape input_shape;
     if (takes_shape_) {
       const Tensor& input_sizes = context->input(0);
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_sizes.vec<int32>(), &input_shape));
+      // MakeShape is able to handle both DT_INT32 and DT_INT64 for input_sizes.
+      OP_REQUIRES_OK(context, MakeShape(input_sizes, &input_shape));
     } else {
       input_shape = context->input(0).shape();
     }
index 180531b..a2e7342 100644 (file)
@@ -595,7 +595,7 @@ constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
   // For a tile size combination (longside, shortside), lying on the frontier
   // implies that (longside, shortside) is on or within the frontier but
   // (longside*2, shortside) or (longside, shortside+1) is not. With the above
-  // critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // criterion, we simply need to use !TileSizeOnLongSideFrontier to ensure that
   // it is not on the long side frontier.
   return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
          (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
index 7f12eb9..0e43cc1 100644 (file)
@@ -114,7 +114,7 @@ struct NthElementFunctor<CPUDevice, T> {
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     // The average time complexity of partition-based nth_element (BFPRT) is
-    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // O(n), although the worst time complexity could be O(n^2). Here, 20 is a
     // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
new file mode 100644 (file)
index 0000000..5863a2c
--- /dev/null
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "re2/re2.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class RegexFullMatchOp : public OpKernel {
+ public:
+  explicit RegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    const Tensor* pattern_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
+                errors::InvalidArgument("Pattern must be scalar, but received ",
+                                        pattern_tensor->shape().DebugString()));
+    const string pattern = pattern_tensor->flat<string>()(0);
+    const RE2 match(pattern);
+    OP_REQUIRES(ctx, match.ok(),
+                errors::InvalidArgument("Invalid pattern: ", pattern,
+                                        ", error: ", match.error()));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
+                                             &output_tensor));
+    auto output_flat = output_tensor->flat<bool>();
+    for (size_t i = 0; i < input_flat.size(); ++i) {
+      output_flat(i) = RE2::FullMatch(input_flat(i), match);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
+                        RegexFullMatchOp);
+
+}  // namespace tensorflow
index 96f94d8..722116f 100644 (file)
@@ -84,7 +84,7 @@ void DoRoll(OpKernelContext* context, const int64 num_elements,
   // Shard
   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
   // 15 - expiramentally determined with float and bool types
-  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  const int cost_per_element = 15 * sizeof(T);  // rough estimate
   Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
         cost_per_element, std::move(work));
 }
index c87ce78..2328fc6 100644 (file)
@@ -320,7 +320,9 @@ class SegmentSumGPUOp : public AsyncOpKernel {
   REGISTER_CPU_KERNEL_SEGMENT("SegmentSum", Eigen::internal::SumReducer<type>, \
                               type, index_type, 0);                            \
   REGISTER_CPU_KERNEL_SEGMENT(                                                 \
-      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1)
+      "SegmentMean", Eigen::internal::MeanReducer<type>, type, index_type, 0); \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1);
 
 #define REGISTER_REAL_CPU_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_KERNELS(type, int32);   \
index 4abfbfb..7796bf3 100644 (file)
@@ -130,4 +130,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
index b82d389..5ced65a 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ErrorCodesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/lib/core";
 
 // The canonical error codes for TensorFlow APIs.
 //
index 82330ec..d949e70 100644 (file)
@@ -435,6 +435,25 @@ REGISTER_OP("DrawBoundingBoxes")
     .Output("output: T")
     .Attr("T: {float, half} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
+      // The rank of images should be 4.
+      ShapeHandle images;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &images));
+      // Channel depth should be either 1 (GRY), 3 (RGB), or 4 (RGBA).
+      if (c->ValueKnown(c->Dim(images, 3))) {
+        int64 depth = c->Value(c->Dim(images, 3));
+        if (!(depth == 1 || depth == 3 || depth == 4)) {
+          return errors::InvalidArgument("Channel depth should be either 1 (GRY), "
+                                         "3 (RGB), or 4 (RGBA)");
+        }
+      }
+
+      // The rank of boxes is 3: [batch, num_bounding_boxes, 4].
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &boxes));
+      // The last value of boxes shape is 4.
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
+
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
index 5f0b391..517af26 100644 (file)
@@ -312,4 +312,23 @@ TEST(ImageOpsTest, QuantizedResizeBilinear_ShapeFn) {
   INFER_OK(op, "[1,?,3,?];[2];[];[]", "[d0_0,20,30,d0_3];[];[]");
 }
 
+TEST(ImageOpsTest, DrawBoundingBoxes_ShapeFn) {
+  ShapeInferenceTestOp op("DrawBoundingBoxes");
+  op.input_tensors.resize(2);
+
+  // Check images.
+  INFER_ERROR("must be rank 4", op, "[1,?,3];?");
+  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)",
+      op, "[1,?,?,5];?");
+
+  // Check boxes.
+  INFER_ERROR("must be rank 3", op, "[1,?,?,4];[1,4]");
+  INFER_ERROR("Dimension must be 4", op, "[1,?,?,4];[1,2,2]");
+
+  // OK shapes.
+  INFER_OK(op, "[4,?,?,4];?", "in0");
+  INFER_OK(op, "[?,?,?,?];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,4]", "in0");
+}
 }  // end namespace tensorflow
index 8f8443a..8c0b073 100644 (file)
@@ -1017,7 +1017,7 @@ REGISTER_OP("SegmentMean")
     .Input("data: T")
     .Input("segment_ids: Tindices")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn(SegmentReductionShapeFn);
 
index bb46daf..fc60e80 100644 (file)
@@ -547,7 +547,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     });
 
 REGISTER_OP("Conv3DBackpropInputV2")
-    .Input("input_sizes: int32")
+    .Input("input_sizes: Tshape")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
@@ -556,6 +556,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
index 416ce9c..80ffae5 100644 (file)
@@ -72,7 +72,15 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      // Parameters must be 0-d or 1-d.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(3), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(4), 1, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("TruncatedNormal")
     .Input("shape: T")
index 469f193..1d5c743 100644 (file)
@@ -37,6 +37,17 @@ REGISTER_OP("RegexReplace")
       return Status::OK();
     });
 
+REGISTER_OP("RegexFullMatch")
+    .Input("input: string")
+    .Input("pattern: string")
+    .Output("output: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringToHashBucketFast")
     .Input("input: string")
     .Output("output: int64")
index 8307758..dc12c78 100644 (file)
@@ -103,7 +103,7 @@ constexpr char kResolveCacheSecs[] = "GCS_RESOLVE_REFRESH_SECS";
 // The environment variable to configure the http request's connection timeout.
 constexpr char kRequestConnectionTimeout[] =
     "GCS_REQUEST_CONNECTION_TIMEOUT_SECS";
-// The environment varaible to configure the http request's idle timeout.
+// The environment variable to configure the http request's idle timeout.
 constexpr char kRequestIdleTimeout[] = "GCS_REQUEST_IDLE_TIMEOUT_SECS";
 // The environment variable to configure the overall request timeout for
 // metadata requests.
index 97a858e..8c9e2e0 100644 (file)
@@ -132,7 +132,7 @@ class GcsThrottle {
    * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
    *
    * UpdateState should be called in order to mark the passage of time, and
-   * therefore add tokens to the availble_tokens_ pool.
+   * therefore add tokens to the available_tokens_ pool.
    */
   void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
index bbaf55e..cc6d9de 100644 (file)
@@ -82,7 +82,7 @@ bazel-bin/tensorflow/core/profiler/profiler \
 #
 # Alternatively, user can pass separate files.
 #
-# --graph_path contains the model architecutre and tensor shapes.
+# --graph_path contains the model architecture and tensor shapes.
 # --run_meta_path contains the memory and time information.
 # --op_log_path contains float operation and code traces.
 # --checkpoint_path contains the model checkpoint data.
index 33c87ee..c696d34 100644 (file)
@@ -20,6 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ClusterProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // This file contains protos to be used when defining a TensorFlow
 // cluster.
index 410ad22..9a48f43 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
index 3c05b4f..5f44878 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ControlFlowProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Control flow context related protocol buffers.
 
index 0b3f531..7954e7b 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CriticalSectionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Protocol buffer representing a CriticalSection.
 message CriticalSectionDef {
index 56983f3..499900f 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DebugProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // EXPERIMENTAL. Option for watching a node.
 message DebugTensorWatch {
index 3bd3015..11e1258 100644 (file)
@@ -18,6 +18,7 @@ syntax = "proto3";
 package tensorflow;
 option cc_enable_arenas = true;
 option java_outer_classname = "DevicePropertiesProtos";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 message DeviceProperties {
   // Device type (CPU, GPU, ...)
index 96c9153..0302287 100644 (file)
@@ -20,7 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
index 1170611..ce0e4f6 100644 (file)
@@ -19,7 +19,7 @@ package tensorflow.grpc;
 option java_outer_classname = "MasterServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/master.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
index fd86c0d..75a2a88 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "MetaGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 
 import "tensorflow/core/framework/graph.proto";
index dd4976e..6e2f7fe 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NamedTensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/tensor.proto";
 
 // A pair of tensor name and tensor values.
index 05a48d0..f4df649 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "QueueRunnerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/lib/core/error_codes.proto";
 
 // Protocol buffer representing a QueueRunner.
index 10bfe30..45e5759 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 import "tensorflow/core/framework/attr_value.proto";
 
@@ -31,7 +32,7 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
-  // Enum controling the number of times to run optimizers. The default is to
+  // Enum controlling the number of times to run optimizers. The default is to
   // run them once.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
index c2595dd..03789d3 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SavedModelProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/meta_graph.proto";
 
 // SavedModel is the high level serialization format for TensorFlow Models.
index a757d3f..4245386 100644 (file)
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SaverProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Protocol buffer representing the configuration of a Saver.
 message SaverDef {
index 80e87f1..681c01b 100644 (file)
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorBundleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/tensor_slice.proto";
 import "tensorflow/core/framework/types.proto";
index 6199e70..be25804 100644 (file)
@@ -23,7 +23,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
   // The cluster of which this server is a member.
index b400638..a3bc2f4 100644 (file)
@@ -20,7 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "WorkerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
index e0c27f3..9ebbd55 100644 (file)
@@ -19,7 +19,7 @@ package tensorflow.grpc;
 option java_outer_classname = "WorkerServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/worker.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
index ba69efb..522a9d8 100644 (file)
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
index f2d4e47..b91f8bb 100644 (file)
@@ -537,7 +537,7 @@ __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
   return atomicSub(ptr, value);
 }
 
-// Specializations of substraction which add the negative value.
+// Specializations of subtraction which add the negative value.
 __device__ inline float CudaAtomicSub(float* ptr, float value) {
   return CudaAtomicAdd(ptr, -value);
 }
index 50a8e30..8105121 100644 (file)
@@ -1359,7 +1359,7 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
-/// in Tensorflow's TensorShape object by perserving dimension order.
+/// in Tensorflow's TensorShape object by preserving dimension order.
 ///
 /// @input MKL-DNN memory::dims object
 /// @output TensorShape corresponding to memory::dims
index 58bc79a..d3d5602 100644 (file)
@@ -75,7 +75,7 @@ enum FilterTensorFormat {
   FORMAT_OIHW = 1,
 
   // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
-  // int8 convolution and fused convolution. It is analagous to the NCHW_VECT_C
+  // int8 convolution and fused convolution. It is analogous to the NCHW_VECT_C
   // data format. It is laid out in the same order as OIHW, except that the size
   // of the Input Channels dimension is divided by 4, and a new dimension of
   // size 4 is appended, which packs 4 adjacent input channel weights into an
index b3ca958..5bbbfd3 100644 (file)
@@ -184,7 +184,7 @@ The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}
     dataset = dataset.map(decode)
 ```
 
-To acomplish the same task with a queue based input pipeline requires the following code 
+To accomplish the same task with a queue based input pipeline requires the following code
 (using the same `decode` function from the above example): 
 
 ``` python
index 67856ce..153ef4a 100644 (file)
@@ -1,14 +1,14 @@
 # Defining and Running Benchmarks
 
-This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to TensorFlow github repo, then we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
+This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to the TensorFlow github repo, we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
 
 [TOC]
 
 
 ## Defining a Benchmark
 
-Defining a TensorFlow benchmark requires extending from `tf.test.Benchmark`
-class and calling `self.report_benchmark` method. For example, take a look at the sample benchmark code below:
+Defining a TensorFlow benchmark requires extending the `tf.test.Benchmark`
+class and calling the `self.report_benchmark` method. Below, you'll find an example of benchmark code:
 
 ```python
 import time
@@ -54,20 +54,20 @@ Key points to note in the example above:
 
 ## Running with Python
 
-Use the `--benchmarks` flag to run the benchmark with python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
+Use the `--benchmarks` flag to run the benchmark with Python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
 
 ```
 python sample_benchmark.py --benchmarks=SampleBenchmark
 ```
 
-Setting the flag as `--benchmarks=.` or `--benchmarks=all` would work as well.
+Setting the flag as `--benchmarks=.` or `--benchmarks=all` works as well.
 
-(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with bazel.)
+(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with Bazel.)
 
 
 ## Adding a `bazel` Target
 
-We have a special target called `tf_py_logged_benchmark` for benchmarks defined under TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
+We have a special target called `tf_py_logged_benchmark` for benchmarks defined under the TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
 
 First, define a regular `py_test` target. See example below:
 
@@ -82,7 +82,7 @@ py_test(
 )
 ```
 
-You can run benchmarks in a `py_test` target by passing `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
+You can run benchmarks in a `py_test` target by passing the `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
 
 ```shell
 bazel test :sample_benchmark --test_arg=--benchmarks=all
@@ -90,7 +90,7 @@ bazel test :sample_benchmark --test_arg=--benchmarks=all
 
 
 Now, add the `tf_py_logged_benchmark` target (if available). This target would
-pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. `tf_py_logged_benchmark` target should be available in TensorFlow repository.
+pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. The target `tf_py_logged_benchmark` should be available in TensorFlow repository.
 
 ```build
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
index e5e9e8e..d1625d3 100644 (file)
@@ -8,7 +8,7 @@ Welcome to the Swift for TensorFlow development community!
 
 Swift for TensorFlow is a new way to develop machine learning models. It
 gives you the power of
-[TensorFlow](programmers_guide/eager) directly
+[TensorFlow](https://www.tensorflow.org) directly
 integrated into the [Swift programming language](https://swift.org/about).
 With Swift, you can write the following imperative code, and Swift
 automatically turns it into **a single TensorFlow Graph** and runs it
index ef3b030..9ef9674 100644 (file)
@@ -1,6 +1,6 @@
 # How to run TensorFlow on S3
 
-Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitous, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
 This document guides you through the required setup, and provides examples on usage.
 
index c379549..1b028be 100644 (file)
@@ -863,48 +863,53 @@ REGISTER_OP("ZeroOut")
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
 registration (`REGISTER_KERNEL_BUILDER` call) per overload.
-<pre class="prettyprint"><code class="lang-cpp">
-<b>template &lt;typename T&gt;</b>
+```c++
+template <typename T>
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
+  
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat<b>&lt;T&gt;</b>();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<T>();
+    
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat<b>&lt;T&gt;</b>();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<T>();
+    
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+    
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;int32&gt;("T"),
-    <b>ZeroOutOp&lt;int32&gt;</b>);
-REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOp<int32>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
-    <b>ZeroOutOp&lt;float&gt;</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
+    ZeroOutOp<float>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;double&gt;("T"),
-    ZeroOutOp&lt;double&gt;);
-</b></code></pre>
+    .Device(DEVICE_CPU)
+    .TypeConstraint<double>("T"),
+    ZeroOutOp<double>);
+```
 
 If you have more than a couple overloads, you can put the registration in a
 macro.
index c0fc714..c8f522a 100644 (file)
@@ -4,8 +4,8 @@ We designed TensorFlow for large-scale distributed training and inference, but
 it is also flexible enough to support experimentation with new machine
 learning models and system-level optimizations.
 
-This document describes the system architecture that makes possible this
-combination of scale and flexibility. It assumes that you have basic familiarity
+This document describes the system architecture that makes this
+combination of scale and flexibility possible. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
 and sessions. See @{$programmers_guide/low_level_intro$this document}
 for an introduction to these topics. Some familiarity
@@ -15,8 +15,8 @@ will also be helpful.
 This document is for developers who want to extend TensorFlow in some way not
 supported by current APIs, hardware engineers who want to optimize for
 TensorFlow, implementers of machine learning systems working on scaling and
-distribution, or anyone who wants to look under Tensorflow's hood. After
-reading it you should understand TensorFlow architecture well enough to read
+distribution, or anyone who wants to look under Tensorflow's hood. By the end of this document 
+you should understand the TensorFlow architecture well enough to read
 and modify the core TensorFlow code.
 
 ## Overview
@@ -35,7 +35,7 @@ This document focuses on the following layers:
 *  **Client**:
    *  Defines the computation as a dataflow graph.
    *  Initiates graph execution using a [**session**](
-      https://www.tensorflow.org/code/tensorflow/python/client/session.py)
+      https://www.tensorflow.org/code/tensorflow/python/client/session.py).
 *  **Distributed Master**
    *  Prunes a specific subgraph from the graph, as defined by the arguments
       to Session.run().
@@ -55,7 +55,7 @@ Figure 2 illustrates the interaction of these components. "/job:worker/task:0" a
 server": a task responsible for storing and updating the model's parameters.
 Other tasks send updates to these parameters as they work on optimizing the
 parameters. This particular division of labor between tasks is not required, but
-it is common for distributed training.
+ is common for distributed training.
 
 ![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"}
 
@@ -193,7 +193,7 @@ https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py))
 
 ## Kernel Implementations
 
-The runtime contains over 200 standard operations, including mathematical, array
+The runtime contains over 200 standard operations including mathematical, array
 manipulation, control flow, and state management operations. Each of these
 operations can have kernel implementations optimized for a variety of devices.
 Many of the operation kernels are implemented using Eigen::Tensor, which uses
index 8c165aa..1abd840 100644 (file)
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
index 26cbcc9..52a2a3f 100644 (file)
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
index 05b2878..1256fb9 100644 (file)
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0-rc1</version>
+                 <version>1.8.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
index 9d9322d..0ed8160 100644 (file)
@@ -438,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -684,14 +684,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +703,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +722,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +741,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
index 0906b55..29a867a 100644 (file)
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
 </pre>
index 8bbdf01..5ba522b 100644 (file)
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0rc1 on Linux:
+for TensorFlow 1.8.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
index 1b0b9b4..241f01d 100644 (file)
@@ -212,7 +212,7 @@ handle the task then it will be difficult to train a computer to do better.
 
 After you’ve solved any fundamental issues with your use case, you need to
 create a labeled dataset to define what problem you’re trying to solve. This
-step is extremely important, moreso than picking which model to use. You want it
+step is extremely important, more than picking which model to use. You want it
 to be as representative as possible of your actual use case, since the model
 will only be effective at the task you teach it. It’s also worth investing in
 tools to make labeling the data as efficient and accurate as possible. For
index 01881cc..5622034 100644 (file)
@@ -155,7 +155,7 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-<img src="/images/tflite-architecture.jpg"
+<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
      alt="TensorFlow Lite architecture diagram"
      style="max-width:600px;">
 
index 51c1a1e..b6291a9 100644 (file)
@@ -72,7 +72,7 @@ tensors in the execution of a step.
 
 If `t` is a @{tf.Tensor} object,
 @{tf.Tensor.eval} is shorthand for
-@{tf.Session.run} (where `sess` is the
+@{tf.Session.run}where `sess` is the
 current @{tf.get_default_session}. The
 two following snippets of code are equivalent:
 
@@ -101,9 +101,8 @@ sessions, it may be more straightforward to make explicit calls to
 Sessions can own resources, such as
 @{tf.Variable},
 @{tf.QueueBase}, and
-@{tf.ReaderBase}; and these resources can use
-a significant amount of memory. These resources (and the associated memory) are
-released when the session is closed, by calling
+@{tf.ReaderBase}. These resources can sometimes use
+a significant amount of memory, and can be released when the session is closed by calling
 @{tf.Session.close}.
 
 The intermediate tensors that are created as part of a call to
@@ -137,7 +136,7 @@ TensorFlow also has a
 to help build support for more client languages.  We invite contributions of new
 language bindings.
 
-Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the opensource community build on top of the C API supported by the TensorFlow maintainers.
+Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the open source community build on top of the C API supported by the TensorFlow maintainers.
 
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
@@ -210,8 +209,8 @@ a new tensor with a different dynamic shape.
 
 #### How do I build a graph that works with variable batch sizes?
 
-It is often useful to build a graph that works with variable batch sizes, for
-example so that the same code can be used for (mini-)batch training, and
+It is often useful to build a graph that works with variable batch sizes 
+so that the same code can be used for (mini-)batch training, and
 single-instance inference. The resulting graph can be
 @{tf.Graph.as_graph_def$saved as a protocol buffer}
 and
@@ -260,7 +259,7 @@ See the how-to documentation for
 There are three main options for dealing with data in a custom format.
 
 The easiest option is to write parsing code in Python that transforms the data
-into a numpy array. Then use @{tf.data.Dataset.from_tensor_slices} to
+into a numpy array. Then, use @{tf.data.Dataset.from_tensor_slices} to
 create an input pipeline from the in-memory data.
 
 If your data doesn't fit in memory, try doing the parsing in the Dataset
@@ -274,7 +273,7 @@ If your data is not easily parsable with the built-in TensorFlow operations,
 consider converting it, offline, to a format that is easily parsable, such
 as @{tf.python_io.TFRecordWriter$`TFRecord`} format.
 
-The more efficient method to customize the parsing behavior is to
+The most efficient method to customize the parsing behavior is to
 @{$adding_an_op$add a new op written in C++} that parses your
 data format. The @{$new_data_formats$guide to handling new data formats} has
 more information about the steps for doing this.
index 58a80d5..1248c3c 100644 (file)
@@ -265,7 +265,7 @@ example:
 ```python
 constant = tf.constant([1, 2, 3])
 tensor = constant * constant
-print tensor.eval()
+print(tensor.eval())
 ```
 
 The `eval` method only works when a default `tf.Session` is active (see
@@ -306,8 +306,8 @@ Note that you rarely want to use the following pattern when printing a
 
 ``` python
 t = <<some tensorflow operation>>
-print t  # This will print the symbolic tensor when the graph is being built.
-         # This tensor does not have a value in this context.
+print(t)  # This will print the symbolic tensor when the graph is being built.
+          # This tensor does not have a value in this context.
 ```
 
 This code prints the `tf.Tensor` object (which represents deferred computation)
index e8cf771..cd8c4b5 100644 (file)
@@ -237,7 +237,7 @@ TensorFlow supports two ways of sharing variables:
 While code which explicitly passes variables around is very clear, it is
 sometimes convenient to write TensorFlow functions that implicitly use
 variables in their implementations. Most of the functional layers from
-`tf.layer` use this approach, as well as all `tf.metrics`, and a few other
+`tf.layers` use this approach, as well as all `tf.metrics`, and a few other
 library utilities.
 
 Variable scopes allow you to control variable reuse when calling functions which
index ead5a63..0f17899 100644 (file)
@@ -209,7 +209,6 @@ for two-dimensional image data expect input tensors to have a shape of
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
-*   _`image_height`_. Height of the example images.
 *   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
       `channels_last` corresponds to inputs with shape
       `(batch, ..., channels)` while `channels_first` corresponds to
index 9e21aee..a40a9ea 100644 (file)
@@ -73,7 +73,7 @@ def cnn_model(features, labels, mode):
         kernel_size=FILTER_SHAPE2,
         padding='VALID')
     # Max across each filter to get useful features for classification.
-    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
+    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), axis=[1])
 
   # Apply regular WX + B and classification.
   logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
index a503b3b..36db3dd 100644 (file)
@@ -21321,7 +21321,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
index 9af2e9b..32e0802 100644 (file)
@@ -103,7 +103,7 @@ def is_sequence(seq):
   NOTE(mrry): This differs from `tensorflow.python.util.nest.is_sequence()`,
   which *does* treat a Python list as a sequence. For ergonomic
   reasons, `tf.data` users would prefer to treat lists as
-  implict `tf.Tensor` objects, and dicts as (nested) sequences.
+  implicit `tf.Tensor` objects, and dicts as (nested) sequences.
 
   Args:
     seq: an input sequence.
index f66cefb..7b87972 100644 (file)
@@ -190,8 +190,6 @@ class ScrollBar(object):
     return layout
 
   def get_click_command(self, mouse_y):
-    # TODO(cais): Support continuous scrolling when the mouse button is held
-    # down.
     if self._output_num_rows <= 1:
       return None
     elif mouse_y == self._min_y:
@@ -271,6 +269,10 @@ class CursesUI(base_ui.BaseUI):
 
   _UI_WAIT_MESSAGE = "Processing..."
 
+  # The delay (in ms) between each update of the scroll bar when the mouse
+  # button is held down on the scroll bar. Controls how fast the screen scrolls.
+  _MOUSE_SCROLL_DELAY_MS = 100
+
   _single_instance_lock = threading.Lock()
 
   def __init__(self, on_ui_exit=None, config=None):
@@ -855,7 +857,30 @@ class CursesUI(base_ui.BaseUI):
       except curses.error:
         mouse_event_type = None
 
-      if mouse_event_type == curses.BUTTON1_RELEASED:
+      if mouse_event_type == curses.BUTTON1_PRESSED:
+        # Logic for held mouse-triggered scrolling.
+        if mouse_x >= self._max_x - 2:
+          # Disable blocking on checking for user input.
+          self._command_window.nodelay(True)
+
+          # Loop while mouse button is pressed.
+          while mouse_event_type == curses.BUTTON1_PRESSED:
+            # Sleep for a bit.
+            curses.napms(self._MOUSE_SCROLL_DELAY_MS)
+            scroll_command = self._scroll_bar.get_click_command(mouse_y)
+            if scroll_command in (_SCROLL_UP_A_LINE, _SCROLL_DOWN_A_LINE):
+              self._scroll_output(scroll_command)
+
+            # Check to see if different mouse event is in queue.
+            self._command_window.getch()
+            try:
+              _, _, _, _, mouse_event_type = self._screen_getmouse()
+            except curses.error:
+              pass
+
+          self._command_window.nodelay(False)
+          return x
+      elif mouse_event_type == curses.BUTTON1_RELEASED:
         # Logic for mouse-triggered scrolling.
         if mouse_x >= self._max_x - 2:
           scroll_command = self._scroll_bar.get_click_command(mouse_y)
@@ -1677,4 +1702,7 @@ class CursesUI(base_ui.BaseUI):
       self._redraw_output()
 
   def _screen_set_mousemask(self):
-    curses.mousemask(self._mouse_enabled)
+    if self._mouse_enabled:
+      curses.mousemask(curses.BUTTON1_RELEASED | curses.BUTTON1_PRESSED)
+    else:
+      curses.mousemask(0)
index 9b4b866..347a760 100644 (file)
@@ -1163,7 +1163,7 @@ class Estimator(object):
             model_fn_lib.ModeKeys.TRAIN,
             self.config)
 
-        # TODO(anjalisridhar): Figure out how to resolve the folowing scaffold
+        # TODO(anjalisridhar): Figure out how to resolve the following scaffold
         # parameters: init_feed_dict, init_fn.
         scaffold_list = self._distribution.unwrap(
             grouped_estimator_spec.scaffold)
index 8e5d814..8e2ec83 100644 (file)
@@ -52,7 +52,7 @@ def _fill_array(arr, seq, fillvalue=0):
   If length of seq is less than arr padded length, fillvalue used.
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
-    seq: Non-padded list of data sampels of shape
+    seq: Non-padded list of data samples of shape
       [batch_size, ..., padded_dim(None)]
     fillvalue: Default fillvalue to use.
   """
index 9961fa7..7bcf3d8 100644 (file)
@@ -74,7 +74,7 @@ def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
-    boolean, True if at least one variable has been initalized, else False.
+    boolean, True if at least one variable has been initialized, else False.
   """
   variables = variables_module.global_variables()
   for v in variables:
index 08fff3b..522662c 100644 (file)
@@ -597,7 +597,7 @@ class _TrainingExecutor(object):
     # max_steps, the evaluator will send the final export signal. There is a
     # small chance that the Estimator.train stopping logic sees a different
     # global_step value (due to global step race condition and the fact the
-    # saver sees a larger value for checkpoing saving), which does not end
+    # saver sees a larger value for checkpoint saving), which does not end
     # the training. When the training ends, a new checkpoint is generated, which
     # triggers the listener again. So, it could be the case the final export is
     # triggered twice.
index ede6e0d..ffcb999 100644 (file)
@@ -48,7 +48,7 @@ should choose depends on (1) the feature type and (2) the model type.
 
       embedded_dept_column = embedding_column(
           categorical_column_with_vocabulary_list(
-              "department", ["math", "philosphy", ...]), dimension=10)
+              "department", ["math", "philosophy", ...]), dimension=10)
 
   * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
 
@@ -280,7 +280,7 @@ def input_layer(features,
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
 # should implement the logic in input_layer using Layer's build-and-call
 # paradigm; input_layer should create an instance of InputLayer and
-# return the result of inovking its apply method, just as functional layers do.
+# return the result of invoking its apply method, just as functional layers do.
 class InputLayer(object):
   """An object-oriented version of `input_layer` that reuses variables."""
 
@@ -834,7 +834,7 @@ def shared_embedding_columns(
     tensor_name_in_ckpt=None, max_norm=None, trainable=True):
   """List of dense columns that convert from sparse, categorical input.
 
-  This is similar to `embedding_column`, except that that it produces a list of
+  This is similar to `embedding_column`, except that it produces a list of
   embedding columns that share the same embedding weights.
 
   Use this when your inputs are sparse and of the same type (e.g. watched and
index 1992831..17d112a 100644 (file)
@@ -7,6 +7,18 @@ cimport numpy as np
 from tensorflow.python.util import compat
 
 
+def AppendFloat16ArrayToTensorProto(
+    # For numpy, npy_half is a typedef for npy_uint16,
+    # see: https://github.com/numpy/numpy/blob/master/doc/source/reference/c-api.coremath.rst#half-precision-functions
+    # Because np.float16_t dosen't exist in cython, we use uint16_t here.
+    # TODO: Use np.float16_t when cython supports it.
+    tensor_proto, np.ndarray[np.uint16_t, ndim=1] nparray):
+  cdef long i, n
+  n = nparray.size
+  for i in range(n):
+    tensor_proto.half_val.append(nparray[i])
+
+
 def AppendFloat32ArrayToTensorProto(
     tensor_proto, np.ndarray[np.float32_t, ndim=1] nparray):
   cdef long i, n
index 80140e4..9fc8136 100644 (file)
@@ -2582,7 +2582,7 @@ def set_shape_and_handle_data_for_outputs(op):
 
   When _USE_C_API = True, this is lazily called when a tensor's shape is first
   requested. Usually this should work automatically, but some edge cases may
-  require manaully calling this first to make sure Tensor._shape_val and
+  require manually calling this first to make sure Tensor._shape_val and
   Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
   Tensor).
   """
@@ -5426,36 +5426,30 @@ def enable_eager_execution(config=None, device_policy=None,
       in which operations are executed. Note that @{tf.ConfigProto} is also
       used to configure graph execution (via @{tf.Session}) and many options
       within `tf.ConfigProto` are not implemented (or are irrelevant) when
-     eager execution is enabled.
+      eager execution is enabled.
     device_policy: (Optional.) Policy controlling how operations requiring
-     inputs on a specific device (e.g., a GPU 0) handle inputs on a different
-     device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
-     picked automatically. The value picked may change between TensorFlow
-     releases.
-     Valid values:
-
+      inputs on a specific device (e.g., a GPU 0) handle inputs on a different
+      device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
+      picked automatically. The value picked may change between TensorFlow
+      releases.
+      Valid values:
       - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
         placement is not correct.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_WARN: copies the tensors which are not
         on the right device but logs a warning.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT: silently copies the tensors.
         Note that this may hide performance problems as there is no notification
         provided when operations are blocked on the tensor being copied between
         devices.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
         int32 tensors, raising errors on the other ones.
     execution_mode: (Optional.) Policy controlling how operations dispatched are
       actually executed. When set to None, an appropriate value will be picked
       automatically. The value picked may change between TensorFlow releases.
       Valid values:
-
-        - tf.contrib.eager.SYNC: executes each operation synchronously.
-
-        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
-          operations may return "non-ready" handles.
+      - tf.contrib.eager.SYNC: executes each operation synchronously.
+      - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+        operations may return "non-ready" handles.
 
   Raises:
     ValueError: If eager execution is enabled after creating/executing a
index 8cf2420..ca63efb 100644 (file)
@@ -50,6 +50,13 @@ def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
       [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
+def _MediumAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  # TODO: Remove the conversion if cython supports np.float16_t
+  fast_tensor_util.AppendFloat16ArrayToTensorProto(
+      tensor_proto,
+      np.asarray(proto_values, dtype=np.float16).view(np.uint16))
+
+
 def ExtractBitsFromBFloat16(x):
   return np.asscalar(
       np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
@@ -64,11 +71,8 @@ if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
           SlowAppendBFloat16ArrayToTensorProto,
-      # TODO(sesse): We should have a
-      # fast_tensor_util.AppendFloat16ArrayToTensorProto,
-      # but it seems np.float16_t doesn't exist?
       np.float16:
-          SlowAppendFloat16ArrayToTensorProto,
+          _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
           fast_tensor_util.AppendFloat32ArrayToTensorProto,
       np.float64:
index 97cd22e..5b01df4 100644 (file)
@@ -682,7 +682,7 @@ def run_in_graph_and_eager_modes(__unused__=None,
 
 
   Args:
-    __unused__: Prevents sliently skipping tests.
+    __unused__: Prevents silently skipping tests.
     config: An optional config_pb2.ConfigProto to use to configure the
       session when executing graphs.
     use_gpu: If True, attempt to run as many operations as possible on GPU.
index 7b5eecc..69337b6 100644 (file)
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
 from tensorflow.python.keras.utils.generic_utils import custom_object_scope
index 72cc357..3dfad9c 100644 (file)
@@ -742,6 +742,18 @@ tf_py_test(
 )
 
 tf_py_test(
+    name = "regex_full_match_op_test",
+    size = "small",
+    srcs = ["regex_full_match_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
     srcs = ["save_restore_ops_test.py"],
index e2e6205..fcba456 100644 (file)
@@ -31,9 +31,7 @@ class Conv1DTest(test.TestCase):
 
   def testBasic(self):
     """Test that argument passing to conv1d is handled properly."""
-    # TODO(yongtang): dtypes.float64 can only be enabled once conv2d support
-    # dtypes.float64, as conv1d implicitly calls conv2d after expand_dims.
-    for dtype in [dtypes.float16, dtypes.float32]:
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant([1, 2, 3, 4], dtype=dtype)
       x = array_ops.expand_dims(x, 0)  # Add batch dimension
       x = array_ops.expand_dims(x, 2)  # And depth dimension
index 8973a45..289ae29 100644 (file)
@@ -131,6 +131,23 @@ class Conv3DTransposeTest(test.TestCase):
     nn_ops.conv3d_transpose(
         x_value, f_value, y_shape, strides, data_format='NCDHW')
 
+  def testConv3DTransposeOutputShapeType(self):
+    # Test case for GitHub issue 18887
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session():
+        x_shape = [2, 5, 6, 4, 3]
+        y_shape = [2, 5, 6, 4, 2]
+        f_shape = [3, 3, 3, 2, 3]
+        strides = [1, 1, 1, 1, 1]
+        x_value = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f_value = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv3d_transpose(
+            x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
+            strides=strides, padding="SAME")
+        output.eval()
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
index 8e5556d..63d19c1 100644 (file)
@@ -735,7 +735,7 @@ class FillTriangularTest(test.TestCase):
       raise ValueError("Invalid shape.")
     n = np.int32(n)
     # We can't do: `x[..., -(n**2-m):]` because this doesn't correctly handle
-    # `m == n == 1`. Hence, we do absoulte indexing.
+    # `m == n == 1`. Hence, we do absolute indexing.
     x_tail = x[..., (m - (n * n - m)):]
     y = np.concatenate(
         [x, x_tail[..., ::-1]] if upper else [x_tail, x[..., ::-1]],
index f314267..dc3ea38 100644 (file)
@@ -93,7 +93,7 @@ class RollTest(test_util.TensorFlowTestCase):
   def testNegativeAxis(self):
     self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
-    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    # Make sure negative axis should be 0 <= axis + dims < dims
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of range"):
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
new file mode 100644 (file)
index 0000000..5daae1b
--- /dev/null
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RegexFullMatch op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class RegexFullMatchOpTest(test.TestCase):
+
+  def testRegexFullMatch(self):
+    values = ["abaaba", "abcdabcde"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "a.*a").eval()
+      self.assertAllEqual([True, False], matched)
+
+  def testEmptyMatch(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "").eval()
+      self.assertAllEqual([False, False], matched)
+
+  def testInvalidPattern(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      invalid_pattern = "A["
+      matched = string_ops.regex_full_match(input_vector, invalid_pattern)
+      with self.assertRaisesOpError("Invalid pattern"):
+        matched.eval()
+
+
+if __name__ == "__main__":
+  test.main()
index 3bca5fa..794be09 100644 (file)
@@ -91,16 +91,18 @@ class SegmentReductionOpTest(SegmentReductionHelper):
     ]
 
     # Each item is np_op1, np_op2, tf_op
-    ops_list = [(np.add, None, math_ops.segment_sum), (self._mean_cum_op,
-                                                       self._mean_reduce_op,
-                                                       math_ops.segment_mean),
+    ops_list = [(np.add, None, math_ops.segment_sum),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.segment_mean),
                 (np.ndarray.__mul__, None, math_ops.segment_prod),
                 (np.minimum, None, math_ops.segment_min),
                 (np.maximum, None, math_ops.segment_max)]
 
     # A subset of ops has been enabled for complex numbers
     complex_ops_list = [(np.add, None, math_ops.segment_sum),
-                        (np.ndarray.__mul__, None, math_ops.segment_prod)]
+                        (np.ndarray.__mul__, None, math_ops.segment_prod),
+                        (self._mean_cum_op, self._mean_reduce_op,
+                         math_ops.segment_mean)]
 
     n = 10
     shape = [n, 2]
index 340c34f..eda036e 100644 (file)
@@ -191,6 +191,16 @@ class Layer(base_layer.Layer):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
+    
+    def _should_add_regularizer(variable, existing_variable_set):
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        for var in variable:
+          if var in existing_variable_set:
+            return False
+        return True
+      else:
+        return variable not in existing_variable_set
+
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -233,7 +243,8 @@ class Layer(base_layer.Layer):
             getter=vs.get_variable)
 
         if regularizer:
-          if context.executing_eagerly() or variable not in existing_variables:
+          if context.executing_eagerly() or _should_add_regularizer(
+              variable, existing_variables):
             self._handle_weight_regularization(name, variable, regularizer)
 
         if init_graph is not None:
@@ -353,4 +364,3 @@ def _add_elements_to_collection(elements, collection_list):
     for element in elements:
       if element not in collection_set:
         collection.append(element)
-
index f08b552..ab49e37 100644 (file)
@@ -30,6 +30,7 @@ from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -95,6 +96,21 @@ class BaseLayerTest(test.TestCase):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testReusePartitionedVaraiblesAndRegularizers(self):
+    regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
+    partitioner = partitioned_variables.fixed_size_partitioner(3)
+    for reuse in [False, True]:
+      with variable_scope.variable_scope(variable_scope.get_variable_scope(),
+                                         partitioner=partitioner,
+                                         reuse=reuse):
+        layer = base_layers.Layer(name='my_layer')
+        variable = layer.add_variable(
+            'reg_part_var', [4, 4],
+            initializer=init_ops.zeros_initializer(),
+            regularizer=regularizer)
+    self.assertEqual(
+        len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 3)
+
   def testNoEagerActivityRegularizer(self):
     with context.eager_mode():
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
index ab5997e..3a31ef7 100644 (file)
@@ -1285,7 +1285,7 @@ def reduce_sum(input_tensor,
     The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  Equivalent to np.sum apart the fact that numpy upcast uint8 and int32 to
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
index 1271ee5..ae79c01 100644 (file)
@@ -39,6 +39,8 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
+# Expose regex_full_match in strings namespace
+tf_export("strings.regex_full_match")(regex_full_match)
 
 @tf_export("string_split")
 def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
index 75580fc..9e49188 100644 (file)
@@ -232,7 +232,12 @@ class PrintModelAnalysisTest(test.TestCase):
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertLess(145660, tfprof_node.total_float_ops)
+        #The graph is modifed when MKL is enabled,total_float_ops will
+        #be different
+        if test_util.IsMklEnabled():
+          self.assertLess(101600, tfprof_node.total_float_ops)
+        else:
+          self.assertLess(145660, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
index 4b39826..24a13c0 100644 (file)
@@ -130,7 +130,8 @@ class SavedModelBuilder(object):
       if not file_io.file_exists(asset_destination_filepath):
         file_io.copy(asset_source_filepath, asset_destination_filepath)
 
-    tf_logging.info("Assets written to: %s", assets_destination_dir)
+    tf_logging.info("Assets written to: %s",
+                    compat.as_text(assets_destination_dir))
 
   def _maybe_add_legacy_init_op(self, legacy_init_op=None):
     """Add legacy init op to the SavedModel.
@@ -461,7 +462,7 @@ class SavedModelBuilder(object):
           compat.as_bytes(self._export_dir),
           compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
       file_io.write_string_to_file(path, self._saved_model.SerializeToString())
-    tf_logging.info("SavedModel written to: %s", path)
+    tf_logging.info("SavedModel written to: %s", compat.as_text(path))
 
     return path
 
index 6d05a2e..ab8b37b 100644 (file)
@@ -750,7 +750,7 @@ class DistributionStrategy(object):
     `fn` may call `tf.get_tower_context()` to access methods such as
     `tower_id()` and `merge_call()`.
 
-    `merge_call()` is used to communicate betwen the towers and
+    `merge_call()` is used to communicate between the towers and
     re-enter the cross-tower context. All towers pause their execution
     having encountered a `merge_call()` call. After that the
     `merge_fn`-function is executed. Its results are then unwrapped and
index fc89f88..4d46413 100644 (file)
@@ -1743,7 +1743,7 @@ class Saver(object):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
-    logging.info("Restoring parameters from %s", save_path)
+    logging.info("Restoring parameters from %s", compat.as_text(save_path))
     try:
       if context.executing_eagerly():
         self._build_eager(save_path, build_save=False, build_restore=True)
index 5faf644..fbd6561 100644 (file)
@@ -232,7 +232,7 @@ def getcallargs(func, *positional, **named):
   it. If no attached decorators modify argspec, the final unwrapped target's
   argspec will be used.
   """
-  argspec = getargspec(func)
+  argspec = getfullargspec(func)
   call_args = named.copy()
   this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
   if ismethod(func) and this:
index 2b33d10..0f465ed 100644 (file)
@@ -320,7 +320,7 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 
 // Returns true iff there were no "internal" errors. In other words,
 // errors that has nothing to do with structure checking.
-// If an "internal" error occured, the appropriate Python error will be
+// If an "internal" error occurred, the appropriate Python error will be
 // set and the caller can propage it directly to the user.
 //
 // Both `error_msg` and `is_type_error` must be non-null. `error_msg` must
index 9851c11..70efc10 100644 (file)
@@ -97,7 +97,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 // used instead. The same convention is followed in `pack_sequence_as`. This
 // correctly repacks dicts and `OrderedDict`s after they have been flattened,
 // and also allows flattening an `OrderedDict` and then repacking it back using
-// a correponding plain dict, or vice-versa.
+// a corresponding plain dict, or vice-versa.
 // Dictionaries with non-sortable keys cannot be flattened.
 //
 // Args:
index be0b0bf..ea87744 100644 (file)
@@ -1086,6 +1086,13 @@ class BlasSupport {
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
       uint64 n, uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha,
       const port::ArraySlice<DeviceMemory<float> *> &a, int lda,
       const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,
       const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
@@ -1948,6 +1955,13 @@ class BlasSupport {
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64 m, uint64 n, uint64 k, float alpha,                               \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,         \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,         \
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,      \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha,                               \
       const port::ArraySlice<DeviceMemory<float> *> &a, int lda,               \
       const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,   \
       const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,               \
index 3e9a23c..08fe153 100644 (file)
@@ -286,6 +286,10 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
+#if CUDA_VERSION >= 9010
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmBatchedEx)
+#endif
+
 }  // namespace wrap
 
 static string ToString(cublasStatus_t status) {
@@ -2330,13 +2334,23 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
       computation_type, algorithm, output_profile_result);
 }
 
-template <typename T, typename FuncT>
+template <typename T>
+struct HalfAsFloat {
+  typedef T type;
+};
+
+template <>
+struct HalfAsFloat<Eigen::half> {
+  typedef float type;
+};
+
+template <typename T, typename Scalar, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
-    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
     const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
     const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
-    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   for (int i = 0; i < batch_count; ++i) {
@@ -2345,7 +2359,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename CUDAComplexT<T>::type CUDA_T;
+  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2397,18 +2411,84 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
                         "CUDABlas::DoBlasGemmBatched");
   }
 
-  bool ok = DoBlasInternal(
-      cublas_func, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-      const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+  cudaDataType_t data_type = CUDADataType<T>::type;
 
-  if (ok) {
+#if CUDA_VERSION >= 9010
+  int cc_major, cc_minor;
+  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    cublasGemmAlgo_t algo =
+        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    cudaDataType_t compute_type =
+        (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
+    const void **a_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(a)));
+    const void **b_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(b)));
+    void **c_void_ptrs =
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+    bool ok;
+    ok = DoBlasInternalImpl(
+        wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
+        b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
+        batch_count, compute_type, algo);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  }
+#endif
+  // either CUDA_VERSION < 9.1 or SM < 5.0
+  if (data_type != CUDA_R_16F) {
+    bool ok = DoBlasInternal(
+        cublas_func, stream, true /* = pointer_mode_host */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
+        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  } else {
+    // Fall back to a loop for fp16
+    for (int b = 0; b < batch_count; ++b) {
+      const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
+      const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
+      DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
+      bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
+                           lda, b_matrix, ldb, beta, c_matrix, ldc);
+      if (!ok) {
+        return port::Status(port::error::INTERNAL,
+                            "failed BLAS call, see log for details");
+      }
+    }
     return port::Status::OK();
   }
-  return port::Status(port::error::INTERNAL,
-                      "failed BLAS call, see log for details");
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
+    float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // Note: The func passed here (cublasSgemmBatched) is not actually called,
+  // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
index 12dc5e4..42b3fde 100644 (file)
@@ -107,12 +107,12 @@ class CUDABlas : public blas::BlasSupport {
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
-  template <typename T, typename FuncT>
+  template <typename T, typename Scalar, typename FuncT>
   port::Status DoBlasGemmBatchedInternal(
       FuncT cublas_func, Stream *stream, blas::Transpose transa,
-      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
       const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
-      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, Scalar beta,
       const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
       int batch_count, ScratchAllocator *scratch_allocator);
 
index 2bc9b6b..4a98cfe 100644 (file)
@@ -4482,6 +4482,40 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
 
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        /*scratch_allocator=*/nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
+
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               float, const port::ArraySlice<DeviceMemory<Eigen::half> *> &,
+               int, int, ScratchAllocator *>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
+}
+
+Stream &Stream::ThenBlasGemmBatched(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
     uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
     int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
     float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
index c6e37da..3da1b85 100644 (file)
@@ -1471,6 +1471,13 @@ class Stream {
       blas::ProfileResult *output_profile_result);
 
   // See BlasSupport::DoBlasGemmBatched.
+  Stream &ThenBlasGemmBatched(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64 m, uint64 n, uint64 k, float alpha,
                               const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -1505,6 +1512,13 @@ class Stream {
       int batch_count);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
       uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
       int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
       float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
index 880ec05..d71fd71 100644 (file)
@@ -1300,7 +1300,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
     native.cc_library(
         name=basename + "_gpu",
         srcs=gpu_srcs,
-        copts=_cuda_copts(),
+        copts=_cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
         deps=deps + if_cuda(cuda_deps))
     cuda_deps.extend([":" + basename + "_gpu"])
 
@@ -1483,7 +1483,7 @@ def tf_py_wrap_cc(name,
 # This macro is for running python tests against system installed pip package
 # on Windows.
 #
-# py_test is built as an exectuable python zip file on Windows, which contains all
+# py_test is built as an executable python zip file on Windows, which contains all
 # dependencies of the target. Because of the C++ extensions, it would be very
 # inefficient if the py_test zips all runfiles, plus we don't need them when running
 # tests against system installed pip package. So we'd like to get rid of the deps
index e58de5b..f46bb4b 100644 (file)
@@ -101,6 +101,7 @@ genrule(
         "api/profiler/__init__.py",
         "api/python_io/__init__.py",
         "api/resource_loader/__init__.py",
+        "api/strings/__init__.py",
         "api/saved_model/__init__.py",
         "api/saved_model/builder/__init__.py",
         "api/saved_model/constants/__init__.py",
index 74b1b39..dc2bd40 100644 (file)
@@ -501,6 +501,10 @@ tf_module {
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
+    name: "strings"
+    mtype: "<type \'module\'>"
+  }
+  member {
     name: "summary"
     mtype: "<type \'module\'>"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
new file mode 100644 (file)
index 0000000..a3fbe95
--- /dev/null
@@ -0,0 +1,7 @@
+path: "tensorflow.strings"
+tf_module {
+  member_method {
+    name: "regex_full_match"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
index 5aaf544..982161c 100755 (executable)
 set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
-# Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
-if $(cat /etc/*-release | grep -q 14.04); then
-  easy_install -U pip==9.0.3
-  easy_install3 -U pip==9.0.3
-else
-  pip2 install --upgrade pip==9.0.3
-  pip3 install --upgrade pip==9.0.3
-fi
+# Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
+easy_install -U pip==9.0.3
+easy_install3 -U pip==9.0.3
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
index b999639..406d134 100644 (file)
@@ -85,7 +85,7 @@ RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.g
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
         # For ivy-bridge or sandy-bridge
         # --copt=-march="ivybridge" \
index c65e0b7..a6cd44c 100644 (file)
@@ -35,10 +35,10 @@ ENV CI_BUILD_PYTHON=python \
     PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
     CC_OPT_FLAGS='-march=native' \
     TF_NEED_JEMALLOC=0 \
-    TF_NEED_GCP=0 \
+    TF_NEED_GCP=1 \
     TF_NEED_CUDA=0 \
     TF_NEED_HDFS=0 \
-    TF_NEED_S3=0 \
+    TF_NEED_S3=1 \
     TF_NEED_OPENCL=0 \
     TF_NEED_GDR=0 \
     TF_ENABLE_XLA=0 \
index 7e5e6ef..2fe47f3 100644 (file)
@@ -98,7 +98,7 @@ ENV TF_CUDNN_VERSION=7
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda \
+    bazel build -c opt --copt=-mavx --config=cuda \
        --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         tensorflow/tools/pip_package:build_pip_package && \
     rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
index 67badb4..9f6f553 100644 (file)
@@ -388,7 +388,7 @@ input is collapsed down into a simple constant.
 Args:
 
 *   clear_output_shapes: Clears tensor shape information saved as attributes.
-    Some older graphs containes out-of-date information and may cause import
+    Some older graphs contains out-of-date information and may cause import
     errors. Defaults to true.
 
 Prerequisites: None
index b66d5bd..1a83c6e 100755 (executable)
@@ -24,7 +24,7 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
   mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
index f7385e5..319878e 100644 (file)
@@ -31,7 +31,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0-rc1'
+_VERSION = '1.8.0'
 
 _SHORT_DESCRIPTION = ('TensorFlow is an open source machine learning framework '
                       'for everyone.')
@@ -55,7 +55,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorboard >= 1.7.0, < 1.8.0',
+    'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
 
index 7f477d1..fbb1fde 100644 (file)
@@ -70,7 +70,7 @@ Other eager execution examples can be found under [tensorflow/contrib/eager/pyth
 - After training, you may use the model to perform inference on input data in
   the SNLI data format. The premise and hypotheses sentences are specified with
   the command-line flags `--inference_premise` and `--inference_hypothesis`,
-  respecitvely. Each sentence should include the words, as well as parentheses
+  respectively. Each sentence should include the words, as well as parentheses
   representing a binary parsing of the sentence. The words and parentheses
   should all be separated by spaces. For instance,
 
index 50d1b77..c90c669 100644 (file)
@@ -604,7 +604,7 @@ def _find_cupti_header_dir(repository_ctx, cuda_config):
   for relative_path in CUPTI_HEADER_PATHS:
     if repository_ctx.path("%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
         return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cupti.h under %s" % cuda_toolkit_path)
+  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join([cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
 
 
 def _find_cupti_lib(repository_ctx, cuda_config):
index c2adf57..a058c46 100644 (file)
@@ -34,6 +34,7 @@ filegroup(
         "@org_tensorflow//tensorflow:windows": [
             "@mkl_windows//:LICENSE",
         ],
+        "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
 )
@@ -54,5 +55,6 @@ cc_library(
             "@mkl_windows//:mkl_headers",
             "@mkl_windows//:mkl_libs_windows",
         ],
+        "//conditions:default": [],
     }),
 )