From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Mon, 29 Jun 2020 04:23:28 -0700 Subject: [PATCH] New ROCm 3.5 RBE docker based on Ubuntu 18.04, re-enable RBE. Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths). Properly return error from crosstool wrapper. PiperOrigin-RevId: 318788040 Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5 --- .bazelrc | 17 ++++ tensorflow/core/util/gpu_launch_config.h | 40 ++------- ....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++ .../ci_build/Dockerfile.rbe.rocm-ubuntu16.04 | 37 --------- ...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++ .../bin/crosstool_wrapper_driver_rocm.tpl | 19 ++++- third_party/gpus/rocm_configure.bzl | 83 +++---------------- .../preconfig/generate/containers.bzl | 2 +- .../toolchains/remote_config/configs.bzl | 12 +-- .../toolchains/remote_config/containers.bzl | 10 ++- 10 files changed, 184 insertions(+), 149 deletions(-) create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h index 4dfaf333d4bf0..0b943e917da01 100644 --- a/tensorflow/core/util/gpu_launch_config.h +++ b/tensorflow/core/util/gpu_launch_config.h @@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count, block_size_limit); CHECK_EQ(err, cudaSuccess); #elif TENSORFLOW_USE_ROCM - // Earlier versions of this HIP routine incorrectly returned void. - // TODO re-enable hipError_t error checking when HIP is fixed. - // ROCm interface uses unsigned int, convert after checking - uint32_t block_count_uint = 0; - uint32_t thread_per_block_uint = 0; - CHECK_GE(block_size_limit, 0); - uint32_t block_size_limit_uint = static_cast(block_size_limit); - hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint, - func, dynamic_shared_memory_size, - block_size_limit_uint); - block_count = static_cast(block_count_uint); - thread_per_block = static_cast(thread_per_block_uint); + hipError_t err = hipOccupancyMaxPotentialBlockSize( + &block_count, &thread_per_block, func, dynamic_shared_memory_size, + block_size_limit); + CHECK_EQ(err, hipSuccess); #endif block_count = @@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize( cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &block_count, func, fixed_block_size, dynamic_shared_memory_size); CHECK_EQ(err, cudaSuccess); - block_count = std::min(block_count * d.getNumGpuMultiProcessors(), - DivUp(work_element_count, fixed_block_size)); #elif TENSORFLOW_USE_ROCM - // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor - // is implemented - // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - // &block_count, &thread_per_block, func, dynamic_shared_memory_size, - // block_size_limit); - // CHECK_EQ(err, hipSuccess); - - // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) - // that the kernel is quite simple and will largely be memory-limited. - const int physical_thread_count = std::min( - d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), - work_element_count); - // Assume the kernel be simple enough that it is okay to use 1024 threads - // per workgroup. - int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); - block_count = std::min(DivUp(physical_thread_count, thread_per_block), - d.getNumGpuMultiProcessors()); + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &block_count, func, fixed_block_size, dynamic_shared_memory_size); + CHECK_EQ(err, hipSuccess); #endif + block_count = std::min(block_count * d.getNumGpuMultiProcessors(), + DivUp(work_element_count, fixed_block_size)); config.virtual_thread_count = work_element_count; config.thread_per_block = fixed_block_size;