diff options
Diffstat (limited to 'fix_occupancy_block.patch')
-rw-r--r-- | fix_occupancy_block.patch | 87 |
1 files changed, 0 insertions, 87 deletions
diff --git a/fix_occupancy_block.patch b/fix_occupancy_block.patch deleted file mode 100644 index 137b4e56ea55..000000000000 --- a/fix_occupancy_block.patch +++ /dev/null @@ -1,87 +0,0 @@ -From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001 -From: Christian Sigg <csigg@google.com> -Date: Mon, 29 Jun 2020 04:23:28 -0700 -Subject: [PATCH] New ROCm 3.5 RBE docker based on Ubuntu 18.04, re-enable RBE. - -Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths). - -Properly return error from crosstool wrapper. - -PiperOrigin-RevId: 318788040 -Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5 ---- - .bazelrc | 17 ++++ - tensorflow/core/util/gpu_launch_config.h | 40 ++------- - ....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++ - .../ci_build/Dockerfile.rbe.rocm-ubuntu16.04 | 37 --------- - ...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++ - .../bin/crosstool_wrapper_driver_rocm.tpl | 19 ++++- - third_party/gpus/rocm_configure.bzl | 83 +++---------------- - .../preconfig/generate/containers.bzl | 2 +- - .../toolchains/remote_config/configs.bzl | 12 +-- - .../toolchains/remote_config/containers.bzl | 10 ++- - 10 files changed, 184 insertions(+), 149 deletions(-) - create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 - delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04 - create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython - -diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h -index 4dfaf333d4bf0..0b943e917da01 100644 ---- a/tensorflow/core/util/gpu_launch_config.h -+++ b/tensorflow/core/util/gpu_launch_config.h -@@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count, - block_size_limit); - CHECK_EQ(err, cudaSuccess); - #elif TENSORFLOW_USE_ROCM -- // Earlier versions of this HIP routine incorrectly returned void. -- // TODO re-enable hipError_t error checking when HIP is fixed. -- // ROCm interface uses unsigned int, convert after checking -- uint32_t block_count_uint = 0; -- uint32_t thread_per_block_uint = 0; -- CHECK_GE(block_size_limit, 0); -- uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit); -- hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint, -- func, dynamic_shared_memory_size, -- block_size_limit_uint); -- block_count = static_cast<int>(block_count_uint); -- thread_per_block = static_cast<int>(thread_per_block_uint); -+ hipError_t err = hipOccupancyMaxPotentialBlockSize( -+ &block_count, &thread_per_block, func, dynamic_shared_memory_size, -+ block_size_limit); -+ CHECK_EQ(err, hipSuccess); - #endif - - block_count = -@@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize( - cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &block_count, func, fixed_block_size, dynamic_shared_memory_size); - CHECK_EQ(err, cudaSuccess); -- block_count = std::min(block_count * d.getNumGpuMultiProcessors(), -- DivUp(work_element_count, fixed_block_size)); - #elif TENSORFLOW_USE_ROCM -- // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor -- // is implemented -- // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( -- // &block_count, &thread_per_block, func, dynamic_shared_memory_size, -- // block_size_limit); -- // CHECK_EQ(err, hipSuccess); -- -- // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) -- // that the kernel is quite simple and will largely be memory-limited. -- const int physical_thread_count = std::min( -- d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), -- work_element_count); -- // Assume the kernel be simple enough that it is okay to use 1024 threads -- // per workgroup. -- int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); -- block_count = std::min(DivUp(physical_thread_count, thread_per_block), -- d.getNumGpuMultiProcessors()); -+ hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( -+ &block_count, func, fixed_block_size, dynamic_shared_memory_size); -+ CHECK_EQ(err, hipSuccess); - #endif -+ block_count = std::min(block_count * d.getNumGpuMultiProcessors(), -+ DivUp(work_element_count, fixed_block_size)); - - config.virtual_thread_count = work_element_count; - config.thread_per_block = fixed_block_size; |