aboutsummarylogtreecommitdiffstats
path: root/fix_occupancy_block.patch
diff options
context:
space:
mode:
Diffstat (limited to 'fix_occupancy_block.patch')
-rw-r--r--fix_occupancy_block.patch87
1 files changed, 0 insertions, 87 deletions
diff --git a/fix_occupancy_block.patch b/fix_occupancy_block.patch
deleted file mode 100644
index 137b4e56ea55..000000000000
--- a/fix_occupancy_block.patch
+++ /dev/null
@@ -1,87 +0,0 @@
-From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001
-From: Christian Sigg <csigg@google.com>
-Date: Mon, 29 Jun 2020 04:23:28 -0700
-Subject: [PATCH] New ROCm 3.5 RBE docker based on Ubuntu 18.04, re-enable RBE.
-
-Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths).
-
-Properly return error from crosstool wrapper.
-
-PiperOrigin-RevId: 318788040
-Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5
----
- .bazelrc | 17 ++++
- tensorflow/core/util/gpu_launch_config.h | 40 ++-------
- ....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++
- .../ci_build/Dockerfile.rbe.rocm-ubuntu16.04 | 37 ---------
- ...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++
- .../bin/crosstool_wrapper_driver_rocm.tpl | 19 ++++-
- third_party/gpus/rocm_configure.bzl | 83 +++----------------
- .../preconfig/generate/containers.bzl | 2 +-
- .../toolchains/remote_config/configs.bzl | 12 +--
- .../toolchains/remote_config/containers.bzl | 10 ++-
- 10 files changed, 184 insertions(+), 149 deletions(-)
- create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
- delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
- create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
-
-diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
-index 4dfaf333d4bf0..0b943e917da01 100644
---- a/tensorflow/core/util/gpu_launch_config.h
-+++ b/tensorflow/core/util/gpu_launch_config.h
-@@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
- block_size_limit);
- CHECK_EQ(err, cudaSuccess);
- #elif TENSORFLOW_USE_ROCM
-- // Earlier versions of this HIP routine incorrectly returned void.
-- // TODO re-enable hipError_t error checking when HIP is fixed.
-- // ROCm interface uses unsigned int, convert after checking
-- uint32_t block_count_uint = 0;
-- uint32_t thread_per_block_uint = 0;
-- CHECK_GE(block_size_limit, 0);
-- uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
-- hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
-- func, dynamic_shared_memory_size,
-- block_size_limit_uint);
-- block_count = static_cast<int>(block_count_uint);
-- thread_per_block = static_cast<int>(thread_per_block_uint);
-+ hipError_t err = hipOccupancyMaxPotentialBlockSize(
-+ &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-+ block_size_limit);
-+ CHECK_EQ(err, hipSuccess);
- #endif
-
- block_count =
-@@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
- cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
- &block_count, func, fixed_block_size, dynamic_shared_memory_size);
- CHECK_EQ(err, cudaSuccess);
-- block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
-- DivUp(work_element_count, fixed_block_size));
- #elif TENSORFLOW_USE_ROCM
-- // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
-- // is implemented
-- // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-- // &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-- // block_size_limit);
-- // CHECK_EQ(err, hipSuccess);
--
-- // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
-- // that the kernel is quite simple and will largely be memory-limited.
-- const int physical_thread_count = std::min(
-- d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
-- work_element_count);
-- // Assume the kernel be simple enough that it is okay to use 1024 threads
-- // per workgroup.
-- int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-- block_count = std::min(DivUp(physical_thread_count, thread_per_block),
-- d.getNumGpuMultiProcessors());
-+ hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-+ &block_count, func, fixed_block_size, dynamic_shared_memory_size);
-+ CHECK_EQ(err, hipSuccess);
- #endif
-+ block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
-+ DivUp(work_element_count, fixed_block_size));
-
- config.virtual_thread_count = work_element_count;
- config.thread_per_block = fixed_block_size;