1 files changed, 0 insertions, 87 deletions
diff --git a/fix_occupancy_block.patch b/fix_occupancy_block.patch
deleted file mode 100644
index 137b4e56ea55..000000000000
--- a/fix_occupancy_block.patch
+++ /dev/null
@@ -1,87 +0,0 @@
-From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001
-From: Christian Sigg <csigg@google.com>
-Date: Mon, 29 Jun 2020 04:23:28 -0700
-Subject: [PATCH] New ROCm 3.5 RBE docker based on Ubuntu 18.04, re-enable RBE.
-
-Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths).
-
-Properly return error from crosstool wrapper.
-
-PiperOrigin-RevId: 318788040
-Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5
----
- .bazelrc                                      | 17 ++++
- tensorflow/core/util/gpu_launch_config.h      | 40 ++-------
- ....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++
- .../ci_build/Dockerfile.rbe.rocm-ubuntu16.04  | 37 ---------
- ...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++
- .../bin/crosstool_wrapper_driver_rocm.tpl     | 19 ++++-
- third_party/gpus/rocm_configure.bzl           | 83 +++----------------
- .../preconfig/generate/containers.bzl         |  2 +-
- .../toolchains/remote_config/configs.bzl      | 12 +--
- .../toolchains/remote_config/containers.bzl   | 10 ++-
- 10 files changed, 184 insertions(+), 149 deletions(-)
- create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
- delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
- create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
-
-diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
-index 4dfaf333d4bf0..0b943e917da01 100644
---- a/tensorflow/core/util/gpu_launch_config.h
-+++ b/tensorflow/core/util/gpu_launch_config.h
-@@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
-       block_size_limit);
-   CHECK_EQ(err, cudaSuccess);
- #elif TENSORFLOW_USE_ROCM
--  // Earlier versions of this HIP routine incorrectly returned void.
--  // TODO re-enable hipError_t error checking when HIP is fixed.
--  // ROCm interface uses unsigned int, convert after checking
--  uint32_t block_count_uint = 0;
--  uint32_t thread_per_block_uint = 0;
--  CHECK_GE(block_size_limit, 0);
--  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
--  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
--                                    func, dynamic_shared_memory_size,
--                                    block_size_limit_uint);
--  block_count = static_cast<int>(block_count_uint);
--  thread_per_block = static_cast<int>(thread_per_block_uint);
-+  hipError_t err = hipOccupancyMaxPotentialBlockSize(
-+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-+      block_size_limit);
-+  CHECK_EQ(err, hipSuccess);
- #endif
-
-   block_count =
-@@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
-   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
-   CHECK_EQ(err, cudaSuccess);
--  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
--                         DivUp(work_element_count, fixed_block_size));
- #elif TENSORFLOW_USE_ROCM
--  // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
--  // is implemented
--  // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
--  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
--  //    block_size_limit);
--  // CHECK_EQ(err, hipSuccess);
--
--  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
--  // that the kernel is quite simple and will largely be memory-limited.
--  const int physical_thread_count = std::min(
--      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
--      work_element_count);
--  // Assume the kernel be simple enough that it is okay to use 1024 threads
--  // per workgroup.
--  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
--  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
--                         d.getNumGpuMultiProcessors());
-+  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-+      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
-+  CHECK_EQ(err, hipSuccess);
- #endif
-+  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
-+                         DivUp(work_element_count, fixed_block_size));
-
-   config.virtual_thread_count = work_element_count;
-   config.thread_per_block = fixed_block_size;