fix_occupancy_block.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 29 Jun 2020 04:23:28 -0700
Subject: [PATCH] New ROCm 3.5 RBE docker based on Ubuntu 18.04, re-enable RBE.

Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths).

Properly return error from crosstool wrapper.

PiperOrigin-RevId: 318788040
Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5
---
 .bazelrc                                      | 17 ++++
 tensorflow/core/util/gpu_launch_config.h      | 40 ++-------
 ....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++
 .../ci_build/Dockerfile.rbe.rocm-ubuntu16.04  | 37 ---------
 ...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++
 .../bin/crosstool_wrapper_driver_rocm.tpl     | 19 ++++-
 third_party/gpus/rocm_configure.bzl           | 83 +++----------------
 .../preconfig/generate/containers.bzl         |  2 +-
 .../toolchains/remote_config/configs.bzl      | 12 +--
 .../toolchains/remote_config/containers.bzl   | 10 ++-
 10 files changed, 184 insertions(+), 149 deletions(-)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython

diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 4dfaf333d4bf0..0b943e917da01 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
-  // Earlier versions of this HIP routine incorrectly returned void.
-  // TODO re-enable hipError_t error checking when HIP is fixed.
-  // ROCm interface uses unsigned int, convert after checking
-  uint32_t block_count_uint = 0;
-  uint32_t thread_per_block_uint = 0;
-  CHECK_GE(block_size_limit, 0);
-  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
-  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
-                                    func, dynamic_shared_memory_size,
-                                    block_size_limit_uint);
-  block_count = static_cast<int>(block_count_uint);
-  thread_per_block = static_cast<int>(thread_per_block_uint);
+  hipError_t err = hipOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, hipSuccess);
 #endif

   block_count =
@@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
-  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
-                         DivUp(work_element_count, fixed_block_size));
 #elif TENSORFLOW_USE_ROCM
-  // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
-  // is implemented
-  // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-  //    block_size_limit);
-  // CHECK_EQ(err, hipSuccess);
-
-  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
-  // that the kernel is quite simple and will largely be memory-limited.
-  const int physical_thread_count = std::min(
-      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
-      work_element_count);
-  // Assume the kernel be simple enough that it is okay to use 1024 threads
-  // per workgroup.
-  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
-                         d.getNumGpuMultiProcessors());
+  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
+  CHECK_EQ(err, hipSuccess);
 #endif
+  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
+                         DivUp(work_element_count, fixed_block_size));

   config.virtual_thread_count = work_element_count;
   config.thread_per_block = fixed_block_size;