1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 29 Jun 2020 04:23:28 -0700
Subject: [PATCH] New ROCm 3.5 RBE docker based on Ubuntu 18.04, re-enable RBE.
Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths).
Properly return error from crosstool wrapper.
PiperOrigin-RevId: 318788040
Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5
---
.bazelrc | 17 ++++
tensorflow/core/util/gpu_launch_config.h | 40 ++-------
....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++
.../ci_build/Dockerfile.rbe.rocm-ubuntu16.04 | 37 ---------
...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++
.../bin/crosstool_wrapper_driver_rocm.tpl | 19 ++++-
third_party/gpus/rocm_configure.bzl | 83 +++----------------
.../preconfig/generate/containers.bzl | 2 +-
.../toolchains/remote_config/configs.bzl | 12 +--
.../toolchains/remote_config/containers.bzl | 10 ++-
10 files changed, 184 insertions(+), 149 deletions(-)
create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 4dfaf333d4bf0..0b943e917da01 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
block_size_limit);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
- // Earlier versions of this HIP routine incorrectly returned void.
- // TODO re-enable hipError_t error checking when HIP is fixed.
- // ROCm interface uses unsigned int, convert after checking
- uint32_t block_count_uint = 0;
- uint32_t thread_per_block_uint = 0;
- CHECK_GE(block_size_limit, 0);
- uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
- hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
- func, dynamic_shared_memory_size,
- block_size_limit_uint);
- block_count = static_cast<int>(block_count_uint);
- thread_per_block = static_cast<int>(thread_per_block_uint);
+ hipError_t err = hipOccupancyMaxPotentialBlockSize(
+ &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+ block_size_limit);
+ CHECK_EQ(err, hipSuccess);
#endif
block_count =
@@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, cudaSuccess);
- block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
- DivUp(work_element_count, fixed_block_size));
#elif TENSORFLOW_USE_ROCM
- // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
- // is implemented
- // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
- // &block_count, &thread_per_block, func, dynamic_shared_memory_size,
- // block_size_limit);
- // CHECK_EQ(err, hipSuccess);
-
- // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
- // that the kernel is quite simple and will largely be memory-limited.
- const int physical_thread_count = std::min(
- d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
- work_element_count);
- // Assume the kernel be simple enough that it is okay to use 1024 threads
- // per workgroup.
- int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
- block_count = std::min(DivUp(physical_thread_count, thread_per_block),
- d.getNumGpuMultiProcessors());
+ hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+ &block_count, func, fixed_block_size, dynamic_shared_memory_size);
+ CHECK_EQ(err, hipSuccess);
#endif
+ block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
+ DivUp(work_element_count, fixed_block_size));
config.virtual_thread_count = work_element_count;
config.thread_per_block = fixed_block_size;
|