summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authoracxz2020-09-30 10:09:59 -0400
committeracxz2020-09-30 10:10:56 -0400
commitad6bd8ba4d83cf71c450fa15b9c68559749f4739 (patch)
treeb2c24ce0af30baf0464d46c86defc8c9cd030b88
parent1c2a2f45cdd06ea6442d24b14824bf1fec5aa9a8 (diff)
downloadaur-ad6bd8ba4d83cf71c450fa15b9c68559749f4739.tar.gz
add patch for using newer rocm versions
-rw-r--r--.SRCINFO6
-rw-r--r--PKGBUILD11
-rw-r--r--new-rocm.patch692
3 files changed, 701 insertions, 8 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 30b438cc38f0..6ca94b2e930e 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,7 +1,7 @@
pkgbase = tensorflow-rocm
pkgdesc = Library for computation using data flow graphs for scalable machine learning
pkgver = 2.3.1
- pkgrel = 1
+ pkgrel = 2
url = https://www.tensorflow.org/
arch = x86_64
license = APACHE
@@ -31,7 +31,7 @@ pkgbase = tensorflow-rocm
source = fix_gpu_atomic_redef.patch::https://github.com/tensorflow/tensorflow/commit/c054f40f66fa625f51085a20c48554c61d05c5fd.patch
source = fix_ldexp_float.patch::https://github.com/tensorflow/tensorflow/commit/655ce09f679a90ecd561538227c703b42d0fc5fa.patch
source = fix_occupancy_block.patch
- source = rocm-3.7.patch::https://github.com/tensorflow/tensorflow/pull/42689.patch
+ source = new-rocm.patch
sha512sums = e497ef4564f50abf9f918be4522cf702f4cf945cb1ebf83af1386ac4ddc7373b3ba70c7f803f8ca06faf2c6b5396e60b1e0e9b97bfbd667e733b08b6e6d70ef0
sha512sums = df2e0373e2f63b8766f31933f7db57f6a7559b8f03af1db51644fba87731451a7cd3895529a3192e5394612fcb42f245b794b1c9ca3c05881ca03a547c8c9acc
sha512sums = e51e3f3dced121db3a09fbdaefd33555536095584b72a5eb6f302fa6fa68ab56ea45e8a847ec90ff4ba076db312c06f91ff672e08e95263c658526582494ce08
@@ -40,7 +40,7 @@ pkgbase = tensorflow-rocm
sha512sums = 75972acf0ec53b28aa6c93de77a385acaf675c0d0ae93b6545f67414e9895cbd1074a5d65b211390846b736df271a567b49ec4c992883ad83c060f708bbe0d20
sha512sums = 42fc09bc15412f3b9a82f36485735faed0dcc2f47d72c5bfc451bc09a2aad472db59edb387455fb6594b1606de3a7789917e1fb31280c7044898097ec37db3d5
sha512sums = 88c04ed7a766193687d7079102332e3c63d6f0accbda777836abe5e03e9ebb83fd1aeaa9e4adca70310ce18bf3c6c3907f1f8a11c13e67e3ef79497b91bbf126
- sha512sums = SKIP
+ sha512sums = 080fd9d4e1228ceb04901a0caceb18b965ef199704196a9b7711fcada3a8cfc2f65c529c4c0e05960ab1e469d203727bf0bbded82d895c13e0e2ab29ae524317
pkgname = tensorflow-rocm
pkgdesc = Library for computation using data flow graphs for scalable machine learning (with ROCM)
diff --git a/PKGBUILD b/PKGBUILD
index 52e2e0dd7a7f..5e6f3b4383ab 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -16,7 +16,7 @@ pkgname=()
pkgver=2.3.1
_pkgver=2.3.1
-pkgrel=1
+pkgrel=2
pkgdesc="Library for computation using data flow graphs for scalable machine learning"
url="https://www.tensorflow.org/"
license=('APACHE')
@@ -34,7 +34,7 @@ source=("$pkgname-$pkgver.tar.gz::https://github.com/tensorflow/tensorflow/archi
fix_gpu_atomic_redef.patch::https://github.com/tensorflow/tensorflow/commit/c054f40f66fa625f51085a20c48554c61d05c5fd.patch
fix_ldexp_float.patch::https://github.com/tensorflow/tensorflow/commit/655ce09f679a90ecd561538227c703b42d0fc5fa.patch
fix_occupancy_block.patch
- rocm-3.7.patch::https://github.com/tensorflow/tensorflow/pull/42689.patch)
+ new-rocm.patch)
sha512sums=('e497ef4564f50abf9f918be4522cf702f4cf945cb1ebf83af1386ac4ddc7373b3ba70c7f803f8ca06faf2c6b5396e60b1e0e9b97bfbd667e733b08b6e6d70ef0'
'df2e0373e2f63b8766f31933f7db57f6a7559b8f03af1db51644fba87731451a7cd3895529a3192e5394612fcb42f245b794b1c9ca3c05881ca03a547c8c9acc'
@@ -44,7 +44,7 @@ sha512sums=('e497ef4564f50abf9f918be4522cf702f4cf945cb1ebf83af1386ac4ddc7373b3ba
'75972acf0ec53b28aa6c93de77a385acaf675c0d0ae93b6545f67414e9895cbd1074a5d65b211390846b736df271a567b49ec4c992883ad83c060f708bbe0d20'
'42fc09bc15412f3b9a82f36485735faed0dcc2f47d72c5bfc451bc09a2aad472db59edb387455fb6594b1606de3a7789917e1fb31280c7044898097ec37db3d5'
'88c04ed7a766193687d7079102332e3c63d6f0accbda777836abe5e03e9ebb83fd1aeaa9e4adca70310ce18bf3c6c3907f1f8a11c13e67e3ef79497b91bbf126'
- 'SKIP')
+ '080fd9d4e1228ceb04901a0caceb18b965ef199704196a9b7711fcada3a8cfc2f65c529c4c0e05960ab1e469d203727bf0bbded82d895c13e0e2ab29ae524317')
get_pyver () {
python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'
@@ -89,8 +89,9 @@ prepare() {
# https://github.com/tensorflow/tensorflow/commit/22def20bae7be6d5b790b360abed5919385b16c2
patch -Np1 -d tensorflow-${_pkgver} -i "$srcdir"/fix_occupancy_block.patch
- # Update codebase for ROCm 3.7
- patch -Np1 -d tensorflow-${_pkgver} -i "$srcdir"/rocm-3.7.patch
+ # Patch for ROCm 3.7 and later
+ # https://github.com/tensorflow/tensorflow/pull/42689
+ patch -Np1 -d tensorflow-${_pkgver} -i "$srcdir"/new-rocm.patch
cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-rocm
cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-opt-rocm
diff --git a/new-rocm.patch b/new-rocm.patch
new file mode 100644
index 000000000000..01eb2b4fab8c
--- /dev/null
+++ b/new-rocm.patch
@@ -0,0 +1,692 @@
+From fcc2de09eb38f45b678a5457f594ca594f2572c9 Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Thu, 16 Jul 2020 19:38:03 +0000
+Subject: [PATCH 1/8] Change references to libhip_hcc.so to refer to
+ libamdhip64.so instead
+
+With the switch to the new hipclang-vdi runtime (in ROCm 3.5), the new name for the HIP runtime library is libamdhip64.so.
+
+For backwards compatibility, ROCm 3.5 and ROCm 3.6 include a "libhip_hcc.so" softlink, which points to libamdhip64.so. That softlink will be going away starting with ROCm 3.7(?).
+
+This commit updates references to libhip_hcc.so (in the TF build) to use libamdhip64.so instead.
+
+See following JIRA tickets for further details:
+
+* http://ontrack-internal.amd.com/browse/SWDEV-244762
+* http://ontrack-internal.amd.com/browse/SWDEV-238533
+---
+ tensorflow/stream_executor/platform/default/dso_loader.cc | 2 +-
+ .../crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl | 7 -------
+ third_party/gpus/rocm_configure.bzl | 8 +++-----
+ 3 files changed, 4 insertions(+), 13 deletions(-)
+
+diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
+index 70b1ebe070a76..84293b7767a20 100644
+--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
++++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
+@@ -140,7 +140,7 @@ port::StatusOr<void*> GetHipsparseDsoHandle() {
+ return GetDsoHandle("hipsparse", "");
+ }
+
+-port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
++port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); }
+
+ } // namespace DsoLoader
+
+diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+index 8848bd32c2e1d..d5bfe78c6449d 100755
+--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
++++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+@@ -34,8 +34,6 @@ HIPCC_ENV = '%{hipcc_env}'
+ HIPCC_IS_HIPCLANG = '%{hipcc_is_hipclang}'=="True"
+ HIP_RUNTIME_PATH = '%{hip_runtime_path}'
+ HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
+-HCC_RUNTIME_PATH = '%{hcc_runtime_path}'
+-HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}'
+ ROCR_RUNTIME_PATH = '%{rocr_runtime_path}'
+ ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
+ VERBOSE = '%{crosstool_verbose}'=='1'
+@@ -267,11 +265,6 @@ def main():
+ gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH)
+ gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH)
+ gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY)
+- # do not link with HCC runtime library in case hip-clang toolchain is used
+- if not HIPCC_IS_HIPCLANG:
+- gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH)
+- gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH)
+- gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY)
+ gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
+ gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
+ gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
+index 1312574f0aa46..0508279518894 100644
+--- a/third_party/gpus/rocm_configure.bzl
++++ b/third_party/gpus/rocm_configure.bzl
+@@ -390,7 +390,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
+ libs_paths = [
+ (name, _rocm_lib_paths(repository_ctx, name, path))
+ for name, path in [
+- ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"),
++ ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
+ ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
+ ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
+ ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
+@@ -646,7 +646,7 @@ def _create_local_rocm_repository(repository_ctx):
+ "rocm/BUILD",
+ tpl_paths["rocm:BUILD"],
+ {
+- "%{hip_lib}": rocm_libs["hip_hcc"].file_name,
++ "%{hip_lib}": rocm_libs["amdhip64"].file_name,
+ "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
+ "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
+ "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
+@@ -733,9 +733,7 @@ def _create_local_rocm_repository(repository_ctx):
+ "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
+ "%{rocr_runtime_library}": "hsa-runtime64",
+ "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
+- "%{hip_runtime_library}": "hip_hcc",
+- "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib",
+- "%{hcc_runtime_library}": "mcwamp",
++ "%{hip_runtime_library}": "amdhip64",
+ "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
+ "%{gcc_host_compiler_path}": str(cc),
+ },
+
+From 77fb7fd1c68f81c416fd909b6677277b3637be05 Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Fri, 17 Jul 2020 01:04:58 +0000
+Subject: [PATCH 2/8] Removing references to `*StaticCompiledGEMM` from TF code
+
+This commit is in conjunction with this MIOpen PR which removes scgemm from MIOpen
+https://github.com/ROCmSoftwarePlatform/MIOpen/pull/325
+
+The MIOpen release that includes that change will be included in the next ROCm release.
+This commit removes references to `*StaticCompiledGEMM` from TF code to prepare for switching to the next ROCm release (3.7)
+---
+ tensorflow/stream_executor/rocm/rocm_dnn.cc | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
+index 80306105d4adf..4c5a740dfb090 100644
+--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
++++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
+@@ -113,9 +113,6 @@ string ToString(miopenConvFwdAlgorithm_t algorithm) {
+ case miopenConvolutionFwdAlgoImplicitGEMM:
+ s = "Implicit GEMM";
+ break;
+- case miopenConvolutionFwdAlgoStaticCompiledGEMM:
+- s = "Static Compiled GEMM";
+- break;
+ }
+ return s;
+ }
+@@ -182,9 +179,6 @@ string ToString(miopenConvAlgorithm_t algorithm) {
+ case miopenConvolutionAlgoImplicitGEMM:
+ s = "Implicit GEMM";
+ break;
+- case miopenConvolutionAlgoStaticCompiledGEMM:
+- s = "Static Compiled GEMM";
+- break;
+ }
+ return s;
+ }
+
+From 566d2a95c6140322241bce20fcfea952e837fda1 Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Tue, 11 Aug 2020 02:09:46 +0000
+Subject: [PATCH 3/8] Reverting "Provide ldexp float overload for HIP, it's
+ missing in their headers. "
+
+---
+ tensorflow/core/kernels/cwise_ops_gpu_common.cu.h | 6 ------
+ tensorflow/core/kernels/rnn/blas_gemm.h | 5 -----
+ 2 files changed, 11 deletions(-)
+
+diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+index 8849c3f4eddbb..ecc58da315f6b 100644
+--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
++++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+@@ -30,12 +30,6 @@ limitations under the License.
+ #include "tensorflow/core/platform/types.h"
+
+ #include "tensorflow/core/platform/logging.h"
+-
+-#ifdef __HIP_DEVICE_COMPILE__
+-// Provide ldexp float overload for HIP, it's missing in their headers.
+-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
+-#endif
+-
+ namespace tensorflow {
+ namespace functor {
+
+diff --git a/tensorflow/core/kernels/rnn/blas_gemm.h b/tensorflow/core/kernels/rnn/blas_gemm.h
+index 74f4cd2bb39a4..126e1edef17a9 100644
+--- a/tensorflow/core/kernels/rnn/blas_gemm.h
++++ b/tensorflow/core/kernels/rnn/blas_gemm.h
+@@ -25,11 +25,6 @@ limitations under the License.
+ #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+ #endif
+
+-#ifdef __HIP_DEVICE_COMPILE__
+-// Provide ldexp float overload for HIP, it's missing in their headers.
+-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
+-#endif
+-
+ namespace tensorflow {
+ class OpKernelContext;
+ namespace functor {
+
+From 9dcaad456e194bf8d1e3962cd6ad272f4879d7f3 Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Wed, 12 Aug 2020 00:39:02 +0000
+Subject: [PATCH 4/8] updating ROCM CI scripts to use ROCm 3.7
+
+---
+ .../tools/ci_build/linux/rocm/run_cc_core.sh | 34 +++++++++++++------
+ .../ci_build/linux/rocm/run_csb_tests.sh | 27 ++++++++++-----
+ .../tools/ci_build/linux/rocm/run_py3_core.sh | 23 +++++++++----
+ .../tools/ci_build/xla/linux/rocm/run_py3.sh | 33 ++++++++++++------
+ 4 files changed, 79 insertions(+), 38 deletions(-)
+
+diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+index 1f4a36f8de0f5..92d21cb133be9 100755
+--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
++++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+@@ -18,20 +18,27 @@
+ set -e
+ set -x
+
+-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++TF_TESTS_PER_GPU=1
++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+ echo ""
+-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+ echo ""
+
++# First positional argument (if any) specifies the ROCM_INSTALL_DIR
++ROCM_INSTALL_DIR=/opt/rocm-3.7.0
++if [[ -n $1 ]]; then
++ ROCM_INSTALL_DIR=$1
++fi
++
+ # Run configure.
+ export PYTHON_BIN_PATH=`which python3`
+ export CC_OPT_FLAGS='-mavx'
+
+ export TF_NEED_ROCM=1
+-export ROCM_PATH=/opt/rocm-3.3.0
+-export TF_GPU_COUNT=${N_GPUS}
++export ROCM_PATH=$ROCM_INSTALL_DIR
+
+ yes "" | $PYTHON_BIN_PATH configure.py
+
+@@ -39,15 +46,17 @@ yes "" | $PYTHON_BIN_PATH configure.py
+ bazel test \
+ --config=rocm \
+ -k \
+- --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
++ --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-multi_gpu,-v1only \
+ --test_lang_filters=cc \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=${TF_GPU_COUNT}\
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_timeout 600,900,2400,7200 \
+ --build_tests_only \
+ --test_output=errors \
+ --test_sharding_strategy=disabled \
+- --test_size_filters=small,medium \
++ --test_size_filters=small,medium,large \
+ --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+ -- \
+ //tensorflow/... \
+@@ -59,11 +68,14 @@ bazel test \
+ --config=rocm \
+ -k \
+ --test_tag_filters=gpu \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=1 \
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_timeout 600,900,2400,7200 \
+ --build_tests_only \
+ --test_output=errors \
+ --test_sharding_strategy=disabled \
++ --test_size_filters=small,medium,large \
+ -- \
+ //tensorflow/core/nccl:nccl_manager_test
+diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+index 4962b2789b1c0..80c0686e64724 100755
+--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
++++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+@@ -18,20 +18,27 @@
+ set -e
+ set -x
+
+-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++TF_TESTS_PER_GPU=1
++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+ echo ""
+-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+ echo ""
+
++# First positional argument (if any) specifies the ROCM_INSTALL_DIR
++ROCM_INSTALL_DIR=/opt/rocm-3.7.0
++if [[ -n $1 ]]; then
++ ROCM_INSTALL_DIR=$1
++fi
++
+ # Run configure.
+ export PYTHON_BIN_PATH=`which python3`
+ export CC_OPT_FLAGS='-mavx'
+
+ export TF_NEED_ROCM=1
+-export ROCM_PATH=/opt/rocm-3.3.0
+-export TF_GPU_COUNT=${N_GPUS}
++export ROCM_PATH=$ROCM_INSTALL_DIR
+
+ yes "" | $PYTHON_BIN_PATH configure.py
+
+@@ -40,8 +47,10 @@ bazel test \
+ --config=rocm \
+ -k \
+ --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=${TF_GPU_COUNT} \
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_timeout 600,900,2400,7200 \
+ --test_output=errors \
+ --test_sharding_strategy=disabled \
+@@ -60,8 +69,8 @@ bazel test \
+ --test_tag_filters=gpu \
+ --test_timeout 600,900,2400,7200 \
+ --test_output=errors \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=1 \
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
+ --test_sharding_strategy=disabled \
+ -- \
+ //tensorflow/core/nccl:nccl_manager_test
+diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+index 7ea866f8e2032..3a09081dd6ac6 100755
+--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
++++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+@@ -18,20 +18,27 @@
+ set -e
+ set -x
+
+-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++TF_TESTS_PER_GPU=1
++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+ echo ""
+-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+ echo ""
+
++# First positional argument (if any) specifies the ROCM_INSTALL_DIR
++ROCM_INSTALL_DIR=/opt/rocm-3.7.0
++if [[ -n $1 ]]; then
++ ROCM_INSTALL_DIR=$1
++fi
++
+ # Run configure.
+ export PYTHON_BIN_PATH=`which python3`
+ export CC_OPT_FLAGS='-mavx'
+
+ export TF_NEED_ROCM=1
+-export ROCM_PATH=/opt/rocm-3.3.0
+-export TF_GPU_COUNT=${N_GPUS}
++export ROCM_PATH=$ROCM_INSTALL_DIR
+
+ yes "" | $PYTHON_BIN_PATH configure.py
+
+@@ -41,8 +48,10 @@ bazel test \
+ -k \
+ --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+ --test_lang_filters=py \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=${TF_GPU_COUNT} \
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_timeout 600,900,2400,7200 \
+ --build_tests_only \
+ --test_output=errors \
+diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+index 6ce1fad9cc754..d623b77d5333d 100755
+--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
++++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+@@ -18,20 +18,27 @@
+ set -e
+ set -x
+
+-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
++TF_TESTS_PER_GPU=1
++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+ echo ""
+-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+ echo ""
+
++# First positional argument (if any) specifies the ROCM_INSTALL_DIR
++ROCM_INSTALL_DIR=/opt/rocm-3.7.0
++if [[ -n $1 ]]; then
++ ROCM_INSTALL_DIR=$1
++fi
++
+ # Run configure.
+ export PYTHON_BIN_PATH=`which python3`
+ export CC_OPT_FLAGS='-mavx'
+
+ export TF_NEED_ROCM=1
+-export ROCM_PATH=/opt/rocm-3.3.0
+-export TF_GPU_COUNT=${N_GPUS}
++export ROCM_PATH=$ROCM_INSTALL_DIR
+
+ yes "" | $PYTHON_BIN_PATH configure.py
+ echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
+@@ -41,9 +48,11 @@ bazel test \
+ --config=rocm \
+ --config=xla \
+ -k \
+- --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=${TF_GPU_COUNT} \
++ --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_timeout 600,900,2400,7200 \
+ --build_tests_only \
+ --test_output=errors \
+@@ -65,9 +74,11 @@ bazel test \
+ --config=rocm \
+ --config=xla \
+ -k \
+- --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+- --jobs=${N_JOBS} \
+- --local_test_jobs=${TF_GPU_COUNT} \
++ --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
++ --jobs=${N_BUILD_JOBS} \
++ --local_test_jobs=${N_TEST_JOBS} \
++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_timeout 600,900,2400,7200 \
+ --build_tests_only \
+ --test_output=errors \
+
+From 4b76a49a1a5741dece6d368b30f7125e20c12878 Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Wed, 26 Aug 2020 15:21:31 +0000
+Subject: [PATCH 5/8] Updating Dockerfile.rocm to use ROCm 3.7
+
+---
+ tensorflow/tools/ci_build/Dockerfile.rocm | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
+index 4f5d3ae7291b1..d209173258ada 100644
+--- a/tensorflow/tools/ci_build/Dockerfile.rocm
++++ b/tensorflow/tools/ci_build/Dockerfile.rocm
+@@ -3,8 +3,10 @@
+ FROM ubuntu:bionic
+ MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
+
+-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
+-ARG ROCM_PATH=/opt/rocm-3.3.0
++ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/3.7/
++ARG ROCM_BUILD_NAME=xenial
++ARG ROCM_BUILD_NUM=main
++ARG ROCM_PATH=/opt/rocm-3.7.0
+
+ ENV DEBIAN_FRONTEND noninteractive
+ ENV TF_NEED_ROCM 1
+@@ -13,8 +15,12 @@ RUN apt update && apt install -y wget software-properties-common
+
+ # Add rocm repository
+ RUN apt-get clean all
+-RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
+-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
++RUN bin/bash -c 'if [[ $ROCM_DEB_REPO == http://repo.radeon.com/rocm/* ]] ; then \
++ wget -qO - $ROCM_DEB_REPO/rocm.gpg.key | apt-key add -; \
++ echo "deb [arch=amd64] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list; \
++ else \
++ echo "deb [arch=amd64 trusted=yes] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list ; \
++ fi'
+
+ # Install misc pkgs
+ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+
+From f5a822d2012bc3e1cea1de97ff8189404688f84e Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Wed, 12 Aug 2020 15:51:34 +0000
+Subject: [PATCH 6/8] Updating TF to acccount for the (ROCm 3.7) change in
+ hipDeviceGetStreamPriorityRange
+
+Starting with ROCm 3.7, the `hipDeviceGetStreamPriorityRange` API returns a range of `[-1,1]`.
+This is a departure from the `[0,2]` range that was returned by this API in ROCm 3.3 and prior.
+
+Updating the TF unit test, that has checks based on the range returned by this API, to account for change in the returned range
+---
+ .../common_runtime/gpu/gpu_device_test.cc | 34 +++++--------------
+ 1 file changed, 8 insertions(+), 26 deletions(-)
+
+diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+index 6448fc56af7a1..21c75244b5feb 100644
+--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
++++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+@@ -230,9 +230,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
+ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
+ {
+ #if TENSORFLOW_USE_ROCM
+- // Priority outside the range (0, 2) for AMD GPUs
++ // Priority outside the range (-1, 1) for AMD GPUs
+ SessionOptions opts =
+- MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
++ MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 1}});
+ #else
+ // Priority outside the range (-2, 0) for NVidia GPUs
+ SessionOptions opts =
+@@ -245,7 +245,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
+ #if TENSORFLOW_USE_ROCM
+ ExpectErrorMessageSubstr(
+ status,
+- "Priority -1 is outside the range of supported priorities [0,2] for"
++ "Priority -2 is outside the range of supported priorities [-1,1] for"
+ " virtual device 0 on GPU# 0");
+ #else
+ ExpectErrorMessageSubstr(
+@@ -254,8 +254,8 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
+ }
+ {
+ #if TENSORFLOW_USE_ROCM
+- // Priority outside the range (0, 2) for AMD GPUs
+- SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
++ // Priority outside the range (-1, 1) for AMD GPUs
++ SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
+ #else
+ // Priority outside the range (-2, 0) for NVidia GPUs
+ SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
+@@ -267,7 +267,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
+ #if TENSORFLOW_USE_ROCM
+ ExpectErrorMessageSubstr(
+ status,
+- "Priority 3 is outside the range of supported priorities [0,2] for"
++ "Priority 2 is outside the range of supported priorities [-1,1] for"
+ " virtual device 0 on GPU# 0");
+ #else
+ ExpectErrorMessageSubstr(
+@@ -288,26 +288,17 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
+ }
+
+ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
+-#if TENSORFLOW_USE_ROCM
+- // Valid range for priority values on AMD GPUs in (0,2)
+- SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
+-#else
++ // Valid range for priority values on AMD GPUs in (-1,1)
+ // Valid range for priority values on NVidia GPUs in (-2, 0)
+ SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
+-#endif
+ std::vector<std::unique_ptr<Device>> devices;
+ TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+ opts, kDeviceNamePrefix, &devices));
+ EXPECT_EQ(2, devices.size());
+ EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+ EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+-#if TENSORFLOW_USE_ROCM
+- EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+- EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
+-#else
+ EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+ EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
+-#endif
+ ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
+ ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
+ EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
+@@ -339,27 +330,18 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
+ }
+ {
+ // Multile virtual devices with matching priority.
+-#if TENSORFLOW_USE_ROCM
+- // Valid range for priority values on AMD GPUs in (0,2)
+- SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
+-#else
++ // Valid range for priority values on AMD GPUs in (-1,1)
+ // Valid range for priority values on NVidia GPUs in (-2, 0)
+ SessionOptions opts =
+ MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
+-#endif
+ std::vector<std::unique_ptr<Device>> devices;
+ TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+ opts, kDeviceNamePrefix, &devices));
+ EXPECT_EQ(2, devices.size());
+ EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+ EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+-#if TENSORFLOW_USE_ROCM
+- EXPECT_EQ(2, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+- EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
+-#else
+ EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+ EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
+-#endif
+ }
+ }
+
+
+From ae9e3bd2fb8c3e042742b8c534c9020732c2c66d Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Wed, 12 Aug 2020 23:05:32 +0000
+Subject: [PATCH 7/8] Commeting out subtests that are failing due to JIRA
+ ticket 236756, and also removing the no_rocm tag from the tests that contain
+ those subtests
+
+---
+ tensorflow/python/ops/parallel_for/math_test.py | 5 +++++
+ tensorflow/python/ops/ragged/ragged_dispatch_test.py | 5 +++++
+ 2 files changed, 10 insertions(+)
+
+diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
+index 933ce765cdbfa..367f40d341115 100644
+--- a/tensorflow/python/ops/parallel_for/math_test.py
++++ b/tensorflow/python/ops/parallel_for/math_test.py
+@@ -82,6 +82,11 @@ def test_unary_cwise_complex_ops(self):
+ self._test_unary_cwise_ops(complex_ops, True)
+
+ def test_unary_cwise_real_ops_1(self):
++ if test.is_built_with_rocm():
++ # TODO(rocm):
++ # This fails on ROCm...see JIRA ticket 236756
++ self.skipTest('Fails on ROCM')
++
+ real_ops = [
+ lambda x: math_ops.acosh(1 + math_ops.square(x)),
+ math_ops.abs,
+diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+index 0237624aa451d..7a1d7c1882af1 100644
+--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
++++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+@@ -139,6 +139,11 @@ def assertSameShape(self, x, y):
+ ]
+ ) # pyformat: disable
+ def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
++ if test_util.IsBuiltWithROCm():
++ # TODO(rocm):
++ # This fails on ROCm...see JIRA ticket 236756
++ self.skipTest('Fails on ROCM')
++
+ result = op(x, **extra_args)
+
+ # Run the wrapped op on the dense values, for comparison.
+
+From d4b8e68a3675bfb2d7465205420bd5ad15701d0b Mon Sep 17 00:00:00 2001
+From: Deven Desai <deven.desai.amd@gmail.com>
+Date: Wed, 26 Aug 2020 22:01:18 +0000
+Subject: [PATCH 8/8] Adding no_rocm tag to unit-tests that will not pass with
+ ROCm 3.7 until PR #42288 gets merged
+
+---
+ tensorflow/python/BUILD | 1 +
+ tensorflow/python/keras/optimizer_v2/BUILD | 2 ++
+ 2 files changed, 3 insertions(+)
+
+diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
+index a111237e0565d..5252ebbed6e4b 100644
+--- a/tensorflow/python/BUILD
++++ b/tensorflow/python/BUILD
+@@ -5423,6 +5423,7 @@ cuda_py_test(
+ python_version = "PY3",
+ shard_count = 10,
+ tags = [
++ "no_rocm",
+ "no_windows_gpu",
+ "noasan", # b/159332048
+ "nomsan", # b/148630708
+diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
+index b208e2e1e1e6b..11966ce8211d2 100644
+--- a/tensorflow/python/keras/optimizer_v2/BUILD
++++ b/tensorflow/python/keras/optimizer_v2/BUILD
+@@ -157,6 +157,7 @@ cuda_py_test(
+ size = "medium",
+ srcs = ["adadelta_test.py"],
+ shard_count = 4,
++ tags = ["no_rocm"],
+ deps = [
+ ":optimizer_v2",
+ "//tensorflow/python:client_testlib",
+@@ -298,6 +299,7 @@ cuda_py_test(
+ size = "medium",
+ srcs = ["rmsprop_test.py"],
+ shard_count = 2,
++ tags = ["no_rocm"],
+ deps = [
+ ":optimizer_v2",
+ "//tensorflow/python:array_ops",