diff options
author | acxz | 2020-09-30 10:09:59 -0400 |
---|---|---|
committer | acxz | 2020-09-30 10:10:56 -0400 |
commit | ad6bd8ba4d83cf71c450fa15b9c68559749f4739 (patch) | |
tree | b2c24ce0af30baf0464d46c86defc8c9cd030b88 | |
parent | 1c2a2f45cdd06ea6442d24b14824bf1fec5aa9a8 (diff) | |
download | aur-ad6bd8ba4d83cf71c450fa15b9c68559749f4739.tar.gz |
add patch for using newer rocm versions
-rw-r--r-- | .SRCINFO | 6 | ||||
-rw-r--r-- | PKGBUILD | 11 | ||||
-rw-r--r-- | new-rocm.patch | 692 |
3 files changed, 701 insertions, 8 deletions
@@ -1,7 +1,7 @@ pkgbase = tensorflow-rocm pkgdesc = Library for computation using data flow graphs for scalable machine learning pkgver = 2.3.1 - pkgrel = 1 + pkgrel = 2 url = https://www.tensorflow.org/ arch = x86_64 license = APACHE @@ -31,7 +31,7 @@ pkgbase = tensorflow-rocm source = fix_gpu_atomic_redef.patch::https://github.com/tensorflow/tensorflow/commit/c054f40f66fa625f51085a20c48554c61d05c5fd.patch source = fix_ldexp_float.patch::https://github.com/tensorflow/tensorflow/commit/655ce09f679a90ecd561538227c703b42d0fc5fa.patch source = fix_occupancy_block.patch - source = rocm-3.7.patch::https://github.com/tensorflow/tensorflow/pull/42689.patch + source = new-rocm.patch sha512sums = e497ef4564f50abf9f918be4522cf702f4cf945cb1ebf83af1386ac4ddc7373b3ba70c7f803f8ca06faf2c6b5396e60b1e0e9b97bfbd667e733b08b6e6d70ef0 sha512sums = df2e0373e2f63b8766f31933f7db57f6a7559b8f03af1db51644fba87731451a7cd3895529a3192e5394612fcb42f245b794b1c9ca3c05881ca03a547c8c9acc sha512sums = e51e3f3dced121db3a09fbdaefd33555536095584b72a5eb6f302fa6fa68ab56ea45e8a847ec90ff4ba076db312c06f91ff672e08e95263c658526582494ce08 @@ -40,7 +40,7 @@ pkgbase = tensorflow-rocm sha512sums = 75972acf0ec53b28aa6c93de77a385acaf675c0d0ae93b6545f67414e9895cbd1074a5d65b211390846b736df271a567b49ec4c992883ad83c060f708bbe0d20 sha512sums = 42fc09bc15412f3b9a82f36485735faed0dcc2f47d72c5bfc451bc09a2aad472db59edb387455fb6594b1606de3a7789917e1fb31280c7044898097ec37db3d5 sha512sums = 88c04ed7a766193687d7079102332e3c63d6f0accbda777836abe5e03e9ebb83fd1aeaa9e4adca70310ce18bf3c6c3907f1f8a11c13e67e3ef79497b91bbf126 - sha512sums = SKIP + sha512sums = 080fd9d4e1228ceb04901a0caceb18b965ef199704196a9b7711fcada3a8cfc2f65c529c4c0e05960ab1e469d203727bf0bbded82d895c13e0e2ab29ae524317 pkgname = tensorflow-rocm pkgdesc = Library for computation using data flow graphs for scalable machine learning (with ROCM) @@ -16,7 +16,7 @@ pkgname=() pkgver=2.3.1 _pkgver=2.3.1 -pkgrel=1 +pkgrel=2 pkgdesc="Library for computation using data flow graphs for scalable machine learning" url="https://www.tensorflow.org/" license=('APACHE') @@ -34,7 +34,7 @@ source=("$pkgname-$pkgver.tar.gz::https://github.com/tensorflow/tensorflow/archi fix_gpu_atomic_redef.patch::https://github.com/tensorflow/tensorflow/commit/c054f40f66fa625f51085a20c48554c61d05c5fd.patch fix_ldexp_float.patch::https://github.com/tensorflow/tensorflow/commit/655ce09f679a90ecd561538227c703b42d0fc5fa.patch fix_occupancy_block.patch - rocm-3.7.patch::https://github.com/tensorflow/tensorflow/pull/42689.patch) + new-rocm.patch) sha512sums=('e497ef4564f50abf9f918be4522cf702f4cf945cb1ebf83af1386ac4ddc7373b3ba70c7f803f8ca06faf2c6b5396e60b1e0e9b97bfbd667e733b08b6e6d70ef0' 'df2e0373e2f63b8766f31933f7db57f6a7559b8f03af1db51644fba87731451a7cd3895529a3192e5394612fcb42f245b794b1c9ca3c05881ca03a547c8c9acc' @@ -44,7 +44,7 @@ sha512sums=('e497ef4564f50abf9f918be4522cf702f4cf945cb1ebf83af1386ac4ddc7373b3ba '75972acf0ec53b28aa6c93de77a385acaf675c0d0ae93b6545f67414e9895cbd1074a5d65b211390846b736df271a567b49ec4c992883ad83c060f708bbe0d20' '42fc09bc15412f3b9a82f36485735faed0dcc2f47d72c5bfc451bc09a2aad472db59edb387455fb6594b1606de3a7789917e1fb31280c7044898097ec37db3d5' '88c04ed7a766193687d7079102332e3c63d6f0accbda777836abe5e03e9ebb83fd1aeaa9e4adca70310ce18bf3c6c3907f1f8a11c13e67e3ef79497b91bbf126' - 'SKIP') + '080fd9d4e1228ceb04901a0caceb18b965ef199704196a9b7711fcada3a8cfc2f65c529c4c0e05960ab1e469d203727bf0bbded82d895c13e0e2ab29ae524317') get_pyver () { python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))' @@ -89,8 +89,9 @@ prepare() { # https://github.com/tensorflow/tensorflow/commit/22def20bae7be6d5b790b360abed5919385b16c2 patch -Np1 -d tensorflow-${_pkgver} -i "$srcdir"/fix_occupancy_block.patch - # Update codebase for ROCm 3.7 - patch -Np1 -d tensorflow-${_pkgver} -i "$srcdir"/rocm-3.7.patch + # Patch for ROCm 3.7 and later + # https://github.com/tensorflow/tensorflow/pull/42689 + patch -Np1 -d tensorflow-${_pkgver} -i "$srcdir"/new-rocm.patch cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-rocm cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-opt-rocm diff --git a/new-rocm.patch b/new-rocm.patch new file mode 100644 index 000000000000..01eb2b4fab8c --- /dev/null +++ b/new-rocm.patch @@ -0,0 +1,692 @@ +From fcc2de09eb38f45b678a5457f594ca594f2572c9 Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Thu, 16 Jul 2020 19:38:03 +0000 +Subject: [PATCH 1/8] Change references to libhip_hcc.so to refer to + libamdhip64.so instead + +With the switch to the new hipclang-vdi runtime (in ROCm 3.5), the new name for the HIP runtime library is libamdhip64.so. + +For backwards compatibility, ROCm 3.5 and ROCm 3.6 include a "libhip_hcc.so" softlink, which points to libamdhip64.so. That softlink will be going away starting with ROCm 3.7(?). + +This commit updates references to libhip_hcc.so (in the TF build) to use libamdhip64.so instead. + +See following JIRA tickets for further details: + +* http://ontrack-internal.amd.com/browse/SWDEV-244762 +* http://ontrack-internal.amd.com/browse/SWDEV-238533 +--- + tensorflow/stream_executor/platform/default/dso_loader.cc | 2 +- + .../crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl | 7 ------- + third_party/gpus/rocm_configure.bzl | 8 +++----- + 3 files changed, 4 insertions(+), 13 deletions(-) + +diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc +index 70b1ebe070a76..84293b7767a20 100644 +--- a/tensorflow/stream_executor/platform/default/dso_loader.cc ++++ b/tensorflow/stream_executor/platform/default/dso_loader.cc +@@ -140,7 +140,7 @@ port::StatusOr<void*> GetHipsparseDsoHandle() { + return GetDsoHandle("hipsparse", ""); + } + +-port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); } ++port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); } + + } // namespace DsoLoader + +diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl +index 8848bd32c2e1d..d5bfe78c6449d 100755 +--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl ++++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl +@@ -34,8 +34,6 @@ HIPCC_ENV = '%{hipcc_env}' + HIPCC_IS_HIPCLANG = '%{hipcc_is_hipclang}'=="True" + HIP_RUNTIME_PATH = '%{hip_runtime_path}' + HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}' +-HCC_RUNTIME_PATH = '%{hcc_runtime_path}' +-HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}' + ROCR_RUNTIME_PATH = '%{rocr_runtime_path}' + ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}' + VERBOSE = '%{crosstool_verbose}'=='1' +@@ -267,11 +265,6 @@ def main(): + gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH) + gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH) + gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY) +- # do not link with HCC runtime library in case hip-clang toolchain is used +- if not HIPCC_IS_HIPCLANG: +- gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH) +- gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH) +- gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY) + gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH) + gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH) + gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY) +diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl +index 1312574f0aa46..0508279518894 100644 +--- a/third_party/gpus/rocm_configure.bzl ++++ b/third_party/gpus/rocm_configure.bzl +@@ -390,7 +390,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin): + libs_paths = [ + (name, _rocm_lib_paths(repository_ctx, name, path)) + for name, path in [ +- ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"), ++ ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"), + ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"), + ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"), + ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"), +@@ -646,7 +646,7 @@ def _create_local_rocm_repository(repository_ctx): + "rocm/BUILD", + tpl_paths["rocm:BUILD"], + { +- "%{hip_lib}": rocm_libs["hip_hcc"].file_name, ++ "%{hip_lib}": rocm_libs["amdhip64"].file_name, + "%{rocblas_lib}": rocm_libs["rocblas"].file_name, + "%{rocfft_lib}": rocm_libs["rocfft"].file_name, + "%{hiprand_lib}": rocm_libs["hiprand"].file_name, +@@ -733,9 +733,7 @@ def _create_local_rocm_repository(repository_ctx): + "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib", + "%{rocr_runtime_library}": "hsa-runtime64", + "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib", +- "%{hip_runtime_library}": "hip_hcc", +- "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib", +- "%{hcc_runtime_library}": "mcwamp", ++ "%{hip_runtime_library}": "amdhip64", + "%{crosstool_verbose}": _crosstool_verbose(repository_ctx), + "%{gcc_host_compiler_path}": str(cc), + }, + +From 77fb7fd1c68f81c416fd909b6677277b3637be05 Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Fri, 17 Jul 2020 01:04:58 +0000 +Subject: [PATCH 2/8] Removing references to `*StaticCompiledGEMM` from TF code + +This commit is in conjunction with this MIOpen PR which removes scgemm from MIOpen +https://github.com/ROCmSoftwarePlatform/MIOpen/pull/325 + +The MIOpen release that includes that change will be included in the next ROCm release. +This commit removes references to `*StaticCompiledGEMM` from TF code to prepare for switching to the next ROCm release (3.7) +--- + tensorflow/stream_executor/rocm/rocm_dnn.cc | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc +index 80306105d4adf..4c5a740dfb090 100644 +--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc ++++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc +@@ -113,9 +113,6 @@ string ToString(miopenConvFwdAlgorithm_t algorithm) { + case miopenConvolutionFwdAlgoImplicitGEMM: + s = "Implicit GEMM"; + break; +- case miopenConvolutionFwdAlgoStaticCompiledGEMM: +- s = "Static Compiled GEMM"; +- break; + } + return s; + } +@@ -182,9 +179,6 @@ string ToString(miopenConvAlgorithm_t algorithm) { + case miopenConvolutionAlgoImplicitGEMM: + s = "Implicit GEMM"; + break; +- case miopenConvolutionAlgoStaticCompiledGEMM: +- s = "Static Compiled GEMM"; +- break; + } + return s; + } + +From 566d2a95c6140322241bce20fcfea952e837fda1 Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Tue, 11 Aug 2020 02:09:46 +0000 +Subject: [PATCH 3/8] Reverting "Provide ldexp float overload for HIP, it's + missing in their headers. " + +--- + tensorflow/core/kernels/cwise_ops_gpu_common.cu.h | 6 ------ + tensorflow/core/kernels/rnn/blas_gemm.h | 5 ----- + 2 files changed, 11 deletions(-) + +diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h +index 8849c3f4eddbb..ecc58da315f6b 100644 +--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h ++++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h +@@ -30,12 +30,6 @@ limitations under the License. + #include "tensorflow/core/platform/types.h" + + #include "tensorflow/core/platform/logging.h" +- +-#ifdef __HIP_DEVICE_COMPILE__ +-// Provide ldexp float overload for HIP, it's missing in their headers. +-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); } +-#endif +- + namespace tensorflow { + namespace functor { + +diff --git a/tensorflow/core/kernels/rnn/blas_gemm.h b/tensorflow/core/kernels/rnn/blas_gemm.h +index 74f4cd2bb39a4..126e1edef17a9 100644 +--- a/tensorflow/core/kernels/rnn/blas_gemm.h ++++ b/tensorflow/core/kernels/rnn/blas_gemm.h +@@ -25,11 +25,6 @@ limitations under the License. + #include "tensorflow/core/kernels/eigen_contraction_kernel.h" + #endif + +-#ifdef __HIP_DEVICE_COMPILE__ +-// Provide ldexp float overload for HIP, it's missing in their headers. +-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); } +-#endif +- + namespace tensorflow { + class OpKernelContext; + namespace functor { + +From 9dcaad456e194bf8d1e3962cd6ad272f4879d7f3 Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Wed, 12 Aug 2020 00:39:02 +0000 +Subject: [PATCH 4/8] updating ROCM CI scripts to use ROCm 3.7 + +--- + .../tools/ci_build/linux/rocm/run_cc_core.sh | 34 +++++++++++++------ + .../ci_build/linux/rocm/run_csb_tests.sh | 27 ++++++++++----- + .../tools/ci_build/linux/rocm/run_py3_core.sh | 23 +++++++++---- + .../tools/ci_build/xla/linux/rocm/run_py3.sh | 33 ++++++++++++------ + 4 files changed, 79 insertions(+), 38 deletions(-) + +diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh +index 1f4a36f8de0f5..92d21cb133be9 100755 +--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh ++++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh +@@ -18,20 +18,27 @@ + set -e + set -x + +-N_JOBS=$(grep -c ^processor /proc/cpuinfo) +-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo) ++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++TF_TESTS_PER_GPU=1 ++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU}) + + echo "" +-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)." ++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)." + echo "" + ++# First positional argument (if any) specifies the ROCM_INSTALL_DIR ++ROCM_INSTALL_DIR=/opt/rocm-3.7.0 ++if [[ -n $1 ]]; then ++ ROCM_INSTALL_DIR=$1 ++fi ++ + # Run configure. + export PYTHON_BIN_PATH=`which python3` + export CC_OPT_FLAGS='-mavx' + + export TF_NEED_ROCM=1 +-export ROCM_PATH=/opt/rocm-3.3.0 +-export TF_GPU_COUNT=${N_GPUS} ++export ROCM_PATH=$ROCM_INSTALL_DIR + + yes "" | $PYTHON_BIN_PATH configure.py + +@@ -39,15 +46,17 @@ yes "" | $PYTHON_BIN_PATH configure.py + bazel test \ + --config=rocm \ + -k \ +- --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ ++ --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-multi_gpu,-v1only \ + --test_lang_filters=cc \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=${TF_GPU_COUNT}\ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ ++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ ++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_timeout 600,900,2400,7200 \ + --build_tests_only \ + --test_output=errors \ + --test_sharding_strategy=disabled \ +- --test_size_filters=small,medium \ ++ --test_size_filters=small,medium,large \ + --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ + -- \ + //tensorflow/... \ +@@ -59,11 +68,14 @@ bazel test \ + --config=rocm \ + -k \ + --test_tag_filters=gpu \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=1 \ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ ++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ ++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_timeout 600,900,2400,7200 \ + --build_tests_only \ + --test_output=errors \ + --test_sharding_strategy=disabled \ ++ --test_size_filters=small,medium,large \ + -- \ + //tensorflow/core/nccl:nccl_manager_test +diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh +index 4962b2789b1c0..80c0686e64724 100755 +--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh ++++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh +@@ -18,20 +18,27 @@ + set -e + set -x + +-N_JOBS=$(grep -c ^processor /proc/cpuinfo) +-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo) ++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++TF_TESTS_PER_GPU=1 ++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU}) + + echo "" +-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)." ++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)." + echo "" + ++# First positional argument (if any) specifies the ROCM_INSTALL_DIR ++ROCM_INSTALL_DIR=/opt/rocm-3.7.0 ++if [[ -n $1 ]]; then ++ ROCM_INSTALL_DIR=$1 ++fi ++ + # Run configure. + export PYTHON_BIN_PATH=`which python3` + export CC_OPT_FLAGS='-mavx' + + export TF_NEED_ROCM=1 +-export ROCM_PATH=/opt/rocm-3.3.0 +-export TF_GPU_COUNT=${N_GPUS} ++export ROCM_PATH=$ROCM_INSTALL_DIR + + yes "" | $PYTHON_BIN_PATH configure.py + +@@ -40,8 +47,10 @@ bazel test \ + --config=rocm \ + -k \ + --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=${TF_GPU_COUNT} \ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ ++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ ++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_timeout 600,900,2400,7200 \ + --test_output=errors \ + --test_sharding_strategy=disabled \ +@@ -60,8 +69,8 @@ bazel test \ + --test_tag_filters=gpu \ + --test_timeout 600,900,2400,7200 \ + --test_output=errors \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=1 \ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ + --test_sharding_strategy=disabled \ + -- \ + //tensorflow/core/nccl:nccl_manager_test +diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh +index 7ea866f8e2032..3a09081dd6ac6 100755 +--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh ++++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh +@@ -18,20 +18,27 @@ + set -e + set -x + +-N_JOBS=$(grep -c ^processor /proc/cpuinfo) +-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo) ++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++TF_TESTS_PER_GPU=1 ++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU}) + + echo "" +-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)." ++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)." + echo "" + ++# First positional argument (if any) specifies the ROCM_INSTALL_DIR ++ROCM_INSTALL_DIR=/opt/rocm-3.7.0 ++if [[ -n $1 ]]; then ++ ROCM_INSTALL_DIR=$1 ++fi ++ + # Run configure. + export PYTHON_BIN_PATH=`which python3` + export CC_OPT_FLAGS='-mavx' + + export TF_NEED_ROCM=1 +-export ROCM_PATH=/opt/rocm-3.3.0 +-export TF_GPU_COUNT=${N_GPUS} ++export ROCM_PATH=$ROCM_INSTALL_DIR + + yes "" | $PYTHON_BIN_PATH configure.py + +@@ -41,8 +48,10 @@ bazel test \ + -k \ + --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ + --test_lang_filters=py \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=${TF_GPU_COUNT} \ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ ++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ ++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_timeout 600,900,2400,7200 \ + --build_tests_only \ + --test_output=errors \ +diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh +index 6ce1fad9cc754..d623b77d5333d 100755 +--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh ++++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh +@@ -18,20 +18,27 @@ + set -e + set -x + +-N_JOBS=$(grep -c ^processor /proc/cpuinfo) +-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo) ++TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) ++TF_TESTS_PER_GPU=1 ++N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU}) + + echo "" +-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)." ++echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)." + echo "" + ++# First positional argument (if any) specifies the ROCM_INSTALL_DIR ++ROCM_INSTALL_DIR=/opt/rocm-3.7.0 ++if [[ -n $1 ]]; then ++ ROCM_INSTALL_DIR=$1 ++fi ++ + # Run configure. + export PYTHON_BIN_PATH=`which python3` + export CC_OPT_FLAGS='-mavx' + + export TF_NEED_ROCM=1 +-export ROCM_PATH=/opt/rocm-3.3.0 +-export TF_GPU_COUNT=${N_GPUS} ++export ROCM_PATH=$ROCM_INSTALL_DIR + + yes "" | $PYTHON_BIN_PATH configure.py + echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc +@@ -41,9 +48,11 @@ bazel test \ + --config=rocm \ + --config=xla \ + -k \ +- --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=${TF_GPU_COUNT} \ ++ --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ ++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ ++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_timeout 600,900,2400,7200 \ + --build_tests_only \ + --test_output=errors \ +@@ -65,9 +74,11 @@ bazel test \ + --config=rocm \ + --config=xla \ + -k \ +- --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ +- --jobs=${N_JOBS} \ +- --local_test_jobs=${TF_GPU_COUNT} \ ++ --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ ++ --jobs=${N_BUILD_JOBS} \ ++ --local_test_jobs=${N_TEST_JOBS} \ ++ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ ++ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_timeout 600,900,2400,7200 \ + --build_tests_only \ + --test_output=errors \ + +From 4b76a49a1a5741dece6d368b30f7125e20c12878 Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Wed, 26 Aug 2020 15:21:31 +0000 +Subject: [PATCH 5/8] Updating Dockerfile.rocm to use ROCm 3.7 + +--- + tensorflow/tools/ci_build/Dockerfile.rocm | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm +index 4f5d3ae7291b1..d209173258ada 100644 +--- a/tensorflow/tools/ci_build/Dockerfile.rocm ++++ b/tensorflow/tools/ci_build/Dockerfile.rocm +@@ -3,8 +3,10 @@ + FROM ubuntu:bionic + MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com> + +-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/ +-ARG ROCM_PATH=/opt/rocm-3.3.0 ++ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/3.7/ ++ARG ROCM_BUILD_NAME=xenial ++ARG ROCM_BUILD_NUM=main ++ARG ROCM_PATH=/opt/rocm-3.7.0 + + ENV DEBIAN_FRONTEND noninteractive + ENV TF_NEED_ROCM 1 +@@ -13,8 +15,12 @@ RUN apt update && apt install -y wget software-properties-common + + # Add rocm repository + RUN apt-get clean all +-RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add - +-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list" ++RUN bin/bash -c 'if [[ $ROCM_DEB_REPO == http://repo.radeon.com/rocm/* ]] ; then \ ++ wget -qO - $ROCM_DEB_REPO/rocm.gpg.key | apt-key add -; \ ++ echo "deb [arch=amd64] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list; \ ++ else \ ++ echo "deb [arch=amd64 trusted=yes] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list ; \ ++ fi' + + # Install misc pkgs + RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + +From f5a822d2012bc3e1cea1de97ff8189404688f84e Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Wed, 12 Aug 2020 15:51:34 +0000 +Subject: [PATCH 6/8] Updating TF to acccount for the (ROCm 3.7) change in + hipDeviceGetStreamPriorityRange + +Starting with ROCm 3.7, the `hipDeviceGetStreamPriorityRange` API returns a range of `[-1,1]`. +This is a departure from the `[0,2]` range that was returned by this API in ROCm 3.3 and prior. + +Updating the TF unit test, that has checks based on the range returned by this API, to account for change in the returned range +--- + .../common_runtime/gpu/gpu_device_test.cc | 34 +++++-------------- + 1 file changed, 8 insertions(+), 26 deletions(-) + +diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc +index 6448fc56af7a1..21c75244b5feb 100644 +--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc ++++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc +@@ -230,9 +230,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) { + TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) { + { + #if TENSORFLOW_USE_ROCM +- // Priority outside the range (0, 2) for AMD GPUs ++ // Priority outside the range (-1, 1) for AMD GPUs + SessionOptions opts = +- MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}}); ++ MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 1}}); + #else + // Priority outside the range (-2, 0) for NVidia GPUs + SessionOptions opts = +@@ -245,7 +245,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) { + #if TENSORFLOW_USE_ROCM + ExpectErrorMessageSubstr( + status, +- "Priority -1 is outside the range of supported priorities [0,2] for" ++ "Priority -2 is outside the range of supported priorities [-1,1] for" + " virtual device 0 on GPU# 0"); + #else + ExpectErrorMessageSubstr( +@@ -254,8 +254,8 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) { + } + { + #if TENSORFLOW_USE_ROCM +- // Priority outside the range (0, 2) for AMD GPUs +- SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}}); ++ // Priority outside the range (-1, 1) for AMD GPUs ++ SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}}); + #else + // Priority outside the range (-2, 0) for NVidia GPUs + SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}}); +@@ -267,7 +267,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) { + #if TENSORFLOW_USE_ROCM + ExpectErrorMessageSubstr( + status, +- "Priority 3 is outside the range of supported priorities [0,2] for" ++ "Priority 2 is outside the range of supported priorities [-1,1] for" + " virtual device 0 on GPU# 0"); + #else + ExpectErrorMessageSubstr( +@@ -288,26 +288,17 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) { + } + + TEST_F(GPUDeviceTest, MultipleVirtualDevices) { +-#if TENSORFLOW_USE_ROCM +- // Valid range for priority values on AMD GPUs in (0,2) +- SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}}); +-#else ++ // Valid range for priority values on AMD GPUs in (-1,1) + // Valid range for priority values on NVidia GPUs in (-2, 0) + SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}}); +-#endif + std::vector<std::unique_ptr<Device>> devices; + TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices)); + EXPECT_EQ(2, devices.size()); + EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit()); + EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit()); +-#if TENSORFLOW_USE_ROCM +- EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority()); +- EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority()); +-#else + EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority()); + EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority()); +-#endif + ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size()); + ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size()); + EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id()); +@@ -339,27 +330,18 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) { + } + { + // Multile virtual devices with matching priority. +-#if TENSORFLOW_USE_ROCM +- // Valid range for priority values on AMD GPUs in (0,2) +- SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}}); +-#else ++ // Valid range for priority values on AMD GPUs in (-1,1) + // Valid range for priority values on NVidia GPUs in (-2, 0) + SessionOptions opts = + MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}}); +-#endif + std::vector<std::unique_ptr<Device>> devices; + TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices)); + EXPECT_EQ(2, devices.size()); + EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit()); + EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit()); +-#if TENSORFLOW_USE_ROCM +- EXPECT_EQ(2, static_cast<BaseGPUDevice*>(devices[0].get())->priority()); +- EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority()); +-#else + EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority()); + EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority()); +-#endif + } + } + + +From ae9e3bd2fb8c3e042742b8c534c9020732c2c66d Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Wed, 12 Aug 2020 23:05:32 +0000 +Subject: [PATCH 7/8] Commeting out subtests that are failing due to JIRA + ticket 236756, and also removing the no_rocm tag from the tests that contain + those subtests + +--- + tensorflow/python/ops/parallel_for/math_test.py | 5 +++++ + tensorflow/python/ops/ragged/ragged_dispatch_test.py | 5 +++++ + 2 files changed, 10 insertions(+) + +diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py +index 933ce765cdbfa..367f40d341115 100644 +--- a/tensorflow/python/ops/parallel_for/math_test.py ++++ b/tensorflow/python/ops/parallel_for/math_test.py +@@ -82,6 +82,11 @@ def test_unary_cwise_complex_ops(self): + self._test_unary_cwise_ops(complex_ops, True) + + def test_unary_cwise_real_ops_1(self): ++ if test.is_built_with_rocm(): ++ # TODO(rocm): ++ # This fails on ROCm...see JIRA ticket 236756 ++ self.skipTest('Fails on ROCM') ++ + real_ops = [ + lambda x: math_ops.acosh(1 + math_ops.square(x)), + math_ops.abs, +diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py +index 0237624aa451d..7a1d7c1882af1 100644 +--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py ++++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py +@@ -139,6 +139,11 @@ def assertSameShape(self, x, y): + ] + ) # pyformat: disable + def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args): ++ if test_util.IsBuiltWithROCm(): ++ # TODO(rocm): ++ # This fails on ROCm...see JIRA ticket 236756 ++ self.skipTest('Fails on ROCM') ++ + result = op(x, **extra_args) + + # Run the wrapped op on the dense values, for comparison. + +From d4b8e68a3675bfb2d7465205420bd5ad15701d0b Mon Sep 17 00:00:00 2001 +From: Deven Desai <deven.desai.amd@gmail.com> +Date: Wed, 26 Aug 2020 22:01:18 +0000 +Subject: [PATCH 8/8] Adding no_rocm tag to unit-tests that will not pass with + ROCm 3.7 until PR #42288 gets merged + +--- + tensorflow/python/BUILD | 1 + + tensorflow/python/keras/optimizer_v2/BUILD | 2 ++ + 2 files changed, 3 insertions(+) + +diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD +index a111237e0565d..5252ebbed6e4b 100644 +--- a/tensorflow/python/BUILD ++++ b/tensorflow/python/BUILD +@@ -5423,6 +5423,7 @@ cuda_py_test( + python_version = "PY3", + shard_count = 10, + tags = [ ++ "no_rocm", + "no_windows_gpu", + "noasan", # b/159332048 + "nomsan", # b/148630708 +diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD +index b208e2e1e1e6b..11966ce8211d2 100644 +--- a/tensorflow/python/keras/optimizer_v2/BUILD ++++ b/tensorflow/python/keras/optimizer_v2/BUILD +@@ -157,6 +157,7 @@ cuda_py_test( + size = "medium", + srcs = ["adadelta_test.py"], + shard_count = 4, ++ tags = ["no_rocm"], + deps = [ + ":optimizer_v2", + "//tensorflow/python:client_testlib", +@@ -298,6 +299,7 @@ cuda_py_test( + size = "medium", + srcs = ["rmsprop_test.py"], + shard_count = 2, ++ tags = ["no_rocm"], + deps = [ + ":optimizer_v2", + "//tensorflow/python:array_ops", |