aboutsummarylogtreecommitdiffstats
path: root/new-rocm.patch
blob: 01eb2b4fab8cc0b6ed6926856aca10b492ac336a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
From fcc2de09eb38f45b678a5457f594ca594f2572c9 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 16 Jul 2020 19:38:03 +0000
Subject: [PATCH 1/8] Change references to libhip_hcc.so to refer to
 libamdhip64.so instead

With the switch to the new hipclang-vdi runtime (in ROCm 3.5), the new name for the HIP runtime library is libamdhip64.so.

For backwards compatibility, ROCm 3.5 and ROCm 3.6 include a "libhip_hcc.so" softlink, which points to libamdhip64.so. That softlink will be going away starting with ROCm 3.7(?).

This commit updates references to libhip_hcc.so (in the TF build) to use libamdhip64.so instead.

See following JIRA tickets for further details:

* http://ontrack-internal.amd.com/browse/SWDEV-244762
* http://ontrack-internal.amd.com/browse/SWDEV-238533
---
 tensorflow/stream_executor/platform/default/dso_loader.cc | 2 +-
 .../crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl | 7 -------
 third_party/gpus/rocm_configure.bzl                       | 8 +++-----
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 70b1ebe070a76..84293b7767a20 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -140,7 +140,7 @@ port::StatusOr<void*> GetHipsparseDsoHandle() {
   return GetDsoHandle("hipsparse", "");
 }
 
-port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
+port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); }
 
 }  // namespace DsoLoader
 
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 8848bd32c2e1d..d5bfe78c6449d 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -34,8 +34,6 @@ HIPCC_ENV = '%{hipcc_env}'
 HIPCC_IS_HIPCLANG = '%{hipcc_is_hipclang}'=="True"
 HIP_RUNTIME_PATH = '%{hip_runtime_path}'
 HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
-HCC_RUNTIME_PATH = '%{hcc_runtime_path}'
-HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}'
 ROCR_RUNTIME_PATH = '%{rocr_runtime_path}'
 ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
 VERBOSE = '%{crosstool_verbose}'=='1'
@@ -267,11 +265,6 @@ def main():
     gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY)
-    # do not link with HCC runtime library in case hip-clang toolchain is used
-    if not HIPCC_IS_HIPCLANG:
-      gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH)
-      gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH)
-      gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY)
     gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 1312574f0aa46..0508279518894 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -390,7 +390,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"),
+            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
             ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
             ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
             ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
@@ -646,7 +646,7 @@ def _create_local_rocm_repository(repository_ctx):
         "rocm/BUILD",
         tpl_paths["rocm:BUILD"],
         {
-            "%{hip_lib}": rocm_libs["hip_hcc"].file_name,
+            "%{hip_lib}": rocm_libs["amdhip64"].file_name,
             "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
             "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
             "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
@@ -733,9 +733,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{rocr_runtime_library}": "hsa-runtime64",
             "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
-            "%{hip_runtime_library}": "hip_hcc",
-            "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib",
-            "%{hcc_runtime_library}": "mcwamp",
+            "%{hip_runtime_library}": "amdhip64",
             "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
         },

From 77fb7fd1c68f81c416fd909b6677277b3637be05 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 17 Jul 2020 01:04:58 +0000
Subject: [PATCH 2/8] Removing references to `*StaticCompiledGEMM` from TF code

This commit is in conjunction with this MIOpen PR which removes scgemm from MIOpen
https://github.com/ROCmSoftwarePlatform/MIOpen/pull/325

The MIOpen release that includes that change will be included in the next ROCm release.
This commit removes references to `*StaticCompiledGEMM` from TF code to prepare for switching to the next ROCm release (3.7)
---
 tensorflow/stream_executor/rocm/rocm_dnn.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index 80306105d4adf..4c5a740dfb090 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -113,9 +113,6 @@ string ToString(miopenConvFwdAlgorithm_t algorithm) {
     case miopenConvolutionFwdAlgoImplicitGEMM:
       s = "Implicit GEMM";
       break;
-    case miopenConvolutionFwdAlgoStaticCompiledGEMM:
-      s = "Static Compiled GEMM";
-      break;
   }
   return s;
 }
@@ -182,9 +179,6 @@ string ToString(miopenConvAlgorithm_t algorithm) {
     case miopenConvolutionAlgoImplicitGEMM:
       s = "Implicit GEMM";
       break;
-    case miopenConvolutionAlgoStaticCompiledGEMM:
-      s = "Static Compiled GEMM";
-      break;
   }
   return s;
 }

From 566d2a95c6140322241bce20fcfea952e837fda1 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Tue, 11 Aug 2020 02:09:46 +0000
Subject: [PATCH 3/8] Reverting "Provide ldexp float overload for HIP, it's
 missing in their headers. "

---
 tensorflow/core/kernels/cwise_ops_gpu_common.cu.h | 6 ------
 tensorflow/core/kernels/rnn/blas_gemm.h           | 5 -----
 2 files changed, 11 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 8849c3f4eddbb..ecc58da315f6b 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -30,12 +30,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #include "tensorflow/core/platform/logging.h"
-
-#ifdef __HIP_DEVICE_COMPILE__
-// Provide ldexp float overload for HIP, it's missing in their headers.
-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
-#endif
-
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.h b/tensorflow/core/kernels/rnn/blas_gemm.h
index 74f4cd2bb39a4..126e1edef17a9 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.h
+++ b/tensorflow/core/kernels/rnn/blas_gemm.h
@@ -25,11 +25,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
-#ifdef __HIP_DEVICE_COMPILE__
-// Provide ldexp float overload for HIP, it's missing in their headers.
-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
-#endif
-
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {

From 9dcaad456e194bf8d1e3962cd6ad272f4879d7f3 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 12 Aug 2020 00:39:02 +0000
Subject: [PATCH 4/8] updating ROCM CI scripts to use ROCm 3.7

---
 .../tools/ci_build/linux/rocm/run_cc_core.sh  | 34 +++++++++++++------
 .../ci_build/linux/rocm/run_csb_tests.sh      | 27 ++++++++++-----
 .../tools/ci_build/linux/rocm/run_py3_core.sh | 23 +++++++++----
 .../tools/ci_build/xla/linux/rocm/run_py3.sh  | 33 ++++++++++++------
 4 files changed, 79 insertions(+), 38 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
index 1f4a36f8de0f5..92d21cb133be9 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
@@ -39,15 +46,17 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test \
       --config=rocm \
       -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-multi_gpu,-v1only \
       --test_lang_filters=cc \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT}\
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
-      --test_size_filters=small,medium \
+      --test_size_filters=small,medium,large \
       --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
       -- \
       //tensorflow/... \
@@ -59,11 +68,14 @@ bazel test \
       --config=rocm \
       -k \
       --test_tag_filters=gpu \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=1 \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
+      --test_size_filters=small,medium,large \
       -- \
       //tensorflow/core/nccl:nccl_manager_test
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
index 4962b2789b1c0..80c0686e64724 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
@@ -40,8 +47,10 @@ bazel test \
       --config=rocm \
       -k \
       --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --test_output=errors \
       --test_sharding_strategy=disabled \
@@ -60,8 +69,8 @@ bazel test \
       --test_tag_filters=gpu \
       --test_timeout 600,900,2400,7200 \
       --test_output=errors \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=1 \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
       --test_sharding_strategy=disabled \
       -- \
       //tensorflow/core/nccl:nccl_manager_test
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 7ea866f8e2032..3a09081dd6ac6 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
@@ -41,8 +48,10 @@ bazel test \
       -k \
       --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
       --test_lang_filters=py \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
index 6ce1fad9cc754..d623b77d5333d 100755
--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
@@ -41,9 +48,11 @@ bazel test \
       --config=rocm \
       --config=xla \
       -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
@@ -65,9 +74,11 @@ bazel test \
       --config=rocm \
       --config=xla \
       -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \

From 4b76a49a1a5741dece6d368b30f7125e20c12878 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 26 Aug 2020 15:21:31 +0000
Subject: [PATCH 5/8] Updating Dockerfile.rocm to use ROCm 3.7

---
 tensorflow/tools/ci_build/Dockerfile.rocm | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index 4f5d3ae7291b1..d209173258ada 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -3,8 +3,10 @@
 FROM ubuntu:bionic
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
-ARG ROCM_PATH=/opt/rocm-3.3.0
+ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/3.7/
+ARG ROCM_BUILD_NAME=xenial
+ARG ROCM_BUILD_NUM=main
+ARG ROCM_PATH=/opt/rocm-3.7.0
 
 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
@@ -13,8 +15,12 @@ RUN apt update && apt install -y wget software-properties-common
 
 # Add rocm repository
 RUN apt-get clean all
-RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
-RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
+RUN bin/bash -c 'if [[ $ROCM_DEB_REPO == http://repo.radeon.com/rocm/*  ]] ; then \
+      wget -qO - $ROCM_DEB_REPO/rocm.gpg.key | apt-key add -; \
+      echo "deb [arch=amd64] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list; \
+    else \
+      echo "deb [arch=amd64 trusted=yes] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list ; \
+    fi'
 
 # Install misc pkgs
 RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \

From f5a822d2012bc3e1cea1de97ff8189404688f84e Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 12 Aug 2020 15:51:34 +0000
Subject: [PATCH 6/8] Updating TF to acccount for the (ROCm 3.7) change in
 hipDeviceGetStreamPriorityRange

Starting with ROCm 3.7, the `hipDeviceGetStreamPriorityRange` API returns a range of `[-1,1]`.
This is a departure from the `[0,2]` range that was returned by this API in ROCm 3.3 and prior.

Updating the TF unit test, that has checks based on the range returned by this API, to account for change in the returned range
---
 .../common_runtime/gpu/gpu_device_test.cc     | 34 +++++--------------
 1 file changed, 8 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 6448fc56af7a1..21c75244b5feb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -230,9 +230,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
   {
 #if TENSORFLOW_USE_ROCM
-    // Priority outside the range (0, 2) for AMD GPUs
+    // Priority outside the range (-1, 1) for AMD GPUs
     SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 1}});
 #else
     // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts =
@@ -245,7 +245,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
 #if TENSORFLOW_USE_ROCM
     ExpectErrorMessageSubstr(
         status,
-        "Priority -1 is outside the range of supported priorities [0,2] for"
+        "Priority -2 is outside the range of supported priorities [-1,1] for"
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
@@ -254,8 +254,8 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
   }
   {
 #if TENSORFLOW_USE_ROCM
-    // Priority outside the range (0, 2) for AMD GPUs
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
+    // Priority outside the range (-1, 1) for AMD GPUs
+    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
 #else
     // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
@@ -267,7 +267,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
 #if TENSORFLOW_USE_ROCM
     ExpectErrorMessageSubstr(
         status,
-        "Priority 3 is outside the range of supported priorities [0,2] for"
+        "Priority 2 is outside the range of supported priorities [-1,1] for"
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
@@ -288,26 +288,17 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
-#if TENSORFLOW_USE_ROCM
-  // Valid range for priority values on AMD GPUs in (0,2)
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
-#else
+  // Valid range for priority values on AMD GPUs in (-1,1)
   // Valid range for priority values on NVidia GPUs in (-2, 0)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
-#endif
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
   EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-#if TENSORFLOW_USE_ROCM
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-  EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#else
   EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
   EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#endif
   ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
   ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
   EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
@@ -339,27 +330,18 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
   }
   {
     // Multile virtual devices with matching priority.
-#if TENSORFLOW_USE_ROCM
-    // Valid range for priority values on AMD GPUs in (0,2)
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
-#else
+    // Valid range for priority values on AMD GPUs in (-1,1)
     // Valid range for priority values on NVidia GPUs in (-2, 0)
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
-#endif
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices));
     EXPECT_EQ(2, devices.size());
     EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
     EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-#if TENSORFLOW_USE_ROCM
-    EXPECT_EQ(2, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-    EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#else
     EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
     EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#endif
   }
 }
 

From ae9e3bd2fb8c3e042742b8c534c9020732c2c66d Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 12 Aug 2020 23:05:32 +0000
Subject: [PATCH 7/8] Commeting out subtests that are failing due to JIRA
 ticket 236756, and also removing the no_rocm tag from the tests that contain
 those subtests

---
 tensorflow/python/ops/parallel_for/math_test.py      | 5 +++++
 tensorflow/python/ops/ragged/ragged_dispatch_test.py | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 933ce765cdbfa..367f40d341115 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -82,6 +82,11 @@ def test_unary_cwise_complex_ops(self):
     self._test_unary_cwise_ops(complex_ops, True)
 
   def test_unary_cwise_real_ops_1(self):
+    if test.is_built_with_rocm():
+      # TODO(rocm):
+      # This fails on ROCm...see JIRA ticket 236756
+      self.skipTest('Fails on ROCM')
+
     real_ops = [
         lambda x: math_ops.acosh(1 + math_ops.square(x)),
         math_ops.abs,
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 0237624aa451d..7a1d7c1882af1 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -139,6 +139,11 @@ def assertSameShape(self, x, y):
       ]
       )  # pyformat: disable
   def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
+    if test_util.IsBuiltWithROCm():
+      # TODO(rocm):
+      # This fails on ROCm...see JIRA ticket 236756
+      self.skipTest('Fails on ROCM')
+
     result = op(x, **extra_args)
 
     # Run the wrapped op on the dense values, for comparison.

From d4b8e68a3675bfb2d7465205420bd5ad15701d0b Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 26 Aug 2020 22:01:18 +0000
Subject: [PATCH 8/8] Adding no_rocm tag to unit-tests that will not pass with
 ROCm 3.7 until PR #42288 gets merged

---
 tensorflow/python/BUILD                    | 1 +
 tensorflow/python/keras/optimizer_v2/BUILD | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a111237e0565d..5252ebbed6e4b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5423,6 +5423,7 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 10,
     tags = [
+        "no_rocm",
         "no_windows_gpu",
         "noasan",  # b/159332048
         "nomsan",  # b/148630708
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b208e2e1e1e6b..11966ce8211d2 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -157,6 +157,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["adadelta_test.py"],
     shard_count = 4,
+    tags = ["no_rocm"],
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -298,6 +299,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["rmsprop_test.py"],
     shard_count = 2,
+    tags = ["no_rocm"],
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:array_ops",