summarylogtreecommitdiffstats
path: root/gpu-fix.patch
blob: 066adf5d80fb6f6bba1f66c8a44a598cd0de1034 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
From e9ad6b2e9fb35b107d6b1462ddc7101b94122d3c Mon Sep 17 00:00:00 2001
From: Anton <anton.j.15.le@gmail.com>
Date: Mon, 15 Sep 2025 19:08:30 +1200
Subject: [PATCH] respect GPU in exclusive process compute mode

---
 gpu.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/gpu.c b/gpu.c
index fb277a9..01913e8 100755
--- a/gpu.c
+++ b/gpu.c
@@ -3,11 +3,11 @@
 //
 
 #include <stdlib.h>
+#include <stdbool.h>
 #include <nvml.h>
 #include <string.h>
 
 #include "main.h"
-
 #define TS_VISIBLE_DEVICES "TS_VISIBLE_DEVICES"
 
 static int free_percentage = 90;
@@ -90,6 +90,7 @@ int * getGpuList(int *num) {
 
         nvmlMemory_t mem;
         nvmlDevice_t dev;
+        nvmlComputeMode_t computeMode;
         result = nvmlDeviceGetHandleByIndex_v2(visible[i], &dev);
         if (result != 0) {
             warning("Failed to get GPU handle for GPU %d: %s", visible[i], nvmlErrorString(result));
@@ -101,8 +102,33 @@ int * getGpuList(int *num) {
             warning("Failed to get GPU memory for GPU %d: %s", visible[i], nvmlErrorString(result));
             goto Error;
         }
+        
+        result = nvmlDeviceGetComputeMode(dev, &computeMode);
+        if (result != 0) {
+            warning("Failed to get GPU compute mode for GPU %d: %s", visible[i], nvmlErrorString(result));
+            goto Error;
+        }
+
+        // Check if there are any running processes on the GPU
+        bool anyRunningProcesses = false;
+        unsigned int infoCount = 0;
+        result = nvmlDeviceGetComputeRunningProcesses(dev, &infoCount, NULL);
+        if (result == NVML_ERROR_INSUFFICIENT_SIZE){
+            anyRunningProcesses = true;
+        } else if (result != 0) {
+            warning("Failed to get GPU compute processes for GPU %d: %s", visible[i], nvmlErrorString(result));
+            goto Error;
+        }
+
+        // Check if the GPU memory is free enough
+        bool isMemFree = mem.free > free_percentage / 100. * mem.total;
+        // When the compute mode is prohibited or in exclusive process mode with a
+        // running processes, the GPU is not free to use.
+        bool blockedByComputeMode = (computeMode == NVML_COMPUTEMODE_PROHIBITED) ||
+                                    (computeMode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS && anyRunningProcesses);
 
-        if (mem.free > free_percentage / 100. * mem.total)
+        bool isFree = isMemFree && !blockedByComputeMode;
+        if (isFree)
             gpuList[count++] = visible[i];
     }
     free(visible);