aboutsummarylogtreecommitdiffstats
path: root/nvidia-egpu-hotplug.sh
blob: dfd39ddaa429bcc21eba924ca994d8494557d335 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/bin/bash
# NVIDIA eGPU hotplug handler script
# Handles module loading/unloading for Thunderbolt eGPU hotplug

LOGFILE="/var/log/nvidia-egpu-hotplug.log"
ACTION="$1"
DEVICE="$2"

log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') [$ACTION] $*" >> "$LOGFILE"
}

# Count remaining NVIDIA GPUs in sysfs
count_nvidia_gpus() {
    local count=0
    for dir in /sys/bus/pci/devices/*; do
        if [ -f "$dir/vendor" ] && [ -f "$dir/class" ]; then
            vendor=$(cat "$dir/vendor" 2>/dev/null)
            class=$(cat "$dir/class" 2>/dev/null)
            log "DEBUG: $dir vendor=$vendor class=$class"
            # Check for NVIDIA (0x10de) and display class (0x0300xx or 0x0302xx)
            if [ "$vendor" = "0x10de" ]; then
                case "$class" in
                    0x030000|0x030200|0x030800)
                        count=$((count + 1))
                        log "Found NVIDIA GPU at $dir (class=$class)"
                        ;;
                    *)
                        log "Found NVIDIA non-GPU at $dir (class=$class)"
                        ;;
                esac
            fi
        fi
    done
    log "DEBUG: Total NVIDIA GPUs counted: $count"
    echo $count
}

# Get PIDs that have nvidia device files open (without using fuser)
get_nvidia_pids() {
    local pids=""
    for fd_dir in /proc/[0-9]*/fd; do
        pid=$(echo "$fd_dir" | cut -d'/' -f3)
        # Check if any fd points to nvidia device
        if ls -la "$fd_dir" 2>/dev/null | grep -q '/dev/nvidia'; then
            pids="$pids $pid"
        fi
    done
    echo $pids | tr ' ' '\n' | sort -u | tr '\n' ' '
}

# Wait for processes to exit, then unload modules
delayed_unload() {
    log "Starting delayed module unload sequence..."
    
    # Wait for driver to finish surprise removal handling
    sleep 5
    
    # Get processes using nvidia (without fuser - it triggers RM calls)
    local pids=$(get_nvidia_pids)
    if [ -n "$pids" ]; then
        log "Sending SIGTERM to processes using NVIDIA: $pids"
        for pid in $pids; do
            kill -TERM "$pid" 2>/dev/null
        done
        
        # Wait up to 10 seconds for processes to exit
        local waited=0
        while [ $waited -lt 10 ]; do
            sleep 1
            waited=$((waited + 1))
            pids=$(get_nvidia_pids)
            if [ -z "$pids" ]; then
                log "All NVIDIA processes exited"
                break
            fi
            log "Waiting for processes to exit... ($waited/10s)"
        done
        
        # If still running after 10s, force kill
        pids=$(get_nvidia_pids)
        if [ -n "$pids" ]; then
            log "Force killing remaining processes: $pids"
            for pid in $pids; do
                kill -KILL "$pid" 2>/dev/null
            done
            sleep 2
        fi
    else
        log "No processes using NVIDIA devices"
    fi
    
    # Now unload modules
    log "Unloading NVIDIA modules..."
    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
        if lsmod | grep -q "^$mod "; then
            log "Unloading $mod..."
            modprobe -r "$mod" 2>> "$LOGFILE" || log "Failed to unload $mod"
        fi
    done
    
    log "Module unload sequence complete"
}

# Load NVIDIA modules
load_nvidia_modules() {
    log "Loading NVIDIA modules..."
    modprobe nvidia
    modprobe nvidia_modeset
    modprobe nvidia_drm
    modprobe nvidia_uvm
    log "Module load complete"
}

case "$ACTION" in
    remove)
        log "GPU removed: $DEVICE"
        
        # Wait for kernel to fully process the removal
        # Thunderbolt removal can take several seconds to propagate
        sleep 5
        
        remaining=$(count_nvidia_gpus)
        log "Remaining NVIDIA GPUs: $remaining"
        
        if [ "$remaining" -eq 0 ]; then
            log "No NVIDIA GPUs remaining, starting cleanup..."
            # Run unload via systemd-run to escape udev process killing
            # udev kills background processes, so we must use systemd-run
            systemd-run --no-block --unit=nvidia-egpu-unload-$$ /bin/bash -c "
                exec >> $LOGFILE 2>&1
                echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] Delayed unload started via systemd\"
                sleep 1
                
                # Check for processes using nvidia
                for fd_dir in /proc/[0-9]*/fd; do
                    pid=\$(echo \"\$fd_dir\" | cut -d'/' -f3)
                    if ls -la \"\$fd_dir\" 2>/dev/null | grep -q '/dev/nvidia'; then
                        echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] Sending SIGTERM to \$pid\"
                        kill -TERM \"\$pid\" 2>/dev/null
                    fi
                done
                
                sleep 2
                
                # Force kill if still running
                for fd_dir in /proc/[0-9]*/fd; do
                    pid=\$(echo \"\$fd_dir\" | cut -d'/' -f3)
                    if ls -la \"\$fd_dir\" 2>/dev/null | grep -q '/dev/nvidia'; then
                        echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] Sending SIGKILL to \$pid\"
                        kill -KILL \"\$pid\" 2>/dev/null
                    fi
                done
                
                sleep 1
                
                # Check if GPU came back (rapid replug scenario)
                gpu_back=0
                for dir in /sys/bus/pci/devices/*; do
                    if [ -f \"\$dir/vendor\" ]; then
                        vendor=\$(cat \"\$dir/vendor\" 2>/dev/null)
                        class=\$(cat \"\$dir/class\" 2>/dev/null)
                        if [ \"\$vendor\" = \"0x10de\" ]; then
                            case \"\$class\" in
                                0x030000|0x030200|0x030800) gpu_back=1 ;;
                            esac
                        fi
                    fi
                done
                
                if [ \$gpu_back -eq 1 ]; then
                    echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] GPU detected again, aborting unload\"
                    exit 0
                fi
                
                # Unload modules
                for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
                    if lsmod | grep -q \"^\$mod \"; then
                        echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] Unloading \$mod...\"
                        modprobe -r \"\$mod\" 2>&1 || echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] Failed to unload \$mod\"
                    fi
                done
                
                echo \"\$(date '+%Y-%m-%d %H:%M:%S') [unload] Module unload sequence complete\"
            "
            log "Spawned nvidia-egpu-unload-$$ via systemd-run"
        else
            log "Other NVIDIA GPUs still present, keeping modules loaded"
        fi
        ;;
        
    add)
        log "GPU added: $DEVICE"
        # Modules should auto-load, but ensure they're loaded
        if ! lsmod | grep -q "^nvidia "; then
            load_nvidia_modules
        fi
        ;;
        
    *)
        log "Unknown action: $ACTION"
        exit 1
        ;;
esac

exit 0