summarylogtreecommitdiffstats
path: root/nvx
blob: 5e577a5e2409e988c0fda1111d397e49651f8bd0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/bin/bash

### utility functions ###

# Print json strings contaning pci bus data and their nvidia display devices.
function _pci_devices {
    sudo lshw -json |
        jq -c '
        .. |
        objects |
        select(.id | strings | contains("pci")) |
        select(.children) |
        .children |= (map(select(.id=="display")) | map(select(.vendor | strings | contains("NVIDIA")))) |
        select(.children | length > 0)
    '
}

# Print which processes are accessing nvidia device files.
function _process_list {
    sudo lsof /dev/nvidia* | awk '{ if (NR>1) pids[$1" "$2]++ } END { for (pid in pids) { print pid } }'
}

### setup functions ###

# Print "on" or "off" depending on whether the gpu is enabled or not.
function pci_status() {
    [ -n "$(_pci_devices)" ] && echo "on" || echo "off"
}

# Kill processes that are accessing nvidia device files.
# $1?: any non empty string parameter will cause the function to ask confirmation before killing processes
function process_kill {
    echo "# kill nvidia processes"
    PROCESSES=$(_process_list)
    for process in $PROCESSES; do
        NAME=$(cut -d' ' -f1 <<<"$process")
        PID=$(cut -d' ' -f2 <<<"$process")
        echo "-- $NAME -> pid: $PID"
    done
    if [ -n "$1" ]; then
        read -r -p "-- kill all? [Y/n] " response
        case $response in
        n | N)
            echo "-- aborting"
            return
            ;;
        *) ;;
        esac
    fi
    IFS=$'\n'
    for process in $PROCESSES; do
        NAME=$(cut -d' ' -f1 <<<"$process")
        PID=$(cut -d' ' -f2 <<<"$process")
        echo "-- kill process $NAME -> pid: $PID"
        kill $PID
    done
    unset $IFS
}

# Remove gpu devices from the bus and change their pci power state to "auto".
# The process may hand if not all processes using the gpu are stopped.
function turn_off {
    echo "# turn off gpu"
    IFS=$'\n'
    for pci in $(_pci_devices); do
        PCI_NAME=$(echo $pci | jq '.description + " - " + .product')
        PCI_BUS=$(echo $pci | jq --raw-output '.businfo[4:]')
        echo "-- pci $PCI_NAME -> $PCI_BUS"
        for device in $(echo $pci | jq -c '.children | .[]'); do
            DEVICE_NAME=$(echo $device | jq '.description + " - " + .product')
            DEVICE_BUS=$(echo $device | jq --raw-output '.businfo[4:]')
            echo "   -- device remove $DEVICE_NAME -> $DEVICE_BUS"
            sudo tee /sys/bus/pci/devices/$DEVICE_BUS/remove <<<1 >/dev/null
        done
        echo "   -- power control auto"
        sudo tee /sys/bus/pci/devices/$PCI_BUS/power/control <<<auto >/dev/null
    done
    unset $IFS
}

# Rescan pci devices enabling gpu devices and changing their pci power state to "on".
function turn_on {
    echo "# turn on gpu"
    echo "-- pci rescan and wait 1 second"
    sudo tee /sys/bus/pci/rescan <<<1 >/dev/null
    sleep 1
    IFS=$'\n'
    for pci in $(_pci_devices); do
        PCI_NAME=$(echo $pci | jq '.description + " - " + .product')
        PCI_BUS=$(echo $pci | jq --raw-output '.businfo[4:]')
        echo "-- pci $PCI_NAME -> $PCI_BUS"
        echo "   -- pci power control on and wait 1 second"
        sudo tee /sys/bus/pci/devices/$PCI_BUS/power/control <<<on >/dev/null
        sleep 1
        for device in $(echo $pci | jq -c '.children | .[]'); do
            DEVICE_NAME=$(echo $device | jq '.description + " - " + .product')
            DEVICE_BUS=$(echo $device | jq --raw-output '.businfo[4:]')
            echo "   -- device enable $DEVICE_NAME -> $DEVICE_BUS"
            sudo tee /sys/bus/pci/devices/$DEVICE_BUS/power/control <<<on >/dev/null
        done
    done
    unset $IFS
}

function unload_modules {
    echo "# unload modules"
    echo "-- some modules may fail to unload, that is normal"
    MODULES_UNLOAD=(nvidia_drm nvidia_modeset nvidia_uvm nvidia)
    for module in "${MODULES_UNLOAD[@]}"; do
        echo "-- module $module"
        sudo modprobe -r $module
    done
}

function load_modules {
    echo "# load modules"
    MODULES_LOAD=(nvidia nvidia_uvm nvidia_modeset nvidia_drm)
    for module in "${MODULES_LOAD[@]}"; do
        echo "   -- module $module"
        sudo modprobe $module
    done
}

### execution functions ###

function start {
    if [ ! -f /tmp/nvx.open ]; then
        touch /tmp/nvx.open
    fi
    OPEN=$(cat /tmp/nvx.open)
    echo $((++OPEN)) >/tmp/nvx.open
    if [ $OPEN == 1 ]; then
        turn_on
        load_modules
    fi
}

function stop {
    if [ ! -f /tmp/nvx.open ]; then
        touch /tmp/nvx.open
    fi
    OPEN=$(cat /tmp/nvx.open)
    echo $((--OPEN)) >/tmp/nvx.open
    if [ $OPEN -lt 1 ]; then
        rm /tmp/nvx.open
        process_kill
        unload_modules
        turn_off
    fi

}

if [ "$1" = "start" ]; then
    shift
    start
    __NV_PRIME_RENDER_OFFLOAD=1 __VK_LAYER_NV_optimus=NVIDIA_only __GLX_VENDOR_LIBRARY_NAME=nvidia sudo -u $USER "$@" || true
    stop
elif [ "$1" = "on" ]; then
    turn_on
    load_modules
elif [ "$1" = "off" ]; then
    unload_modules
    turn_off
elif [ "$1" = "off-kill" ]; then
    process_kill
    unload_modules
    turn_off
elif [ "$1" = "status" ]; then
    pci_status
elif [ "$1" = "processes" ]; then
    _process_list
elif [ "$1" = "kill" ]; then
    process_kill "confirm"
else
    echo "Usage: $0 [start|on|off|off-kill|status|processes|kill]"
fi