summarylogtreecommitdiffstats
path: root/0001-GPU-reset.patch
blob: 32651f44f5e2147716148ddc92b5585f3d6301ef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
From 69dfc1d574d2faddf770a40cf5d1f9715da33fd2 Mon Sep 17 00:00:00 2001
From: Shashank Sharma <shashank.sharma@amd.com>
Date: Mon, 7 Mar 2022 12:32:49 +0100
Subject: [PATCH 1/3] drm: Add GPU reset sysfs event

This patch adds a new sysfs event, which will indicate
the userland about a GPU reset, and can also provide
some information like:
- process ID of the process involved with the GPU reset
- process name of the involved process
- the GPU status info (using flags)

This patch also introduces the first flag of the flags
bitmap, which can be appended as and when required.

V2: Addressed review comments from Christian and Amar
   - move the reset information structure to DRM layer
   - drop _ctx from struct name
   - make pid 32 bit(than 64)
   - set flag when VRAM invalid (than valid)
   - add process name as well (Amar)

Cc: Alexandar Deucher <alexander.deucher@amd.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Amaranath Somalapuram <amaranath.somalapuram@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
(cherry picked from commit 90230bd9d9c7d979038547460c9a2cbbeff8d6b9)
[Forward port to 6.0]
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
---
 drivers/gpu/drm/drm_sysfs.c | 31 +++++++++++++++++++++++++++++++
 include/drm/drm_sysfs.h     | 10 ++++++++++
 2 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 430e00b16eeca..4cf1b9b40e707 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -434,6 +434,37 @@ void drm_sysfs_connector_hotplug_event(struct drm_connector *connector)
 }
 EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event);
 
+/**
+ * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
+ * @dev: DRM device
+ * @reset_info: The contextual information about the reset (like PID, flags)
+ *
+ * Send a uevent for the DRM device specified by @dev. This informs
+ * user that a GPU reset has occurred, so that an interested client
+ * can take any recovery or profiling measure.
+ */
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info)
+{
+	unsigned char pid_str[13];
+	unsigned char flags_str[15];
+	unsigned char pname_str[TASK_COMM_LEN + 6];
+	unsigned char reset_str[] = "RESET=1";
+	char *envp[] = { reset_str, pid_str, pname_str, flags_str, NULL };
+
+	if (!reset_info) {
+		DRM_WARN("No reset info, not sending the event\n");
+		return;
+	}
+
+	DRM_DEBUG("generating reset event\n");
+
+	snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%u", reset_info->pid);
+	snprintf(pname_str, ARRAY_SIZE(pname_str), "NAME=%s", reset_info->pname);
+	snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", reset_info->flags);
+	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
+}
+EXPORT_SYMBOL(drm_sysfs_reset_event);
+
 /**
  * drm_sysfs_connector_status_event - generate a DRM uevent for connector
  * property status change
diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
index 6273cac44e479..8c37d6a529328 100644
--- a/include/drm/drm_sysfs.h
+++ b/include/drm/drm_sysfs.h
@@ -1,17 +1,27 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _DRM_SYSFS_H_
 #define _DRM_SYSFS_H_
+#include <linux/sched.h>
+
+#define DRM_GPU_RESET_FLAG_VRAM_INVALID (1 << 0)
 
 struct drm_device;
 struct device;
 struct drm_connector;
 struct drm_property;
 
+struct drm_reset_event {
+	uint32_t pid;
+	uint32_t flags;
+	char pname[TASK_COMM_LEN];
+};
+
 int drm_class_device_register(struct device *dev);
 void drm_class_device_unregister(struct device *dev);
 
 void drm_sysfs_hotplug_event(struct drm_device *dev);
 void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info);
 void drm_sysfs_connector_status_event(struct drm_connector *connector,
 				      struct drm_property *property);
 #endif
-- 
2.41.0


From b717d62ae484fe8fc2f87478960e95cad838f574 Mon Sep 17 00:00:00 2001
From: Shashank Sharma <shashank.sharma@amd.com>
Date: Mon, 7 Mar 2022 15:33:00 +0100
Subject: [PATCH 2/3] drm/amdgpu: add work function for GPU reset event

This patch adds a work function, which sends a GPU reset
uevent and some contextual infomration, like the PID and
some status flags. This work should be scheduled during
a GPU reset.

The userspace can do some recovery and post-processing work
based on this event and information.

V2: Addressed review comments from Christian
- Changed the name of the work to gpu_reset_event_work
- Added a structure to accommodate some additional information
  (like a PID and some flags)
- Do not add new structure in amdgpu.h

Cc: Alexander Deucher <alexander.deucher@amd.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Amaranath Somalapuram <amaranath.somalapuram@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
(cherry picked from commit f63b09e78126f7da67b69409e2cce1d3ab2d7f46)
[Forward port to 6.0]
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2eca58220550e..5c376f7f51f4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -60,6 +60,7 @@
 #include <drm/amdgpu_drm.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_ioctl.h>
+#include <drm/drm_sysfs.h>
 
 #include <kgd_kfd_interface.h>
 #include "dm_pp_interface.h"
@@ -1002,6 +1003,7 @@ struct amdgpu_device {
 
 	int asic_reset_res;
 	struct work_struct		xgmi_reset_work;
+	struct work_struct		gpu_reset_event_work;
 	struct list_head		reset_list;
 
 	long				gfx_timeout;
@@ -1035,6 +1037,7 @@ struct amdgpu_device {
 	pci_channel_state_t		pci_channel_state;
 
 	struct amdgpu_reset_control     *reset_cntl;
+	struct drm_reset_event		reset_event_info;
 	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
 	bool				ram_is_direct_mapped;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f1e9663b40510..08aa72743dfce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -76,6 +76,7 @@
 #include <linux/pm_runtime.h>
 
 #include <drm/drm_drv.h>
+#include <drm/drm_sysfs.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -3355,6 +3356,17 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
 }
 
+static void amdgpu_device_reset_event_func(struct work_struct *__work)
+{
+	struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
+						  gpu_reset_event_work);
+	/*
+	 * A GPU reset has happened, inform the userspace and pass the
+	 * reset related information.
+	 */
+	drm_sysfs_reset_event(&adev->ddev, &adev->reset_event_info);
+}
+
 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
 	struct amdgpu_device *adev =
@@ -3606,6 +3618,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 			  amdgpu_device_delay_enable_gfx_off);
 
 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+	INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);
 
 	adev->gfx.gfx_off_req_count = 1;
 	adev->gfx.gfx_off_residency = 0;
-- 
2.41.0


From b65228799e5ac96211b0e4c342d0a4478a1b6410 Mon Sep 17 00:00:00 2001
From: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
Date: Thu, 10 Mar 2022 11:31:44 +0530
Subject: [PATCH 3/3] drm/amdgpu: schedule GPU reset event work function

Schedule work function with valid PID, process name,
and vram lost status during a GPU reset/ recovery.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
(cherry picked from commit 293c019a84c6402b08db9579819b555b01cd613b)
[Forward ported to 6.0]
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 08aa72743dfce..47ff7a56c6848 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4924,6 +4924,20 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 						reset_context->job->vm->task_info;
 				amdgpu_reset_capture_coredumpm(tmp_adev);
 #endif
+				if (reset_context->job && reset_context->job->vm) {
+					tmp_adev->reset_event_info.pid =
+						reset_context->job->vm->task_info.pid;
+					memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN);
+					strcpy(tmp_adev->reset_event_info.pname,
+						reset_context->job->vm->task_info.process_name);
+				} else {
+					tmp_adev->reset_event_info.pid = 0;
+					memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN);
+				}
+
+				tmp_adev->reset_event_info.flags = vram_lost;
+				schedule_work(&tmp_adev->gpu_reset_event_work);
+
 				if (vram_lost) {
 					DRM_INFO("VRAM is lost due to GPU reset!\n");
 					amdgpu_inc_vram_lost(tmp_adev);
-- 
2.41.0