From 69dfc1d574d2faddf770a40cf5d1f9715da33fd2 Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Mon, 7 Mar 2022 12:32:49 +0100 Subject: [PATCH 1/3] drm: Add GPU reset sysfs event This patch adds a new sysfs event, which will indicate the userland about a GPU reset, and can also provide some information like: - process ID of the process involved with the GPU reset - process name of the involved process - the GPU status info (using flags) This patch also introduces the first flag of the flags bitmap, which can be appended as and when required. V2: Addressed review comments from Christian and Amar - move the reset information structure to DRM layer - drop _ctx from struct name - make pid 32 bit(than 64) - set flag when VRAM invalid (than valid) - add process name as well (Amar) Cc: Alexandar Deucher Cc: Christian Koenig Cc: Amaranath Somalapuram Signed-off-by: Shashank Sharma (cherry picked from commit 90230bd9d9c7d979038547460c9a2cbbeff8d6b9) [Forward port to 6.0] Signed-off-by: Cristian Ciocaltea --- drivers/gpu/drm/drm_sysfs.c | 31 +++++++++++++++++++++++++++++++ include/drm/drm_sysfs.h | 10 ++++++++++ 2 files changed, 41 insertions(+) diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c index 430e00b16eeca..4cf1b9b40e707 100644 --- a/drivers/gpu/drm/drm_sysfs.c +++ b/drivers/gpu/drm/drm_sysfs.c @@ -434,6 +434,37 @@ void drm_sysfs_connector_hotplug_event(struct drm_connector *connector) } EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event); +/** + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset + * @dev: DRM device + * @reset_info: The contextual information about the reset (like PID, flags) + * + * Send a uevent for the DRM device specified by @dev. This informs + * user that a GPU reset has occurred, so that an interested client + * can take any recovery or profiling measure. + */ +void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info) +{ + unsigned char pid_str[13]; + unsigned char flags_str[15]; + unsigned char pname_str[TASK_COMM_LEN + 6]; + unsigned char reset_str[] = "RESET=1"; + char *envp[] = { reset_str, pid_str, pname_str, flags_str, NULL }; + + if (!reset_info) { + DRM_WARN("No reset info, not sending the event\n"); + return; + } + + DRM_DEBUG("generating reset event\n"); + + snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%u", reset_info->pid); + snprintf(pname_str, ARRAY_SIZE(pname_str), "NAME=%s", reset_info->pname); + snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", reset_info->flags); + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); +} +EXPORT_SYMBOL(drm_sysfs_reset_event); + /** * drm_sysfs_connector_status_event - generate a DRM uevent for connector * property status change diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h index 6273cac44e479..8c37d6a529328 100644 --- a/include/drm/drm_sysfs.h +++ b/include/drm/drm_sysfs.h @@ -1,17 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DRM_SYSFS_H_ #define _DRM_SYSFS_H_ +#include + +#define DRM_GPU_RESET_FLAG_VRAM_INVALID (1 << 0) struct drm_device; struct device; struct drm_connector; struct drm_property; +struct drm_reset_event { + uint32_t pid; + uint32_t flags; + char pname[TASK_COMM_LEN]; +}; + int drm_class_device_register(struct device *dev); void drm_class_device_unregister(struct device *dev); void drm_sysfs_hotplug_event(struct drm_device *dev); void drm_sysfs_connector_hotplug_event(struct drm_connector *connector); +void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info); void drm_sysfs_connector_status_event(struct drm_connector *connector, struct drm_property *property); #endif -- 2.41.0 From b717d62ae484fe8fc2f87478960e95cad838f574 Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Mon, 7 Mar 2022 15:33:00 +0100 Subject: [PATCH 2/3] drm/amdgpu: add work function for GPU reset event This patch adds a work function, which sends a GPU reset uevent and some contextual infomration, like the PID and some status flags. This work should be scheduled during a GPU reset. The userspace can do some recovery and post-processing work based on this event and information. V2: Addressed review comments from Christian - Changed the name of the work to gpu_reset_event_work - Added a structure to accommodate some additional information (like a PID and some flags) - Do not add new structure in amdgpu.h Cc: Alexander Deucher Cc: Christian Koenig Cc: Amaranath Somalapuram Signed-off-by: Shashank Sharma (cherry picked from commit f63b09e78126f7da67b69409e2cce1d3ab2d7f46) [Forward port to 6.0] Signed-off-by: Cristian Ciocaltea --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2eca58220550e..5c376f7f51f4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -60,6 +60,7 @@ #include #include #include +#include #include #include "dm_pp_interface.h" @@ -1002,6 +1003,7 @@ struct amdgpu_device { int asic_reset_res; struct work_struct xgmi_reset_work; + struct work_struct gpu_reset_event_work; struct list_head reset_list; long gfx_timeout; @@ -1035,6 +1037,7 @@ struct amdgpu_device { pci_channel_state_t pci_channel_state; struct amdgpu_reset_control *reset_cntl; + struct drm_reset_event reset_event_info; uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; bool ram_is_direct_mapped; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f1e9663b40510..08aa72743dfce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -76,6 +76,7 @@ #include #include +#include MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); @@ -3355,6 +3356,17 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) return amdgpu_device_asic_has_dc_support(adev->asic_type); } +static void amdgpu_device_reset_event_func(struct work_struct *__work) +{ + struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, + gpu_reset_event_work); + /* + * A GPU reset has happened, inform the userspace and pass the + * reset related information. + */ + drm_sysfs_reset_event(&adev->ddev, &adev->reset_event_info); +} + static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = @@ -3606,6 +3618,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_device_delay_enable_gfx_off); INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func); adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; -- 2.41.0 From b65228799e5ac96211b0e4c342d0a4478a1b6410 Mon Sep 17 00:00:00 2001 From: Somalapuram Amaranath Date: Thu, 10 Mar 2022 11:31:44 +0530 Subject: [PATCH 3/3] drm/amdgpu: schedule GPU reset event work function Schedule work function with valid PID, process name, and vram lost status during a GPU reset/ recovery. Signed-off-by: Somalapuram Amaranath (cherry picked from commit 293c019a84c6402b08db9579819b555b01cd613b) [Forward ported to 6.0] Signed-off-by: Cristian Ciocaltea --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 08aa72743dfce..47ff7a56c6848 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4924,6 +4924,20 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, reset_context->job->vm->task_info; amdgpu_reset_capture_coredumpm(tmp_adev); #endif + if (reset_context->job && reset_context->job->vm) { + tmp_adev->reset_event_info.pid = + reset_context->job->vm->task_info.pid; + memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN); + strcpy(tmp_adev->reset_event_info.pname, + reset_context->job->vm->task_info.process_name); + } else { + tmp_adev->reset_event_info.pid = 0; + memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN); + } + + tmp_adev->reset_event_info.flags = vram_lost; + schedule_work(&tmp_adev->gpu_reset_event_work); + if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); amdgpu_inc_vram_lost(tmp_adev); -- 2.41.0