[PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

From: Mukul Joshi <mukul.joshi@amd.com>
To: <linux-edac@vger.kernel.org>, <x86@kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
	<mingo@redhat.com>, <mchehab@kernel.org>, <yazen.ghannam@amd.com>,
	<amd-gfx@lists.freedesktop.org>,
	Mukul Joshi <mukul.joshi@amd.com>
Subject: [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS
Date: Sun, 12 Sep 2021 22:13:11 -0400	[thread overview]
Message-ID: <20210913021311.12896-2-mukul.joshi@amd.com> (raw)
In-Reply-To: <20210913021311.12896-1-mukul.joshi@amd.com>

On Aldebaran, GPU driver will handle bad page retirement
even though UMC is host managed. As a result, register a
bad page retirement handler on the mce notifier chain to
retire bad pages on Aldebaran.

v1->v2:
- Use smca_get_bank_type() to determine MCA bank.
- Envelope the changes under #ifdef CONFIG_X86_MCE_AMD.
- Use MCE_PRIORITY_UC instead of MCE_PRIO_ACCEL as we are
  only handling uncorrectable errors.
- Use macros to determine UMC instance and channel instance
  where the uncorrectable error occured.
- Update the headline.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Link: https://lore.kernel.org/amd-gfx/20210512013058.6827-1-mukul.joshi@amd.com/
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 142 ++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d287..35cfcc71ff94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,7 +35,11 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
+#ifdef CONFIG_X86_MCE_AMD
+#include <asm/mce.h>
 
+static bool notifier_registered;
+#endif
 static const char *RAS_FS_NAME = "ras";
 
 const char *ras_error_string[] = {
@@ -86,6 +90,9 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
+#ifdef CONFIG_X86_MCE_AMD
+static void amdgpu_register_bad_pages_mca_notifier(void);
+#endif
 
 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
 {
@@ -2018,6 +2025,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 			adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
 	}
 
+#ifdef CONFIG_X86_MCE_AMD
+	if ((adev->asic_type == CHIP_ALDEBARAN) &&
+	    (adev->gmc.xgmi.connected_to_cpu))
+		amdgpu_register_bad_pages_mca_notifier();
+#endif
 	return 0;
 
 free:
@@ -2511,3 +2523,133 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 		kfree(con);
 	}
 }
+
+#ifdef CONFIG_X86_MCE_AMD
+static struct amdgpu_device *find_adev(uint32_t node_id)
+{
+	struct amdgpu_gpu_instance *gpu_instance;
+	int i;
+	struct amdgpu_device *adev = NULL;
+
+	mutex_lock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_gpu; i++) {
+		gpu_instance = &(mgpu_info.gpu_ins[i]);
+		adev = gpu_instance->adev;
+
+		if (adev->gmc.xgmi.connected_to_cpu &&
+		    adev->gmc.xgmi.physical_node_id == node_id)
+			break;
+		adev = NULL;
+	}
+
+	mutex_unlock(&mgpu_info.mutex);
+
+	return adev;
+}
+
+#define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
+#define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
+#define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
+#define GPU_ID_OFFSET		8
+
+static int amdgpu_bad_page_notifier(struct notifier_block *nb,
+				    unsigned long val, void *data)
+{
+	struct mce *m = (struct mce *)data;
+	struct amdgpu_device *adev = NULL;
+	uint32_t gpu_id = 0;
+	uint32_t umc_inst = 0;
+	uint32_t ch_inst, channel_index = 0;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct eeprom_table_record err_rec;
+	uint64_t retired_page;
+
+	/*
+	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
+	 * and error occurred in DramECC (Extended error code = 0) then only
+	 * process the error, else bail out.
+	 */
+	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+		    (XEC(m->status, 0x1f) == 0x0)))
+		return NOTIFY_DONE;
+
+	/*
+	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
+	 */
+	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
+
+	adev = find_adev(gpu_id);
+	if (!adev) {
+		dev_warn(adev->dev, "%s: Unable to find adev for gpu_id: %d\n",
+				     __func__, gpu_id);
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * If it is correctable error, return.
+	 */
+	if (mce_is_correctable(m)) {
+		return NOTIFY_OK;
+	}
+
+	/*
+	 * If it is uncorrectable error, then find out UMC instance and
+	 * channel index.
+	 */
+	umc_inst = GET_UMC_INST(m->ipid);
+	ch_inst = GET_CHAN_INDEX(m->ipid);
+
+	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
+			     umc_inst, ch_inst);
+
+	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	channel_index =
+		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
+					  + ch_inst];
+
+	retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
+			ADDR_OF_256B_BLOCK(channel_index) |
+			OFFSET_IN_256B_BLOCK(m->addr);
+
+	err_rec.address = m->addr;
+	err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+	err_rec.ts = (uint64_t)ktime_get_real_seconds();
+	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+	err_rec.cu = 0;
+	err_rec.mem_channel = channel_index;
+	err_rec.mcumc_id = umc_inst;
+
+	err_data.err_addr = &err_rec;
+	err_data.err_addr_cnt = 1;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amdgpu_bad_page_nb = {
+	.notifier_call  = amdgpu_bad_page_notifier,
+	.priority       = MCE_PRIO_UC,
+};
+
+static void amdgpu_register_bad_pages_mca_notifier(void)
+{
+	/*
+	 * Register the x86 notifier only once
+	 * with MCE subsystem.
+	 */
+	if (notifier_registered == false) {
+		mce_register_decode_chain(&amdgpu_bad_page_nb);
+		notifier_registered = true;
+	}
+}
+#endif
-- 
2.17.1

next prev parent reply	other threads:[~2021-09-13  2:14 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-11 15:25 [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Naveen Krishna Chatradhi
2021-05-11 15:25 ` [PATCH 2/3] x86/MCE/AMD: Helper function to check UMC v2 Naveen Krishna Chatradhi
2021-05-11 17:34   ` Borislav Petkov
2021-05-12  1:40     ` Joshi, Mukul
2021-05-12  7:26       ` Borislav Petkov
2021-09-13  2:13   ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Mukul Joshi
2021-09-13  2:13     ` Mukul Joshi [this message]
2021-09-22 11:40       ` [PATCHv2 2/2] drm/amdgpu: Register MCE notifier for Aldebaran RAS Borislav Petkov
2021-09-22 19:43         ` Joshi, Mukul
2021-09-22 19:36       ` [PATCHv3 " Mukul Joshi
2021-09-23 14:29         ` Yazen Ghannam
2021-09-23 14:37           ` Borislav Petkov
2021-09-23 15:31             ` Joshi, Mukul
2021-09-23 15:30           ` Joshi, Mukul
2021-09-23 17:23             ` Yazen Ghannam
2021-09-23 18:14               ` Borislav Petkov
2021-09-24 19:46                 ` Yazen Ghannam
2021-09-25 11:20                   ` Borislav Petkov
2021-09-27 18:37                     ` Yazen Ghannam
2021-09-23 18:34               ` Joshi, Mukul
2021-09-23 22:04         ` [PATCHv4 " Mukul Joshi
2021-09-24 19:53           ` Yazen Ghannam
2021-09-22 11:33     ` [PATCHv2 1/2] x86/MCE/AMD: Export smca_get_bank_type symbol Borislav Petkov
2021-09-22 16:27       ` Deucher, Alexander
2021-09-22 16:43         ` Borislav Petkov
2021-09-22 16:47           ` Joshi, Mukul
2021-05-11 15:25 ` [PATCH 3/3] x86/mce: Add MCE priority for Accelerator devices Naveen Krishna Chatradhi
2021-05-11 17:27 ` [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Borislav Petkov
2021-05-24 16:41   ` Chatradhi, Naveen Krishna
2021-05-25 18:02     ` Borislav Petkov
2021-05-25 20:03       ` Yazen Ghannam
2021-05-25 20:12         ` Borislav Petkov
2021-05-26 16:46 ` [PATCH v2] " Naveen Krishna Chatradhi
2021-05-28 15:25   ` [tip: ras/core] " tip-bot2 for Muralidhara M K

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:b5332db4d28 dfblob:35cfcc71ff9
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210913021311.12896-2-mukul.joshi@amd.com \
    --to=mukul.joshi@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=bp@alien8.de \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=mingo@redhat.com \
    --cc=x86@kernel.org \
    --cc=yazen.ghannam@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.