All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: add mutex to protect ras shared memory
@ 2024-04-28  7:08 YiPeng Chai
  2024-04-28  7:47 ` Wang, Yang(Kevin)
  2024-04-29  7:30 ` Lazar, Lijo
  0 siblings, 2 replies; 4+ messages in thread
From: YiPeng Chai @ 2024-04-28  7:08 UTC (permalink / raw
  To: amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add mutex to protect ras shared memory.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 121 ++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
 3 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 5583e2d1b12f..fa4fea00f6b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
 	}
 }
 
+static int psp_ras_send_cmd(struct psp_context *psp,
+		enum ras_command cmd_id, void *in, void *out)
+{
+	struct ta_ras_shared_memory *ras_cmd;
+	uint32_t cmd = cmd_id;
+	int ret = 0;
+
+	mutex_lock(&psp->ras_context.mutex);
+	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
+	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+	switch (cmd) {
+	case TA_RAS_COMMAND__ENABLE_FEATURES:
+	case TA_RAS_COMMAND__DISABLE_FEATURES:
+		memcpy(&ras_cmd->ras_in_message,
+			in, sizeof(ras_cmd->ras_in_message));
+		break;
+	case TA_RAS_COMMAND__TRIGGER_ERROR:
+		memcpy(&ras_cmd->ras_in_message.trigger_error,
+			in, sizeof(ras_cmd->ras_in_message.trigger_error));
+		break;
+	case TA_RAS_COMMAND__QUERY_ADDRESS:
+		memcpy(&ras_cmd->ras_in_message.address,
+			in, sizeof(ras_cmd->ras_in_message.address));
+		break;
+	default:
+		dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ras_cmd->cmd_id = cmd;
+	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+
+	switch (cmd) {
+	case TA_RAS_COMMAND__TRIGGER_ERROR:
+		if (out) {
+			uint32_t *ras_status = (uint32_t *)out;
+
+			*ras_status = ras_cmd->ras_status;
+		}
+		break;
+	case TA_RAS_COMMAND__QUERY_ADDRESS:
+		if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+			ret = -EINVAL;
+		else if (out)
+			memcpy(out,
+				&ras_cmd->ras_out_message.address,
+				sizeof(ras_cmd->ras_out_message.address));
+		break;
+	default:
+		break;
+	}
+
+err_out:
+	mutex_unlock(&psp->ras_context.mutex);
+
+	return ret;
+}
+
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
 {
 	struct ta_ras_shared_memory *ras_cmd;
@@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
 int psp_ras_enable_features(struct psp_context *psp,
 		union ta_ras_cmd_input *info, bool enable)
 {
-	struct ta_ras_shared_memory *ras_cmd;
+	enum ras_command cmd_id;
 	int ret;
 
-	if (!psp->ras_context.context.initialized)
+	if (!psp->ras_context.context.initialized || !info)
 		return -EINVAL;
 
-	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-	if (enable)
-		ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
-	else
-		ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
-
-	ras_cmd->ras_in_message = *info;
-
-	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	cmd_id = enable ?
+		TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
+	ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
 	if (ret)
 		return -EINVAL;
 
@@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)
 
 	psp->ras_context.context.initialized = false;
 
+	mutex_destroy(&psp->ras_context.mutex);
+
 	return ret;
 }
 
@@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)
 
 	ret = psp_ta_load(psp, &psp->ras_context.context);
 
-	if (!ret && !ras_cmd->ras_status)
+	if (!ret && !ras_cmd->ras_status) {
 		psp->ras_context.context.initialized = true;
-	else {
+		mutex_init(&psp->ras_context.mutex);
+	} else {
 		if (ras_cmd->ras_status)
 			dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);
 
@@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)
 int psp_ras_trigger_error(struct psp_context *psp,
 			  struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
 {
-	struct ta_ras_shared_memory *ras_cmd;
 	struct amdgpu_device *adev = psp->adev;
 	int ret;
 	uint32_t dev_mask;
+	uint32_t ras_status;
 
-	if (!psp->ras_context.context.initialized)
+	if (!psp->ras_context.context.initialized || !info)
 		return -EINVAL;
 
 	switch (info->block_id) {
@@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
 	dev_mask &= AMDGPU_RAS_INST_MASK;
 	info->sub_block_index |= dev_mask;
 
-	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-	ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
-	ras_cmd->ras_in_message.trigger_error = *info;
-
-	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	ret = psp_ras_send_cmd(psp,
+			TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
 	if (ret)
 		return -EINVAL;
 
@@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
 	if (amdgpu_ras_intr_triggered())
 		return 0;
 
-	if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
+	if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
 		return -EACCES;
-	else if (ras_cmd->ras_status)
+	else if (ras_status)
 		return -EINVAL;
 
 	return 0;
@@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
 			  struct ta_ras_query_address_input *addr_in,
 			  struct ta_ras_query_address_output *addr_out)
 {
-	struct ta_ras_shared_memory *ras_cmd;
 	int ret;
 
-	if (!psp->ras_context.context.initialized)
-		return -EINVAL;
-
-	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-	ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
-	ras_cmd->ras_in_message.address = *addr_in;
-
-	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
-	if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+	if (!psp->ras_context.context.initialized ||
+		!addr_in || !addr_out)
 		return -EINVAL;
 
-	*addr_out = ras_cmd->ras_out_message.address;
+	ret = psp_ras_send_cmd(psp,
+			TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);
 
-	return 0;
+	return ret;
 }
 // ras end
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index ee16f134ae92..686023918ce3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -197,6 +197,7 @@ struct psp_xgmi_context {
 struct psp_ras_context {
 	struct ta_context		context;
 	struct amdgpu_ras		*ras;
+	struct mutex			mutex;
 };
 
 #define MEM_TRAIN_SYSTEM_SIGNATURE		0x54534942
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index ca5c86e5f7cd..87f213f92d83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
 
 	context->session_id = ta_id;
 
+	mutex_lock(&psp->ras_context.mutex);
 	ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
 	if (ret)
 		goto err_free_shared_buf;
@@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
 		ret = -EFAULT;
 
 err_free_shared_buf:
+	mutex_unlock(&psp->ras_context.mutex);
 	kfree(shared_buf);
 
 	return ret;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [PATCH] drm/amdgpu: add mutex to protect ras shared memory
  2024-04-28  7:08 [PATCH] drm/amdgpu: add mutex to protect ras shared memory YiPeng Chai
@ 2024-04-28  7:47 ` Wang, Yang(Kevin)
  2024-04-29  3:09   ` Chai, Thomas
  2024-04-29  7:30 ` Lazar, Lijo
  1 sibling, 1 reply; 4+ messages in thread
From: Wang, Yang(Kevin) @ 2024-04-28  7:47 UTC (permalink / raw
  To: Chai, Thomas, amd-gfx@lists.freedesktop.org
  Cc: Zhang, Hawking, Zhou1, Tao, Li, Candice, Yang, Stanley

[AMD Official Use Only - General]

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Sunday, April 28, 2024 3:08 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH] drm/amdgpu: add mutex to protect ras shared memory

Add mutex to protect ras shared memory.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 121 ++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
 3 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 5583e2d1b12f..fa4fea00f6b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
        }
 }

+static int psp_ras_send_cmd(struct psp_context *psp,
+               enum ras_command cmd_id, void *in, void *out) {
+       struct ta_ras_shared_memory *ras_cmd;
+       uint32_t cmd = cmd_id;
+       int ret = 0;
+
+       mutex_lock(&psp->ras_context.mutex);
+       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
+       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+       switch (cmd) {
+       case TA_RAS_COMMAND__ENABLE_FEATURES:
+       case TA_RAS_COMMAND__DISABLE_FEATURES:
+               memcpy(&ras_cmd->ras_in_message,
+                       in, sizeof(ras_cmd->ras_in_message));
+               break;
+       case TA_RAS_COMMAND__TRIGGER_ERROR:
+               memcpy(&ras_cmd->ras_in_message.trigger_error,
+                       in, sizeof(ras_cmd->ras_in_message.trigger_error));
+               break;
+       case TA_RAS_COMMAND__QUERY_ADDRESS:
+               memcpy(&ras_cmd->ras_in_message.address,
+                       in, sizeof(ras_cmd->ras_in_message.address));
+               break;
+       default:
+               dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
+               ret = -EINVAL;
+               goto err_out;
+       }
+
+       ras_cmd->cmd_id = cmd;
+       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+
+       switch (cmd) {
+       case TA_RAS_COMMAND__TRIGGER_ERROR:
+               if (out) {
+                       uint32_t *ras_status = (uint32_t *)out;
[Kevin]:
It's better to check 'ret' value first before use this 'out' data.

Best Regards,
Kevin
+
+                       *ras_status = ras_cmd->ras_status;
+               }
+               break;
+       case TA_RAS_COMMAND__QUERY_ADDRESS:
+               if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+                       ret = -EINVAL;
+               else if (out)
+                       memcpy(out,
+                               &ras_cmd->ras_out_message.address,
+                               sizeof(ras_cmd->ras_out_message.address));
+               break;
+       default:
+               break;
+       }
+
+err_out:
+       mutex_unlock(&psp->ras_context.mutex);
+
+       return ret;
+}
+
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)  {
        struct ta_ras_shared_memory *ras_cmd;
@@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)  int psp_ras_enable_features(struct psp_context *psp,
                union ta_ras_cmd_input *info, bool enable)  {
-       struct ta_ras_shared_memory *ras_cmd;
+       enum ras_command cmd_id;
        int ret;

-       if (!psp->ras_context.context.initialized)
+       if (!psp->ras_context.context.initialized || !info)
                return -EINVAL;

-       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-       if (enable)
-               ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
-       else
-               ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
-
-       ras_cmd->ras_in_message = *info;
-
-       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+       cmd_id = enable ?
+               TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
+       ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
        if (ret)
                return -EINVAL;

@@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)

        psp->ras_context.context.initialized = false;

+       mutex_destroy(&psp->ras_context.mutex);
+
        return ret;
 }

@@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)

        ret = psp_ta_load(psp, &psp->ras_context.context);

-       if (!ret && !ras_cmd->ras_status)
+       if (!ret && !ras_cmd->ras_status) {
                psp->ras_context.context.initialized = true;
-       else {
+               mutex_init(&psp->ras_context.mutex);
+       } else {
                if (ras_cmd->ras_status)
                        dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);

@@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)  int psp_ras_trigger_error(struct psp_context *psp,
                          struct ta_ras_trigger_error_input *info, uint32_t instance_mask)  {
-       struct ta_ras_shared_memory *ras_cmd;
        struct amdgpu_device *adev = psp->adev;
        int ret;
        uint32_t dev_mask;
+       uint32_t ras_status;

-       if (!psp->ras_context.context.initialized)
+       if (!psp->ras_context.context.initialized || !info)
                return -EINVAL;

        switch (info->block_id) {
@@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
        dev_mask &= AMDGPU_RAS_INST_MASK;
        info->sub_block_index |= dev_mask;

-       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-       ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
-       ras_cmd->ras_in_message.trigger_error = *info;
-
-       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+       ret = psp_ras_send_cmd(psp,
+                       TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
        if (ret)
                return -EINVAL;

@@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
        if (amdgpu_ras_intr_triggered())
                return 0;

-       if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
+       if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
                return -EACCES;
-       else if (ras_cmd->ras_status)
+       else if (ras_status)
                return -EINVAL;

        return 0;
@@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
                          struct ta_ras_query_address_input *addr_in,
                          struct ta_ras_query_address_output *addr_out)  {
-       struct ta_ras_shared_memory *ras_cmd;
        int ret;

-       if (!psp->ras_context.context.initialized)
-               return -EINVAL;
-
-       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-       ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
-       ras_cmd->ras_in_message.address = *addr_in;
-
-       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
-       if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+       if (!psp->ras_context.context.initialized ||
+               !addr_in || !addr_out)
                return -EINVAL;

-       *addr_out = ras_cmd->ras_out_message.address;
+       ret = psp_ras_send_cmd(psp,
+                       TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);

-       return 0;
+       return ret;
 }
 // ras end

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index ee16f134ae92..686023918ce3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -197,6 +197,7 @@ struct psp_xgmi_context {  struct psp_ras_context {
        struct ta_context               context;
        struct amdgpu_ras               *ras;
+       struct mutex                    mutex;
 };

 #define MEM_TRAIN_SYSTEM_SIGNATURE             0x54534942
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index ca5c86e5f7cd..87f213f92d83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size

        context->session_id = ta_id;

+       mutex_lock(&psp->ras_context.mutex);
        ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
        if (ret)
                goto err_free_shared_buf;
@@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
                ret = -EFAULT;

 err_free_shared_buf:
+       mutex_unlock(&psp->ras_context.mutex);
        kfree(shared_buf);

        return ret;
--
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [PATCH] drm/amdgpu: add mutex to protect ras shared memory
  2024-04-28  7:47 ` Wang, Yang(Kevin)
@ 2024-04-29  3:09   ` Chai, Thomas
  0 siblings, 0 replies; 4+ messages in thread
From: Chai, Thomas @ 2024-04-29  3:09 UTC (permalink / raw
  To: Wang, Yang(Kevin), amd-gfx@lists.freedesktop.org
  Cc: Zhang, Hawking, Zhou1, Tao, Li, Candice, Yang, Stanley

[AMD Official Use Only - General]

OK


-----------------
Best Regards,
Thomas

-----Original Message-----
From: Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
Sent: Sunday, April 28, 2024 3:48 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH] drm/amdgpu: add mutex to protect ras shared memory

[AMD Official Use Only - General]

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Sunday, April 28, 2024 3:08 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH] drm/amdgpu: add mutex to protect ras shared memory

Add mutex to protect ras shared memory.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 121 ++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
 3 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 5583e2d1b12f..fa4fea00f6b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
        }
 }

+static int psp_ras_send_cmd(struct psp_context *psp,
+               enum ras_command cmd_id, void *in, void *out) {
+       struct ta_ras_shared_memory *ras_cmd;
+       uint32_t cmd = cmd_id;
+       int ret = 0;
+
+       mutex_lock(&psp->ras_context.mutex);
+       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
+       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+       switch (cmd) {
+       case TA_RAS_COMMAND__ENABLE_FEATURES:
+       case TA_RAS_COMMAND__DISABLE_FEATURES:
+               memcpy(&ras_cmd->ras_in_message,
+                       in, sizeof(ras_cmd->ras_in_message));
+               break;
+       case TA_RAS_COMMAND__TRIGGER_ERROR:
+               memcpy(&ras_cmd->ras_in_message.trigger_error,
+                       in, sizeof(ras_cmd->ras_in_message.trigger_error));
+               break;
+       case TA_RAS_COMMAND__QUERY_ADDRESS:
+               memcpy(&ras_cmd->ras_in_message.address,
+                       in, sizeof(ras_cmd->ras_in_message.address));
+               break;
+       default:
+               dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
+               ret = -EINVAL;
+               goto err_out;
+       }
+
+       ras_cmd->cmd_id = cmd;
+       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+
+       switch (cmd) {
+       case TA_RAS_COMMAND__TRIGGER_ERROR:
+               if (out) {
+                       uint32_t *ras_status = (uint32_t *)out;
[Kevin]:
It's better to check 'ret' value first before use this 'out' data.

Best Regards,
Kevin
+
+                       *ras_status = ras_cmd->ras_status;
+               }
+               break;
+       case TA_RAS_COMMAND__QUERY_ADDRESS:
+               if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+                       ret = -EINVAL;
+               else if (out)
+                       memcpy(out,
+                               &ras_cmd->ras_out_message.address,
+                               sizeof(ras_cmd->ras_out_message.address));
+               break;
+       default:
+               break;
+       }
+
+err_out:
+       mutex_unlock(&psp->ras_context.mutex);
+
+       return ret;
+}
+
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)  {
        struct ta_ras_shared_memory *ras_cmd; @@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)  int psp_ras_enable_features(struct psp_context *psp,
                union ta_ras_cmd_input *info, bool enable)  {
-       struct ta_ras_shared_memory *ras_cmd;
+       enum ras_command cmd_id;
        int ret;

-       if (!psp->ras_context.context.initialized)
+       if (!psp->ras_context.context.initialized || !info)
                return -EINVAL;

-       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-       if (enable)
-               ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
-       else
-               ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
-
-       ras_cmd->ras_in_message = *info;
-
-       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+       cmd_id = enable ?
+               TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
+       ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
        if (ret)
                return -EINVAL;

@@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)

        psp->ras_context.context.initialized = false;

+       mutex_destroy(&psp->ras_context.mutex);
+
        return ret;
 }

@@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)

        ret = psp_ta_load(psp, &psp->ras_context.context);

-       if (!ret && !ras_cmd->ras_status)
+       if (!ret && !ras_cmd->ras_status) {
                psp->ras_context.context.initialized = true;
-       else {
+               mutex_init(&psp->ras_context.mutex);
+       } else {
                if (ras_cmd->ras_status)
                        dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);

@@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)  int psp_ras_trigger_error(struct psp_context *psp,
                          struct ta_ras_trigger_error_input *info, uint32_t instance_mask)  {
-       struct ta_ras_shared_memory *ras_cmd;
        struct amdgpu_device *adev = psp->adev;
        int ret;
        uint32_t dev_mask;
+       uint32_t ras_status;

-       if (!psp->ras_context.context.initialized)
+       if (!psp->ras_context.context.initialized || !info)
                return -EINVAL;

        switch (info->block_id) {
@@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
        dev_mask &= AMDGPU_RAS_INST_MASK;
        info->sub_block_index |= dev_mask;

-       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-       ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
-       ras_cmd->ras_in_message.trigger_error = *info;
-
-       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+       ret = psp_ras_send_cmd(psp,
+                       TA_RAS_COMMAND__TRIGGER_ERROR, info,
+ &ras_status);
        if (ret)
                return -EINVAL;

@@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
        if (amdgpu_ras_intr_triggered())
                return 0;

-       if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
+       if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
                return -EACCES;
-       else if (ras_cmd->ras_status)
+       else if (ras_status)
                return -EINVAL;

        return 0;
@@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
                          struct ta_ras_query_address_input *addr_in,
                          struct ta_ras_query_address_output *addr_out)  {
-       struct ta_ras_shared_memory *ras_cmd;
        int ret;

-       if (!psp->ras_context.context.initialized)
-               return -EINVAL;
-
-       ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-       memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-       ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
-       ras_cmd->ras_in_message.address = *addr_in;
-
-       ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
-       if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+       if (!psp->ras_context.context.initialized ||
+               !addr_in || !addr_out)
                return -EINVAL;

-       *addr_out = ras_cmd->ras_out_message.address;
+       ret = psp_ras_send_cmd(psp,
+                       TA_RAS_COMMAND__QUERY_ADDRESS, addr_in,
+ addr_out);

-       return 0;
+       return ret;
 }
 // ras end

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index ee16f134ae92..686023918ce3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -197,6 +197,7 @@ struct psp_xgmi_context {  struct psp_ras_context {
        struct ta_context               context;
        struct amdgpu_ras               *ras;
+       struct mutex                    mutex;
 };

 #define MEM_TRAIN_SYSTEM_SIGNATURE             0x54534942
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index ca5c86e5f7cd..87f213f92d83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size

        context->session_id = ta_id;

+       mutex_lock(&psp->ras_context.mutex);
        ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
        if (ret)
                goto err_free_shared_buf; @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
                ret = -EFAULT;

 err_free_shared_buf:
+       mutex_unlock(&psp->ras_context.mutex);
        kfree(shared_buf);

        return ret;
--
2.34.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdgpu: add mutex to protect ras shared memory
  2024-04-28  7:08 [PATCH] drm/amdgpu: add mutex to protect ras shared memory YiPeng Chai
  2024-04-28  7:47 ` Wang, Yang(Kevin)
@ 2024-04-29  7:30 ` Lazar, Lijo
  1 sibling, 0 replies; 4+ messages in thread
From: Lazar, Lijo @ 2024-04-29  7:30 UTC (permalink / raw
  To: YiPeng Chai, amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang



On 4/28/2024 12:38 PM, YiPeng Chai wrote:
> Add mutex to protect ras shared memory.
> 
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 121 ++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
>  3 files changed, 84 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 5583e2d1b12f..fa4fea00f6b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
>  	}
>  }
>  
> +static int psp_ras_send_cmd(struct psp_context *psp,
> +		enum ras_command cmd_id, void *in, void *out)
> +{
> +	struct ta_ras_shared_memory *ras_cmd;
> +	uint32_t cmd = cmd_id;
> +	int ret = 0;
> +
> +	mutex_lock(&psp->ras_context.mutex);
> +	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> +	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> +
> +	switch (cmd) {
> +	case TA_RAS_COMMAND__ENABLE_FEATURES:
> +	case TA_RAS_COMMAND__DISABLE_FEATURES:
> +		memcpy(&ras_cmd->ras_in_message,
> +			in, sizeof(ras_cmd->ras_in_message));
> +		break;
> +	case TA_RAS_COMMAND__TRIGGER_ERROR:
> +		memcpy(&ras_cmd->ras_in_message.trigger_error,
> +			in, sizeof(ras_cmd->ras_in_message.trigger_error));
> +		break;
> +	case TA_RAS_COMMAND__QUERY_ADDRESS:
> +		memcpy(&ras_cmd->ras_in_message.address,
> +			in, sizeof(ras_cmd->ras_in_message.address));
> +		break;
> +	default:
> +		dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
> +		ret = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	ras_cmd->cmd_id = cmd;
> +	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +
> +	switch (cmd) {
> +	case TA_RAS_COMMAND__TRIGGER_ERROR:
> +		if (out) {

As NULL check for 'out' is done before copying, better to do the same
for 'in' also.
> +			uint32_t *ras_status = (uint32_t *)out;
> +
> +			*ras_status = ras_cmd->ras_status;
> +		}
> +		break;
> +	case TA_RAS_COMMAND__QUERY_ADDRESS:
> +		if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> +			ret = -EINVAL;
> +		else if (out)
> +			memcpy(out,
> +				&ras_cmd->ras_out_message.address,
> +				sizeof(ras_cmd->ras_out_message.address));
> +		break;
> +	default:
> +		break;
> +	}
> +
> +err_out:
> +	mutex_unlock(&psp->ras_context.mutex);
> +
> +	return ret;
> +}
> +
>  int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
>  {
>  	struct ta_ras_shared_memory *ras_cmd;
> @@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
>  int psp_ras_enable_features(struct psp_context *psp,
>  		union ta_ras_cmd_input *info, bool enable)
>  {
> -	struct ta_ras_shared_memory *ras_cmd;
> +	enum ras_command cmd_id;
>  	int ret;
>  
> -	if (!psp->ras_context.context.initialized)
> +	if (!psp->ras_context.context.initialized || !info)
>  		return -EINVAL;
>  
> -	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> -	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -	if (enable)
> -		ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
> -	else
> -		ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
> -
> -	ras_cmd->ras_in_message = *info;
> -
> -	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +	cmd_id = enable ?
> +		TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
> +	ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
>  	if (ret)
>  		return -EINVAL;
>  
> @@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)
>  
>  	psp->ras_context.context.initialized = false;
>  
> +	mutex_destroy(&psp->ras_context.mutex);
> +
>  	return ret;
>  }
>  
> @@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)
>  
>  	ret = psp_ta_load(psp, &psp->ras_context.context);
>  
> -	if (!ret && !ras_cmd->ras_status)
> +	if (!ret && !ras_cmd->ras_status) {
>  		psp->ras_context.context.initialized = true;
> -	else {
> +		mutex_init(&psp->ras_context.mutex);
> +	} else {
>  		if (ras_cmd->ras_status)
>  			dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);
>  
> @@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)
>  int psp_ras_trigger_error(struct psp_context *psp,
>  			  struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
>  {
> -	struct ta_ras_shared_memory *ras_cmd;
>  	struct amdgpu_device *adev = psp->adev;
>  	int ret;
>  	uint32_t dev_mask;
> +	uint32_t ras_status;
>  
> -	if (!psp->ras_context.context.initialized)
> +	if (!psp->ras_context.context.initialized || !info)
>  		return -EINVAL;
>  
>  	switch (info->block_id) {
> @@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
>  	dev_mask &= AMDGPU_RAS_INST_MASK;
>  	info->sub_block_index |= dev_mask;
>  
> -	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> -	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -	ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
> -	ras_cmd->ras_in_message.trigger_error = *info;
> -
> -	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +	ret = psp_ras_send_cmd(psp,
> +			TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
>  	if (ret)
>  		return -EINVAL;
>  
> @@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
>  	if (amdgpu_ras_intr_triggered())
>  		return 0;
>  
> -	if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
> +	if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
>  		return -EACCES;
> -	else if (ras_cmd->ras_status)
> +	else if (ras_status)
>  		return -EINVAL;
>  
>  	return 0;
> @@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
>  			  struct ta_ras_query_address_input *addr_in,
>  			  struct ta_ras_query_address_output *addr_out)
>  {
> -	struct ta_ras_shared_memory *ras_cmd;
>  	int ret;
>  
> -	if (!psp->ras_context.context.initialized)
> -		return -EINVAL;
> -
> -	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> -	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -	ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
> -	ras_cmd->ras_in_message.address = *addr_in;
> -
> -	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> -	if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> +	if (!psp->ras_context.context.initialized ||
> +		!addr_in || !addr_out)
>  		return -EINVAL;
>  
> -	*addr_out = ras_cmd->ras_out_message.address;
> +	ret = psp_ras_send_cmd(psp,
> +			TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);

return psp_ras_send_cmd() will do.
>  
> -	return 0;
> +	return ret;
>  }
>  // ras end
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index ee16f134ae92..686023918ce3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -197,6 +197,7 @@ struct psp_xgmi_context {
>  struct psp_ras_context {
>  	struct ta_context		context;
>  	struct amdgpu_ras		*ras;
> +	struct mutex			mutex;
>  };
>  
>  #define MEM_TRAIN_SYSTEM_SIGNATURE		0x54534942
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> index ca5c86e5f7cd..87f213f92d83 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> @@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
>  
>  	context->session_id = ta_id;
>  
> +	mutex_lock(&psp->ras_context.mutex);
>  	ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
>  	if (ret)
>  		goto err_free_shared_buf;
> @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
>  		ret = -EFAULT;
>  
>  err_free_shared_buf:

This error label is used at other places as well and that happens even
before acquiring the mutex.

Thanks,
Lijo
> +	mutex_unlock(&psp->ras_context.mutex);
>  	kfree(shared_buf);
>  
>  	return ret;

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-04-29  7:30 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-28  7:08 [PATCH] drm/amdgpu: add mutex to protect ras shared memory YiPeng Chai
2024-04-28  7:47 ` Wang, Yang(Kevin)
2024-04-29  3:09   ` Chai, Thomas
2024-04-29  7:30 ` Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.