LKML Archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] habanalabs: add uapi to retrieve engines status
@ 2022-07-21  8:41 Oded Gabbay
  2022-07-21  8:41 ` [PATCH 2/2] habanalabs/gaudi2: mark PCIE access error as fatal Oded Gabbay
  0 siblings, 1 reply; 2+ messages in thread
From: Oded Gabbay @ 2022-07-21  8:41 UTC (permalink / raw
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Currently, to get engines status, user needed to read debugfs file
with root permissions.

This new uapi allows user apace apps retrieve status, so for example,
in case of failure, status can be retrieved immediately by the
application itself which runs without root permissions.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/debugfs.c      |  3 +-
 .../misc/habanalabs/common/habanalabs_ioctl.c | 40 +++++++++++++++++++
 include/uapi/misc/habanalabs.h                |  9 +++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 90c91c1b2c10..c297248748d3 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -17,7 +17,6 @@
 #define MMU_ASID_BUF_SIZE	10
 #define MMU_KBUF_SIZE		(MMU_ADDR_BUF_SIZE + MMU_ASID_BUF_SIZE)
 #define I2C_MAX_TRANSACTION_LEN	8
-#define ENGINES_DATA_MAX_SIZE	SZ_16K
 
 static struct dentry *hl_debug_root;
 
@@ -626,7 +625,7 @@ static int engines_show(struct seq_file *s, void *data)
 	}
 
 	eng_data.actual_size = 0;
-	eng_data.allocated_buf_size = ENGINES_DATA_MAX_SIZE;
+	eng_data.allocated_buf_size = HL_ENGINES_DATA_MAX_SIZE;
 	eng_data.buf = vmalloc(eng_data.allocated_buf_size);
 	if (!eng_data.buf)
 		return -ENOMEM;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 6a30bd98ab5e..ec55c66fedd6 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
 	[HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr),
@@ -697,6 +698,42 @@ static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	return 0;
 }
 
+static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	u32 status_buf_size = args->return_size;
+	struct hl_device *hdev = hpriv->hdev;
+	struct engines_data eng_data;
+	int rc;
+
+	if ((status_buf_size < SZ_1K) || (status_buf_size > HL_ENGINES_DATA_MAX_SIZE) || (!out))
+		return -EINVAL;
+
+	eng_data.actual_size = 0;
+	eng_data.allocated_buf_size = status_buf_size;
+	eng_data.buf = vmalloc(status_buf_size);
+	if (!eng_data.buf)
+		return -ENOMEM;
+
+	hdev->asic_funcs->is_device_idle(hdev, NULL, 0, &eng_data);
+
+	if (eng_data.actual_size > eng_data.allocated_buf_size) {
+		dev_err(hdev->dev,
+			"Engines data size (%d Bytes) is bigger than allocated size (%u Bytes)\n",
+			eng_data.actual_size, status_buf_size);
+		vfree(eng_data.buf);
+		return -ENOMEM;
+	}
+
+	args->user_buffer_actual_size = eng_data.actual_size;
+	rc = copy_to_user(out, eng_data.buf, min_t(size_t, status_buf_size, eng_data.actual_size)) ?
+				-EFAULT : 0;
+
+	vfree(eng_data.buf);
+
+	return rc;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -812,6 +849,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_UNREGISTER_EVENTFD:
 		return eventfd_unregister(hpriv, args);
 
+	case HL_INFO_ENGINE_STATUS:
+		return engine_status_info(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -EINVAL;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index be06b1307c44..83ca6f40f4ba 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -787,10 +787,14 @@ enum hl_server_type {
 #define HL_INFO_UNREGISTER_EVENTFD		29
 #define HL_INFO_GET_EVENTS			30
 #define HL_INFO_UNDEFINED_OPCODE_EVENT		31
+#define HL_INFO_ENGINE_STATUS			32
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
 
+/* Maximum buffer size for retrieving engines status */
+#define HL_ENGINES_DATA_MAX_SIZE	SZ_1M
+
 /**
  * struct hl_info_hw_ip_info - hardware information on various IPs in the ASIC
  * @sram_base_address: The first SRAM physical base address that is free to be
@@ -1130,6 +1134,10 @@ enum gaudi_dcores {
  *             resolution. Currently not in use.
  * @pll_index: Index as defined in hl_<asic type>_pll_index enumeration.
  * @eventfd: event file descriptor for event notifications.
+ * @user_buffer_actual_size: Actual data size which was copied to user allocated buffer by the
+ *                           driver. It is possible for the user to allocate buffer larger than
+ *                           needed, hence updating this variable so user will know the exact amount
+ *                           of bytes copied by the kernel to the buffer.
  * @pad: Padding to 64 bit.
  */
 struct hl_info_args {
@@ -1143,6 +1151,7 @@ struct hl_info_args {
 		__u32 period_ms;
 		__u32 pll_index;
 		__u32 eventfd;
+		__u32 user_buffer_actual_size;
 	};
 
 	__u32 pad;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH 2/2] habanalabs/gaudi2: mark PCIE access error as fatal
  2022-07-21  8:41 [PATCH 1/2] habanalabs: add uapi to retrieve engines status Oded Gabbay
@ 2022-07-21  8:41 ` Oded Gabbay
  0 siblings, 0 replies; 2+ messages in thread
From: Oded Gabbay @ 2022-07-21  8:41 UTC (permalink / raw
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

F/W events are enabled in a late phase of the device init, so an event
for a PCIE access error during the init, can be received after the init
is already done and considered as successful.
A resulting device reset, which does the same H/W init, can end
similarly with this event right after the reset is done and considered
as successful, and a loop of this sequence can continue.

To avoid it mark the PCIE access error as a fatal event, so after 2
consecutive events no more resets will be done.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 2c43ed403509..68ab407fa6ba 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8532,6 +8532,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_PCIE_ADDR_DEC_ERR:
 		gaudi2_print_pcie_addr_dec_info(hdev,
 				le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		break;
 
 	case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-07-21  8:42 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-07-21  8:41 [PATCH 1/2] habanalabs: add uapi to retrieve engines status Oded Gabbay
2022-07-21  8:41 ` [PATCH 2/2] habanalabs/gaudi2: mark PCIE access error as fatal Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).