NVDIMM Device and Persistent Memory development
 help / color / mirror / Atom feed
* [ndctl PATCH] libndctl/msft: Improve "smart" state reporting
@ 2022-11-17 21:09 Alexander Motin
  2022-11-17 22:07 ` Alison Schofield
  0 siblings, 1 reply; 4+ messages in thread
From: Alexander Motin @ 2022-11-17 21:09 UTC (permalink / raw
  To: nvdimm; +Cc: Alexander Motin

Previous code reported "smart" state based on number of bits
set in the module health field.  But actually any single bit
set there already means critical failure.  Rework the logic
according to specification, properly reporting non-critical
state in case of warning threshold reached, critical in case
of any module health bit set or error threshold reached and
fatal if NVDIMM exhausted its life time.  In attempt to
report the cause of failure in absence of better methods,
report reached thresholds as more or less matching alarms.

While there clean up the code, making it more uniform with
others and allowing more methods to be implemented later.

Signed-off-by: Alexander Motin <mav@ixsystems.com>
---
 ndctl/lib/msft.c | 111 ++++++++++++++++++++++++++++++++---------------
 ndctl/lib/msft.h |  13 ++----
 2 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c
index 3112799..8f66c97 100644
--- a/ndctl/lib/msft.c
+++ b/ndctl/lib/msft.c
@@ -2,6 +2,7 @@
 // Copyright (C) 2016-2017 Dell, Inc.
 // Copyright (C) 2016 Hewlett Packard Enterprise Development LP
 // Copyright (C) 2016-2020, Intel Corporation.
+/* Copyright (C) 2022 iXsystems, Inc. */
 #include <stdlib.h>
 #include <limits.h>
 #include <util/log.h>
@@ -12,12 +13,39 @@
 #define CMD_MSFT(_c) ((_c)->msft)
 #define CMD_MSFT_SMART(_c) (CMD_MSFT(_c)->u.smart.data)
 
+static const char *msft_cmd_desc(int fn)
+{
+	static const char * const descs[] = {
+		[NDN_MSFT_CMD_CHEALTH] = "critical_health",
+		[NDN_MSFT_CMD_NHEALTH] = "nvdimm_health",
+		[NDN_MSFT_CMD_EHEALTH] = "es_health",
+	};
+	const char *desc;
+
+	if (fn >= (int) ARRAY_SIZE(descs))
+		return "unknown";
+	desc = descs[fn];
+	if (!desc)
+		return "unknown";
+	return desc;
+}
+
+static bool msft_cmd_is_supported(struct ndctl_dimm *dimm, int cmd)
+{
+	/* Handle this separately to support monitor mode */
+	if (cmd == ND_CMD_SMART)
+		return true;
+
+	return !!(dimm->cmd_mask & (1ULL << cmd));
+}
+
 static u32 msft_get_firmware_status(struct ndctl_cmd *cmd)
 {
 	return cmd->msft->u.smart.status;
 }
 
-static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
+static struct ndctl_cmd *alloc_msft_cmd(struct ndctl_dimm *dimm,
+		unsigned int func, size_t in_size, size_t out_size)
 {
 	struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
 	struct ndctl_ctx *ctx = ndctl_bus_get_ctx(bus);
@@ -30,12 +58,12 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
 		return NULL;
 	}
 
-	if (test_dimm_dsm(dimm, NDN_MSFT_CMD_SMART) == DIMM_DSM_UNSUPPORTED) {
+	if (test_dimm_dsm(dimm, func) == DIMM_DSM_UNSUPPORTED) {
 		dbg(ctx, "unsupported function\n");
 		return NULL;
 	}
 
-	size = sizeof(*cmd) + sizeof(struct ndn_pkg_msft);
+	size = sizeof(*cmd) + sizeof(struct nd_cmd_pkg) + in_size + out_size;
 	cmd = calloc(1, size);
 	if (!cmd)
 		return NULL;
@@ -45,25 +73,30 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
 	cmd->type = ND_CMD_CALL;
 	cmd->size = size;
 	cmd->status = 1;
+	cmd->get_firmware_status = msft_get_firmware_status;
 
 	msft = CMD_MSFT(cmd);
 	msft->gen.nd_family = NVDIMM_FAMILY_MSFT;
-	msft->gen.nd_command = NDN_MSFT_CMD_SMART;
+	msft->gen.nd_command = func;
 	msft->gen.nd_fw_size = 0;
-	msft->gen.nd_size_in = offsetof(struct ndn_msft_smart, status);
-	msft->gen.nd_size_out = sizeof(msft->u.smart);
+	msft->gen.nd_size_in = in_size;
+	msft->gen.nd_size_out = out_size;
 	msft->u.smart.status = 0;
-	cmd->get_firmware_status = msft_get_firmware_status;
 
 	return cmd;
 }
 
+static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
+{
+	return (alloc_msft_cmd(dimm, NDN_MSFT_CMD_NHEALTH,
+			0, sizeof(struct ndn_msft_smart)));
+}
+
 static int msft_smart_valid(struct ndctl_cmd *cmd)
 {
 	if (cmd->type != ND_CMD_CALL ||
-	    cmd->size != sizeof(*cmd) + sizeof(struct ndn_pkg_msft) ||
 	    CMD_MSFT(cmd)->gen.nd_family != NVDIMM_FAMILY_MSFT ||
-	    CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_SMART ||
+	    CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_NHEALTH ||
 	    cmd->status != 0)
 		return cmd->status < 0 ? cmd->status : -EINVAL;
 	return 0;
@@ -80,28 +113,33 @@ static unsigned int msft_cmd_smart_get_flags(struct ndctl_cmd *cmd)
 	}
 
 	/* below health data can be retrieved via MSFT _DSM function 11 */
-	return NDN_MSFT_SMART_HEALTH_VALID |
-		NDN_MSFT_SMART_TEMP_VALID |
-		NDN_MSFT_SMART_USED_VALID;
+	return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID |
+	    ND_SMART_USED_VALID | ND_SMART_ALARM_VALID;
 }
 
-static unsigned int num_set_bit_health(__u16 num)
+static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
 {
-	int i;
-	__u16 n = num & 0x7FFF;
-	unsigned int count = 0;
+	unsigned int health = 0;
+	int rc;
 
-	for (i = 0; i < 15; i++)
-		if (!!(n & (1 << i)))
-			count++;
+	rc = msft_smart_valid(cmd);
+	if (rc < 0) {
+		errno = -rc;
+		return UINT_MAX;
+	}
 
-	return count;
+	if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0)
+		health |= ND_SMART_FATAL_HEALTH;
+	if (CMD_MSFT_SMART(cmd)->health != 0 ||
+	    CMD_MSFT_SMART(cmd)->err_thresh_stat != 0)
+		health |= ND_SMART_CRITICAL_HEALTH;
+	if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0)
+		health |= ND_SMART_NON_CRITICAL_HEALTH;
+	return health;
 }
 
-static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
+static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
 {
-	unsigned int health;
-	unsigned int num;
 	int rc;
 
 	rc = msft_smart_valid(cmd);
@@ -110,21 +148,13 @@ static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
 		return UINT_MAX;
 	}
 
-	num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health);
-	if (num == 0)
-		health = 0;
-	else if (num < 2)
-		health = ND_SMART_NON_CRITICAL_HEALTH;
-	else if (num < 3)
-		health = ND_SMART_CRITICAL_HEALTH;
-	else
-		health = ND_SMART_FATAL_HEALTH;
-
-	return health;
+	return CMD_MSFT_SMART(cmd)->temp * 16;
 }
 
-static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
+static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd)
 {
+	__u8 stat;
+	unsigned int flags = 0;
 	int rc;
 
 	rc = msft_smart_valid(cmd);
@@ -133,7 +163,13 @@ static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
 		return UINT_MAX;
 	}
 
-	return CMD_MSFT_SMART(cmd)->temp * 16;
+	stat = CMD_MSFT_SMART(cmd)->err_thresh_stat |
+	    CMD_MSFT_SMART(cmd)->warn_thresh_stat;
+	if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */
+		flags |= ND_SMART_SPARE_TRIP;
+	if (stat & 4) /* ES_TEMP */
+		flags |= ND_SMART_CTEMP_TRIP;
+	return flags;
 }
 
 static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd)
@@ -171,10 +207,13 @@ static int msft_cmd_xlat_firmware_status(struct ndctl_cmd *cmd)
 }
 
 struct ndctl_dimm_ops * const msft_dimm_ops = &(struct ndctl_dimm_ops) {
+	.cmd_desc = msft_cmd_desc,
+	.cmd_is_supported = msft_cmd_is_supported,
 	.new_smart = msft_dimm_cmd_new_smart,
 	.smart_get_flags = msft_cmd_smart_get_flags,
 	.smart_get_health = msft_cmd_smart_get_health,
 	.smart_get_media_temperature = msft_cmd_smart_get_media_temperature,
+	.smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags,
 	.smart_get_life_used = msft_cmd_smart_get_life_used,
 	.xlat_firmware_status = msft_cmd_xlat_firmware_status,
 };
diff --git a/ndctl/lib/msft.h b/ndctl/lib/msft.h
index 978cc11..8d246a5 100644
--- a/ndctl/lib/msft.h
+++ b/ndctl/lib/msft.h
@@ -2,21 +2,16 @@
 /* Copyright (C) 2016-2017 Dell, Inc. */
 /* Copyright (C) 2016 Hewlett Packard Enterprise Development LP */
 /* Copyright (C) 2014-2020, Intel Corporation. */
+/* Copyright (C) 2022 iXsystems, Inc. */
 #ifndef __NDCTL_MSFT_H__
 #define __NDCTL_MSFT_H__
 
 enum {
-	NDN_MSFT_CMD_QUERY = 0,
-
-	/* non-root commands */
-	NDN_MSFT_CMD_SMART = 11,
+	NDN_MSFT_CMD_CHEALTH = 10,
+	NDN_MSFT_CMD_NHEALTH = 11,
+	NDN_MSFT_CMD_EHEALTH = 12,
 };
 
-/* NDN_MSFT_CMD_SMART */
-#define NDN_MSFT_SMART_HEALTH_VALID	ND_SMART_HEALTH_VALID
-#define NDN_MSFT_SMART_TEMP_VALID	ND_SMART_TEMP_VALID
-#define NDN_MSFT_SMART_USED_VALID	ND_SMART_USED_VALID
-
 /*
  * This is actually function 11 data,
  * This is the closest I can find to match smart
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [ndctl PATCH] libndctl/msft: Improve "smart" state reporting
  2022-11-17 21:09 [ndctl PATCH] libndctl/msft: Improve "smart" state reporting Alexander Motin
@ 2022-11-17 22:07 ` Alison Schofield
  2022-11-17 22:41   ` Alexander Motin
  0 siblings, 1 reply; 4+ messages in thread
From: Alison Schofield @ 2022-11-17 22:07 UTC (permalink / raw
  To: Alexander Motin; +Cc: nvdimm

On Thu, Nov 17, 2022 at 04:09:36PM -0500, Alexander Motin wrote:
> Previous code reported "smart" state based on number of bits
> set in the module health field.  But actually any single bit
> set there already means critical failure.  Rework the logic
> according to specification, properly reporting non-critical
> state in case of warning threshold reached, critical in case
> of any module health bit set or error threshold reached and
> fatal if NVDIMM exhausted its life time.  In attempt to
> report the cause of failure in absence of better methods,
> report reached thresholds as more or less matching alarms.
> 
> While there clean up the code, making it more uniform with
> others and allowing more methods to be implemented later.

Hi Alexander,

Perhaps this would be better presented in 2 patches:
1)the cleanup and then 2) improvement of smart state reporting.

Alison

> 
> Signed-off-by: Alexander Motin <mav@ixsystems.com>
> ---
>  ndctl/lib/msft.c | 111 ++++++++++++++++++++++++++++++++---------------
>  ndctl/lib/msft.h |  13 ++----
>  2 files changed, 79 insertions(+), 45 deletions(-)
> 
> diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c
> index 3112799..8f66c97 100644
> --- a/ndctl/lib/msft.c
> +++ b/ndctl/lib/msft.c
> @@ -2,6 +2,7 @@
>  // Copyright (C) 2016-2017 Dell, Inc.
>  // Copyright (C) 2016 Hewlett Packard Enterprise Development LP
>  // Copyright (C) 2016-2020, Intel Corporation.
> +/* Copyright (C) 2022 iXsystems, Inc. */
>  #include <stdlib.h>
>  #include <limits.h>
>  #include <util/log.h>
> @@ -12,12 +13,39 @@
>  #define CMD_MSFT(_c) ((_c)->msft)
>  #define CMD_MSFT_SMART(_c) (CMD_MSFT(_c)->u.smart.data)
>  
> +static const char *msft_cmd_desc(int fn)
> +{
> +	static const char * const descs[] = {
> +		[NDN_MSFT_CMD_CHEALTH] = "critical_health",
> +		[NDN_MSFT_CMD_NHEALTH] = "nvdimm_health",
> +		[NDN_MSFT_CMD_EHEALTH] = "es_health",
> +	};
> +	const char *desc;
> +
> +	if (fn >= (int) ARRAY_SIZE(descs))
> +		return "unknown";
> +	desc = descs[fn];
> +	if (!desc)
> +		return "unknown";
> +	return desc;
> +}
> +
> +static bool msft_cmd_is_supported(struct ndctl_dimm *dimm, int cmd)
> +{
> +	/* Handle this separately to support monitor mode */
> +	if (cmd == ND_CMD_SMART)
> +		return true;
> +
> +	return !!(dimm->cmd_mask & (1ULL << cmd));
> +}
> +
>  static u32 msft_get_firmware_status(struct ndctl_cmd *cmd)
>  {
>  	return cmd->msft->u.smart.status;
>  }
>  
> -static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
> +static struct ndctl_cmd *alloc_msft_cmd(struct ndctl_dimm *dimm,
> +		unsigned int func, size_t in_size, size_t out_size)
>  {
>  	struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
>  	struct ndctl_ctx *ctx = ndctl_bus_get_ctx(bus);
> @@ -30,12 +58,12 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
>  		return NULL;
>  	}
>  
> -	if (test_dimm_dsm(dimm, NDN_MSFT_CMD_SMART) == DIMM_DSM_UNSUPPORTED) {
> +	if (test_dimm_dsm(dimm, func) == DIMM_DSM_UNSUPPORTED) {
>  		dbg(ctx, "unsupported function\n");
>  		return NULL;
>  	}
>  
> -	size = sizeof(*cmd) + sizeof(struct ndn_pkg_msft);
> +	size = sizeof(*cmd) + sizeof(struct nd_cmd_pkg) + in_size + out_size;
>  	cmd = calloc(1, size);
>  	if (!cmd)
>  		return NULL;
> @@ -45,25 +73,30 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
>  	cmd->type = ND_CMD_CALL;
>  	cmd->size = size;
>  	cmd->status = 1;
> +	cmd->get_firmware_status = msft_get_firmware_status;
>  
>  	msft = CMD_MSFT(cmd);
>  	msft->gen.nd_family = NVDIMM_FAMILY_MSFT;
> -	msft->gen.nd_command = NDN_MSFT_CMD_SMART;
> +	msft->gen.nd_command = func;
>  	msft->gen.nd_fw_size = 0;
> -	msft->gen.nd_size_in = offsetof(struct ndn_msft_smart, status);
> -	msft->gen.nd_size_out = sizeof(msft->u.smart);
> +	msft->gen.nd_size_in = in_size;
> +	msft->gen.nd_size_out = out_size;
>  	msft->u.smart.status = 0;
> -	cmd->get_firmware_status = msft_get_firmware_status;
>  
>  	return cmd;
>  }
>  
> +static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
> +{
> +	return (alloc_msft_cmd(dimm, NDN_MSFT_CMD_NHEALTH,
> +			0, sizeof(struct ndn_msft_smart)));
> +}
> +
>  static int msft_smart_valid(struct ndctl_cmd *cmd)
>  {
>  	if (cmd->type != ND_CMD_CALL ||
> -	    cmd->size != sizeof(*cmd) + sizeof(struct ndn_pkg_msft) ||
>  	    CMD_MSFT(cmd)->gen.nd_family != NVDIMM_FAMILY_MSFT ||
> -	    CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_SMART ||
> +	    CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_NHEALTH ||
>  	    cmd->status != 0)
>  		return cmd->status < 0 ? cmd->status : -EINVAL;
>  	return 0;
> @@ -80,28 +113,33 @@ static unsigned int msft_cmd_smart_get_flags(struct ndctl_cmd *cmd)
>  	}
>  
>  	/* below health data can be retrieved via MSFT _DSM function 11 */
> -	return NDN_MSFT_SMART_HEALTH_VALID |
> -		NDN_MSFT_SMART_TEMP_VALID |
> -		NDN_MSFT_SMART_USED_VALID;
> +	return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID |
> +	    ND_SMART_USED_VALID | ND_SMART_ALARM_VALID;
>  }
>  
> -static unsigned int num_set_bit_health(__u16 num)
> +static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>  {
> -	int i;
> -	__u16 n = num & 0x7FFF;
> -	unsigned int count = 0;
> +	unsigned int health = 0;
> +	int rc;
>  
> -	for (i = 0; i < 15; i++)
> -		if (!!(n & (1 << i)))
> -			count++;
> +	rc = msft_smart_valid(cmd);
> +	if (rc < 0) {
> +		errno = -rc;
> +		return UINT_MAX;
> +	}
>  
> -	return count;
> +	if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0)
> +		health |= ND_SMART_FATAL_HEALTH;
> +	if (CMD_MSFT_SMART(cmd)->health != 0 ||
> +	    CMD_MSFT_SMART(cmd)->err_thresh_stat != 0)
> +		health |= ND_SMART_CRITICAL_HEALTH;
> +	if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0)
> +		health |= ND_SMART_NON_CRITICAL_HEALTH;
> +	return health;
>  }
>  
> -static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
> +static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>  {
> -	unsigned int health;
> -	unsigned int num;
>  	int rc;
>  
>  	rc = msft_smart_valid(cmd);
> @@ -110,21 +148,13 @@ static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>  		return UINT_MAX;
>  	}
>  
> -	num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health);
> -	if (num == 0)
> -		health = 0;
> -	else if (num < 2)
> -		health = ND_SMART_NON_CRITICAL_HEALTH;
> -	else if (num < 3)
> -		health = ND_SMART_CRITICAL_HEALTH;
> -	else
> -		health = ND_SMART_FATAL_HEALTH;
> -
> -	return health;
> +	return CMD_MSFT_SMART(cmd)->temp * 16;
>  }
>  
> -static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
> +static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd)
>  {
> +	__u8 stat;
> +	unsigned int flags = 0;
>  	int rc;
>  
>  	rc = msft_smart_valid(cmd);
> @@ -133,7 +163,13 @@ static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>  		return UINT_MAX;
>  	}
>  
> -	return CMD_MSFT_SMART(cmd)->temp * 16;
> +	stat = CMD_MSFT_SMART(cmd)->err_thresh_stat |
> +	    CMD_MSFT_SMART(cmd)->warn_thresh_stat;
> +	if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */
> +		flags |= ND_SMART_SPARE_TRIP;
> +	if (stat & 4) /* ES_TEMP */
> +		flags |= ND_SMART_CTEMP_TRIP;
> +	return flags;
>  }
>  
>  static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd)
> @@ -171,10 +207,13 @@ static int msft_cmd_xlat_firmware_status(struct ndctl_cmd *cmd)
>  }
>  
>  struct ndctl_dimm_ops * const msft_dimm_ops = &(struct ndctl_dimm_ops) {
> +	.cmd_desc = msft_cmd_desc,
> +	.cmd_is_supported = msft_cmd_is_supported,
>  	.new_smart = msft_dimm_cmd_new_smart,
>  	.smart_get_flags = msft_cmd_smart_get_flags,
>  	.smart_get_health = msft_cmd_smart_get_health,
>  	.smart_get_media_temperature = msft_cmd_smart_get_media_temperature,
> +	.smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags,
>  	.smart_get_life_used = msft_cmd_smart_get_life_used,
>  	.xlat_firmware_status = msft_cmd_xlat_firmware_status,
>  };
> diff --git a/ndctl/lib/msft.h b/ndctl/lib/msft.h
> index 978cc11..8d246a5 100644
> --- a/ndctl/lib/msft.h
> +++ b/ndctl/lib/msft.h
> @@ -2,21 +2,16 @@
>  /* Copyright (C) 2016-2017 Dell, Inc. */
>  /* Copyright (C) 2016 Hewlett Packard Enterprise Development LP */
>  /* Copyright (C) 2014-2020, Intel Corporation. */
> +/* Copyright (C) 2022 iXsystems, Inc. */
>  #ifndef __NDCTL_MSFT_H__
>  #define __NDCTL_MSFT_H__
>  
>  enum {
> -	NDN_MSFT_CMD_QUERY = 0,
> -
> -	/* non-root commands */
> -	NDN_MSFT_CMD_SMART = 11,
> +	NDN_MSFT_CMD_CHEALTH = 10,
> +	NDN_MSFT_CMD_NHEALTH = 11,
> +	NDN_MSFT_CMD_EHEALTH = 12,
>  };
>  
> -/* NDN_MSFT_CMD_SMART */
> -#define NDN_MSFT_SMART_HEALTH_VALID	ND_SMART_HEALTH_VALID
> -#define NDN_MSFT_SMART_TEMP_VALID	ND_SMART_TEMP_VALID
> -#define NDN_MSFT_SMART_USED_VALID	ND_SMART_USED_VALID
> -
>  /*
>   * This is actually function 11 data,
>   * This is the closest I can find to match smart
> -- 
> 2.30.2
> 
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [ndctl PATCH] libndctl/msft: Improve "smart" state reporting
  2022-11-17 22:07 ` Alison Schofield
@ 2022-11-17 22:41   ` Alexander Motin
  2022-12-02 15:59     ` Alexander Motin
  0 siblings, 1 reply; 4+ messages in thread
From: Alexander Motin @ 2022-11-17 22:41 UTC (permalink / raw
  To: Alison Schofield; +Cc: nvdimm

On 17.11.2022 17:07, Alison Schofield wrote:
> On Thu, Nov 17, 2022 at 04:09:36PM -0500, Alexander Motin wrote:
>> Previous code reported "smart" state based on number of bits
>> set in the module health field.  But actually any single bit
>> set there already means critical failure.  Rework the logic
>> according to specification, properly reporting non-critical
>> state in case of warning threshold reached, critical in case
>> of any module health bit set or error threshold reached and
>> fatal if NVDIMM exhausted its life time.  In attempt to
>> report the cause of failure in absence of better methods,
>> report reached thresholds as more or less matching alarms.
>>
>> While there clean up the code, making it more uniform with
>> others and allowing more methods to be implemented later.
> 
> Hi Alexander,
> 
> Perhaps this would be better presented in 2 patches:
> 1)the cleanup and then 2) improvement of smart state reporting.

Done.  Trying to convince myself it is not a waste of time...

> Alison
> 
>>
>> Signed-off-by: Alexander Motin <mav@ixsystems.com>
>> ---
>>   ndctl/lib/msft.c | 111 ++++++++++++++++++++++++++++++++---------------
>>   ndctl/lib/msft.h |  13 ++----
>>   2 files changed, 79 insertions(+), 45 deletions(-)
>>
>> diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c
>> index 3112799..8f66c97 100644
>> --- a/ndctl/lib/msft.c
>> +++ b/ndctl/lib/msft.c
>> @@ -2,6 +2,7 @@
>>   // Copyright (C) 2016-2017 Dell, Inc.
>>   // Copyright (C) 2016 Hewlett Packard Enterprise Development LP
>>   // Copyright (C) 2016-2020, Intel Corporation.
>> +/* Copyright (C) 2022 iXsystems, Inc. */
>>   #include <stdlib.h>
>>   #include <limits.h>
>>   #include <util/log.h>
>> @@ -12,12 +13,39 @@
>>   #define CMD_MSFT(_c) ((_c)->msft)
>>   #define CMD_MSFT_SMART(_c) (CMD_MSFT(_c)->u.smart.data)
>>   
>> +static const char *msft_cmd_desc(int fn)
>> +{
>> +	static const char * const descs[] = {
>> +		[NDN_MSFT_CMD_CHEALTH] = "critical_health",
>> +		[NDN_MSFT_CMD_NHEALTH] = "nvdimm_health",
>> +		[NDN_MSFT_CMD_EHEALTH] = "es_health",
>> +	};
>> +	const char *desc;
>> +
>> +	if (fn >= (int) ARRAY_SIZE(descs))
>> +		return "unknown";
>> +	desc = descs[fn];
>> +	if (!desc)
>> +		return "unknown";
>> +	return desc;
>> +}
>> +
>> +static bool msft_cmd_is_supported(struct ndctl_dimm *dimm, int cmd)
>> +{
>> +	/* Handle this separately to support monitor mode */
>> +	if (cmd == ND_CMD_SMART)
>> +		return true;
>> +
>> +	return !!(dimm->cmd_mask & (1ULL << cmd));
>> +}
>> +
>>   static u32 msft_get_firmware_status(struct ndctl_cmd *cmd)
>>   {
>>   	return cmd->msft->u.smart.status;
>>   }
>>   
>> -static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
>> +static struct ndctl_cmd *alloc_msft_cmd(struct ndctl_dimm *dimm,
>> +		unsigned int func, size_t in_size, size_t out_size)
>>   {
>>   	struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
>>   	struct ndctl_ctx *ctx = ndctl_bus_get_ctx(bus);
>> @@ -30,12 +58,12 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
>>   		return NULL;
>>   	}
>>   
>> -	if (test_dimm_dsm(dimm, NDN_MSFT_CMD_SMART) == DIMM_DSM_UNSUPPORTED) {
>> +	if (test_dimm_dsm(dimm, func) == DIMM_DSM_UNSUPPORTED) {
>>   		dbg(ctx, "unsupported function\n");
>>   		return NULL;
>>   	}
>>   
>> -	size = sizeof(*cmd) + sizeof(struct ndn_pkg_msft);
>> +	size = sizeof(*cmd) + sizeof(struct nd_cmd_pkg) + in_size + out_size;
>>   	cmd = calloc(1, size);
>>   	if (!cmd)
>>   		return NULL;
>> @@ -45,25 +73,30 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
>>   	cmd->type = ND_CMD_CALL;
>>   	cmd->size = size;
>>   	cmd->status = 1;
>> +	cmd->get_firmware_status = msft_get_firmware_status;
>>   
>>   	msft = CMD_MSFT(cmd);
>>   	msft->gen.nd_family = NVDIMM_FAMILY_MSFT;
>> -	msft->gen.nd_command = NDN_MSFT_CMD_SMART;
>> +	msft->gen.nd_command = func;
>>   	msft->gen.nd_fw_size = 0;
>> -	msft->gen.nd_size_in = offsetof(struct ndn_msft_smart, status);
>> -	msft->gen.nd_size_out = sizeof(msft->u.smart);
>> +	msft->gen.nd_size_in = in_size;
>> +	msft->gen.nd_size_out = out_size;
>>   	msft->u.smart.status = 0;
>> -	cmd->get_firmware_status = msft_get_firmware_status;
>>   
>>   	return cmd;
>>   }
>>   
>> +static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
>> +{
>> +	return (alloc_msft_cmd(dimm, NDN_MSFT_CMD_NHEALTH,
>> +			0, sizeof(struct ndn_msft_smart)));
>> +}
>> +
>>   static int msft_smart_valid(struct ndctl_cmd *cmd)
>>   {
>>   	if (cmd->type != ND_CMD_CALL ||
>> -	    cmd->size != sizeof(*cmd) + sizeof(struct ndn_pkg_msft) ||
>>   	    CMD_MSFT(cmd)->gen.nd_family != NVDIMM_FAMILY_MSFT ||
>> -	    CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_SMART ||
>> +	    CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_NHEALTH ||
>>   	    cmd->status != 0)
>>   		return cmd->status < 0 ? cmd->status : -EINVAL;
>>   	return 0;
>> @@ -80,28 +113,33 @@ static unsigned int msft_cmd_smart_get_flags(struct ndctl_cmd *cmd)
>>   	}
>>   
>>   	/* below health data can be retrieved via MSFT _DSM function 11 */
>> -	return NDN_MSFT_SMART_HEALTH_VALID |
>> -		NDN_MSFT_SMART_TEMP_VALID |
>> -		NDN_MSFT_SMART_USED_VALID;
>> +	return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID |
>> +	    ND_SMART_USED_VALID | ND_SMART_ALARM_VALID;
>>   }
>>   
>> -static unsigned int num_set_bit_health(__u16 num)
>> +static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>>   {
>> -	int i;
>> -	__u16 n = num & 0x7FFF;
>> -	unsigned int count = 0;
>> +	unsigned int health = 0;
>> +	int rc;
>>   
>> -	for (i = 0; i < 15; i++)
>> -		if (!!(n & (1 << i)))
>> -			count++;
>> +	rc = msft_smart_valid(cmd);
>> +	if (rc < 0) {
>> +		errno = -rc;
>> +		return UINT_MAX;
>> +	}
>>   
>> -	return count;
>> +	if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0)
>> +		health |= ND_SMART_FATAL_HEALTH;
>> +	if (CMD_MSFT_SMART(cmd)->health != 0 ||
>> +	    CMD_MSFT_SMART(cmd)->err_thresh_stat != 0)
>> +		health |= ND_SMART_CRITICAL_HEALTH;
>> +	if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0)
>> +		health |= ND_SMART_NON_CRITICAL_HEALTH;
>> +	return health;
>>   }
>>   
>> -static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>> +static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>>   {
>> -	unsigned int health;
>> -	unsigned int num;
>>   	int rc;
>>   
>>   	rc = msft_smart_valid(cmd);
>> @@ -110,21 +148,13 @@ static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>>   		return UINT_MAX;
>>   	}
>>   
>> -	num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health);
>> -	if (num == 0)
>> -		health = 0;
>> -	else if (num < 2)
>> -		health = ND_SMART_NON_CRITICAL_HEALTH;
>> -	else if (num < 3)
>> -		health = ND_SMART_CRITICAL_HEALTH;
>> -	else
>> -		health = ND_SMART_FATAL_HEALTH;
>> -
>> -	return health;
>> +	return CMD_MSFT_SMART(cmd)->temp * 16;
>>   }
>>   
>> -static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>> +static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd)
>>   {
>> +	__u8 stat;
>> +	unsigned int flags = 0;
>>   	int rc;
>>   
>>   	rc = msft_smart_valid(cmd);
>> @@ -133,7 +163,13 @@ static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>>   		return UINT_MAX;
>>   	}
>>   
>> -	return CMD_MSFT_SMART(cmd)->temp * 16;
>> +	stat = CMD_MSFT_SMART(cmd)->err_thresh_stat |
>> +	    CMD_MSFT_SMART(cmd)->warn_thresh_stat;
>> +	if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */
>> +		flags |= ND_SMART_SPARE_TRIP;
>> +	if (stat & 4) /* ES_TEMP */
>> +		flags |= ND_SMART_CTEMP_TRIP;
>> +	return flags;
>>   }
>>   
>>   static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd)
>> @@ -171,10 +207,13 @@ static int msft_cmd_xlat_firmware_status(struct ndctl_cmd *cmd)
>>   }
>>   
>>   struct ndctl_dimm_ops * const msft_dimm_ops = &(struct ndctl_dimm_ops) {
>> +	.cmd_desc = msft_cmd_desc,
>> +	.cmd_is_supported = msft_cmd_is_supported,
>>   	.new_smart = msft_dimm_cmd_new_smart,
>>   	.smart_get_flags = msft_cmd_smart_get_flags,
>>   	.smart_get_health = msft_cmd_smart_get_health,
>>   	.smart_get_media_temperature = msft_cmd_smart_get_media_temperature,
>> +	.smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags,
>>   	.smart_get_life_used = msft_cmd_smart_get_life_used,
>>   	.xlat_firmware_status = msft_cmd_xlat_firmware_status,
>>   };
>> diff --git a/ndctl/lib/msft.h b/ndctl/lib/msft.h
>> index 978cc11..8d246a5 100644
>> --- a/ndctl/lib/msft.h
>> +++ b/ndctl/lib/msft.h
>> @@ -2,21 +2,16 @@
>>   /* Copyright (C) 2016-2017 Dell, Inc. */
>>   /* Copyright (C) 2016 Hewlett Packard Enterprise Development LP */
>>   /* Copyright (C) 2014-2020, Intel Corporation. */
>> +/* Copyright (C) 2022 iXsystems, Inc. */
>>   #ifndef __NDCTL_MSFT_H__
>>   #define __NDCTL_MSFT_H__
>>   
>>   enum {
>> -	NDN_MSFT_CMD_QUERY = 0,
>> -
>> -	/* non-root commands */
>> -	NDN_MSFT_CMD_SMART = 11,
>> +	NDN_MSFT_CMD_CHEALTH = 10,
>> +	NDN_MSFT_CMD_NHEALTH = 11,
>> +	NDN_MSFT_CMD_EHEALTH = 12,
>>   };
>>   
>> -/* NDN_MSFT_CMD_SMART */
>> -#define NDN_MSFT_SMART_HEALTH_VALID	ND_SMART_HEALTH_VALID
>> -#define NDN_MSFT_SMART_TEMP_VALID	ND_SMART_TEMP_VALID
>> -#define NDN_MSFT_SMART_USED_VALID	ND_SMART_USED_VALID
>> -
>>   /*
>>    * This is actually function 11 data,
>>    * This is the closest I can find to match smart
>> -- 
>> 2.30.2
>>
>>

-- 
Alexander Motin

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [ndctl PATCH] libndctl/msft: Improve "smart" state reporting
  2022-11-17 22:41   ` Alexander Motin
@ 2022-12-02 15:59     ` Alexander Motin
  0 siblings, 0 replies; 4+ messages in thread
From: Alexander Motin @ 2022-12-02 15:59 UTC (permalink / raw
  To: Alison Schofield; +Cc: nvdimm

On 17.11.2022 17:41, Alexander Motin wrote:
> On 17.11.2022 17:07, Alison Schofield wrote:
>> On Thu, Nov 17, 2022 at 04:09:36PM -0500, Alexander Motin wrote:
>>> Previous code reported "smart" state based on number of bits
>>> set in the module health field.  But actually any single bit
>>> set there already means critical failure.  Rework the logic
>>> according to specification, properly reporting non-critical
>>> state in case of warning threshold reached, critical in case
>>> of any module health bit set or error threshold reached and
>>> fatal if NVDIMM exhausted its life time.  In attempt to
>>> report the cause of failure in absence of better methods,
>>> report reached thresholds as more or less matching alarms.
>>>
>>> While there clean up the code, making it more uniform with
>>> others and allowing more methods to be implemented later.
>>
>> Hi Alexander,
>>
>> Perhaps this would be better presented in 2 patches:
>> 1)the cleanup and then 2) improvement of smart state reporting.
> 
> Done.
Pardon my ignorance about the processes here, but what's about review on 
on the merits after two weeks?  Any more thoughts after I fixed the 
"procedural issue":
https://lore.kernel.org/nvdimm/20221117223749.6783-1-mav@ixsystems.com/T/#t

Thanks.

-- 
Alexander Motin

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-12-02 15:59 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-11-17 21:09 [ndctl PATCH] libndctl/msft: Improve "smart" state reporting Alexander Motin
2022-11-17 22:07 ` Alison Schofield
2022-11-17 22:41   ` Alexander Motin
2022-12-02 15:59     ` Alexander Motin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).