New feature: support memory row CE threshold policy

- Introduction: Identify memory row faults in memory CE faults and isolate the physical memory pages where row faults occur. This method can effectively prevent CE storms or memory UCE faults caused by memory row failures. - Implementation: The system counts the number of CE faults in the same memory row within a specified period. If the number of CE faults exceeds the configured threshold, the system considers that the memory row may fail and isolates all physical pages recorded in the memory row. Notes: 1. This function is disabled by default. You can enable it by configuring the'ROW_CE_ACTION' field in the '/etc/sysconfig/rasdaemon' configuration file. 2. If both row isolation and page isolation are enabled, page isolation is automatically disabled by default. 3. If the number of fault times in the DIMM CE fault information received by the rasdaemon is 0, the BIOS does not correctly parse the number of fault times when parsing the fault information. When a fault occurs, the rasdaemon process considers that the number of faults is 1 by default, which is the same as the kernel process. Signed-off-by: zhuofeng <[email protected]>
mchehab · Jun 18, 2024 · 7ebc551 · 7ebc551
1 parent f9cb13b
commit 7ebc551
Show file tree

Hide file tree

Showing 7 changed files with 529 additions and 5 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -64,9 +64,7 @@ endif
 if WITH_HISI_NS_DECODE
  rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
 endif
-if WITH_MEMORY_CE_PFA
- rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
-endif
+rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
 if WITH_AMP_NS_DECODE
  rasdaemon_SOURCES += non-standard-ampere.c
 endif
@@ -120,6 +118,5 @@ upload:
 # custom target
 install-data-local:
  $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
-if WITH_MEMORY_CE_PFA
  $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
-endif
+
diff --git a/configure.ac b/configure.ac
@@ -167,6 +167,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" = "xyes"],
 AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all = xyes])
 AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"])
 
+AC_ARG_ENABLE([memory_row_ce_pfa],
+ AS_HELP_STRING([--enable-memory-row-ce-pfa], [enable memory row Corrected Error predictive failure analysis]))
+
+AS_IF([test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_MEMORY_ROW_CE_PFA,1,"have memory row corrected error predictive failure analysis")
+ AC_SUBST([WITH_MEMORY_ROW_CE_PFA])
+])
+AM_CONDITIONAL([WITH_MEMORY_ROW_CE_PFA], [test x$enable_memory_row_ce_pfa = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_MEMORY_ROW_CE_PFA], [USE_MEMORY_ROW_CE_PFA="yes"], [USE_MEMORY_ROW_CE_PFA="no"])
+
 AC_ARG_ENABLE([amp_ns_decode],
  AS_HELP_STRING([--enable-amp-ns-decode], [enable AMP_NS_DECODE events (currently experimental)]))
 
@@ -247,6 +257,7 @@ compile time options summary
  Memory Failure : $USE_MEMORY_FAILURE
  CXL events : $USE_CXL
  Memory CE PFA : $USE_MEMORY_CE_PFA
+ Memory ROW CE PFA : $USE_MEMORY_ROW_CE_PFA
  AMP RAS errors : $USE_AMP_NS_DECODE
  CPU fault isolation : $USE_CPU_FAULT_ISOLATION
  YITIAN RAS errors : $USE_YITIAN_NS_DECODE

diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
@@ -16,6 +16,32 @@
 PAGE_CE_REFRESH_CYCLE="24h"
 PAGE_CE_THRESHOLD="50"
 
+# Specify the threshold of isolating buggy memory rows.
+#
+# Format:
+# [0-9]+[unit]
+# Notice: please make sure match this format, rasdaemon will use default value for exception input cases.
+#
+# Supported units:
+# ROW_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour
+# ROW_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none
+#
+# The two configs will only take no effect when PAGE_CE_ACTION is "off".
+ROW_CE_REFRESH_CYCLE="24h"
+ROW_CE_THRESHOLD="50"
+
+# Specify the internal action in rasdaemon to exceeding a row error threshold.
+#
+# off no action
+# account only account errors
+# soft try to soft-offline row without killing any processes
+# This requires an uptodate kernel. Might not be successfull.
+# hard try to hard-offline row by killing processes
+# Requires an uptodate kernel. Might not be successfull.
+# soft-then-hard First try to soft offline, then try hard offlining.
+# Note: default offline choice is "off".
+ROW_CE_ACTION="off"
+
 # Specify the internal action in rasdaemon to exceeding a page error threshold.
 #
 # off no action

diff --git a/ras-events.c b/ras-events.c
@@ -919,6 +919,10 @@ int handle_ras_events(int record_events)
  ras->page_size = page_size;
  ras->record_events = record_events;
 
+#ifdef HAVE_MEMORY_ROW_CE_PFA
+ ras_row_account_init();
+#endif
+
 #ifdef HAVE_MEMORY_CE_PFA
  /* FIXME: enable memory isolation unconditionally */
  ras_page_account_init();
@@ -1181,5 +1185,9 @@ int handle_ras_events(int record_events)
 #ifdef HAVE_CPU_FAULT_ISOLATION
  cpu_infos_free();
 #endif
+
+#ifdef HAVE_MEMORY_ROW_CE_PFA
+ row_record_infos_free();
+#endif
  return rc;
 }
diff --git a/ras-mc-handler.c b/ras-mc-handler.c
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <errno.h>
 #include <traceevent/kbuffer.h>
 #include "ras-mc-handler.h"
 #include "ras-record.h"
@@ -189,6 +190,21 @@ int ras_mc_event_handler(struct trace_seq *s,
  ras_record_page_error(ev.address, ev.error_count, now);
 #endif
 
+#ifdef HAVE_MEMORY_ROW_CE_PFA
+ /* Account row corrected errors */
+ struct timespec ts;
+ clockid_t clk_id = CLOCK_MONOTONIC;
+ // A fault occurs, but the fault error_count BIOS reports sometimes is 0.
+ // This is a bug in the BIOS. 
+ // We set the value to 1 
+ // even if the error_count is reported 0.
+ if (ev.error_count == 0)
+ ev.error_count = 1;
+ if (clock_gettime(clk_id, &ts) == 0 && !strcmp(ev.error_type, "Corrected")) {
+ ras_record_row_error(ev.driver_detail, ev.error_count, ts.tv_sec, ev.address);
+ }
+#endif
+
 #ifdef HAVE_ABRT_REPORT
  /* Report event to ABRT */
  ras_report_mc_event(ras, &ev);