Skip to content

Commit

Permalink
New feature: support memory row CE threshold policy
Browse files Browse the repository at this point in the history
- Introduction: Identify memory row faults in memory CE faults and
isolate the physical memory pages where row faults occur. This method
can effectively prevent CE storms or memory UCE faults caused by memory
row failures.

- Implementation: The system counts the number of CE faults in the same
memory row within a specified period. If the number of CE faults exceeds
the configured threshold, the system considers that the memory row may
fail and isolates all physical pages recorded in the memory row.

Notes:
1. This function is disabled by default. You can enable it by
configuring the'ROW_CE_ACTION' field in the '/etc/sysconfig/rasdaemon' configuration file.
2. If both row isolation and page isolation are enabled, page isolation is automatically
disabled by default.
3. If the number of fault times in the DIMM CE fault information received by the rasdaemon
is 0, the BIOS does not correctly parse the number of fault times when parsing the fault information.
When a fault occurs, the rasdaemon process considers that the number of faults is 1 by default,
which is the same as the kernel process.

Signed-off-by: zhuofeng <[email protected]>
  • Loading branch information
zhuofeng committed Jun 18, 2024
1 parent f9cb13b commit 7ebc551
Show file tree
Hide file tree
Showing 7 changed files with 529 additions and 5 deletions.
7 changes: 2 additions & 5 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
endif
if WITH_MEMORY_CE_PFA
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
endif
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
if WITH_AMP_NS_DECODE
rasdaemon_SOURCES += non-standard-ampere.c
endif
Expand Down Expand Up @@ -120,6 +118,5 @@ upload:
# custom target
install-data-local:
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
if WITH_MEMORY_CE_PFA
$(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
endif

11 changes: 11 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" = "xyes"],
AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"])

AC_ARG_ENABLE([memory_row_ce_pfa],
AS_HELP_STRING([--enable-memory-row-ce-pfa], [enable memory row Corrected Error predictive failure analysis]))

AS_IF([test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_MEMORY_ROW_CE_PFA,1,"have memory row corrected error predictive failure analysis")
AC_SUBST([WITH_MEMORY_ROW_CE_PFA])
])
AM_CONDITIONAL([WITH_MEMORY_ROW_CE_PFA], [test x$enable_memory_row_ce_pfa = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_MEMORY_ROW_CE_PFA], [USE_MEMORY_ROW_CE_PFA="yes"], [USE_MEMORY_ROW_CE_PFA="no"])

AC_ARG_ENABLE([amp_ns_decode],
AS_HELP_STRING([--enable-amp-ns-decode], [enable AMP_NS_DECODE events (currently experimental)]))

Expand Down Expand Up @@ -247,6 +257,7 @@ compile time options summary
Memory Failure : $USE_MEMORY_FAILURE
CXL events : $USE_CXL
Memory CE PFA : $USE_MEMORY_CE_PFA
Memory ROW CE PFA : $USE_MEMORY_ROW_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
YITIAN RAS errors : $USE_YITIAN_NS_DECODE
Expand Down
26 changes: 26 additions & 0 deletions misc/rasdaemon.env
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@
PAGE_CE_REFRESH_CYCLE="24h"
PAGE_CE_THRESHOLD="50"

# Specify the threshold of isolating buggy memory rows.
#
# Format:
# [0-9]+[unit]
# Notice: please make sure match this format, rasdaemon will use default value for exception input cases.
#
# Supported units:
# ROW_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour
# ROW_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none
#
# The two configs will only take no effect when PAGE_CE_ACTION is "off".
ROW_CE_REFRESH_CYCLE="24h"
ROW_CE_THRESHOLD="50"

# Specify the internal action in rasdaemon to exceeding a row error threshold.
#
# off no action
# account only account errors
# soft try to soft-offline row without killing any processes
# This requires an uptodate kernel. Might not be successfull.
# hard try to hard-offline row by killing processes
# Requires an uptodate kernel. Might not be successfull.
# soft-then-hard First try to soft offline, then try hard offlining.
# Note: default offline choice is "off".
ROW_CE_ACTION="off"

# Specify the internal action in rasdaemon to exceeding a page error threshold.
#
# off no action
Expand Down
8 changes: 8 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,10 @@ int handle_ras_events(int record_events)
ras->page_size = page_size;
ras->record_events = record_events;

#ifdef HAVE_MEMORY_ROW_CE_PFA
ras_row_account_init();
#endif

#ifdef HAVE_MEMORY_CE_PFA
/* FIXME: enable memory isolation unconditionally */
ras_page_account_init();
Expand Down Expand Up @@ -1181,5 +1185,9 @@ int handle_ras_events(int record_events)
#ifdef HAVE_CPU_FAULT_ISOLATION
cpu_infos_free();
#endif

#ifdef HAVE_MEMORY_ROW_CE_PFA
row_record_infos_free();
#endif
return rc;
}
16 changes: 16 additions & 0 deletions ras-mc-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <traceevent/kbuffer.h>
#include "ras-mc-handler.h"
#include "ras-record.h"
Expand Down Expand Up @@ -189,6 +190,21 @@ int ras_mc_event_handler(struct trace_seq *s,
ras_record_page_error(ev.address, ev.error_count, now);
#endif

#ifdef HAVE_MEMORY_ROW_CE_PFA
/* Account row corrected errors */
struct timespec ts;
clockid_t clk_id = CLOCK_MONOTONIC;
// A fault occurs, but the fault error_count BIOS reports sometimes is 0.
// This is a bug in the BIOS.
// We set the value to 1
// even if the error_count is reported 0.
if (ev.error_count == 0)
ev.error_count = 1;
if (clock_gettime(clk_id, &ts) == 0 && !strcmp(ev.error_type, "Corrected")) {
ras_record_row_error(ev.driver_detail, ev.error_count, ts.tv_sec, ev.address);
}
#endif

#ifdef HAVE_ABRT_REPORT
/* Report event to ABRT */
ras_report_mc_event(ras, &ev);
Expand Down
Loading

0 comments on commit 7ebc551

Please sign in to comment.