Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-15-SP5:Update
rasdaemon.35133
rasdaemon-Add-support-for-post-processing-MCA-e...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File rasdaemon-Add-support-for-post-processing-MCA-errors.patch of Package rasdaemon.35133
From: Avadhut Naik <avadhut.naik@amd.com> Subject: rasdaemon: Add support for post-processing MCA errors References: jsc#PED-7633 MI300A: rasdaemon: MI300A support patches Patch-Mainline: Git-commit: 932118b04a04104dfac6b8536419803f236e6118 Git-repo: git://git.infradead.org/users/mchehab/rasdaemon.git.git Currently, the rasdaemon performs detailed error decoding of received MCA errors on the system only whence it is running, either as a daemon or in the foreground. As such, error decoding cannot be undertaken for any MCA errors received whence the rasdaemon wasn't running. Additionally, if the error decoding modules like edac_mce_amd too have not been loaded, error records in the demsg buffer might correspond to raw values in associated MSRs, compelling users to undertake decoding manually. The scenario seems more plausible on AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA Extended Error Descriptions from the edac_mce_amd module in an effort to offload SMCA Error Decoding to the rasdaemon. As such, add support to post-process and decode MCA Errors received on AMD SMCA systems from raw MSR values. Support for post-processing and decoding of MCA Errors received on CPUs of other vendors can be added in the future, as needed. Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Avadhut Naik <avadhut.naik@amd.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org> Signed-off-by: <trenn@suse.com> Index: rasdaemon-0.6.7.18.git+7ccf12f/README =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/README +++ rasdaemon-0.6.7.18.git+7ccf12f/README @@ -156,6 +156,16 @@ or, if you also want to record errors at required): # rasdaemon -f -r +To post-process and decode received MCA errors on AMD SMCA systems, run: + +``` + # rasdaemon -p --status <STATUS_reg> --ipid <IPID_reg> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM> +``` + +Status and IPID Register values (in hex) are mandatory. The `smca` flag +with `family` and `model` are required if not decoding locally. `Bank` +parameter is optional. + You may also start it via systemd: # systemctl start rasdaemon Index: rasdaemon-0.6.7.18.git+7ccf12f/mce-amd-smca.c =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/mce-amd-smca.c +++ rasdaemon-0.6.7.18.git+7ccf12f/mce-amd-smca.c @@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[ [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, }; -static void amd_decode_errcode(struct mce_event *e) +void amd_decode_errcode(struct mce_event *e) { decode_amd_errcode(e); @@ -782,7 +782,7 @@ static inline void fixup_hwid(struct mce } /* Decode extended errors according to Scalable MCA specification */ -static void decode_smca_error(struct mce_event *e, struct mce_priv* m) +void decode_smca_error(struct mce_event *e, struct mce_priv *m) { enum smca_bank_types bank_type; const char *ip_name; @@ -827,7 +827,9 @@ static void decode_smca_error(struct mce /* Only print the descriptor of valid extended error code */ if (xec < smca_mce_descs[bank_type].num_descs) mce_snprintf(e->mcastatus_msg, - " %s.\n", smca_mce_descs[bank_type].descs[xec]); + "%s. Ext Err Code: %d", + smca_mce_descs[bank_type].descs[xec], + xec); if (bank_type == SMCA_UMC && xec == 0) { channel = find_umc_channel(e); Index: rasdaemon-0.6.7.18.git+7ccf12f/ras-events.h =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/ras-events.h +++ rasdaemon-0.6.7.18.git+7ccf12f/ras-events.h @@ -100,6 +100,7 @@ enum ghes_severity { /* Function prototypes */ int toggle_ras_mc_event(int enable); +int ras_offline_mce_event(struct ras_mc_offline_event *event); int handle_ras_events(int record_events); #endif Index: rasdaemon-0.6.7.18.git+7ccf12f/ras-mce-handler.c =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/ras-mce-handler.c +++ rasdaemon-0.6.7.18.git+7ccf12f/ras-mce-handler.c @@ -63,10 +63,8 @@ static char *cputype_name[] = { [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server", }; -static enum cputype select_intel_cputype(struct ras_events *ras) +static enum cputype select_intel_cputype(struct mce_priv *mce) { - struct mce_priv *mce = ras->mce_priv; - if (mce->family == 15) { if (mce->model == 6) return CPU_TULSA; @@ -140,9 +138,8 @@ static enum cputype select_intel_cputype return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC; } -static int detect_cpu(struct ras_events *ras) +static int detect_cpu(struct mce_priv *mce) { - struct mce_priv *mce = ras->mce_priv; FILE *f; int ret = 0; char *line = NULL; @@ -221,7 +218,7 @@ static int detect_cpu(struct ras_events } goto ret; } else if (!strcmp(mce->vendor,"GenuineIntel")) { - mce->cputype = select_intel_cputype(ras); + mce->cputype = select_intel_cputype(mce); } else { ret = EINVAL; } @@ -246,7 +243,7 @@ int register_mce_handler(struct ras_even mce = ras->mce_priv; - rc = detect_cpu(ras); + rc = detect_cpu(mce); if (rc) { if (mce->processor_flags) free (mce->processor_flags); @@ -383,6 +380,105 @@ static void report_mce_event(struct ras_ */ } +static int report_mce_offline(struct trace_seq *s, + struct mce_event *mce, + struct mce_priv *priv) +{ + time_t now; + struct tm *tm; + + time(&now); + tm = localtime(&now); + + if (tm) + strftime(mce->timestamp, sizeof(mce->timestamp), + "%Y-%m-%d %H:%M:%S %z", tm); + trace_seq_printf(s, "%s,", mce->timestamp); + + if (*mce->bank_name) + trace_seq_printf(s, " %s,", mce->bank_name); + else + trace_seq_printf(s, " bank=%x,", mce->bank); + + if (*mce->mcastatus_msg) + trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg); + + if (*mce->mcistatus_msg) + trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg); + + if (*mce->mc_location) + trace_seq_printf(s, " Locn: %s,", mce->mc_location); + + if (*mce->error_msg) + trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg); + + return 0; +} + +int ras_offline_mce_event(struct ras_mc_offline_event *event) +{ + int rc = 0; + struct trace_seq s; + struct mce_event *mce = NULL; + struct mce_priv *priv = NULL; + + mce = (struct mce_event *)calloc(1, sizeof(struct mce_event)); + if (!mce) { + log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n"); + return errno; + } + + priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv)); + if (!priv) { + log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n"); + free(mce); + return errno; + } + + if (event->smca) { + priv->cputype = CPU_AMD_SMCA; + priv->family = event->family; + priv->model = event->model; + } else { + rc = detect_cpu(priv); + if (rc) { + log(TERM, LOG_ERR, "Failed to detect CPU\n"); + goto free_mce; + } + } + + mce->status = event->status; + mce->bank = event->bank; + + switch (priv->cputype) { + case CPU_AMD_SMCA: + mce->synd = event->synd; + mce->ipid = event->ipid; + if (!mce->ipid || !mce->status) { + log(TERM, LOG_ERR, "%s MSR required.\n", + mce->ipid ? "Status" : "Ipid"); + rc = -EINVAL; + goto free_mce; + } + decode_smca_error(mce, priv); + amd_decode_errcode(mce); + break; + default: + break; + } + + trace_seq_init(&s); + report_mce_offline(&s, mce, priv); + trace_seq_do_printf(&s); + fflush(stdout); + trace_seq_destroy(&s); + +free_mce: + free(priv); + free(mce); + return rc; +} + int ras_mce_event_handler(struct trace_seq *s, struct pevent_record *record, struct event_format *event, void *context) Index: rasdaemon-0.6.7.18.git+7ccf12f/ras-mce-handler.h =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/ras-mce-handler.h +++ rasdaemon-0.6.7.18.git+7ccf12f/ras-mce-handler.h @@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_s /* enables intel iMC logs */ int set_intel_imc_log(enum cputype cputype, unsigned ncpus); +/* Undertake AMD SMCA Error Decoding */ +void decode_smca_error(struct mce_event *e, struct mce_priv *m); +void amd_decode_errcode(struct mce_event *e); + /* Per-CPU-type decoders for Intel CPUs */ void p4_decode_model(struct mce_event *e); void core2_decode_model(struct mce_event *e); Index: rasdaemon-0.6.7.18.git+7ccf12f/ras-record.h =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/ras-record.h +++ rasdaemon-0.6.7.18.git+7ccf12f/ras-record.h @@ -21,6 +21,7 @@ #define __RAS_RECORD_H #include <stdint.h> +#include <stdbool.h> #include "config.h" #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) @@ -39,6 +40,15 @@ struct ras_mc_event { const char *driver_detail; }; +struct ras_mc_offline_event { + unsigned int family, model; + bool smca; + uint8_t bank; + uint64_t ipid; + uint64_t synd; + uint64_t status; +}; + struct ras_aer_event { char timestamp[64]; const char *error_type; Index: rasdaemon-0.6.7.18.git+7ccf12f/rasdaemon.c =================================================================== --- rasdaemon-0.6.7.18.git+7ccf12f.orig/rasdaemon.c +++ rasdaemon-0.6.7.18.git+7ccf12f/rasdaemon.c @@ -41,8 +41,21 @@ struct arguments { int record_events; int enable_ras; int foreground; + int offline; }; +enum OFFLINE_ARG_KEYS { + SMCA = 0x100, + MODEL, + FAMILY, + BANK_NUM, + IPID_REG, + STATUS_REG, + SYNDROME_REG +}; + +struct ras_mc_offline_event event; + static error_t parse_opt(int k, char *arg, struct argp_state *state) { struct arguments *args = state->input; @@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *ar case 'f': args->foreground++; break; +#ifdef HAVE_MCE + case 'p': + if (state->argc < 4) + argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR); + args->offline++; + break; +#endif default: return ARGP_ERR_UNKNOWN; } return 0; } +#ifdef HAVE_MCE +static error_t parse_opt_offline(int key, char *arg, + struct argp_state *state) +{ + switch (key) { + case SMCA: + event.smca = true; + break; + case MODEL: + event.model = strtoul(state->argv[state->next], NULL, 0); + break; + case FAMILY: + event.family = strtoul(state->argv[state->next], NULL, 0); + break; + case BANK_NUM: + event.bank = atoi(state->argv[state->next]); + break; + case IPID_REG: + event.ipid = strtoull(state->argv[state->next], NULL, 0); + break; + case STATUS_REG: + event.status = strtoull(state->argv[state->next], NULL, 0); + break; + case SYNDROME_REG: + event.synd = strtoull(state->argv[state->next], NULL, 0); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} +#endif + long user_hz; int main(int argc, char *argv[]) { struct arguments args; int idx = -1; + +#ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, + {"model", MODEL, 0, 0, "CPU Model"}, + {"family", FAMILY, 0, 0, "CPU Family"}, + {"bank", BANK_NUM, 0, 0, "Bank Number"}, + {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"}, + {"status", STATUS_REG, 0, 0, "Status Register"}, + {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"}, + {0, 0, 0, 0, 0, 0}, + }; + + struct argp offline_argp = { + .options = offline_options, + .parser = parse_opt_offline, + .doc = TOOL_DESCRIPTION, + .args_doc = ARGS_DOC, + }; + + struct argp_child offline_parser[] = { + {&offline_argp, 0, "Post-Processing Options:", 0}, + {0, 0, 0, 0}, + }; +#endif + const struct argp_option options[] = { {"enable", 'e', 0, 0, "enable RAS events and exit", 0}, {"disable", 'd', 0, 0, "disable RAS events and exit", 0}, @@ -81,6 +160,10 @@ int main(int argc, char *argv[]) {"record", 'r', 0, 0, "record events via sqlite3", 0}, #endif {"foreground", 'f', 0, 0, "run foreground, not daemonize"}, +#ifdef HAVE_MCE + {"post-processing", 'p', 0, 0, + "Post-processing MCE's with raw register values"}, +#endif { 0, 0, 0, 0, 0, 0 } }; @@ -89,7 +172,9 @@ int main(int argc, char *argv[]) .parser = parse_opt, .doc = TOOL_DESCRIPTION, .args_doc = ARGS_DOC, - +#ifdef HAVE_MCE + .children = offline_parser, +#endif }; memset (&args, 0, sizeof(args)); @@ -111,6 +196,13 @@ int main(int argc, char *argv[]) return 0; } +#ifdef HAVE_MCE + if (args.offline) { + ras_offline_mce_event(&event); + return 0; + } +#endif + openlog(TOOL_NAME, 0, LOG_DAEMON); if (!args.foreground) if (daemon(0,0))
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor