Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
openSUSE:Evergreen:11.1
mcelog
mcelog-0.7-newcpus-1.diff
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File mcelog-0.7-newcpus-1.diff of Package mcelog
From: Andi Kleen <ak@linux.intel.com> Subject: mcelog decoding support for Intel Tigerton Backport of the changes for Tigerton/Dunnington/Nehalem changes from mcelog git git://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git The Tigerton support required adding Core2 support, they are all lumped together. I also added "P6OLD" because that was in the mainline mcelog git changes and would have been difficult to separate. The differences to core2 are very minimal (just a few different events). The actual decoder is all table driven. In the original git this was done as individual changes, but I lumped it all together in the backport. While it adds quite a lot of new code there's not many changes to generic code. Most of the new code is only used on the new CPUs. diff -x '*~' -urpN mcelog-0.7/bitfield.c mcelog-0.7-newcpus//bitfield.c --- mcelog-0.7/bitfield.c 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//bitfield.c 2008-09-26 20:28:29.000000000 +0200 @@ -0,0 +1,61 @@ +#include <string.h> +#include <stdio.h> +#include "mcelog.h" +#include "bitfield.h" + +char *reserved_3bits[8]; +char *reserved_1bit[2]; +char *reserved_2bits[4]; + +static u64 bitmask(u64 i) +{ + u64 mask = 1; + while (mask < i) + mask = (mask << 1) | 1; + return mask; +} + +void decode_bitfield(u64 status, struct field *fields) +{ + struct field *f; + int linelen = 0; + char *delim = ""; + + for (f = fields; f->str; f++) { + u64 v = (status >> f->start_bit) & bitmask(f->stringlen - 1); + char *s = NULL; + if (v < f->stringlen) + s = f->str[v]; + if (!s) { + if (v == 0) + continue; + char buf[60]; + s = buf; + snprintf(buf, sizeof buf, "<%u:%Lx>", f->start_bit, v); + } + int len = strlen(s); + if (linelen + len > 75) { + delim = "\n"; + linelen = 0; + } + Wprintf("%s%s", delim, s); + delim = " "; + linelen += len + 1; + } + if (linelen > 0) + Wprintf("\n"); +} + +void decode_numfield(u64 status, struct numfield *fields) +{ + struct numfield *f; + for (f = fields; f->name; f++) { + u64 mask = (1ULL << (f->end - f->start - 1)) - 1; + u64 v = (status >> f->start) & mask; + if (v > 0) { + char fmt[30]; + snprintf(fmt, 30, "%%s: %s\n", f->fmt ? f->fmt : "%Lu"); + Wprintf(fmt, f->name, v); + } + } +} diff -x '*~' -urpN mcelog-0.7/bitfield.h mcelog-0.7-newcpus//bitfield.h --- mcelog-0.7/bitfield.h 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//bitfield.h 2008-09-26 20:28:29.000000000 +0200 @@ -0,0 +1,27 @@ +/* Generic bitfield decoder */ + +struct field { + int start_bit; + char **str; + int stringlen; +}; + +struct numfield { + int start, end; + char *name; + char *fmt; +}; + +#define FIELD(start_bit, name) { start_bit, name, NELE(name) } +#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 } + +#define NUMBER(start, end, name) { start, end, name, "%Lu" } +#define HEXNUMBER(start, end, name) { start, end, name, "%Lx" } + +void decode_bitfield(u64 status, struct field *fields); +void decode_numfield(u64 status, struct numfield *fields); + +extern char *reserved_3bits[8]; +extern char *reserved_1bit[2]; +extern char *reserved_2bits[4]; + diff -x '*~' -urpN mcelog-0.7/core2.c mcelog-0.7-newcpus//core2.c --- mcelog-0.7/core2.c 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//core2.c 2008-09-26 20:21:18.000000000 +0200 @@ -0,0 +1,105 @@ +#include <string.h> +#include <stdio.h> +#include <assert.h> +#include "mcelog.h" +#include "core2.h" +#include "bitfield.h" + +/* Decode P6 family (Core2) model specific errors. + The generic errors are decoded in p4.c */ + +/* [19..24] */ +static char *bus_queue_req_type[] = { + [0] = "BQ_DCU_READ_TYPE", + [2] = "BQ_IFU_DEMAND_TYPE", + [3] = "BQ_IFU_DEMAND_NC_TYPE", + [4] = "BQ_DCU_RFO_TYPE", + [5] = "BQ_DCU_RFO_LOCK_TYPE", + [6] = "BQ_DCU_ITOM_TYPE", + [8] = "BQ_DCU_WB_TYPE", + [10] = "BC_DCU_WCEVICT_TYPE", + [11] = "BQ_DCU_WCLINE_TYPE", + [12] = "BQ_DCU_BTM_TYPE", + [13] = "BQ_DCU_INTACK_TYPE", + [14] = "BQ_DCU_INVALL2_TYPE", + [15] = "BQ_DCU_FLUSHL2_TYPE", + [16] = "BQ_DCU_PART_RD_TYPE", + [18] = "BQ_DCU_PART_WR_TYPE", + [20] = "BQ_DCU_SPEC_CYC_TYPE", + [24] = "BQ_DCU_IO_RD_TYPE", + [25] = "BQ_DCU_IO_WR_TYPE", + [28] = "BQ_DCU_LOCK_RD_TYPE", + [30] = "BQ_DCU_SPLOCK_RD_TYPE", + [29] = "BQ_DCU_LOCK_WR_TYPE", +}; + +/* [25..27] */ +static char *bus_queue_error_type[] = { + [0] = "BQ_ERR_HARD_TYPE", + [1] = "BQ_ERR_DOUBLE_TYPE", + [2] = "BQ_ERR_AERR2_TYPE", + [4] = "BQ_ERR_SINGLE_TYPE", + [5] = "BQ_ERR_AERR1_TYPE", +}; + +static struct field p6_shared_status[] = { + FIELD(16, reserved_3bits), + FIELD(19, bus_queue_req_type), + FIELD(25, bus_queue_error_type), + FIELD(25, bus_queue_error_type), + SBITFIELD(30, "internal BINIT"), + SBITFIELD(36, "received parity error on response transaction"), + SBITFIELD(38, "timeout BINIT (ROB timeout)." + " No micro-instruction retired for some time"), + FIELD(39, reserved_3bits), + SBITFIELD(42, "bus transaction received hard error response"), + SBITFIELD(43, "failure that caused IERR"), + /* The following are reserved for Core in the SDM. Let's keep them here anyways*/ + SBITFIELD(44, "two failing bus transactions with address parity error (AERR)"), + SBITFIELD(45, "uncorrectable ECC error"), + SBITFIELD(46, "correctable ECC error"), + /* [47..54]: ECC syndrome */ + FIELD(55, reserved_2bits), + {}, +}; + +static struct field p6old_status[] = { + SBITFIELD(28, "FRC error"), + SBITFIELD(29, "BERR on this CPU"), + FIELD(31, reserved_1bit), + FIELD(32, reserved_3bits), + SBITFIELD(35, "BINIT received from external bus"), + SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), + {} +}; + +static struct field core2_status[] = { + SBITFIELD(28, "MCE driven"), + SBITFIELD(29, "MCE is observed"), + SBITFIELD(31, "BINIT observed"), + FIELD(32, reserved_2bits), + SBITFIELD(34, "PIC or FSB data parity error"), + FIELD(35, reserved_1bit), + SBITFIELD(37, "FSB address parity error detected"), + {} +}; + +static struct numfield p6old_status_numbers[] = { + HEXNUMBER(47, 54, "ECC syndrome"), + {} +}; + +void core2_decode_model(u64 status) +{ + decode_bitfield(status, p6_shared_status); + decode_bitfield(status, core2_status); + /* Normally reserved, but let's parse anyways: */ + decode_numfield(status, p6old_status_numbers); +} + +void p6old_decode_model(u64 status) +{ + decode_bitfield(status, p6_shared_status); + decode_bitfield(status, p6old_status); + decode_numfield(status, p6old_status_numbers); +} diff -x '*~' -urpN mcelog-0.7/core2.h mcelog-0.7-newcpus//core2.h --- mcelog-0.7/core2.h 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//core2.h 2008-09-26 20:21:18.000000000 +0200 @@ -0,0 +1,2 @@ +void core2_decode_model(u64 status); +void p6old_decode_model(u64 status); diff -x '*~' -urpN mcelog-0.7/dunnington.c mcelog-0.7-newcpus//dunnington.c --- mcelog-0.7/dunnington.c 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//dunnington.c 2008-09-26 20:24:24.000000000 +0200 @@ -0,0 +1,123 @@ +/* Copyright (c) 2008 by Intel Corp. + Decode Intel Xeon Processor 7400 Model (Dunnington) specific MCEs + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; version + 2. + + mcelog is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should find a copy of v2 of the GNU General Public License somewhere + on your Linux system; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + Author: + Andi Kleen +*/ + +/* other files + +mcelog.h CPU_DUNNINGTON +mcelog.c: cputype name +intel.h CASE_INTEL_CPUS +intel.c model == 0x1d CPU_DUNNINGTON +p4.c: if (cpu == CPU_DUNNINGTON) dunnington_decode_model(log->status); + add to CORE2 cases + +*/ + +#include <stddef.h> +#include "mcelog.h" +#include "bitfield.h" +#include "dunnington.h" + +/* Follows Intel IA32 SDM 3b Appendix E.2.1 ++ */ + +static struct field dunnington_bus_status[] = { + SBITFIELD(16, "Parity error detected during FSB request phase"), + FIELD(17, reserved_3bits), + SBITFIELD(20, "Hard Failure response received for a local transaction"), + SBITFIELD(21, "Parity error on FSB response field detected"), + SBITFIELD(22, "Parity data error on inbound data detected"), + FIELD(23, reserved_3bits), + FIELD(25, reserved_3bits), + FIELD(28, reserved_3bits), + FIELD(31, reserved_1bit), + {} +}; + +static char *dnt_front_error[0xf] = { + [0x1] = "Inclusion error from core 0", + [0x2] = "Inclusion error from core 1", + [0x3] = "Write Exclusive error from core 0", + [0x4] = "Write Exclusive error from core 1", + [0x5] = "Inclusion error from FSB", + [0x6] = "SNP stall error from FSB", + [0x7] = "Write stall error from FSB", + [0x8] = "FSB Arbiter Timeout error", + [0xA] = "Inclusion error from core 2", + [0xB] = "Write exclusive error from core 2", +}; + +static char *dnt_int_error[0xf] = { + [0x2] = "Internal timeout error", + [0x3] = "Internal timeout error", + [0x4] = "Intel Cache Safe Technology Queue full error\n" + "or disabled ways in a set overflow", + [0x5] = "Quiet cycle timeout error (correctable)", +}; + +struct field dnt_int_status[] = { + FIELD(8, dnt_int_error), + {} +}; + +struct field dnt_front_status[] = { + FIELD(0, dnt_front_error), + {} +}; + +struct field dnt_cecc[] = { + SBITFIELD(1, "Correctable ECC event on outgoing core 0 data"), + SBITFIELD(2, "Correctable ECC event on outgoing core 1 data"), + SBITFIELD(3, "Correctable ECC event on outgoing core 3 data"), + {} +}; + +struct field dnt_uecc[] = { + SBITFIELD(1, "Uncorrectable ECC event on outgoing core 0 data"), + SBITFIELD(2, "Uncorrectable ECC event on outgoing core 1 data"), + SBITFIELD(3, "Uncorrectable ECC event on outgoing core 3 data"), + {} +}; + +static void dunnington_decode_bus(u64 status) +{ + decode_bitfield(status, dunnington_bus_status); +} + +static void dunnington_decode_internal(u64 status) +{ + u32 mca = (status >> 16) & 0xffff; + if ((mca & 0xfff0) == 0) + decode_bitfield(status, dnt_front_status); + else if ((mca & 0xf0ff) == 0) + decode_bitfield(status, dnt_int_status); + else if ((mca & 0xfff0) == 0xc000) + decode_bitfield(status, dnt_cecc); + else if ((mca & 0xfff0) == 0xe000) + decode_bitfield(status, dnt_uecc); +} + +void dunnington_decode_model(u64 status) +{ + if ((status & 0xffff) == 0xe0f) + dunnington_decode_bus(status); + else if ((status & 0xffff) == (1 << 10)) + dunnington_decode_internal(status); +} + diff -x '*~' -urpN mcelog-0.7/dunnington.h mcelog-0.7-newcpus//dunnington.h --- mcelog-0.7/dunnington.h 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//dunnington.h 2008-09-26 20:24:24.000000000 +0200 @@ -0,0 +1,2 @@ +void dunnington_decode_model(u64 status); + diff -x '*~' -urpN mcelog-0.7/intel.c mcelog-0.7-newcpus//intel.c --- mcelog-0.7/intel.c 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//intel.c 2008-09-26 20:32:52.000000000 +0200 @@ -0,0 +1,22 @@ +#include "mcelog.h" +#include "intel.h" +#include <stdio.h> + +enum cputype select_intel_cputype(int family, int model) +{ + if (family == 15) { + return CPU_P4; + } + if (family == 6) { + if (model < 0xf) + return CPU_P6OLD; + else if (model == 0xf || model == 0x17) /* Merom/Penryn */ + return CPU_CORE2; + else if (model == 0x1d) + return CPU_DUNNINGTON; + else if (model == 0x1a) + return CPU_NEHALEM; + } + fprintf(stderr, "Unknown Intel CPU type family %x model %x\n", family, model); + return family == 6 ? CPU_P6OLD : CPU_GENERIC; +} diff -x '*~' -urpN mcelog-0.7/intel.h mcelog-0.7-newcpus//intel.h --- mcelog-0.7/intel.h 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//intel.h 2008-09-26 20:32:00.000000000 +0200 @@ -0,0 +1,9 @@ +enum cputype select_intel_cputype(int family, int model); + +#define CASE_INTEL_CPUS \ + case CPU_P6OLD: \ + case CPU_CORE2: \ + case CPU_NEHALEM: \ + case CPU_DUNNINGTON: \ + case CPU_P4 + diff -x '*~' -urpN mcelog-0.7/Makefile mcelog-0.7-newcpus//Makefile --- mcelog-0.7/Makefile 2006-05-03 08:55:54.000000000 +0200 +++ mcelog-0.7-newcpus//Makefile 2008-09-26 21:07:21.000000000 +0200 @@ -5,7 +5,8 @@ all: mcelog .PHONY: install clean -mcelog: p4.o k8.o mcelog.o dmi.o +mcelog: p4.o k8.o mcelog.o dmi.o core2.o dunnington.o nehalem.o \ + bitfield.o intel.o p4.o: p4.c mcelog.h p4.h k8.o: k8.c mcelog.h k8.h @@ -18,7 +19,8 @@ install: mcelog.c echo "call mcelog regularly from your crontab" clean: - rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi + rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi core2.o dunnington.o \ + nehalem.o bitfield.o intel.o dmi: dmi.c gcc -o dmi ${CFLAGS} -DSTANDALONE dmi.c ${LDFLAGS} diff -x '*~' -urpN mcelog-0.7/mcelog.8 mcelog-0.7-newcpus//mcelog.8 --- mcelog-0.7/mcelog.8 2006-05-03 08:55:54.000000000 +0200 +++ mcelog-0.7-newcpus//mcelog.8 2008-09-26 20:42:44.000000000 +0200 @@ -2,9 +2,9 @@ .SH NAME mcelog \- Print machine check log from x86-64 kernel. .SH SYNOPSIS -mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device] +mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic|...] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device] .br -mcelog [\-\-k8|\-\-p4|\-\-generic] \-\-ascii +mcelog [\-\-k8|\-\-p4|\-\-generic|...] \-\-ascii .SH DESCRIPTION Linux x86-64 kernels since 2.6.4 don't print recoverable machine check errors to the kernel log anymore. Instead they are saved into a special @@ -18,13 +18,21 @@ When the .B \-\-syslog option is specified redirect output to system log. + When .B \-\-k8 is specified assume the events are for a AMD Opteron or Athlon 64 or Athlon FX CPU. With .B \-\-p4 -is specified assume the events are for a Intel Pentium 4 or Intel Xeon. +is specified assume the events are for a Intel Pentium 4 or Intel (older) Xeon. +With +.B \-\-core2 +assume the events are for a Intel Core2 CPU or Intel Xeon 3000, 3200, 5100, 5300, 7300 +series. When +.B \-\-intel-cpu=family,model +are specified then the family number and model number of the Intel CPU +to be decoded should be specified (can be found in /proc/cpuinfo). When .B \-\-generic all the fields are dumped without CPU specific decoding. diff -x '*~' -urpN mcelog-0.7/mcelog.c mcelog-0.7-newcpus//mcelog.c --- mcelog-0.7/mcelog.c 2006-05-03 08:55:54.000000000 +0200 +++ mcelog-0.7-newcpus//mcelog.c 2008-09-26 20:45:50.000000000 +0200 @@ -31,12 +31,10 @@ #include "k8.h" #include "p4.h" #include "dmi.h" +#include "intel.h" -enum { - CPU_GENERIC, - CPU_K8, - CPU_P4 -} cpu = CPU_GENERIC; + +enum cputype cpu = CPU_GENERIC; char *logfn = "/dev/mcelog"; @@ -62,8 +60,8 @@ char *bankname(unsigned bank) switch (cpu) { case CPU_K8: return k8_bank_name(bank); - case CPU_P4: - return p4_bank_name(bank); + CASE_INTEL_CPUS: + return intel_bank_name(bank); /* add banks of other cpu types here */ default: sprintf(numeric, "BANK %d", bank); @@ -98,7 +96,7 @@ int mce_filter(struct mce *m) case CPU_K8: return mce_filter_k8(m); /* add more buggy CPUs here */ - case CPU_P4: + CASE_INTEL_CPUS: /* No bugs known */ return 1; default: @@ -134,8 +132,8 @@ void dump_mce(struct mce *m) case CPU_K8: decode_k8_mc(m); break; - case CPU_P4: - decode_p4_mc(m); + CASE_INTEL_CPUS: + decode_intel_mc(m, cpu); break; /* add handlers for other CPUs here */ default: @@ -153,23 +151,27 @@ void check_cpu(void) if (f != NULL) { int found = 0; int family; + int model; char vendor[64]; char *line = NULL; size_t linelen = 0; - while (getdelim(&line, &linelen, '\n', f) > 0 && found < 2) { + while (getdelim(&line, &linelen, '\n', f) > 0 && found < 3) { if (sscanf(line, "vendor_id : %63[^\n]", vendor) == 1) found++; if (sscanf(line, "cpu family : %d", &family) == 1) found++; + if (sscanf(line, "model : %d", &model) == 1) + found++; } - if (found == 2) { + if (found == 3) { if (!strcmp(vendor,"AuthenticAMD") && family == 15) cpu = CPU_K8; - if (!strcmp(vendor,"GenuineIntel") && family == 15) - cpu = CPU_P4; + if (!strcmp(vendor,"GenuineIntel")) + cpu = select_intel_cputype(family, model); /* Add checks for other CPUs here */ } else { - fprintf(stderr, "mcelog: warning: Cannot parse /proc/cpuinfo\n"); + fprintf(stderr, + "mcelog: warning: Cannot parse /proc/cpuinfo\n"); } fclose(f); free(line); @@ -303,9 +305,11 @@ void usage(void) { fprintf(stderr, "Usage:\n" - " mcelog [--k8|--p4|--generic] [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n" + " mcelog options [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n" "Decode machine check error records from kernel\n" - " mcelog [--k8|--p4|--generic] [--dmi] --ascii < log\n" + " mcelog options [--dmi] --ascii < log\n" + "Options:\n" + "--p4|--k8|--core2|--generic|--intel-cpu=family,model Set CPU type to decode\n" "Decode machine check ASCII output from kernel logs\n"); exit(1); } @@ -318,6 +322,17 @@ int modifier(char *s) cpu = CPU_P4; } else if (!strcmp(s, "--generic")) { cpu = CPU_GENERIC; + } else if (!strcmp(s, "--core2")) { + cpu = CPU_CORE2; + } else if (!strncmp(s, "--intel-cpu=", 12)) { + unsigned fam, mod; + if (sscanf(s + 12, "%i,%i", &fam, &mod) != 2) + usage(); + cpu = select_intel_cputype(fam, mod); + if (cpu == CPU_GENERIC) { + fprintf(stderr, "Unknown Intel CPU\n"); + usage(); + } } else if (!strcmp(s, "--ignorenodev")) { ignore_nodev = 1; } else if (!strcmp(s,"--filter")) { diff -x '*~' -urpN mcelog-0.7/mcelog.h mcelog-0.7-newcpus//mcelog.h --- mcelog-0.7/mcelog.h 2006-05-03 08:55:54.000000000 +0200 +++ mcelog-0.7-newcpus//mcelog.h 2008-09-26 20:28:19.000000000 +0200 @@ -61,3 +61,13 @@ struct mce { #endif void Wprintf(char *fmt, ...) PRINTFLIKE; + +enum cputype { + CPU_GENERIC, + CPU_K8, + CPU_P4, + CPU_NEHALEM, + CPU_DUNNINGTON, + CPU_P6OLD, + CPU_CORE2, +}; diff -x '*~' -urpN mcelog-0.7/nehalem.c mcelog-0.7-newcpus//nehalem.c --- mcelog-0.7/nehalem.c 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//nehalem.c 2008-09-26 20:24:20.000000000 +0200 @@ -0,0 +1,163 @@ +/* Copyright (C) 2008 Intel Corporation + Decode Intel Nehalem specific machine check errors. + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; version + 2. + + mcelog is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should find a copy of v2 of the GNU General Public License somewhere + on your Linux system; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + Author: Andi Kleen +*/ + +/* other files + +mcelog.h CPU_NEHALEM +intel.h CASE_INTEL_CPUS +intel.c model == 0x1a CPU_NEHALEM +p4.c: if (cpu == CPU_NEHALEM) nehalem_decode_model(log->status, log->misc); + if (test_prefix(status, 7)) decode_memory_controller(log->status); +mcelog.c/p4.c: syslog/trigger for memory controller + cputype_name +*/ + +#include <string.h> +#include <stdio.h> +#include "mcelog.h" +#include "nehalem.h" +#include "core2.h" +#include "bitfield.h" + +/* See IA32 SDM Vol3B Appendix E.3.2 ff */ + +/* MC1_STATUS error */ +static struct field qpi_status[] = { + SBITFIELD(16, "QPI header had bad parity"), + SBITFIELD(17, "QPI Data packet had bad parity"), + SBITFIELD(18, "Number of QPI retries exceeded"), + SBITFIELD(19, "Received QPI data packet that was poisoned by sender"), + SBITFIELD(20, "QPI reserved 20"), + SBITFIELD(21, "QPI reserved 21"), + SBITFIELD(22, "QPI received unsupported message encoding"), + SBITFIELD(23, "QPI credit type is not supported"), + SBITFIELD(24, "Sender sent too many QPI flits to the receiver"), + SBITFIELD(25, "QPI Sender sent a failed response to receiver"), + SBITFIELD(26, "Clock jitter detected in internal QPI clocking"), + {} +}; + +static struct field qpi_misc[] = { + SBITFIELD(14, "QPI misc reserved 14"), + SBITFIELD(15, "QPI misc reserved 15"), + SBITFIELD(24, "QPI Interleave/Head Indication Bit (IIB)"), + {} +}; + +static struct numfield qpi_numbers[] = { + HEXNUMBER(0, 7, "QPI class and opcode of packet with error"), + HEXNUMBER(8, 13, "QPI Request Transaction ID"), + NUMBER(16, 18, "QPI Requestor/Home Node ID (RHNID)"), + HEXNUMBER(19, 23, "QPI miscreserved 19-23"), +}; + +static struct field memory_controller_status[] = { + SBITFIELD(16, "Memory read ECC error"), + SBITFIELD(17, "Memory ECC error occurred during scrub"), + SBITFIELD(18, "Memory write parity error"), + SBITFIELD(19, "Memory error in half of redundant memory"), + SBITFIELD(20, "Memory reserved 20"), + SBITFIELD(21, "Memory access out of range"), + SBITFIELD(22, "Memory internal RTID invalid"), + SBITFIELD(23, "Memory address parity error"), + SBITFIELD(24, "Memory byte enable parity error"), + {} +}; + +static struct numfield memory_controller_numbers[] = { + HEXNUMBER(0, 7, "Memory transaction Tracker ID (RTId)"), + HEXNUMBER(8, 15, "Memory MISC reserved 8..15"), + NUMBER(16, 17, "Memory DIMM ID of error"), + NUMBER(18, 19, "Memory channel ID of error"), + HEXNUMBER(32, 63, "Memory ECC syndrome"), + HEXNUMBER(25, 37, "Memory MISC reserved 25..37"), + NUMBER(38, 52, "Memory corrected error count (CORE_ERR_CNT)"), + HEXNUMBER(53, 56, "Memory MISC reserved 53..56"), + {} +}; + +static char *internal_errors[] = { + [0x0] = "No Error", + [0x3] = "Reset firmware did not complete", + [0x8] = "Received an invalid CMPD", + [0xa] = "Invalid Power Management Request", + [0xd] = "Invalid S-state transition", + [0x11] = "VID controller does not match POC controller selected", + [0x1a] = "MSID from POC does not match CPU MSID", +}; + +static struct field internal_error_status[] = { + FIELD(24, internal_errors), + {} +}; + +static struct numfield internal_error_numbers[] = { + HEXNUMBER(16, 23, "Internal machine check status reserved 16..23"), + HEXNUMBER(32, 56, "Internal machine check status reserved 32..56"), + {}, +}; + +/* Generic architectural memory controller encoding */ + +static char *mmm_mnemonic[] = { + "GEN", "RD", "WR", "AC", "MS", "RES5", "RES6", "RES7" +}; +static char *mmm_desc[] = { + "Generic undefined request", + "Memory read error", + "Memory write error", + "Address/Command error", + "Memory scrubbing error", + "Reserved 5", + "Reserved 6", + "Reserved 7" +}; + +void decode_memory_controller(u32 status) +{ + char channel[30]; + if ((status & 0xf) == 0xf) + strcpy(channel, "unspecified"); + else + sprintf(channel, "%u", status & 0xf); + Wprintf("MEMORY CONTROLLER %s_CHANNEL%s_ERR\n", + mmm_mnemonic[(status >> 4) & 7], + channel); + Wprintf("Transaction: %s\n", mmm_desc[(status >> 4) & 7]); + Wprintf("Channel: %s\n", channel); +} + +void nehalem_decode_model(u64 status, u64 misc) +{ + u32 mca = status & 0xffff; + core2_decode_model(status); + if ((mca >> 11) == 1) { /* bus and interconnect QPI */ + decode_bitfield(status, qpi_status); + decode_numfield(status, qpi_numbers); + decode_bitfield(misc, qpi_misc); + } else if (mca == 0x0001) { /* internal unspecified */ + decode_bitfield(status, internal_error_status); + decode_numfield(status, internal_error_numbers); + } else if ((mca >> 8) == 1) { /* memory controller */ + decode_bitfield(status, memory_controller_status); + decode_numfield(status, memory_controller_numbers); + } +} + diff -x '*~' -urpN mcelog-0.7/nehalem.h mcelog-0.7-newcpus//nehalem.h --- mcelog-0.7/nehalem.h 1970-01-01 01:00:00.000000000 +0100 +++ mcelog-0.7-newcpus//nehalem.h 2008-09-26 20:24:20.000000000 +0200 @@ -0,0 +1,2 @@ +void nehalem_decode_model(u64 status, u64 misc); +void decode_memory_controller(u32 status); diff -x '*~' -urpN mcelog-0.7/p4.c mcelog-0.7-newcpus//p4.c --- mcelog-0.7/p4.c 2006-05-03 08:55:54.000000000 +0200 +++ mcelog-0.7-newcpus//p4.c 2008-09-26 20:34:41.000000000 +0200 @@ -1,7 +1,6 @@ /* Copyright (c) 2005 by Intel Corp. - Decode IA32/x86-64 machine check for Pentium 4, Intel Xeon - or EM64T. + Decode Intel machine check (generic and P4 specific) mcelog is free software; you can redistribute it and/or modify it under the terms of the GNU General Public @@ -19,12 +18,17 @@ Authors: Racing Guo <racing.guo@intel.com> + Andi Kleen */ - + #include <stdio.h> #include "mcelog.h" +#include "p4.h" +#include "core2.h" +#include "nehalem.h" +#include "dunnington.h" -/* decode mce for P4/Xeon family */ +/* decode mce for P4/Xeon and Core2 family */ static inline int test_prefix(int nr, __u32 value) { @@ -73,13 +77,12 @@ static char* get_RRRR_str(__u8 rrrr) } return "UNKNOWN"; - } static char* get_PP_str(__u8 pp) { static char* PP[] = { - "Originated-request", + "Local-CPU-originated-request", "Responed-to-request", "Observed-error-as-third-party", "Generic" @@ -112,7 +115,7 @@ static char* get_II_str(__u8 i) return II[i]; } -static int decode_mca(__u32 mca, char *buf, int len) +static void decode_mca(__u32 mca) { #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ #define TLB_LL_SHIFT 0x0 @@ -137,64 +140,59 @@ static int decode_mca(__u32 mca, char *b #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ #define BUS_PP_SHIFT 0x9 - mca = mca & 0xFFFF; + static char *msg[] = { + [0] = "No Error", + [1] = "Unclassified", + [2] = "Microcode ROM parity error", + [3] = "External error", + [4] = "FRC error", + }; + + if (mca & (1UL << 12)) { + Wprintf("corrected filtering (some unreported errors in same region)\n"); + mca &= ~(1UL << 12); + } - switch(mca) { - case 0x0: - return snprintf(buf, len, "%s", "No Error"); - break; - case 0x1: - return snprintf(buf, len, "%s", "Unclassified"); - break; - case 0x2: - return snprintf(buf, len, "%s", "Microcode ROM Parity Error"); - break; - case 0x3: - return snprintf(buf, len, "%s", "External Error"); - break; - case 0x4: - return snprintf(buf, len, "%s", "FRC Error"); - break; - default: - break; + if (mca < NELE(msg)) { + Wprintf("%s\n", msg[mca]); + return; } - if (test_prefix(4, mca)) { - return snprintf(buf, len, "%s TLB %s Error", + if ((mca >> 2) == 3) { + Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3)); + } else if (test_prefix(4, mca)) { + Wprintf("%s TLB %s Error\n", get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT), get_LL_str((mca & TLB_LL_MASK) >> TLB_LL_SHIFT)); - } - if (test_prefix(8, mca)) { - return snprintf(buf, len, "%s CACHE %s %s Error", + } else if (test_prefix(8, mca)) { + Wprintf("%s CACHE %s %s Error\n", get_TT_str((mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT), get_LL_str((mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT), get_RRRR_str((mca & CACHE_RRRR_MASK) >> CACHE_RRRR_SHIFT)); - } - if (test_prefix(10, mca)) { + } else if (test_prefix(10, mca)) { if (mca == 0x400) - return snprintf(buf, len, "Internal Timer error"); + Wprintf("Internal Timer error\n"); else - return snprintf(buf, len, - "Internal unclassified errors"); - } - if (test_prefix(11, mca)) { - - return snprintf(buf, len, "BUS %s %s %s %s %s Error", + Wprintf("Internal unclassified error: %x\n", mca & 0xffff); + } else if (test_prefix(11, mca)) { + Wprintf("BUS %s %s %s %s %s Error\n", get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT), get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT), get_RRRR_str((mca & BUS_RRRR_MASK) >> BUS_RRRR_SHIFT), get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT), get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT)); - } - return snprintf(buf, len, "Unknown Error"); + } else if (test_prefix(7, mca)) { + decode_memory_controller(mca); + } else + Wprintf("Unknown Error %x\n", mca); } -static void decode_model(__u32 model) +static void p4_decode_model(__u32 model) { static struct { int value; @@ -219,17 +217,27 @@ static void decode_model(__u32 model) Wprintf("\n"); } -static void decode_mci(__u64 status) +static void decode_tracking(u64 track, int cpu) { -#define BUF_LEN 200 - char buf[BUF_LEN]; - __u32 mca; + static char *msg[] = { + [1] = "green", + [2] = "yellow\n" +"Large number of corrected errors. System operating, but you should\n" +"schedule it for service within a few weeks", + [3] ="res3" }; + if (track) { + Wprintf("Threshold based error status: %s\n", msg[track]); + if (track == 2) + Wprintf( + "CPU %d has large number of corrected errors. Consider replacement", cpu); + } +} +static void decode_mci(__u64 status, int cpu) +{ Wprintf("MCi status:\n"); - if (!(status & MCI_STATUS_VAL)) { - Wprintf("Invalid log\n"); - return; - } + if (!(status & MCI_STATUS_VAL)) + Wprintf("Machine check not valid\n"); if (status & MCI_STATUS_OVER) Wprintf("Error overflow\n"); @@ -249,15 +257,9 @@ static void decode_mci(__u64 status) if (status & MCI_STATUS_PCC) Wprintf("Processor context corrupt\n"); - mca = status & 0xFFFFL; - decode_mca(mca, buf, BUF_LEN); - Wprintf("MCA:%s\n", buf); - - if (test_prefix(11, mca)) { - __u32 model; - model = (status & 0xFFFF0000L); - decode_model(model); - } + decode_tracking((status >> 54) & 3, cpu); + Wprintf("MCA: "); + decode_mca(status & 0xffffL); } static void decode_mcg(__u64 mcgstatus) @@ -272,13 +274,36 @@ static void decode_mcg(__u64 mcgstatus) Wprintf("\n"); } -void decode_p4_mc(struct mce *log) +void decode_intel_mc(struct mce *log, int cputype) { + int cpu = log->cpu; + decode_mcg(log->mcgstatus); - decode_mci(log->status); + decode_mci(log->status, cpu); + + if (test_prefix(11, (log->status & 0xffffL))) { + switch (cputype) { + case CPU_P6OLD: + p6old_decode_model(log->status); + break; + case CPU_DUNNINGTON: + case CPU_CORE2: + core2_decode_model(log->status); + break; + case CPU_P4: + p4_decode_model(log->status & 0xffff0000L); + break; + case CPU_NEHALEM: + nehalem_decode_model(log->status, log->misc); + break; + } + } + + if (cputype == CPU_DUNNINGTON) + dunnington_decode_model(log->status); } -char *p4_bank_name(int num) +char *intel_bank_name(int num) { static char bname[64]; sprintf(bname, "BANK %d", num); diff -x '*~' -urpN mcelog-0.7/p4.h mcelog-0.7-newcpus//p4.h --- mcelog-0.7/p4.h 2006-05-03 08:55:54.000000000 +0200 +++ mcelog-0.7-newcpus//p4.h 2008-09-26 20:35:46.000000000 +0200 @@ -1,2 +1,2 @@ -char *p4_bank_name(int num); -void decode_p4_mc(struct mce* mce); +char *intel_bank_name(int num); +void decode_intel_mc(struct mce *log, int cpu);
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor