1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
45 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 /* memory transaction type */
48 static const char * const rrrr_msgs[] = {
49 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 /* participating processor */
53 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
54 EXPORT_SYMBOL_GPL(pp_msgs);
57 static const char * const to_msgs[] = { "no timeout", "timed out" };
60 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
62 /* internal error type */
63 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
65 static const char * const f15h_mc1_mce_desc[] = {
66 "UC during a demand linefill from L2",
67 "Parity error during data load from IC",
68 "Parity error for IC valid bit",
69 "Main tag parity error",
70 "Parity error in prediction queue",
71 "PFB data/address parity error",
72 "Parity error in the branch status reg",
73 "PFB promotion address error",
74 "Tag error during probe/victimization",
75 "Parity error for IC probe tag valid bit",
76 "PFB non-cacheable bit parity error",
77 "PFB valid bit parity error", /* xec = 0xd */
78 "Microcode Patch Buffer", /* xec = 010 */
85 static const char * const f15h_mc2_mce_desc[] = {
86 "Fill ECC error on data fills", /* xec = 0x4 */
87 "Fill parity error on insn fills",
88 "Prefetcher request FIFO parity error",
89 "PRQ address parity error",
90 "PRQ data parity error",
93 "WCB Data parity error",
94 "VB Data ECC or parity error",
95 "L2 Tag ECC error", /* xec = 0x10 */
96 "Hard L2 Tag ECC error",
97 "Multiple hits on L2 tag",
99 "PRB address parity error"
102 static const char * const mc4_mce_desc[] = {
103 "DRAM ECC error detected on the NB",
104 "CRC error detected on HT link",
105 "Link-defined sync error packets detected on HT link",
108 "Invalid GART PTE entry during GART table walk",
109 "Unsupported atomic RMW received from an IO link",
110 "Watchdog timeout due to lack of progress",
111 "DRAM ECC error detected on the NB",
112 "SVM DMA Exclusion Vector error",
113 "HT data error detected on link",
114 "Protocol error (link, L3, probe filter)",
115 "NB internal arrays parity error",
116 "DRAM addr/ctl signals parity error",
117 "IO link transmission error",
118 "L3 data cache ECC error", /* xec = 0x1c */
119 "L3 cache tag error",
120 "L3 LRU parity bits error",
121 "ECC Error in the Probe Filter directory"
124 static const char * const mc5_mce_desc[] = {
125 "CPU Watchdog timer expire",
126 "Wakeup array dest tag",
130 "Retire dispatch queue",
131 "Mapper checkpoint array",
132 "Physical register file EX0 port",
133 "Physical register file EX1 port",
134 "Physical register file AG0 port",
135 "Physical register file AG1 port",
136 "Flag register file",
138 "Retire status queue"
141 static bool f12h_mc0_mce(u16 ec, u8 xec)
150 pr_cont("during L1 linefill from L2.\n");
151 else if (ll == LL_L1)
152 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
159 static bool f10h_mc0_mce(u16 ec, u8 xec)
161 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
162 pr_cont("during data scrub.\n");
165 return f12h_mc0_mce(ec, xec);
168 static bool k8_mc0_mce(u16 ec, u8 xec)
171 pr_cont("during system linefill.\n");
175 return f10h_mc0_mce(ec, xec);
178 static bool cat_mc0_mce(u16 ec, u8 xec)
185 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
191 pr_cont("Data/Tag parity error due to %s.\n",
192 (r4 == R4_DRD ? "load/hw prf" : "store"));
195 pr_cont("Copyback parity error on a tag miss.\n");
198 pr_cont("Tag parity error during snoop.\n");
203 } else if (BUS_ERROR(ec)) {
205 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
208 pr_cont("System read data error on a ");
212 pr_cont("TLB reload.\n");
230 static bool f15h_mc0_mce(u16 ec, u8 xec)
238 pr_cont("Data Array access error.\n");
242 pr_cont("UC error during a linefill from L2/NB.\n");
247 pr_cont("STQ access error.\n");
251 pr_cont("SCB access error.\n");
255 pr_cont("Tag error.\n");
259 pr_cont("LDQ access error.\n");
265 } else if (BUS_ERROR(ec)) {
268 pr_cont("System Read Data Error.\n");
270 pr_cont(" Internal error condition type %d.\n", xec);
277 static void decode_mc0_mce(struct mce *m)
279 u16 ec = EC(m->status);
280 u8 xec = XEC(m->status, xec_mask);
282 pr_emerg(HW_ERR "MC0 Error: ");
284 /* TLB error signatures are the same across families */
286 if (TT(ec) == TT_DATA) {
287 pr_cont("%s TLB %s.\n", LL_MSG(ec),
288 ((xec == 2) ? "locked miss"
289 : (xec ? "multimatch" : "parity")));
292 } else if (fam_ops->mc0_mce(ec, xec))
295 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
298 static bool k8_mc1_mce(u16 ec, u8 xec)
307 pr_cont("during a linefill from L2.\n");
308 else if (ll == 0x1) {
311 pr_cont("Parity error during data load.\n");
315 pr_cont("Copyback Parity/Victim error.\n");
319 pr_cont("Tag Snoop error.\n");
332 static bool cat_mc1_mce(u16 ec, u8 xec)
340 if (TT(ec) != TT_INSTR)
344 pr_cont("Data/tag array parity error for a tag hit.\n");
345 else if (r4 == R4_SNOOP)
346 pr_cont("Tag error during snoop/victimization.\n");
348 pr_cont("Tag parity error from victim castout.\n");
350 pr_cont("Microcode patch RAM parity error.\n");
357 static bool f15h_mc1_mce(u16 ec, u8 xec)
366 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
370 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
374 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
378 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
387 static void decode_mc1_mce(struct mce *m)
389 u16 ec = EC(m->status);
390 u8 xec = XEC(m->status, xec_mask);
392 pr_emerg(HW_ERR "MC1 Error: ");
395 pr_cont("%s TLB %s.\n", LL_MSG(ec),
396 (xec ? "multimatch" : "parity error"));
397 else if (BUS_ERROR(ec)) {
398 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
400 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
401 } else if (fam_ops->mc1_mce(ec, xec))
404 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
407 static bool k8_mc2_mce(u16 ec, u8 xec)
412 pr_cont(" in the write data buffers.\n");
414 pr_cont(" in the victim data buffers.\n");
415 else if (xec == 0x2 && MEM_ERROR(ec))
416 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
417 else if (xec == 0x0) {
419 pr_cont(": %s error in a Page Descriptor Cache or "
420 "Guest TLB.\n", TT_MSG(ec));
421 else if (BUS_ERROR(ec))
422 pr_cont(": %s/ECC error in data read from NB: %s.\n",
423 R4_MSG(ec), PP_MSG(ec));
424 else if (MEM_ERROR(ec)) {
428 pr_cont(": %s error during data copyback.\n",
431 pr_cont(": %s parity/ECC error during data "
432 "access from L2.\n", R4_MSG(ec));
443 static bool f15h_mc2_mce(u16 ec, u8 xec)
449 pr_cont("Data parity TLB read error.\n");
451 pr_cont("Poison data provided for TLB fill.\n");
454 } else if (BUS_ERROR(ec)) {
458 pr_cont("Error during attempted NB data read.\n");
459 } else if (MEM_ERROR(ec)) {
462 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
466 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
477 static bool f16h_mc2_mce(u16 ec, u8 xec)
486 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
491 pr_cont("ECC error in L2 tag (%s).\n",
492 ((r4 == R4_GEN) ? "BankReq" :
493 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
498 pr_cont("ECC error in L2 data array (%s).\n",
499 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
500 ((r4 == R4_GEN) ? "Attr" :
501 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
506 pr_cont("Parity error in L2 attribute bits (%s).\n",
507 ((r4 == R4_RD) ? "Hit" :
508 ((r4 == R4_GEN) ? "Attr" : "Fill")));
518 static void decode_mc2_mce(struct mce *m)
520 u16 ec = EC(m->status);
521 u8 xec = XEC(m->status, xec_mask);
523 pr_emerg(HW_ERR "MC2 Error: ");
525 if (!fam_ops->mc2_mce(ec, xec))
526 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
529 static void decode_mc3_mce(struct mce *m)
531 u16 ec = EC(m->status);
532 u8 xec = XEC(m->status, xec_mask);
534 if (boot_cpu_data.x86 >= 0x14) {
535 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
536 " please report on LKML.\n");
540 pr_emerg(HW_ERR "MC3 Error");
545 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
548 pr_cont(" during %s.\n", R4_MSG(ec));
555 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
558 static void decode_mc4_mce(struct mce *m)
560 struct cpuinfo_x86 *c = &boot_cpu_data;
561 int node_id = amd_get_nb_id(m->extcpu);
562 u16 ec = EC(m->status);
563 u8 xec = XEC(m->status, 0x1f);
566 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
571 /* special handling for DRAM ECCs */
572 if (xec == 0x0 || xec == 0x8) {
573 /* no ECCs on F11h */
577 pr_cont("%s.\n", mc4_mce_desc[xec]);
580 nb_bus_decoder(node_id, m);
587 pr_cont("GART Table Walk data error.\n");
588 else if (BUS_ERROR(ec))
589 pr_cont("DMA Exclusion Vector Table Walk error.\n");
595 if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
596 pr_cont("Compute Unit Data Error.\n");
609 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
613 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
616 static void decode_mc5_mce(struct mce *m)
618 struct cpuinfo_x86 *c = &boot_cpu_data;
619 u8 xec = XEC(m->status, xec_mask);
621 if (c->x86 == 0xf || c->x86 == 0x11)
624 pr_emerg(HW_ERR "MC5 Error: ");
626 if (xec == 0x0 || xec == 0xc)
627 pr_cont("%s.\n", mc5_mce_desc[xec]);
629 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
636 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
639 static void decode_mc6_mce(struct mce *m)
641 u8 xec = XEC(m->status, xec_mask);
643 pr_emerg(HW_ERR "MC6 Error: ");
647 pr_cont("Free List");
651 pr_cont("Physical Register File");
655 pr_cont("Retire Queue");
659 pr_cont("Scheduler table");
663 pr_cont("Status Register File");
671 pr_cont(" parity error.\n");
676 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
679 static inline void amd_decode_err_code(u16 ec)
682 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
686 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
689 pr_cont(", mem/io: %s", II_MSG(ec));
691 pr_cont(", tx: %s", TT_MSG(ec));
693 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
694 pr_cont(", mem-tx: %s", R4_MSG(ec));
697 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
704 * Filter out unwanted MCE signatures here.
706 static bool amd_filter_mce(struct mce *m)
708 u8 xec = (m->status >> 16) & 0x1f;
711 * NB GART TLB error reporting is disabled by default.
713 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
719 static const char *decode_error_status(struct mce *m)
721 if (m->status & MCI_STATUS_UC) {
722 if (m->status & MCI_STATUS_PCC)
723 return "System Fatal error.";
724 if (m->mcgstatus & MCG_STATUS_RIPV)
725 return "Uncorrected, software restartable error.";
726 return "Uncorrected, software containable error.";
729 if (m->status & MCI_STATUS_DEFERRED)
730 return "Deferred error.";
732 return "Corrected error, no action required.";
735 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
737 struct mce *m = (struct mce *)data;
738 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
741 if (amd_filter_mce(m))
777 pr_emerg(HW_ERR "Error Status: %s\n", decode_error_status(m));
779 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
781 c->x86, c->x86_model, c->x86_mask,
783 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
784 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
785 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
786 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
787 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
789 if (c->x86 == 0x15 || c->x86 == 0x16)
791 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
792 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
794 /* do the two bits[14:13] together */
795 ecc = (m->status >> 45) & 0x3;
797 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
799 pr_cont("]: 0x%016llx\n", m->status);
801 if (m->status & MCI_STATUS_ADDRV)
802 pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
804 amd_decode_err_code(m->status & 0xffff);
808 EXPORT_SYMBOL_GPL(amd_decode_mce);
810 static struct notifier_block amd_mce_dec_nb = {
811 .notifier_call = amd_decode_mce,
814 static int __init mce_amd_init(void)
816 struct cpuinfo_x86 *c = &boot_cpu_data;
818 if (c->x86_vendor != X86_VENDOR_AMD)
821 if (c->x86 < 0xf || c->x86 > 0x16)
824 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
830 fam_ops->mc0_mce = k8_mc0_mce;
831 fam_ops->mc1_mce = k8_mc1_mce;
832 fam_ops->mc2_mce = k8_mc2_mce;
836 fam_ops->mc0_mce = f10h_mc0_mce;
837 fam_ops->mc1_mce = k8_mc1_mce;
838 fam_ops->mc2_mce = k8_mc2_mce;
842 fam_ops->mc0_mce = k8_mc0_mce;
843 fam_ops->mc1_mce = k8_mc1_mce;
844 fam_ops->mc2_mce = k8_mc2_mce;
848 fam_ops->mc0_mce = f12h_mc0_mce;
849 fam_ops->mc1_mce = k8_mc1_mce;
850 fam_ops->mc2_mce = k8_mc2_mce;
854 nb_err_cpumask = 0x3;
855 fam_ops->mc0_mce = cat_mc0_mce;
856 fam_ops->mc1_mce = cat_mc1_mce;
857 fam_ops->mc2_mce = k8_mc2_mce;
862 fam_ops->mc0_mce = f15h_mc0_mce;
863 fam_ops->mc1_mce = f15h_mc1_mce;
864 fam_ops->mc2_mce = f15h_mc2_mce;
869 fam_ops->mc0_mce = cat_mc0_mce;
870 fam_ops->mc1_mce = cat_mc1_mce;
871 fam_ops->mc2_mce = f16h_mc2_mce;
875 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
880 pr_info("MCE: In-kernel MCE decoding enabled.\n");
882 mce_register_decode_chain(&amd_mce_dec_nb);
886 early_initcall(mce_amd_init);
889 static void __exit mce_amd_exit(void)
891 mce_unregister_decode_chain(&amd_mce_dec_nb);
895 MODULE_DESCRIPTION("AMD MCE decoder");
896 MODULE_ALIAS("edac-mce-amd");
897 MODULE_LICENSE("GPL");
898 module_exit(mce_amd_exit);