1 #include <linux/module.h>
2 #include <linux/slab.h>
3
4 #include <asm/cpu.h>
5
6 #include "mce_amd.h"
7
8 static struct amd_decoder_ops *fam_ops;
9
10 static u8 xec_mask = 0xf;
11
12 static bool report_gart_errors;
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14
amd_report_gart_errors(bool v)15 void amd_report_gart_errors(bool v)
16 {
17 report_gart_errors = v;
18 }
19 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20
amd_register_ecc_decoder(void (* f)(int,struct mce *))21 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 decode_dram_ecc = f;
24 }
25 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))27 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
28 {
29 if (decode_dram_ecc) {
30 WARN_ON(decode_dram_ecc != f);
31
32 decode_dram_ecc = NULL;
33 }
34 }
35 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
36
37 /*
38 * string representation for the different MCA reported error types, see F3x48
39 * or MSR0000_0411.
40 */
41
42 /* transaction type */
43 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
44
45 /* cache level */
46 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47
48 /* memory transaction type */
49 static const char * const rrrr_msgs[] = {
50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 };
52
53 /* participating processor */
54 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
55 EXPORT_SYMBOL_GPL(pp_msgs);
56
57 /* request timeout */
58 static const char * const to_msgs[] = { "no timeout", "timed out" };
59
60 /* memory or i/o */
61 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
62
63 /* internal error type */
64 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
65
66 static const char * const f15h_mc1_mce_desc[] = {
67 "UC during a demand linefill from L2",
68 "Parity error during data load from IC",
69 "Parity error for IC valid bit",
70 "Main tag parity error",
71 "Parity error in prediction queue",
72 "PFB data/address parity error",
73 "Parity error in the branch status reg",
74 "PFB promotion address error",
75 "Tag error during probe/victimization",
76 "Parity error for IC probe tag valid bit",
77 "PFB non-cacheable bit parity error",
78 "PFB valid bit parity error", /* xec = 0xd */
79 "Microcode Patch Buffer", /* xec = 010 */
80 "uop queue",
81 "insn buffer",
82 "predecode buffer",
83 "fetch address FIFO",
84 "dispatch uop queue"
85 };
86
87 static const char * const f15h_mc2_mce_desc[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
93 "WCC Tag ECC error",
94 "WCC Data ECC error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
100 "XAB parity error",
101 "PRB address parity error"
102 };
103
104 static const char * const mc4_mce_desc[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
108 "HT Master abort",
109 "HT Target abort",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
124 };
125
126 static const char * const mc5_mce_desc[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
129 "AG payload array",
130 "EX payload array",
131 "IDRF array",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
139 "DE error occurred",
140 "Retire status queue"
141 };
142
143 static const char * const mc6_mce_desc[] = {
144 "Hardware Assertion",
145 "Free List",
146 "Physical Register File",
147 "Retire Queue",
148 "Scheduler table",
149 "Status Register File",
150 };
151
152 /* Scalable MCA error strings */
153 static const char * const smca_ls_mce_desc[] = {
154 "Load queue parity",
155 "Store queue parity",
156 "Miss address buffer payload parity",
157 "L1 TLB parity",
158 "Reserved",
159 "DC tag error type 6",
160 "DC tag error type 1",
161 "Internal error type 1",
162 "Internal error type 2",
163 "Sys Read data error thread 0",
164 "Sys read data error thread 1",
165 "DC tag error type 2",
166 "DC data error type 1 (poison consumption)",
167 "DC data error type 2",
168 "DC data error type 3",
169 "DC tag error type 4",
170 "L2 TLB parity",
171 "PDC parity error",
172 "DC tag error type 3",
173 "DC tag error type 5",
174 "L2 fill data error",
175 };
176
177 static const char * const smca_if_mce_desc[] = {
178 "microtag probe port parity error",
179 "IC microtag or full tag multi-hit error",
180 "IC full tag parity",
181 "IC data array parity",
182 "Decoupling queue phys addr parity error",
183 "L0 ITLB parity error",
184 "L1 ITLB parity error",
185 "L2 ITLB parity error",
186 "BPQ snoop parity on Thread 0",
187 "BPQ snoop parity on Thread 1",
188 "L1 BTB multi-match error",
189 "L2 BTB multi-match error",
190 "L2 Cache Response Poison error",
191 "System Read Data error",
192 };
193
194 static const char * const smca_l2_mce_desc[] = {
195 "L2M tag multi-way-hit error",
196 "L2M tag ECC error",
197 "L2M data ECC error",
198 "HW assert",
199 };
200
201 static const char * const smca_de_mce_desc[] = {
202 "uop cache tag parity error",
203 "uop cache data parity error",
204 "Insn buffer parity error",
205 "uop queue parity error",
206 "Insn dispatch queue parity error",
207 "Fetch address FIFO parity",
208 "Patch RAM data parity",
209 "Patch RAM sequencer parity",
210 "uop buffer parity"
211 };
212
213 static const char * const smca_ex_mce_desc[] = {
214 "Watchdog timeout error",
215 "Phy register file parity",
216 "Flag register file parity",
217 "Immediate displacement register file parity",
218 "Address generator payload parity",
219 "EX payload parity",
220 "Checkpoint queue parity",
221 "Retire dispatch queue parity",
222 "Retire status queue parity error",
223 "Scheduling queue parity error",
224 "Branch buffer queue parity error",
225 };
226
227 static const char * const smca_fp_mce_desc[] = {
228 "Physical register file parity",
229 "Freelist parity error",
230 "Schedule queue parity",
231 "NSQ parity error",
232 "Retire queue parity",
233 "Status register file parity",
234 "Hardware assertion",
235 };
236
237 static const char * const smca_l3_mce_desc[] = {
238 "Shadow tag macro ECC error",
239 "Shadow tag macro multi-way-hit error",
240 "L3M tag ECC error",
241 "L3M tag multi-way-hit error",
242 "L3M data ECC error",
243 "XI parity, L3 fill done channel error",
244 "L3 victim queue parity",
245 "L3 HW assert",
246 };
247
248 static const char * const smca_cs_mce_desc[] = {
249 "Illegal request from transport layer",
250 "Address violation",
251 "Security violation",
252 "Illegal response from transport layer",
253 "Unexpected response",
254 "Parity error on incoming request or probe response data",
255 "Parity error on incoming read response data",
256 "Atomic request parity",
257 "ECC error on probe filter access",
258 };
259
260 static const char * const smca_pie_mce_desc[] = {
261 "HW assert",
262 "Internal PIE register security violation",
263 "Error on GMI link",
264 "Poison data written to internal PIE register",
265 };
266
267 static const char * const smca_umc_mce_desc[] = {
268 "DRAM ECC error",
269 "Data poison error on DRAM",
270 "SDP parity error",
271 "Advanced peripheral bus error",
272 "Command/address parity error",
273 "Write data CRC error",
274 };
275
276 static const char * const smca_pb_mce_desc[] = {
277 "Parameter Block RAM ECC error",
278 };
279
280 static const char * const smca_psp_mce_desc[] = {
281 "PSP RAM ECC or parity error",
282 };
283
284 static const char * const smca_smu_mce_desc[] = {
285 "SMU RAM ECC or parity error",
286 };
287
288 struct smca_mce_desc {
289 const char * const *descs;
290 unsigned int num_descs;
291 };
292
293 static struct smca_mce_desc smca_mce_descs[] = {
294 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
295 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
296 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
297 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
298 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
299 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
300 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
301 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
302 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
303 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
304 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
305 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
306 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
307 };
308
f12h_mc0_mce(u16 ec,u8 xec)309 static bool f12h_mc0_mce(u16 ec, u8 xec)
310 {
311 bool ret = false;
312
313 if (MEM_ERROR(ec)) {
314 u8 ll = LL(ec);
315 ret = true;
316
317 if (ll == LL_L2)
318 pr_cont("during L1 linefill from L2.\n");
319 else if (ll == LL_L1)
320 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
321 else
322 ret = false;
323 }
324 return ret;
325 }
326
f10h_mc0_mce(u16 ec,u8 xec)327 static bool f10h_mc0_mce(u16 ec, u8 xec)
328 {
329 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
330 pr_cont("during data scrub.\n");
331 return true;
332 }
333 return f12h_mc0_mce(ec, xec);
334 }
335
k8_mc0_mce(u16 ec,u8 xec)336 static bool k8_mc0_mce(u16 ec, u8 xec)
337 {
338 if (BUS_ERROR(ec)) {
339 pr_cont("during system linefill.\n");
340 return true;
341 }
342
343 return f10h_mc0_mce(ec, xec);
344 }
345
cat_mc0_mce(u16 ec,u8 xec)346 static bool cat_mc0_mce(u16 ec, u8 xec)
347 {
348 u8 r4 = R4(ec);
349 bool ret = true;
350
351 if (MEM_ERROR(ec)) {
352
353 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
354 return false;
355
356 switch (r4) {
357 case R4_DRD:
358 case R4_DWR:
359 pr_cont("Data/Tag parity error due to %s.\n",
360 (r4 == R4_DRD ? "load/hw prf" : "store"));
361 break;
362 case R4_EVICT:
363 pr_cont("Copyback parity error on a tag miss.\n");
364 break;
365 case R4_SNOOP:
366 pr_cont("Tag parity error during snoop.\n");
367 break;
368 default:
369 ret = false;
370 }
371 } else if (BUS_ERROR(ec)) {
372
373 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
374 return false;
375
376 pr_cont("System read data error on a ");
377
378 switch (r4) {
379 case R4_RD:
380 pr_cont("TLB reload.\n");
381 break;
382 case R4_DWR:
383 pr_cont("store.\n");
384 break;
385 case R4_DRD:
386 pr_cont("load.\n");
387 break;
388 default:
389 ret = false;
390 }
391 } else {
392 ret = false;
393 }
394
395 return ret;
396 }
397
f15h_mc0_mce(u16 ec,u8 xec)398 static bool f15h_mc0_mce(u16 ec, u8 xec)
399 {
400 bool ret = true;
401
402 if (MEM_ERROR(ec)) {
403
404 switch (xec) {
405 case 0x0:
406 pr_cont("Data Array access error.\n");
407 break;
408
409 case 0x1:
410 pr_cont("UC error during a linefill from L2/NB.\n");
411 break;
412
413 case 0x2:
414 case 0x11:
415 pr_cont("STQ access error.\n");
416 break;
417
418 case 0x3:
419 pr_cont("SCB access error.\n");
420 break;
421
422 case 0x10:
423 pr_cont("Tag error.\n");
424 break;
425
426 case 0x12:
427 pr_cont("LDQ access error.\n");
428 break;
429
430 default:
431 ret = false;
432 }
433 } else if (BUS_ERROR(ec)) {
434
435 if (!xec)
436 pr_cont("System Read Data Error.\n");
437 else
438 pr_cont(" Internal error condition type %d.\n", xec);
439 } else if (INT_ERROR(ec)) {
440 if (xec <= 0x1f)
441 pr_cont("Hardware Assert.\n");
442 else
443 ret = false;
444
445 } else
446 ret = false;
447
448 return ret;
449 }
450
decode_mc0_mce(struct mce * m)451 static void decode_mc0_mce(struct mce *m)
452 {
453 u16 ec = EC(m->status);
454 u8 xec = XEC(m->status, xec_mask);
455
456 pr_emerg(HW_ERR "MC0 Error: ");
457
458 /* TLB error signatures are the same across families */
459 if (TLB_ERROR(ec)) {
460 if (TT(ec) == TT_DATA) {
461 pr_cont("%s TLB %s.\n", LL_MSG(ec),
462 ((xec == 2) ? "locked miss"
463 : (xec ? "multimatch" : "parity")));
464 return;
465 }
466 } else if (fam_ops->mc0_mce(ec, xec))
467 ;
468 else
469 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
470 }
471
k8_mc1_mce(u16 ec,u8 xec)472 static bool k8_mc1_mce(u16 ec, u8 xec)
473 {
474 u8 ll = LL(ec);
475 bool ret = true;
476
477 if (!MEM_ERROR(ec))
478 return false;
479
480 if (ll == 0x2)
481 pr_cont("during a linefill from L2.\n");
482 else if (ll == 0x1) {
483 switch (R4(ec)) {
484 case R4_IRD:
485 pr_cont("Parity error during data load.\n");
486 break;
487
488 case R4_EVICT:
489 pr_cont("Copyback Parity/Victim error.\n");
490 break;
491
492 case R4_SNOOP:
493 pr_cont("Tag Snoop error.\n");
494 break;
495
496 default:
497 ret = false;
498 break;
499 }
500 } else
501 ret = false;
502
503 return ret;
504 }
505
cat_mc1_mce(u16 ec,u8 xec)506 static bool cat_mc1_mce(u16 ec, u8 xec)
507 {
508 u8 r4 = R4(ec);
509 bool ret = true;
510
511 if (!MEM_ERROR(ec))
512 return false;
513
514 if (TT(ec) != TT_INSTR)
515 return false;
516
517 if (r4 == R4_IRD)
518 pr_cont("Data/tag array parity error for a tag hit.\n");
519 else if (r4 == R4_SNOOP)
520 pr_cont("Tag error during snoop/victimization.\n");
521 else if (xec == 0x0)
522 pr_cont("Tag parity error from victim castout.\n");
523 else if (xec == 0x2)
524 pr_cont("Microcode patch RAM parity error.\n");
525 else
526 ret = false;
527
528 return ret;
529 }
530
f15h_mc1_mce(u16 ec,u8 xec)531 static bool f15h_mc1_mce(u16 ec, u8 xec)
532 {
533 bool ret = true;
534
535 if (!MEM_ERROR(ec))
536 return false;
537
538 switch (xec) {
539 case 0x0 ... 0xa:
540 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
541 break;
542
543 case 0xd:
544 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
545 break;
546
547 case 0x10:
548 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
549 break;
550
551 case 0x11 ... 0x15:
552 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
553 break;
554
555 default:
556 ret = false;
557 }
558 return ret;
559 }
560
decode_mc1_mce(struct mce * m)561 static void decode_mc1_mce(struct mce *m)
562 {
563 u16 ec = EC(m->status);
564 u8 xec = XEC(m->status, xec_mask);
565
566 pr_emerg(HW_ERR "MC1 Error: ");
567
568 if (TLB_ERROR(ec))
569 pr_cont("%s TLB %s.\n", LL_MSG(ec),
570 (xec ? "multimatch" : "parity error"));
571 else if (BUS_ERROR(ec)) {
572 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
573
574 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
575 } else if (INT_ERROR(ec)) {
576 if (xec <= 0x3f)
577 pr_cont("Hardware Assert.\n");
578 else
579 goto wrong_mc1_mce;
580 } else if (fam_ops->mc1_mce(ec, xec))
581 ;
582 else
583 goto wrong_mc1_mce;
584
585 return;
586
587 wrong_mc1_mce:
588 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
589 }
590
k8_mc2_mce(u16 ec,u8 xec)591 static bool k8_mc2_mce(u16 ec, u8 xec)
592 {
593 bool ret = true;
594
595 if (xec == 0x1)
596 pr_cont(" in the write data buffers.\n");
597 else if (xec == 0x3)
598 pr_cont(" in the victim data buffers.\n");
599 else if (xec == 0x2 && MEM_ERROR(ec))
600 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
601 else if (xec == 0x0) {
602 if (TLB_ERROR(ec))
603 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
604 TT_MSG(ec));
605 else if (BUS_ERROR(ec))
606 pr_cont(": %s/ECC error in data read from NB: %s.\n",
607 R4_MSG(ec), PP_MSG(ec));
608 else if (MEM_ERROR(ec)) {
609 u8 r4 = R4(ec);
610
611 if (r4 >= 0x7)
612 pr_cont(": %s error during data copyback.\n",
613 R4_MSG(ec));
614 else if (r4 <= 0x1)
615 pr_cont(": %s parity/ECC error during data "
616 "access from L2.\n", R4_MSG(ec));
617 else
618 ret = false;
619 } else
620 ret = false;
621 } else
622 ret = false;
623
624 return ret;
625 }
626
f15h_mc2_mce(u16 ec,u8 xec)627 static bool f15h_mc2_mce(u16 ec, u8 xec)
628 {
629 bool ret = true;
630
631 if (TLB_ERROR(ec)) {
632 if (xec == 0x0)
633 pr_cont("Data parity TLB read error.\n");
634 else if (xec == 0x1)
635 pr_cont("Poison data provided for TLB fill.\n");
636 else
637 ret = false;
638 } else if (BUS_ERROR(ec)) {
639 if (xec > 2)
640 ret = false;
641
642 pr_cont("Error during attempted NB data read.\n");
643 } else if (MEM_ERROR(ec)) {
644 switch (xec) {
645 case 0x4 ... 0xc:
646 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
647 break;
648
649 case 0x10 ... 0x14:
650 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
651 break;
652
653 default:
654 ret = false;
655 }
656 } else if (INT_ERROR(ec)) {
657 if (xec <= 0x3f)
658 pr_cont("Hardware Assert.\n");
659 else
660 ret = false;
661 }
662
663 return ret;
664 }
665
f16h_mc2_mce(u16 ec,u8 xec)666 static bool f16h_mc2_mce(u16 ec, u8 xec)
667 {
668 u8 r4 = R4(ec);
669
670 if (!MEM_ERROR(ec))
671 return false;
672
673 switch (xec) {
674 case 0x04 ... 0x05:
675 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
676 break;
677
678 case 0x09 ... 0x0b:
679 case 0x0d ... 0x0f:
680 pr_cont("ECC error in L2 tag (%s).\n",
681 ((r4 == R4_GEN) ? "BankReq" :
682 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
683 break;
684
685 case 0x10 ... 0x19:
686 case 0x1b:
687 pr_cont("ECC error in L2 data array (%s).\n",
688 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
689 ((r4 == R4_GEN) ? "Attr" :
690 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
691 break;
692
693 case 0x1c ... 0x1d:
694 case 0x1f:
695 pr_cont("Parity error in L2 attribute bits (%s).\n",
696 ((r4 == R4_RD) ? "Hit" :
697 ((r4 == R4_GEN) ? "Attr" : "Fill")));
698 break;
699
700 default:
701 return false;
702 }
703
704 return true;
705 }
706
decode_mc2_mce(struct mce * m)707 static void decode_mc2_mce(struct mce *m)
708 {
709 u16 ec = EC(m->status);
710 u8 xec = XEC(m->status, xec_mask);
711
712 pr_emerg(HW_ERR "MC2 Error: ");
713
714 if (!fam_ops->mc2_mce(ec, xec))
715 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
716 }
717
decode_mc3_mce(struct mce * m)718 static void decode_mc3_mce(struct mce *m)
719 {
720 u16 ec = EC(m->status);
721 u8 xec = XEC(m->status, xec_mask);
722
723 if (boot_cpu_data.x86 >= 0x14) {
724 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
725 " please report on LKML.\n");
726 return;
727 }
728
729 pr_emerg(HW_ERR "MC3 Error");
730
731 if (xec == 0x0) {
732 u8 r4 = R4(ec);
733
734 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
735 goto wrong_mc3_mce;
736
737 pr_cont(" during %s.\n", R4_MSG(ec));
738 } else
739 goto wrong_mc3_mce;
740
741 return;
742
743 wrong_mc3_mce:
744 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
745 }
746
decode_mc4_mce(struct mce * m)747 static void decode_mc4_mce(struct mce *m)
748 {
749 unsigned int fam = x86_family(m->cpuid);
750 int node_id = amd_get_nb_id(m->extcpu);
751 u16 ec = EC(m->status);
752 u8 xec = XEC(m->status, 0x1f);
753 u8 offset = 0;
754
755 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
756
757 switch (xec) {
758 case 0x0 ... 0xe:
759
760 /* special handling for DRAM ECCs */
761 if (xec == 0x0 || xec == 0x8) {
762 /* no ECCs on F11h */
763 if (fam == 0x11)
764 goto wrong_mc4_mce;
765
766 pr_cont("%s.\n", mc4_mce_desc[xec]);
767
768 if (decode_dram_ecc)
769 decode_dram_ecc(node_id, m);
770 return;
771 }
772 break;
773
774 case 0xf:
775 if (TLB_ERROR(ec))
776 pr_cont("GART Table Walk data error.\n");
777 else if (BUS_ERROR(ec))
778 pr_cont("DMA Exclusion Vector Table Walk error.\n");
779 else
780 goto wrong_mc4_mce;
781 return;
782
783 case 0x19:
784 if (fam == 0x15 || fam == 0x16)
785 pr_cont("Compute Unit Data Error.\n");
786 else
787 goto wrong_mc4_mce;
788 return;
789
790 case 0x1c ... 0x1f:
791 offset = 13;
792 break;
793
794 default:
795 goto wrong_mc4_mce;
796 }
797
798 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
799 return;
800
801 wrong_mc4_mce:
802 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
803 }
804
decode_mc5_mce(struct mce * m)805 static void decode_mc5_mce(struct mce *m)
806 {
807 unsigned int fam = x86_family(m->cpuid);
808 u16 ec = EC(m->status);
809 u8 xec = XEC(m->status, xec_mask);
810
811 if (fam == 0xf || fam == 0x11)
812 goto wrong_mc5_mce;
813
814 pr_emerg(HW_ERR "MC5 Error: ");
815
816 if (INT_ERROR(ec)) {
817 if (xec <= 0x1f) {
818 pr_cont("Hardware Assert.\n");
819 return;
820 } else
821 goto wrong_mc5_mce;
822 }
823
824 if (xec == 0x0 || xec == 0xc)
825 pr_cont("%s.\n", mc5_mce_desc[xec]);
826 else if (xec <= 0xd)
827 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
828 else
829 goto wrong_mc5_mce;
830
831 return;
832
833 wrong_mc5_mce:
834 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
835 }
836
decode_mc6_mce(struct mce * m)837 static void decode_mc6_mce(struct mce *m)
838 {
839 u8 xec = XEC(m->status, xec_mask);
840
841 pr_emerg(HW_ERR "MC6 Error: ");
842
843 if (xec > 0x5)
844 goto wrong_mc6_mce;
845
846 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
847 return;
848
849 wrong_mc6_mce:
850 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
851 }
852
853 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)854 static void decode_smca_error(struct mce *m)
855 {
856 struct smca_hwid *hwid;
857 enum smca_bank_types bank_type;
858 const char *ip_name;
859 u8 xec = XEC(m->status, xec_mask);
860
861 if (m->bank >= ARRAY_SIZE(smca_banks))
862 return;
863
864 hwid = smca_banks[m->bank].hwid;
865 if (!hwid)
866 return;
867
868 bank_type = hwid->bank_type;
869
870 if (bank_type == SMCA_RESERVED) {
871 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
872 return;
873 }
874
875 ip_name = smca_get_long_name(bank_type);
876
877 pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
878
879 /* Only print the decode of valid error codes */
880 if (xec < smca_mce_descs[bank_type].num_descs &&
881 (hwid->xec_bitmap & BIT_ULL(xec))) {
882 pr_emerg(HW_ERR "%s Error: ", ip_name);
883 pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
884 }
885
886 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
887 decode_dram_ecc(cpu_to_node(m->extcpu), m);
888 }
889
amd_decode_err_code(u16 ec)890 static inline void amd_decode_err_code(u16 ec)
891 {
892 if (INT_ERROR(ec)) {
893 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
894 return;
895 }
896
897 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
898
899 if (BUS_ERROR(ec))
900 pr_cont(", mem/io: %s", II_MSG(ec));
901 else
902 pr_cont(", tx: %s", TT_MSG(ec));
903
904 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
905 pr_cont(", mem-tx: %s", R4_MSG(ec));
906
907 if (BUS_ERROR(ec))
908 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
909 }
910
911 pr_cont("\n");
912 }
913
914 /*
915 * Filter out unwanted MCE signatures here.
916 */
amd_filter_mce(struct mce * m)917 static bool amd_filter_mce(struct mce *m)
918 {
919 /*
920 * NB GART TLB error reporting is disabled by default.
921 */
922 if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
923 return true;
924
925 return false;
926 }
927
decode_error_status(struct mce * m)928 static const char *decode_error_status(struct mce *m)
929 {
930 if (m->status & MCI_STATUS_UC) {
931 if (m->status & MCI_STATUS_PCC)
932 return "System Fatal error.";
933 if (m->mcgstatus & MCG_STATUS_RIPV)
934 return "Uncorrected, software restartable error.";
935 return "Uncorrected, software containable error.";
936 }
937
938 if (m->status & MCI_STATUS_DEFERRED)
939 return "Deferred error, no action required.";
940
941 return "Corrected error, no action required.";
942 }
943
944 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)945 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
946 {
947 struct mce *m = (struct mce *)data;
948 unsigned int fam = x86_family(m->cpuid);
949 int ecc;
950
951 if (amd_filter_mce(m))
952 return NOTIFY_STOP;
953
954 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
955
956 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
957 m->extcpu,
958 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
959 m->bank,
960 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
961 ((m->status & MCI_STATUS_UC) ? "UE" :
962 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
963 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
964 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
965 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
966
967 if (fam >= 0x15) {
968 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
969
970 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
971 if (fam != 0x15 || m->bank != 4)
972 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
973 }
974
975 if (boot_cpu_has(X86_FEATURE_SMCA)) {
976 u32 low, high;
977 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
978
979 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
980
981 if (!rdmsr_safe(addr, &low, &high) &&
982 (low & MCI_CONFIG_MCAX))
983 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
984 }
985
986 /* do the two bits[14:13] together */
987 ecc = (m->status >> 45) & 0x3;
988 if (ecc)
989 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
990
991 pr_cont("]: 0x%016llx\n", m->status);
992
993 if (m->status & MCI_STATUS_ADDRV)
994 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
995
996 if (boot_cpu_has(X86_FEATURE_SMCA)) {
997 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
998
999 if (m->status & MCI_STATUS_SYNDV)
1000 pr_cont(", Syndrome: 0x%016llx", m->synd);
1001
1002 pr_cont("\n");
1003
1004 decode_smca_error(m);
1005 goto err_code;
1006 }
1007
1008 if (m->tsc)
1009 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1010
1011 if (!fam_ops)
1012 goto err_code;
1013
1014 switch (m->bank) {
1015 case 0:
1016 decode_mc0_mce(m);
1017 break;
1018
1019 case 1:
1020 decode_mc1_mce(m);
1021 break;
1022
1023 case 2:
1024 decode_mc2_mce(m);
1025 break;
1026
1027 case 3:
1028 decode_mc3_mce(m);
1029 break;
1030
1031 case 4:
1032 decode_mc4_mce(m);
1033 break;
1034
1035 case 5:
1036 decode_mc5_mce(m);
1037 break;
1038
1039 case 6:
1040 decode_mc6_mce(m);
1041 break;
1042
1043 default:
1044 break;
1045 }
1046
1047 err_code:
1048 amd_decode_err_code(m->status & 0xffff);
1049
1050 return NOTIFY_STOP;
1051 }
1052
1053 static struct notifier_block amd_mce_dec_nb = {
1054 .notifier_call = amd_decode_mce,
1055 .priority = MCE_PRIO_EDAC,
1056 };
1057
mce_amd_init(void)1058 static int __init mce_amd_init(void)
1059 {
1060 struct cpuinfo_x86 *c = &boot_cpu_data;
1061
1062 if (c->x86_vendor != X86_VENDOR_AMD)
1063 return -ENODEV;
1064
1065 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1066 if (!fam_ops)
1067 return -ENOMEM;
1068
1069 switch (c->x86) {
1070 case 0xf:
1071 fam_ops->mc0_mce = k8_mc0_mce;
1072 fam_ops->mc1_mce = k8_mc1_mce;
1073 fam_ops->mc2_mce = k8_mc2_mce;
1074 break;
1075
1076 case 0x10:
1077 fam_ops->mc0_mce = f10h_mc0_mce;
1078 fam_ops->mc1_mce = k8_mc1_mce;
1079 fam_ops->mc2_mce = k8_mc2_mce;
1080 break;
1081
1082 case 0x11:
1083 fam_ops->mc0_mce = k8_mc0_mce;
1084 fam_ops->mc1_mce = k8_mc1_mce;
1085 fam_ops->mc2_mce = k8_mc2_mce;
1086 break;
1087
1088 case 0x12:
1089 fam_ops->mc0_mce = f12h_mc0_mce;
1090 fam_ops->mc1_mce = k8_mc1_mce;
1091 fam_ops->mc2_mce = k8_mc2_mce;
1092 break;
1093
1094 case 0x14:
1095 fam_ops->mc0_mce = cat_mc0_mce;
1096 fam_ops->mc1_mce = cat_mc1_mce;
1097 fam_ops->mc2_mce = k8_mc2_mce;
1098 break;
1099
1100 case 0x15:
1101 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1102
1103 fam_ops->mc0_mce = f15h_mc0_mce;
1104 fam_ops->mc1_mce = f15h_mc1_mce;
1105 fam_ops->mc2_mce = f15h_mc2_mce;
1106 break;
1107
1108 case 0x16:
1109 xec_mask = 0x1f;
1110 fam_ops->mc0_mce = cat_mc0_mce;
1111 fam_ops->mc1_mce = cat_mc1_mce;
1112 fam_ops->mc2_mce = f16h_mc2_mce;
1113 break;
1114
1115 case 0x17:
1116 xec_mask = 0x3f;
1117 if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1118 printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1119 goto err_out;
1120 }
1121 break;
1122
1123 default:
1124 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1125 goto err_out;
1126 }
1127
1128 pr_info("MCE: In-kernel MCE decoding enabled.\n");
1129
1130 mce_register_decode_chain(&amd_mce_dec_nb);
1131
1132 return 0;
1133
1134 err_out:
1135 kfree(fam_ops);
1136 fam_ops = NULL;
1137 return -EINVAL;
1138 }
1139 early_initcall(mce_amd_init);
1140
1141 #ifdef MODULE
mce_amd_exit(void)1142 static void __exit mce_amd_exit(void)
1143 {
1144 mce_unregister_decode_chain(&amd_mce_dec_nb);
1145 kfree(fam_ops);
1146 }
1147
1148 MODULE_DESCRIPTION("AMD MCE decoder");
1149 MODULE_ALIAS("edac-mce-amd");
1150 MODULE_LICENSE("GPL");
1151 module_exit(mce_amd_exit);
1152 #endif
1153