1 #include <linux/module.h>
2 #include <linux/slab.h>
3 
4 #include <asm/cpu.h>
5 
6 #include "mce_amd.h"
7 
8 static struct amd_decoder_ops *fam_ops;
9 
10 static u8 xec_mask	 = 0xf;
11 
12 static bool report_gart_errors;
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
amd_report_gart_errors(bool v)15 void amd_report_gart_errors(bool v)
16 {
17 	report_gart_errors = v;
18 }
19 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 
amd_register_ecc_decoder(void (* f)(int,struct mce *))21 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	decode_dram_ecc = f;
24 }
25 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))27 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
28 {
29 	if (decode_dram_ecc) {
30 		WARN_ON(decode_dram_ecc != f);
31 
32 		decode_dram_ecc = NULL;
33 	}
34 }
35 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
36 
37 /*
38  * string representation for the different MCA reported error types, see F3x48
39  * or MSR0000_0411.
40  */
41 
42 /* transaction type */
43 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
44 
45 /* cache level */
46 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 
48 /* memory transaction type */
49 static const char * const rrrr_msgs[] = {
50        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 };
52 
53 /* participating processor */
54 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
55 EXPORT_SYMBOL_GPL(pp_msgs);
56 
57 /* request timeout */
58 static const char * const to_msgs[] = { "no timeout", "timed out" };
59 
60 /* memory or i/o */
61 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
62 
63 /* internal error type */
64 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
65 
66 static const char * const f15h_mc1_mce_desc[] = {
67 	"UC during a demand linefill from L2",
68 	"Parity error during data load from IC",
69 	"Parity error for IC valid bit",
70 	"Main tag parity error",
71 	"Parity error in prediction queue",
72 	"PFB data/address parity error",
73 	"Parity error in the branch status reg",
74 	"PFB promotion address error",
75 	"Tag error during probe/victimization",
76 	"Parity error for IC probe tag valid bit",
77 	"PFB non-cacheable bit parity error",
78 	"PFB valid bit parity error",			/* xec = 0xd */
79 	"Microcode Patch Buffer",			/* xec = 010 */
80 	"uop queue",
81 	"insn buffer",
82 	"predecode buffer",
83 	"fetch address FIFO",
84 	"dispatch uop queue"
85 };
86 
87 static const char * const f15h_mc2_mce_desc[] = {
88 	"Fill ECC error on data fills",			/* xec = 0x4 */
89 	"Fill parity error on insn fills",
90 	"Prefetcher request FIFO parity error",
91 	"PRQ address parity error",
92 	"PRQ data parity error",
93 	"WCC Tag ECC error",
94 	"WCC Data ECC error",
95 	"WCB Data parity error",
96 	"VB Data ECC or parity error",
97 	"L2 Tag ECC error",				/* xec = 0x10 */
98 	"Hard L2 Tag ECC error",
99 	"Multiple hits on L2 tag",
100 	"XAB parity error",
101 	"PRB address parity error"
102 };
103 
104 static const char * const mc4_mce_desc[] = {
105 	"DRAM ECC error detected on the NB",
106 	"CRC error detected on HT link",
107 	"Link-defined sync error packets detected on HT link",
108 	"HT Master abort",
109 	"HT Target abort",
110 	"Invalid GART PTE entry during GART table walk",
111 	"Unsupported atomic RMW received from an IO link",
112 	"Watchdog timeout due to lack of progress",
113 	"DRAM ECC error detected on the NB",
114 	"SVM DMA Exclusion Vector error",
115 	"HT data error detected on link",
116 	"Protocol error (link, L3, probe filter)",
117 	"NB internal arrays parity error",
118 	"DRAM addr/ctl signals parity error",
119 	"IO link transmission error",
120 	"L3 data cache ECC error",			/* xec = 0x1c */
121 	"L3 cache tag error",
122 	"L3 LRU parity bits error",
123 	"ECC Error in the Probe Filter directory"
124 };
125 
126 static const char * const mc5_mce_desc[] = {
127 	"CPU Watchdog timer expire",
128 	"Wakeup array dest tag",
129 	"AG payload array",
130 	"EX payload array",
131 	"IDRF array",
132 	"Retire dispatch queue",
133 	"Mapper checkpoint array",
134 	"Physical register file EX0 port",
135 	"Physical register file EX1 port",
136 	"Physical register file AG0 port",
137 	"Physical register file AG1 port",
138 	"Flag register file",
139 	"DE error occurred",
140 	"Retire status queue"
141 };
142 
143 static const char * const mc6_mce_desc[] = {
144 	"Hardware Assertion",
145 	"Free List",
146 	"Physical Register File",
147 	"Retire Queue",
148 	"Scheduler table",
149 	"Status Register File",
150 };
151 
152 /* Scalable MCA error strings */
153 static const char * const smca_ls_mce_desc[] = {
154 	"Load queue parity",
155 	"Store queue parity",
156 	"Miss address buffer payload parity",
157 	"L1 TLB parity",
158 	"Reserved",
159 	"DC tag error type 6",
160 	"DC tag error type 1",
161 	"Internal error type 1",
162 	"Internal error type 2",
163 	"Sys Read data error thread 0",
164 	"Sys read data error thread 1",
165 	"DC tag error type 2",
166 	"DC data error type 1 (poison consumption)",
167 	"DC data error type 2",
168 	"DC data error type 3",
169 	"DC tag error type 4",
170 	"L2 TLB parity",
171 	"PDC parity error",
172 	"DC tag error type 3",
173 	"DC tag error type 5",
174 	"L2 fill data error",
175 };
176 
177 static const char * const smca_if_mce_desc[] = {
178 	"microtag probe port parity error",
179 	"IC microtag or full tag multi-hit error",
180 	"IC full tag parity",
181 	"IC data array parity",
182 	"Decoupling queue phys addr parity error",
183 	"L0 ITLB parity error",
184 	"L1 ITLB parity error",
185 	"L2 ITLB parity error",
186 	"BPQ snoop parity on Thread 0",
187 	"BPQ snoop parity on Thread 1",
188 	"L1 BTB multi-match error",
189 	"L2 BTB multi-match error",
190 	"L2 Cache Response Poison error",
191 	"System Read Data error",
192 };
193 
194 static const char * const smca_l2_mce_desc[] = {
195 	"L2M tag multi-way-hit error",
196 	"L2M tag ECC error",
197 	"L2M data ECC error",
198 	"HW assert",
199 };
200 
201 static const char * const smca_de_mce_desc[] = {
202 	"uop cache tag parity error",
203 	"uop cache data parity error",
204 	"Insn buffer parity error",
205 	"uop queue parity error",
206 	"Insn dispatch queue parity error",
207 	"Fetch address FIFO parity",
208 	"Patch RAM data parity",
209 	"Patch RAM sequencer parity",
210 	"uop buffer parity"
211 };
212 
213 static const char * const smca_ex_mce_desc[] = {
214 	"Watchdog timeout error",
215 	"Phy register file parity",
216 	"Flag register file parity",
217 	"Immediate displacement register file parity",
218 	"Address generator payload parity",
219 	"EX payload parity",
220 	"Checkpoint queue parity",
221 	"Retire dispatch queue parity",
222 	"Retire status queue parity error",
223 	"Scheduling queue parity error",
224 	"Branch buffer queue parity error",
225 };
226 
227 static const char * const smca_fp_mce_desc[] = {
228 	"Physical register file parity",
229 	"Freelist parity error",
230 	"Schedule queue parity",
231 	"NSQ parity error",
232 	"Retire queue parity",
233 	"Status register file parity",
234 	"Hardware assertion",
235 };
236 
237 static const char * const smca_l3_mce_desc[] = {
238 	"Shadow tag macro ECC error",
239 	"Shadow tag macro multi-way-hit error",
240 	"L3M tag ECC error",
241 	"L3M tag multi-way-hit error",
242 	"L3M data ECC error",
243 	"XI parity, L3 fill done channel error",
244 	"L3 victim queue parity",
245 	"L3 HW assert",
246 };
247 
248 static const char * const smca_cs_mce_desc[] = {
249 	"Illegal request from transport layer",
250 	"Address violation",
251 	"Security violation",
252 	"Illegal response from transport layer",
253 	"Unexpected response",
254 	"Parity error on incoming request or probe response data",
255 	"Parity error on incoming read response data",
256 	"Atomic request parity",
257 	"ECC error on probe filter access",
258 };
259 
260 static const char * const smca_pie_mce_desc[] = {
261 	"HW assert",
262 	"Internal PIE register security violation",
263 	"Error on GMI link",
264 	"Poison data written to internal PIE register",
265 };
266 
267 static const char * const smca_umc_mce_desc[] = {
268 	"DRAM ECC error",
269 	"Data poison error on DRAM",
270 	"SDP parity error",
271 	"Advanced peripheral bus error",
272 	"Command/address parity error",
273 	"Write data CRC error",
274 };
275 
276 static const char * const smca_pb_mce_desc[] = {
277 	"Parameter Block RAM ECC error",
278 };
279 
280 static const char * const smca_psp_mce_desc[] = {
281 	"PSP RAM ECC or parity error",
282 };
283 
284 static const char * const smca_smu_mce_desc[] = {
285 	"SMU RAM ECC or parity error",
286 };
287 
288 struct smca_mce_desc {
289 	const char * const *descs;
290 	unsigned int num_descs;
291 };
292 
293 static struct smca_mce_desc smca_mce_descs[] = {
294 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
295 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
296 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
297 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
298 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
299 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
300 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
301 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
302 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
303 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
304 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
305 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
306 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
307 };
308 
f12h_mc0_mce(u16 ec,u8 xec)309 static bool f12h_mc0_mce(u16 ec, u8 xec)
310 {
311 	bool ret = false;
312 
313 	if (MEM_ERROR(ec)) {
314 		u8 ll = LL(ec);
315 		ret = true;
316 
317 		if (ll == LL_L2)
318 			pr_cont("during L1 linefill from L2.\n");
319 		else if (ll == LL_L1)
320 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
321 		else
322 			ret = false;
323 	}
324 	return ret;
325 }
326 
f10h_mc0_mce(u16 ec,u8 xec)327 static bool f10h_mc0_mce(u16 ec, u8 xec)
328 {
329 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
330 		pr_cont("during data scrub.\n");
331 		return true;
332 	}
333 	return f12h_mc0_mce(ec, xec);
334 }
335 
k8_mc0_mce(u16 ec,u8 xec)336 static bool k8_mc0_mce(u16 ec, u8 xec)
337 {
338 	if (BUS_ERROR(ec)) {
339 		pr_cont("during system linefill.\n");
340 		return true;
341 	}
342 
343 	return f10h_mc0_mce(ec, xec);
344 }
345 
cat_mc0_mce(u16 ec,u8 xec)346 static bool cat_mc0_mce(u16 ec, u8 xec)
347 {
348 	u8 r4	 = R4(ec);
349 	bool ret = true;
350 
351 	if (MEM_ERROR(ec)) {
352 
353 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
354 			return false;
355 
356 		switch (r4) {
357 		case R4_DRD:
358 		case R4_DWR:
359 			pr_cont("Data/Tag parity error due to %s.\n",
360 				(r4 == R4_DRD ? "load/hw prf" : "store"));
361 			break;
362 		case R4_EVICT:
363 			pr_cont("Copyback parity error on a tag miss.\n");
364 			break;
365 		case R4_SNOOP:
366 			pr_cont("Tag parity error during snoop.\n");
367 			break;
368 		default:
369 			ret = false;
370 		}
371 	} else if (BUS_ERROR(ec)) {
372 
373 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
374 			return false;
375 
376 		pr_cont("System read data error on a ");
377 
378 		switch (r4) {
379 		case R4_RD:
380 			pr_cont("TLB reload.\n");
381 			break;
382 		case R4_DWR:
383 			pr_cont("store.\n");
384 			break;
385 		case R4_DRD:
386 			pr_cont("load.\n");
387 			break;
388 		default:
389 			ret = false;
390 		}
391 	} else {
392 		ret = false;
393 	}
394 
395 	return ret;
396 }
397 
f15h_mc0_mce(u16 ec,u8 xec)398 static bool f15h_mc0_mce(u16 ec, u8 xec)
399 {
400 	bool ret = true;
401 
402 	if (MEM_ERROR(ec)) {
403 
404 		switch (xec) {
405 		case 0x0:
406 			pr_cont("Data Array access error.\n");
407 			break;
408 
409 		case 0x1:
410 			pr_cont("UC error during a linefill from L2/NB.\n");
411 			break;
412 
413 		case 0x2:
414 		case 0x11:
415 			pr_cont("STQ access error.\n");
416 			break;
417 
418 		case 0x3:
419 			pr_cont("SCB access error.\n");
420 			break;
421 
422 		case 0x10:
423 			pr_cont("Tag error.\n");
424 			break;
425 
426 		case 0x12:
427 			pr_cont("LDQ access error.\n");
428 			break;
429 
430 		default:
431 			ret = false;
432 		}
433 	} else if (BUS_ERROR(ec)) {
434 
435 		if (!xec)
436 			pr_cont("System Read Data Error.\n");
437 		else
438 			pr_cont(" Internal error condition type %d.\n", xec);
439 	} else if (INT_ERROR(ec)) {
440 		if (xec <= 0x1f)
441 			pr_cont("Hardware Assert.\n");
442 		else
443 			ret = false;
444 
445 	} else
446 		ret = false;
447 
448 	return ret;
449 }
450 
decode_mc0_mce(struct mce * m)451 static void decode_mc0_mce(struct mce *m)
452 {
453 	u16 ec = EC(m->status);
454 	u8 xec = XEC(m->status, xec_mask);
455 
456 	pr_emerg(HW_ERR "MC0 Error: ");
457 
458 	/* TLB error signatures are the same across families */
459 	if (TLB_ERROR(ec)) {
460 		if (TT(ec) == TT_DATA) {
461 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
462 				((xec == 2) ? "locked miss"
463 					    : (xec ? "multimatch" : "parity")));
464 			return;
465 		}
466 	} else if (fam_ops->mc0_mce(ec, xec))
467 		;
468 	else
469 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
470 }
471 
k8_mc1_mce(u16 ec,u8 xec)472 static bool k8_mc1_mce(u16 ec, u8 xec)
473 {
474 	u8 ll	 = LL(ec);
475 	bool ret = true;
476 
477 	if (!MEM_ERROR(ec))
478 		return false;
479 
480 	if (ll == 0x2)
481 		pr_cont("during a linefill from L2.\n");
482 	else if (ll == 0x1) {
483 		switch (R4(ec)) {
484 		case R4_IRD:
485 			pr_cont("Parity error during data load.\n");
486 			break;
487 
488 		case R4_EVICT:
489 			pr_cont("Copyback Parity/Victim error.\n");
490 			break;
491 
492 		case R4_SNOOP:
493 			pr_cont("Tag Snoop error.\n");
494 			break;
495 
496 		default:
497 			ret = false;
498 			break;
499 		}
500 	} else
501 		ret = false;
502 
503 	return ret;
504 }
505 
cat_mc1_mce(u16 ec,u8 xec)506 static bool cat_mc1_mce(u16 ec, u8 xec)
507 {
508 	u8 r4    = R4(ec);
509 	bool ret = true;
510 
511 	if (!MEM_ERROR(ec))
512 		return false;
513 
514 	if (TT(ec) != TT_INSTR)
515 		return false;
516 
517 	if (r4 == R4_IRD)
518 		pr_cont("Data/tag array parity error for a tag hit.\n");
519 	else if (r4 == R4_SNOOP)
520 		pr_cont("Tag error during snoop/victimization.\n");
521 	else if (xec == 0x0)
522 		pr_cont("Tag parity error from victim castout.\n");
523 	else if (xec == 0x2)
524 		pr_cont("Microcode patch RAM parity error.\n");
525 	else
526 		ret = false;
527 
528 	return ret;
529 }
530 
f15h_mc1_mce(u16 ec,u8 xec)531 static bool f15h_mc1_mce(u16 ec, u8 xec)
532 {
533 	bool ret = true;
534 
535 	if (!MEM_ERROR(ec))
536 		return false;
537 
538 	switch (xec) {
539 	case 0x0 ... 0xa:
540 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
541 		break;
542 
543 	case 0xd:
544 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
545 		break;
546 
547 	case 0x10:
548 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
549 		break;
550 
551 	case 0x11 ... 0x15:
552 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
553 		break;
554 
555 	default:
556 		ret = false;
557 	}
558 	return ret;
559 }
560 
decode_mc1_mce(struct mce * m)561 static void decode_mc1_mce(struct mce *m)
562 {
563 	u16 ec = EC(m->status);
564 	u8 xec = XEC(m->status, xec_mask);
565 
566 	pr_emerg(HW_ERR "MC1 Error: ");
567 
568 	if (TLB_ERROR(ec))
569 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
570 			(xec ? "multimatch" : "parity error"));
571 	else if (BUS_ERROR(ec)) {
572 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
573 
574 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
575 	} else if (INT_ERROR(ec)) {
576 		if (xec <= 0x3f)
577 			pr_cont("Hardware Assert.\n");
578 		else
579 			goto wrong_mc1_mce;
580 	} else if (fam_ops->mc1_mce(ec, xec))
581 		;
582 	else
583 		goto wrong_mc1_mce;
584 
585 	return;
586 
587 wrong_mc1_mce:
588 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
589 }
590 
k8_mc2_mce(u16 ec,u8 xec)591 static bool k8_mc2_mce(u16 ec, u8 xec)
592 {
593 	bool ret = true;
594 
595 	if (xec == 0x1)
596 		pr_cont(" in the write data buffers.\n");
597 	else if (xec == 0x3)
598 		pr_cont(" in the victim data buffers.\n");
599 	else if (xec == 0x2 && MEM_ERROR(ec))
600 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
601 	else if (xec == 0x0) {
602 		if (TLB_ERROR(ec))
603 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
604 				TT_MSG(ec));
605 		else if (BUS_ERROR(ec))
606 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
607 				R4_MSG(ec), PP_MSG(ec));
608 		else if (MEM_ERROR(ec)) {
609 			u8 r4 = R4(ec);
610 
611 			if (r4 >= 0x7)
612 				pr_cont(": %s error during data copyback.\n",
613 					R4_MSG(ec));
614 			else if (r4 <= 0x1)
615 				pr_cont(": %s parity/ECC error during data "
616 					"access from L2.\n", R4_MSG(ec));
617 			else
618 				ret = false;
619 		} else
620 			ret = false;
621 	} else
622 		ret = false;
623 
624 	return ret;
625 }
626 
f15h_mc2_mce(u16 ec,u8 xec)627 static bool f15h_mc2_mce(u16 ec, u8 xec)
628 {
629 	bool ret = true;
630 
631 	if (TLB_ERROR(ec)) {
632 		if (xec == 0x0)
633 			pr_cont("Data parity TLB read error.\n");
634 		else if (xec == 0x1)
635 			pr_cont("Poison data provided for TLB fill.\n");
636 		else
637 			ret = false;
638 	} else if (BUS_ERROR(ec)) {
639 		if (xec > 2)
640 			ret = false;
641 
642 		pr_cont("Error during attempted NB data read.\n");
643 	} else if (MEM_ERROR(ec)) {
644 		switch (xec) {
645 		case 0x4 ... 0xc:
646 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
647 			break;
648 
649 		case 0x10 ... 0x14:
650 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
651 			break;
652 
653 		default:
654 			ret = false;
655 		}
656 	} else if (INT_ERROR(ec)) {
657 		if (xec <= 0x3f)
658 			pr_cont("Hardware Assert.\n");
659 		else
660 			ret = false;
661 	}
662 
663 	return ret;
664 }
665 
f16h_mc2_mce(u16 ec,u8 xec)666 static bool f16h_mc2_mce(u16 ec, u8 xec)
667 {
668 	u8 r4 = R4(ec);
669 
670 	if (!MEM_ERROR(ec))
671 		return false;
672 
673 	switch (xec) {
674 	case 0x04 ... 0x05:
675 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
676 		break;
677 
678 	case 0x09 ... 0x0b:
679 	case 0x0d ... 0x0f:
680 		pr_cont("ECC error in L2 tag (%s).\n",
681 			((r4 == R4_GEN)   ? "BankReq" :
682 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
683 		break;
684 
685 	case 0x10 ... 0x19:
686 	case 0x1b:
687 		pr_cont("ECC error in L2 data array (%s).\n",
688 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
689 			((r4 == R4_GEN)   ? "Attr" :
690 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
691 		break;
692 
693 	case 0x1c ... 0x1d:
694 	case 0x1f:
695 		pr_cont("Parity error in L2 attribute bits (%s).\n",
696 			((r4 == R4_RD)  ? "Hit"  :
697 			((r4 == R4_GEN) ? "Attr" : "Fill")));
698 		break;
699 
700 	default:
701 		return false;
702 	}
703 
704 	return true;
705 }
706 
decode_mc2_mce(struct mce * m)707 static void decode_mc2_mce(struct mce *m)
708 {
709 	u16 ec = EC(m->status);
710 	u8 xec = XEC(m->status, xec_mask);
711 
712 	pr_emerg(HW_ERR "MC2 Error: ");
713 
714 	if (!fam_ops->mc2_mce(ec, xec))
715 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
716 }
717 
decode_mc3_mce(struct mce * m)718 static void decode_mc3_mce(struct mce *m)
719 {
720 	u16 ec = EC(m->status);
721 	u8 xec = XEC(m->status, xec_mask);
722 
723 	if (boot_cpu_data.x86 >= 0x14) {
724 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
725 			 " please report on LKML.\n");
726 		return;
727 	}
728 
729 	pr_emerg(HW_ERR "MC3 Error");
730 
731 	if (xec == 0x0) {
732 		u8 r4 = R4(ec);
733 
734 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
735 			goto wrong_mc3_mce;
736 
737 		pr_cont(" during %s.\n", R4_MSG(ec));
738 	} else
739 		goto wrong_mc3_mce;
740 
741 	return;
742 
743  wrong_mc3_mce:
744 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
745 }
746 
decode_mc4_mce(struct mce * m)747 static void decode_mc4_mce(struct mce *m)
748 {
749 	unsigned int fam = x86_family(m->cpuid);
750 	int node_id = amd_get_nb_id(m->extcpu);
751 	u16 ec = EC(m->status);
752 	u8 xec = XEC(m->status, 0x1f);
753 	u8 offset = 0;
754 
755 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
756 
757 	switch (xec) {
758 	case 0x0 ... 0xe:
759 
760 		/* special handling for DRAM ECCs */
761 		if (xec == 0x0 || xec == 0x8) {
762 			/* no ECCs on F11h */
763 			if (fam == 0x11)
764 				goto wrong_mc4_mce;
765 
766 			pr_cont("%s.\n", mc4_mce_desc[xec]);
767 
768 			if (decode_dram_ecc)
769 				decode_dram_ecc(node_id, m);
770 			return;
771 		}
772 		break;
773 
774 	case 0xf:
775 		if (TLB_ERROR(ec))
776 			pr_cont("GART Table Walk data error.\n");
777 		else if (BUS_ERROR(ec))
778 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
779 		else
780 			goto wrong_mc4_mce;
781 		return;
782 
783 	case 0x19:
784 		if (fam == 0x15 || fam == 0x16)
785 			pr_cont("Compute Unit Data Error.\n");
786 		else
787 			goto wrong_mc4_mce;
788 		return;
789 
790 	case 0x1c ... 0x1f:
791 		offset = 13;
792 		break;
793 
794 	default:
795 		goto wrong_mc4_mce;
796 	}
797 
798 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
799 	return;
800 
801  wrong_mc4_mce:
802 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
803 }
804 
decode_mc5_mce(struct mce * m)805 static void decode_mc5_mce(struct mce *m)
806 {
807 	unsigned int fam = x86_family(m->cpuid);
808 	u16 ec = EC(m->status);
809 	u8 xec = XEC(m->status, xec_mask);
810 
811 	if (fam == 0xf || fam == 0x11)
812 		goto wrong_mc5_mce;
813 
814 	pr_emerg(HW_ERR "MC5 Error: ");
815 
816 	if (INT_ERROR(ec)) {
817 		if (xec <= 0x1f) {
818 			pr_cont("Hardware Assert.\n");
819 			return;
820 		} else
821 			goto wrong_mc5_mce;
822 	}
823 
824 	if (xec == 0x0 || xec == 0xc)
825 		pr_cont("%s.\n", mc5_mce_desc[xec]);
826 	else if (xec <= 0xd)
827 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
828 	else
829 		goto wrong_mc5_mce;
830 
831 	return;
832 
833  wrong_mc5_mce:
834 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
835 }
836 
decode_mc6_mce(struct mce * m)837 static void decode_mc6_mce(struct mce *m)
838 {
839 	u8 xec = XEC(m->status, xec_mask);
840 
841 	pr_emerg(HW_ERR "MC6 Error: ");
842 
843 	if (xec > 0x5)
844 		goto wrong_mc6_mce;
845 
846 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
847 	return;
848 
849  wrong_mc6_mce:
850 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
851 }
852 
853 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)854 static void decode_smca_error(struct mce *m)
855 {
856 	struct smca_hwid *hwid;
857 	enum smca_bank_types bank_type;
858 	const char *ip_name;
859 	u8 xec = XEC(m->status, xec_mask);
860 
861 	if (m->bank >= ARRAY_SIZE(smca_banks))
862 		return;
863 
864 	hwid = smca_banks[m->bank].hwid;
865 	if (!hwid)
866 		return;
867 
868 	bank_type = hwid->bank_type;
869 
870 	if (bank_type == SMCA_RESERVED) {
871 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
872 		return;
873 	}
874 
875 	ip_name = smca_get_long_name(bank_type);
876 
877 	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
878 
879 	/* Only print the decode of valid error codes */
880 	if (xec < smca_mce_descs[bank_type].num_descs &&
881 			(hwid->xec_bitmap & BIT_ULL(xec))) {
882 		pr_emerg(HW_ERR "%s Error: ", ip_name);
883 		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
884 	}
885 
886 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
887 		decode_dram_ecc(cpu_to_node(m->extcpu), m);
888 }
889 
amd_decode_err_code(u16 ec)890 static inline void amd_decode_err_code(u16 ec)
891 {
892 	if (INT_ERROR(ec)) {
893 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
894 		return;
895 	}
896 
897 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
898 
899 	if (BUS_ERROR(ec))
900 		pr_cont(", mem/io: %s", II_MSG(ec));
901 	else
902 		pr_cont(", tx: %s", TT_MSG(ec));
903 
904 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
905 		pr_cont(", mem-tx: %s", R4_MSG(ec));
906 
907 		if (BUS_ERROR(ec))
908 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
909 	}
910 
911 	pr_cont("\n");
912 }
913 
914 /*
915  * Filter out unwanted MCE signatures here.
916  */
amd_filter_mce(struct mce * m)917 static bool amd_filter_mce(struct mce *m)
918 {
919 	/*
920 	 * NB GART TLB error reporting is disabled by default.
921 	 */
922 	if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
923 		return true;
924 
925 	return false;
926 }
927 
decode_error_status(struct mce * m)928 static const char *decode_error_status(struct mce *m)
929 {
930 	if (m->status & MCI_STATUS_UC) {
931 		if (m->status & MCI_STATUS_PCC)
932 			return "System Fatal error.";
933 		if (m->mcgstatus & MCG_STATUS_RIPV)
934 			return "Uncorrected, software restartable error.";
935 		return "Uncorrected, software containable error.";
936 	}
937 
938 	if (m->status & MCI_STATUS_DEFERRED)
939 		return "Deferred error, no action required.";
940 
941 	return "Corrected error, no action required.";
942 }
943 
944 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)945 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
946 {
947 	struct mce *m = (struct mce *)data;
948 	unsigned int fam = x86_family(m->cpuid);
949 	int ecc;
950 
951 	if (amd_filter_mce(m))
952 		return NOTIFY_STOP;
953 
954 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
955 
956 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
957 		m->extcpu,
958 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
959 		m->bank,
960 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
961 		((m->status & MCI_STATUS_UC)	? "UE"	  :
962 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
963 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
964 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
965 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
966 
967 	if (fam >= 0x15) {
968 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
969 
970 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
971 		if (fam != 0x15 || m->bank != 4)
972 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
973 	}
974 
975 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
976 		u32 low, high;
977 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
978 
979 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
980 
981 		if (!rdmsr_safe(addr, &low, &high) &&
982 		    (low & MCI_CONFIG_MCAX))
983 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
984 	}
985 
986 	/* do the two bits[14:13] together */
987 	ecc = (m->status >> 45) & 0x3;
988 	if (ecc)
989 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
990 
991 	pr_cont("]: 0x%016llx\n", m->status);
992 
993 	if (m->status & MCI_STATUS_ADDRV)
994 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
995 
996 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
997 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
998 
999 		if (m->status & MCI_STATUS_SYNDV)
1000 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1001 
1002 		pr_cont("\n");
1003 
1004 		decode_smca_error(m);
1005 		goto err_code;
1006 	}
1007 
1008 	if (m->tsc)
1009 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1010 
1011 	if (!fam_ops)
1012 		goto err_code;
1013 
1014 	switch (m->bank) {
1015 	case 0:
1016 		decode_mc0_mce(m);
1017 		break;
1018 
1019 	case 1:
1020 		decode_mc1_mce(m);
1021 		break;
1022 
1023 	case 2:
1024 		decode_mc2_mce(m);
1025 		break;
1026 
1027 	case 3:
1028 		decode_mc3_mce(m);
1029 		break;
1030 
1031 	case 4:
1032 		decode_mc4_mce(m);
1033 		break;
1034 
1035 	case 5:
1036 		decode_mc5_mce(m);
1037 		break;
1038 
1039 	case 6:
1040 		decode_mc6_mce(m);
1041 		break;
1042 
1043 	default:
1044 		break;
1045 	}
1046 
1047  err_code:
1048 	amd_decode_err_code(m->status & 0xffff);
1049 
1050 	return NOTIFY_STOP;
1051 }
1052 
1053 static struct notifier_block amd_mce_dec_nb = {
1054 	.notifier_call	= amd_decode_mce,
1055 	.priority	= MCE_PRIO_EDAC,
1056 };
1057 
mce_amd_init(void)1058 static int __init mce_amd_init(void)
1059 {
1060 	struct cpuinfo_x86 *c = &boot_cpu_data;
1061 
1062 	if (c->x86_vendor != X86_VENDOR_AMD)
1063 		return -ENODEV;
1064 
1065 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1066 	if (!fam_ops)
1067 		return -ENOMEM;
1068 
1069 	switch (c->x86) {
1070 	case 0xf:
1071 		fam_ops->mc0_mce = k8_mc0_mce;
1072 		fam_ops->mc1_mce = k8_mc1_mce;
1073 		fam_ops->mc2_mce = k8_mc2_mce;
1074 		break;
1075 
1076 	case 0x10:
1077 		fam_ops->mc0_mce = f10h_mc0_mce;
1078 		fam_ops->mc1_mce = k8_mc1_mce;
1079 		fam_ops->mc2_mce = k8_mc2_mce;
1080 		break;
1081 
1082 	case 0x11:
1083 		fam_ops->mc0_mce = k8_mc0_mce;
1084 		fam_ops->mc1_mce = k8_mc1_mce;
1085 		fam_ops->mc2_mce = k8_mc2_mce;
1086 		break;
1087 
1088 	case 0x12:
1089 		fam_ops->mc0_mce = f12h_mc0_mce;
1090 		fam_ops->mc1_mce = k8_mc1_mce;
1091 		fam_ops->mc2_mce = k8_mc2_mce;
1092 		break;
1093 
1094 	case 0x14:
1095 		fam_ops->mc0_mce = cat_mc0_mce;
1096 		fam_ops->mc1_mce = cat_mc1_mce;
1097 		fam_ops->mc2_mce = k8_mc2_mce;
1098 		break;
1099 
1100 	case 0x15:
1101 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1102 
1103 		fam_ops->mc0_mce = f15h_mc0_mce;
1104 		fam_ops->mc1_mce = f15h_mc1_mce;
1105 		fam_ops->mc2_mce = f15h_mc2_mce;
1106 		break;
1107 
1108 	case 0x16:
1109 		xec_mask = 0x1f;
1110 		fam_ops->mc0_mce = cat_mc0_mce;
1111 		fam_ops->mc1_mce = cat_mc1_mce;
1112 		fam_ops->mc2_mce = f16h_mc2_mce;
1113 		break;
1114 
1115 	case 0x17:
1116 		xec_mask = 0x3f;
1117 		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1118 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1119 			goto err_out;
1120 		}
1121 		break;
1122 
1123 	default:
1124 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1125 		goto err_out;
1126 	}
1127 
1128 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1129 
1130 	mce_register_decode_chain(&amd_mce_dec_nb);
1131 
1132 	return 0;
1133 
1134 err_out:
1135 	kfree(fam_ops);
1136 	fam_ops = NULL;
1137 	return -EINVAL;
1138 }
1139 early_initcall(mce_amd_init);
1140 
1141 #ifdef MODULE
mce_amd_exit(void)1142 static void __exit mce_amd_exit(void)
1143 {
1144 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1145 	kfree(fam_ops);
1146 }
1147 
1148 MODULE_DESCRIPTION("AMD MCE decoder");
1149 MODULE_ALIAS("edac-mce-amd");
1150 MODULE_LICENSE("GPL");
1151 module_exit(mce_amd_exit);
1152 #endif
1153