1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * This file implements the error recovery as a core part of PCIe error
4 * reporting. When a PCIe error is delivered, an error message will be
5 * collected and printed to console, then, an error recovery procedure
6 * will be executed by following the PCI error recovery rules.
7 *
8 * Copyright (C) 2006 Intel Corp.
9 * Tom Long Nguyen (tom.l.nguyen@intel.com)
10 * Zhang Yanmin (yanmin.zhang@intel.com)
11 */
12
13 #include <linux/pci.h>
14 #include <linux/module.h>
15 #include <linux/pci.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/aer.h>
19 #include "portdrv.h"
20 #include "../pci.h"
21
22 struct aer_broadcast_data {
23 enum pci_channel_state state;
24 enum pci_ers_result result;
25 };
26
merge_result(enum pci_ers_result orig,enum pci_ers_result new)27 static pci_ers_result_t merge_result(enum pci_ers_result orig,
28 enum pci_ers_result new)
29 {
30 if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
31 return PCI_ERS_RESULT_NO_AER_DRIVER;
32
33 if (new == PCI_ERS_RESULT_NONE)
34 return orig;
35
36 switch (orig) {
37 case PCI_ERS_RESULT_CAN_RECOVER:
38 case PCI_ERS_RESULT_RECOVERED:
39 orig = new;
40 break;
41 case PCI_ERS_RESULT_DISCONNECT:
42 if (new == PCI_ERS_RESULT_NEED_RESET)
43 orig = PCI_ERS_RESULT_NEED_RESET;
44 break;
45 default:
46 break;
47 }
48
49 return orig;
50 }
51
report_error_detected(struct pci_dev * dev,void * data)52 static int report_error_detected(struct pci_dev *dev, void *data)
53 {
54 pci_ers_result_t vote;
55 const struct pci_error_handlers *err_handler;
56 struct aer_broadcast_data *result_data;
57
58 result_data = (struct aer_broadcast_data *) data;
59
60 device_lock(&dev->dev);
61 dev->error_state = result_data->state;
62
63 if (!dev->driver ||
64 !dev->driver->err_handler ||
65 !dev->driver->err_handler->error_detected) {
66 /*
67 * If any device in the subtree does not have an error_detected
68 * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
69 * error callbacks of "any" device in the subtree, and will
70 * exit in the disconnected error state.
71 */
72 if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
73 vote = PCI_ERS_RESULT_NO_AER_DRIVER;
74 else
75 vote = PCI_ERS_RESULT_NONE;
76 } else {
77 err_handler = dev->driver->err_handler;
78 vote = err_handler->error_detected(dev, result_data->state);
79 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
80 }
81
82 result_data->result = merge_result(result_data->result, vote);
83 device_unlock(&dev->dev);
84 return 0;
85 }
86
report_mmio_enabled(struct pci_dev * dev,void * data)87 static int report_mmio_enabled(struct pci_dev *dev, void *data)
88 {
89 pci_ers_result_t vote;
90 const struct pci_error_handlers *err_handler;
91 struct aer_broadcast_data *result_data;
92
93 result_data = (struct aer_broadcast_data *) data;
94
95 device_lock(&dev->dev);
96 if (!dev->driver ||
97 !dev->driver->err_handler ||
98 !dev->driver->err_handler->mmio_enabled)
99 goto out;
100
101 err_handler = dev->driver->err_handler;
102 vote = err_handler->mmio_enabled(dev);
103 result_data->result = merge_result(result_data->result, vote);
104 out:
105 device_unlock(&dev->dev);
106 return 0;
107 }
108
report_slot_reset(struct pci_dev * dev,void * data)109 static int report_slot_reset(struct pci_dev *dev, void *data)
110 {
111 pci_ers_result_t vote;
112 const struct pci_error_handlers *err_handler;
113 struct aer_broadcast_data *result_data;
114
115 result_data = (struct aer_broadcast_data *) data;
116
117 device_lock(&dev->dev);
118 if (!dev->driver ||
119 !dev->driver->err_handler ||
120 !dev->driver->err_handler->slot_reset)
121 goto out;
122
123 err_handler = dev->driver->err_handler;
124 vote = err_handler->slot_reset(dev);
125 result_data->result = merge_result(result_data->result, vote);
126 out:
127 device_unlock(&dev->dev);
128 return 0;
129 }
130
report_resume(struct pci_dev * dev,void * data)131 static int report_resume(struct pci_dev *dev, void *data)
132 {
133 const struct pci_error_handlers *err_handler;
134
135 device_lock(&dev->dev);
136 dev->error_state = pci_channel_io_normal;
137
138 if (!dev->driver ||
139 !dev->driver->err_handler ||
140 !dev->driver->err_handler->resume)
141 goto out;
142
143 err_handler = dev->driver->err_handler;
144 err_handler->resume(dev);
145 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
146 out:
147 device_unlock(&dev->dev);
148 return 0;
149 }
150
151 /**
152 * default_reset_link - default reset function
153 * @dev: pointer to pci_dev data structure
154 *
155 * Invoked when performing link reset on a Downstream Port or a
156 * Root Port with no aer driver.
157 */
default_reset_link(struct pci_dev * dev)158 static pci_ers_result_t default_reset_link(struct pci_dev *dev)
159 {
160 int rc;
161
162 rc = pci_bus_error_reset(dev);
163 pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
164 return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
165 }
166
reset_link(struct pci_dev * dev,u32 service)167 static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
168 {
169 pci_ers_result_t status;
170 struct pcie_port_service_driver *driver = NULL;
171
172 driver = pcie_port_find_service(dev, service);
173 if (driver && driver->reset_link) {
174 status = driver->reset_link(dev);
175 } else if (dev->has_secondary_link) {
176 status = default_reset_link(dev);
177 } else {
178 pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
179 pci_name(dev));
180 return PCI_ERS_RESULT_DISCONNECT;
181 }
182
183 if (status != PCI_ERS_RESULT_RECOVERED) {
184 pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
185 pci_name(dev));
186 return PCI_ERS_RESULT_DISCONNECT;
187 }
188
189 return status;
190 }
191
192 /**
193 * broadcast_error_message - handle message broadcast to downstream drivers
194 * @dev: pointer to from where in a hierarchy message is broadcasted down
195 * @state: error state
196 * @error_mesg: message to print
197 * @cb: callback to be broadcasted
198 *
199 * Invoked during error recovery process. Once being invoked, the content
200 * of error severity will be broadcasted to all downstream drivers in a
201 * hierarchy in question.
202 */
broadcast_error_message(struct pci_dev * dev,enum pci_channel_state state,char * error_mesg,int (* cb)(struct pci_dev *,void *))203 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
204 enum pci_channel_state state,
205 char *error_mesg,
206 int (*cb)(struct pci_dev *, void *))
207 {
208 struct aer_broadcast_data result_data;
209
210 pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
211 result_data.state = state;
212 if (cb == report_error_detected)
213 result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
214 else
215 result_data.result = PCI_ERS_RESULT_RECOVERED;
216
217 pci_walk_bus(dev->subordinate, cb, &result_data);
218 return result_data.result;
219 }
220
221 /**
222 * pcie_do_fatal_recovery - handle fatal error recovery process
223 * @dev: pointer to a pci_dev data structure of agent detecting an error
224 *
225 * Invoked when an error is fatal. Once being invoked, removes the devices
226 * beneath this AER agent, followed by reset link e.g. secondary bus reset
227 * followed by re-enumeration of devices.
228 */
pcie_do_fatal_recovery(struct pci_dev * dev,u32 service)229 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
230 {
231 struct pci_dev *udev;
232 struct pci_bus *parent;
233 struct pci_dev *pdev, *temp;
234 pci_ers_result_t result;
235
236 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
237 udev = dev;
238 else
239 udev = dev->bus->self;
240
241 parent = udev->subordinate;
242 pci_lock_rescan_remove();
243 pci_dev_get(dev);
244 list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
245 bus_list) {
246 pci_dev_get(pdev);
247 pci_dev_set_disconnected(pdev, NULL);
248 if (pci_has_subordinate(pdev))
249 pci_walk_bus(pdev->subordinate,
250 pci_dev_set_disconnected, NULL);
251 pci_stop_and_remove_bus_device(pdev);
252 pci_dev_put(pdev);
253 }
254
255 result = reset_link(udev, service);
256
257 if ((service == PCIE_PORT_SERVICE_AER) &&
258 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
259 /*
260 * If the error is reported by a bridge, we think this error
261 * is related to the downstream link of the bridge, so we
262 * do error recovery on all subordinates of the bridge instead
263 * of the bridge and clear the error status of the bridge.
264 */
265 pci_aer_clear_fatal_status(dev);
266 pci_aer_clear_device_status(dev);
267 }
268
269 if (result == PCI_ERS_RESULT_RECOVERED) {
270 if (pcie_wait_for_link(udev, true))
271 pci_rescan_bus(udev->bus);
272 pci_info(dev, "Device recovery from fatal error successful\n");
273 } else {
274 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
275 pci_info(dev, "Device recovery from fatal error failed\n");
276 }
277
278 pci_dev_put(dev);
279 pci_unlock_rescan_remove();
280 }
281
282 /**
283 * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
284 * @dev: pointer to a pci_dev data structure of agent detecting an error
285 *
286 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
287 * error detected message to all downstream drivers within a hierarchy in
288 * question and return the returned code.
289 */
pcie_do_nonfatal_recovery(struct pci_dev * dev)290 void pcie_do_nonfatal_recovery(struct pci_dev *dev)
291 {
292 pci_ers_result_t status;
293 enum pci_channel_state state;
294
295 state = pci_channel_io_normal;
296
297 /*
298 * Error recovery runs on all subordinates of the first downstream port.
299 * If the downstream port detected the error, it is cleared at the end.
300 */
301 if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
302 pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
303 dev = dev->bus->self;
304
305 status = broadcast_error_message(dev,
306 state,
307 "error_detected",
308 report_error_detected);
309
310 if (status == PCI_ERS_RESULT_CAN_RECOVER)
311 status = broadcast_error_message(dev,
312 state,
313 "mmio_enabled",
314 report_mmio_enabled);
315
316 if (status == PCI_ERS_RESULT_NEED_RESET) {
317 /*
318 * TODO: Should call platform-specific
319 * functions to reset slot before calling
320 * drivers' slot_reset callbacks?
321 */
322 status = broadcast_error_message(dev,
323 state,
324 "slot_reset",
325 report_slot_reset);
326 }
327
328 if (status != PCI_ERS_RESULT_RECOVERED)
329 goto failed;
330
331 broadcast_error_message(dev,
332 state,
333 "resume",
334 report_resume);
335
336 pci_aer_clear_device_status(dev);
337 pci_cleanup_aer_uncorrect_error_status(dev);
338 pci_info(dev, "AER: Device recovery successful\n");
339 return;
340
341 failed:
342 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
343
344 /* TODO: Should kernel panic here? */
345 pci_info(dev, "AER: Device recovery failed\n");
346 }
347