*
* CPU caches (L1 and L2)
* DMA engines
- * Core CPU swithces
+ * Core CPU switches
* Fabric switch units
* PCIe interface controllers
* other EDAC/ECC type devices that can be monitored for
* errors, etc.
*
- * It allows for a 2 level set of hiearchry. For example:
+ * It allows for a 2 level set of hierarchy. For example:
*
* cache could be composed of L1, L2 and L3 levels of cache.
* Each CPU core would have its own L1 cache, while sharing
#endif /* CONFIG_PCI */
- extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
- unsigned nr_chans, int edac_index);
+ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
+ unsigned n_layers,
+ struct edac_mc_layer *layers,
+ unsigned sz_pvt);
extern int edac_mc_add_mc(struct mem_ctl_info *mci);
extern void edac_mc_free(struct mem_ctl_info *mci);
extern struct mem_ctl_info *edac_mc_find(int idx);
extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
unsigned long page);
-
- /*
- * The no info errors are used when error overflows are reported.
- * There are a limited number of error logging registers that can
- * be exausted. When all registers are exhausted and an additional
- * error occurs then an error overflow register records that an
- * error occurred and the type of error, but doesn't have any
- * further information. The ce/ue versions make for cleaner
- * reporting logic and function interface - reduces conditional
- * statement clutter and extra function arguments.
- */
- extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
- unsigned long page_frame_number,
- unsigned long offset_in_page,
- unsigned long syndrome, int row, int channel,
- const char *msg);
- extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
- const char *msg);
- extern void edac_mc_handle_ue(struct mem_ctl_info *mci,
- unsigned long page_frame_number,
- unsigned long offset_in_page, int row,
- const char *msg);
- extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
- const char *msg);
- extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow,
- unsigned int channel0, unsigned int channel1,
- char *msg);
- extern void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, unsigned int csrow,
- unsigned int channel, char *msg);
+ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
+ struct mem_ctl_info *mci,
+ const unsigned long page_frame_number,
+ const unsigned long offset_in_page,
+ const unsigned long syndrome,
+ const int layer0,
+ const int layer1,
+ const int layer2,
+ const char *msg,
+ const char *other_detail,
+ const void *mcelog);
/*
* edac_device APIs
extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);
extern int edac_device_alloc_index(void);
+ extern const char *edac_layer_name[];
/*
* edac_pci APIs
*
* The control structure is allocated in complete chunk
* from the OS. It is in turn sub allocated to the
- * various objects that compose the struture
+ * various objects that compose the structure
*
* The structure has a 'nr_instance' array within itself.
* Each instance represents a major component
unsigned total_size;
unsigned count;
unsigned instance, block, attr;
- void *pvt;
+ void *pvt, *p;
int err;
debugf4("%s() instances=%d blocks=%d\n",
* to be at least as stringent as what the compiler would
* provide if we could simply hardcode everything into a single struct.
*/
- dev_ctl = (struct edac_device_ctl_info *)NULL;
+ p = NULL;
+ dev_ctl = edac_align_ptr(&p, sizeof(*dev_ctl), 1);
/* Calc the 'end' offset past end of ONE ctl_info structure
* which will become the start of the 'instance' array
*/
- dev_inst = edac_align_ptr(&dev_ctl[1], sizeof(*dev_inst));
+ dev_inst = edac_align_ptr(&p, sizeof(*dev_inst), nr_instances);
/* Calc the 'end' offset past the instance array within the ctl_info
* which will become the start of the block array
*/
- dev_blk = edac_align_ptr(&dev_inst[nr_instances], sizeof(*dev_blk));
+ count = nr_instances * nr_blocks;
+ dev_blk = edac_align_ptr(&p, sizeof(*dev_blk), count);
/* Calc the 'end' offset past the dev_blk array
* which will become the start of the attrib array, if any.
*/
- count = nr_instances * nr_blocks;
- dev_attrib = edac_align_ptr(&dev_blk[count], sizeof(*dev_attrib));
-
- /* Check for case of when an attribute array is specified */
- if (nr_attrib > 0) {
- /* calc how many nr_attrib we need */
+ /* calc how many nr_attrib we need */
+ if (nr_attrib > 0)
count *= nr_attrib;
+ dev_attrib = edac_align_ptr(&p, sizeof(*dev_attrib), count);
- /* Calc the 'end' offset past the attributes array */
- pvt = edac_align_ptr(&dev_attrib[count], sz_private);
- } else {
- /* no attribute array specified */
- pvt = edac_align_ptr(dev_attrib, sz_private);
- }
+ /* Calc the 'end' offset past the attributes array */
+ pvt = edac_align_ptr(&p, sz_private, 1);
/* 'pvt' now points to where the private data area is.
* At this point 'pvt' (like dev_inst,dev_blk and dev_attrib)
/* Reschedule the workq for the next time period to start again
* if the number of msec is for 1 sec, then adjust to the next
- * whole one second to save timers fireing all over the period
+ * whole one second to save timers firing all over the period
* between integral seconds
*/
if (edac_dev->poll_msec == 1000)
* Remove sysfs entries for specified edac_device structure and
* then remove edac_device structure from global list
*
- * @pdev:
+ * @dev:
* Pointer to 'struct device' representing edac_device
* structure to remove.
*
#define MC_MAX_DOD 0x64
/*
- * OFFSETS for Device 3 Function 4, as inicated on Xeon 5500 datasheet:
+ * OFFSETS for Device 3 Function 4, as indicated on Xeon 5500 datasheet:
* http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
*/
#define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff)
#define DIMM0_COR_ERR(r) ((r) & 0x7fff)
-/* OFFSETS for Device 3 Function 2, as inicated on Xeon 5500 datasheet */
+/* OFFSETS for Device 3 Function 2, as indicated on Xeon 5500 datasheet */
#define MC_SSRCONTROL 0x48
#define SSR_MODE_DISABLE 0x00
#define SSR_MODE_ENABLE 0x01
};
struct i7core_channel {
- u32 ranks;
+ bool is_3dimms_present;
+ bool is_single_4rank;
+ bool has_4rank;
u32 dimms;
};
struct i7core_channel channel[NUM_CHANS];
int ce_count_available;
- int csrow_map[NUM_CHANS][MAX_DIMMS];
/* ECC corrected errors counts per udimm */
unsigned long udimm_ce_count[MAX_DIMMS];
};
/****************************************************************************
- Anciliary status routines
+ Ancillary status routines
****************************************************************************/
/* MC_CONTROL bits */
/****************************************************************************
Memory check routines
****************************************************************************/
- static struct pci_dev *get_pdev_slot_func(u8 socket, unsigned slot,
- unsigned func)
- {
- struct i7core_dev *i7core_dev = get_i7core_dev(socket);
- int i;
-
- if (!i7core_dev)
- return NULL;
-
- for (i = 0; i < i7core_dev->n_devs; i++) {
- if (!i7core_dev->pdev[i])
- continue;
-
- if (PCI_SLOT(i7core_dev->pdev[i]->devfn) == slot &&
- PCI_FUNC(i7core_dev->pdev[i]->devfn) == func) {
- return i7core_dev->pdev[i];
- }
- }
-
- return NULL;
- }
-
- /**
- * i7core_get_active_channels() - gets the number of channels and csrows
- * @socket: Quick Path Interconnect socket
- * @channels: Number of channels that will be returned
- * @csrows: Number of csrows found
- *
- * Since EDAC core needs to know in advance the number of available channels
- * and csrows, in order to allocate memory for csrows/channels, it is needed
- * to run two similar steps. At the first step, implemented on this function,
- * it checks the number of csrows/channels present at one socket.
- * this is used in order to properly allocate the size of mci components.
- *
- * It should be noticed that none of the current available datasheets explain
- * or even mention how csrows are seen by the memory controller. So, we need
- * to add a fake description for csrows.
- * So, this driver is attributing one DIMM memory for one csrow.
- */
- static int i7core_get_active_channels(const u8 socket, unsigned *channels,
- unsigned *csrows)
- {
- struct pci_dev *pdev = NULL;
- int i, j;
- u32 status, control;
-
- *channels = 0;
- *csrows = 0;
-
- pdev = get_pdev_slot_func(socket, 3, 0);
- if (!pdev) {
- i7core_printk(KERN_ERR, "Couldn't find socket %d fn 3.0!!!\n",
- socket);
- return -ENODEV;
- }
-
- /* Device 3 function 0 reads */
- pci_read_config_dword(pdev, MC_STATUS, &status);
- pci_read_config_dword(pdev, MC_CONTROL, &control);
-
- for (i = 0; i < NUM_CHANS; i++) {
- u32 dimm_dod[3];
- /* Check if the channel is active */
- if (!(control & (1 << (8 + i))))
- continue;
-
- /* Check if the channel is disabled */
- if (status & (1 << i))
- continue;
-
- pdev = get_pdev_slot_func(socket, i + 4, 1);
- if (!pdev) {
- i7core_printk(KERN_ERR, "Couldn't find socket %d "
- "fn %d.%d!!!\n",
- socket, i + 4, 1);
- return -ENODEV;
- }
- /* Devices 4-6 function 1 */
- pci_read_config_dword(pdev,
- MC_DOD_CH_DIMM0, &dimm_dod[0]);
- pci_read_config_dword(pdev,
- MC_DOD_CH_DIMM1, &dimm_dod[1]);
- pci_read_config_dword(pdev,
- MC_DOD_CH_DIMM2, &dimm_dod[2]);
- (*channels)++;
-
- for (j = 0; j < 3; j++) {
- if (!DIMM_PRESENT(dimm_dod[j]))
- continue;
- (*csrows)++;
- }
- }
-
- debugf0("Number of active channels on socket %d: %d\n",
- socket, *channels);
-
- return 0;
- }
-
- static int get_dimm_config(const struct mem_ctl_info *mci)
+ static int get_dimm_config(struct mem_ctl_info *mci)
{
struct i7core_pvt *pvt = mci->pvt_info;
- struct csrow_info *csr;
struct pci_dev *pdev;
int i, j;
- int csrow = 0;
- unsigned long last_page = 0;
enum edac_type mode;
enum mem_type mtype;
+ struct dimm_info *dimm;
/* Get data from the MC register, function 0 */
pdev = pvt->pci_mcr[0];
pci_read_config_dword(pvt->pci_ch[i][0],
MC_CHANNEL_DIMM_INIT_PARAMS, &data);
- pvt->channel[i].ranks = (data & QUAD_RANK_PRESENT) ?
- 4 : 2;
+
+ if (data & THREE_DIMMS_PRESENT)
+ pvt->channel[i].is_3dimms_present = true;
+
+ if (data & SINGLE_QUAD_RANK_PRESENT)
+ pvt->channel[i].is_single_4rank = true;
+
+ if (data & QUAD_RANK_PRESENT)
+ pvt->channel[i].has_4rank = true;
if (data & REGISTERED_DIMM)
mtype = MEM_RDDR3;
else
mtype = MEM_DDR3;
- #if 0
- if (data & THREE_DIMMS_PRESENT)
- pvt->channel[i].dimms = 3;
- else if (data & SINGLE_QUAD_RANK_PRESENT)
- pvt->channel[i].dimms = 1;
- else
- pvt->channel[i].dimms = 2;
- #endif
/* Devices 4-6 function 1 */
pci_read_config_dword(pvt->pci_ch[i][1],
MC_DOD_CH_DIMM2, &dimm_dod[2]);
debugf0("Ch%d phy rd%d, wr%d (0x%08x): "
- "%d ranks, %cDIMMs\n",
+ "%s%s%s%cDIMMs\n",
i,
RDLCH(pvt->info.ch_map, i), WRLCH(pvt->info.ch_map, i),
data,
- pvt->channel[i].ranks,
+ pvt->channel[i].is_3dimms_present ? "3DIMMS " : "",
+ pvt->channel[i].is_3dimms_present ? "SINGLE_4R " : "",
+ pvt->channel[i].has_4rank ? "HAS_4R " : "",
(data & REGISTERED_DIMM) ? 'R' : 'U');
for (j = 0; j < 3; j++) {
if (!DIMM_PRESENT(dimm_dod[j]))
continue;
+ dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+ i, j, 0);
banks = numbank(MC_DOD_NUMBANK(dimm_dod[j]));
ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j]));
rows = numrow(MC_DOD_NUMROW(dimm_dod[j]));
/* DDR3 has 8 I/O banks */
size = (rows * cols * banks * ranks) >> (20 - 3);
- pvt->channel[i].dimms++;
-
debugf0("\tdimm %d %d Mb offset: %x, "
"bank: %d, rank: %d, row: %#x, col: %#x\n",
j, size,
npages = MiB_TO_PAGES(size);
- csr = &mci->csrows[csrow];
- csr->first_page = last_page + 1;
- last_page += npages;
- csr->last_page = last_page;
- csr->nr_pages = npages;
-
- csr->page_mask = 0;
- csr->grain = 8;
- csr->csrow_idx = csrow;
- csr->nr_channels = 1;
-
- csr->channels[0].chan_idx = i;
- csr->channels[0].ce_count = 0;
-
- pvt->csrow_map[i][j] = csrow;
+ dimm->nr_pages = npages;
switch (banks) {
case 4:
- csr->dtype = DEV_X4;
+ dimm->dtype = DEV_X4;
break;
case 8:
- csr->dtype = DEV_X8;
+ dimm->dtype = DEV_X8;
break;
case 16:
- csr->dtype = DEV_X16;
+ dimm->dtype = DEV_X16;
break;
default:
- csr->dtype = DEV_UNKNOWN;
+ dimm->dtype = DEV_UNKNOWN;
}
- csr->edac_mode = mode;
- csr->mtype = mtype;
- snprintf(csr->channels[0].label,
- sizeof(csr->channels[0].label),
- "CPU#%uChannel#%u_DIMM#%u",
- pvt->i7core_dev->socket, i, j);
-
- csrow++;
+ snprintf(dimm->label, sizeof(dimm->label),
+ "CPU#%uChannel#%u_DIMM#%u",
+ pvt->i7core_dev->socket, i, j);
+ dimm->grain = 8;
+ dimm->edac_mode = mode;
+ dimm->mtype = mtype;
}
pci_read_config_dword(pdev, MC_SAG_CH_0, &value[0]);
dev_descr->dev_id, *prev);
/*
- * On Xeon 55xx, the Intel Quckpath Arch Generic Non-core regs
+ * On Xeon 55xx, the Intel QuickPath Arch Generic Non-core regs
* is at addr 8086:2c40, instead of 8086:2c41. So, we need
* to probe for the alternate address in case of failure
*/
/****************************************************************************
Error check routines
****************************************************************************/
- static void i7core_rdimm_update_csrow(struct mem_ctl_info *mci,
+ static void i7core_rdimm_update_errcount(struct mem_ctl_info *mci,
const int chan,
const int dimm,
const int add)
{
- char *msg;
- struct i7core_pvt *pvt = mci->pvt_info;
- int row = pvt->csrow_map[chan][dimm], i;
+ int i;
for (i = 0; i < add; i++) {
- msg = kasprintf(GFP_KERNEL, "Corrected error "
- "(Socket=%d channel=%d dimm=%d)",
- pvt->i7core_dev->socket, chan, dimm);
-
- edac_mc_handle_fbd_ce(mci, row, 0, msg);
- kfree (msg);
+ edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0,
+ chan, dimm, -1, "error", "", NULL);
}
}
/*updated the edac core */
if (add0 != 0)
- i7core_rdimm_update_csrow(mci, chan, 0, add0);
+ i7core_rdimm_update_errcount(mci, chan, 0, add0);
if (add1 != 0)
- i7core_rdimm_update_csrow(mci, chan, 1, add1);
+ i7core_rdimm_update_errcount(mci, chan, 1, add1);
if (add2 != 0)
- i7core_rdimm_update_csrow(mci, chan, 2, add2);
+ i7core_rdimm_update_errcount(mci, chan, 2, add2);
}
const struct mce *m)
{
struct i7core_pvt *pvt = mci->pvt_info;
- char *type, *optype, *err, *msg;
+ char *type, *optype, *err, msg[80];
+ enum hw_event_mc_err_type tp_event;
unsigned long error = m->status & 0x1ff0000l;
+ bool uncorrected_error = m->mcgstatus & 1ll << 61;
+ bool ripv = m->mcgstatus & 1;
u32 optypenum = (m->status >> 4) & 0x07;
u32 core_err_cnt = (m->status >> 38) & 0x7fff;
u32 dimm = (m->misc >> 16) & 0x3;
u32 channel = (m->misc >> 18) & 0x3;
u32 syndrome = m->misc >> 32;
u32 errnum = find_first_bit(&error, 32);
- int csrow;
- if (m->mcgstatus & 1)
- type = "FATAL";
- else
- type = "NON_FATAL";
+ if (uncorrected_error) {
+ if (ripv) {
+ type = "FATAL";
+ tp_event = HW_EVENT_ERR_FATAL;
+ } else {
+ type = "NON_FATAL";
+ tp_event = HW_EVENT_ERR_UNCORRECTED;
+ }
+ } else {
+ type = "CORRECTED";
+ tp_event = HW_EVENT_ERR_CORRECTED;
+ }
switch (optypenum) {
case 0:
err = "unknown";
}
- /* FIXME: should convert addr into bank and rank information */
- msg = kasprintf(GFP_ATOMIC,
- "%s (addr = 0x%08llx, cpu=%d, Dimm=%d, Channel=%d, "
- "syndrome=0x%08x, count=%d, Err=%08llx:%08llx (%s: %s))\n",
- type, (long long) m->addr, m->cpu, dimm, channel,
- syndrome, core_err_cnt, (long long)m->status,
- (long long)m->misc, optype, err);
-
- debugf0("%s", msg);
-
- csrow = pvt->csrow_map[channel][dimm];
+ snprintf(msg, sizeof(msg), "count=%d %s", core_err_cnt, optype);
- /* Call the helper to output message */
- if (m->mcgstatus & 1)
- edac_mc_handle_fbd_ue(mci, csrow, 0,
- 0 /* FIXME: should be channel here */, msg);
- else if (!pvt->is_registered)
- edac_mc_handle_fbd_ce(mci, csrow,
- 0 /* FIXME: should be channel here */, msg);
-
- kfree(msg);
+ /*
+ * Call the helper to output message
+ * FIXME: what to do if core_err_cnt > 1? Currently, it generates
+ * only one event
+ */
+ if (uncorrected_error || !pvt->is_registered)
+ edac_mc_handle_error(tp_event, mci,
+ m->addr >> PAGE_SHIFT,
+ m->addr & ~PAGE_MASK,
+ syndrome,
+ channel, dimm, -1,
+ err, msg, m);
}
/*
/*
* get_sdram_scrub_rate This routine convert current scrub rate value
- * into byte/sec bandwidth accourding to
+ * into byte/sec bandwidth according to
* SCRUBINTERVAL formula found in datasheet.
*/
static int get_sdram_scrub_rate(struct mem_ctl_info *mci)
{
struct mem_ctl_info *mci;
struct i7core_pvt *pvt;
- int rc, channels, csrows;
-
- /* Check the number of active and not disabled channels */
- rc = i7core_get_active_channels(i7core_dev->socket, &channels, &csrows);
- if (unlikely(rc < 0))
- return rc;
+ int rc;
+ struct edac_mc_layer layers[2];
/* allocate a new MC control structure */
- mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, i7core_dev->socket);
+
+ layers[0].type = EDAC_MC_LAYER_CHANNEL;
+ layers[0].size = NUM_CHANS;
+ layers[0].is_virt_csrow = false;
+ layers[1].type = EDAC_MC_LAYER_SLOT;
+ layers[1].size = MAX_DIMMS;
+ layers[1].is_virt_csrow = true;
+ mci = edac_mc_alloc(i7core_dev->socket, ARRAY_SIZE(layers), layers,
+ sizeof(*pvt));
if (unlikely(!mci))
return -ENOMEM;
/*
* FIXME: For now, let's order by device function, as it makes
- * easier for driver's development proccess. This table should be
+ * easier for driver's development process. This table should be
* moved to pci_id.h when submitted upstream
*/
#define PCI_DEVICE_ID_INTEL_SBRIDGE_SAD0 0x3cf4 /* 12.6 */
struct sbridge_info info;
struct sbridge_channel channel[NUM_CHANNELS];
- int csrow_map[NUM_CHANNELS][MAX_DIMMS];
-
/* Memory type detection */
bool is_mirrored, is_lockstep, is_close_pg;
/****************************************************************************
- Anciliary status routines
+ Ancillary status routines
****************************************************************************/
static inline int numrank(u32 mtr)
}
/**
- * sbridge_get_active_channels() - gets the number of channels and csrows
+ * check_if_ecc_is_active() - Checks if ECC is active
* bus: Device bus
- * @channels: Number of channels that will be returned
- * @csrows: Number of csrows found
- *
- * Since EDAC core needs to know in advance the number of available channels
- * and csrows, in order to allocate memory for csrows/channels, it is needed
- * to run two similar steps. At the first step, implemented on this function,
- * it checks the number of csrows/channels present at one socket, identified
- * by the associated PCI bus.
- * this is used in order to properly allocate the size of mci components.
- * Note: one csrow is one dimm.
*/
- static int sbridge_get_active_channels(const u8 bus, unsigned *channels,
- unsigned *csrows)
+ static int check_if_ecc_is_active(const u8 bus)
{
struct pci_dev *pdev = NULL;
- int i, j;
u32 mcmtr;
- *channels = 0;
- *csrows = 0;
-
pdev = get_pdev_slot_func(bus, 15, 0);
if (!pdev) {
sbridge_printk(KERN_ERR, "Couldn't find PCI device "
sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n");
return -ENODEV;
}
-
- for (i = 0; i < NUM_CHANNELS; i++) {
- u32 mtr;
-
- /* Device 15 functions 2 - 5 */
- pdev = get_pdev_slot_func(bus, 15, 2 + i);
- if (!pdev) {
- sbridge_printk(KERN_ERR, "Couldn't find PCI device "
- "%2x.%02d.%d!!!\n",
- bus, 15, 2 + i);
- return -ENODEV;
- }
- (*channels)++;
-
- for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
- pci_read_config_dword(pdev, mtr_regs[j], &mtr);
- debugf1("Bus#%02x channel #%d MTR%d = %x\n", bus, i, j, mtr);
- if (IS_DIMM_PRESENT(mtr))
- (*csrows)++;
- }
- }
-
- debugf0("Number of active channels: %d, number of active dimms: %d\n",
- *channels, *csrows);
-
return 0;
}
- static int get_dimm_config(const struct mem_ctl_info *mci)
+ static int get_dimm_config(struct mem_ctl_info *mci)
{
struct sbridge_pvt *pvt = mci->pvt_info;
- struct csrow_info *csr;
+ struct dimm_info *dimm;
int i, j, banks, ranks, rows, cols, size, npages;
- int csrow = 0;
- unsigned long last_page = 0;
u32 reg;
enum edac_type mode;
enum mem_type mtype;
u32 mtr;
for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
+ dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+ i, j, 0);
pci_read_config_dword(pvt->pci_tad[i],
mtr_regs[j], &mtr);
debugf4("Channel #%d MTR%d = %x\n", i, j, mtr);
pvt->sbridge_dev->mc, i, j,
size, npages,
banks, ranks, rows, cols);
- csr = &mci->csrows[csrow];
-
- csr->first_page = last_page;
- csr->last_page = last_page + npages - 1;
- csr->page_mask = 0UL; /* Unused */
- csr->nr_pages = npages;
- csr->grain = 32;
- csr->csrow_idx = csrow;
- csr->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
- csr->ce_count = 0;
- csr->ue_count = 0;
- csr->mtype = mtype;
- csr->edac_mode = mode;
- csr->nr_channels = 1;
- csr->channels[0].chan_idx = i;
- csr->channels[0].ce_count = 0;
- pvt->csrow_map[i][j] = csrow;
- snprintf(csr->channels[0].label,
- sizeof(csr->channels[0].label),
+
+ dimm->nr_pages = npages;
+ dimm->grain = 32;
+ dimm->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
+ dimm->mtype = mtype;
+ dimm->edac_mode = mode;
+ snprintf(dimm->label, sizeof(dimm->label),
"CPU_SrcID#%u_Channel#%u_DIMM#%u",
pvt->sbridge_dev->source_id, i, j);
- last_page += npages;
- csrow++;
}
}
}
u8 *socket,
long *channel_mask,
u8 *rank,
- char *area_type)
+ char **area_type, char *msg)
{
struct mem_ctl_info *new_mci;
struct sbridge_pvt *pvt = mci->pvt_info;
- char msg[256];
int n_rir, n_sads, n_tads, sad_way, sck_xch;
int sad_interl, idx, base_ch;
int interleave_mode;
*/
if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) {
sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr);
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
if (addr >= (u64)pvt->tohm) {
sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr);
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
limit = SAD_LIMIT(reg);
if (limit <= prv) {
sprintf(msg, "Can't discover the memory socket");
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
if (addr <= limit)
}
if (n_sads == MAX_SAD) {
sprintf(msg, "Can't discover the memory socket");
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
- area_type = get_dram_attr(reg);
+ *area_type = get_dram_attr(reg);
interleave_mode = INTERLEAVE_MODE(reg);
pci_read_config_dword(pvt->pci_sad0, interleave_list[n_sads],
break;
default:
sprintf(msg, "Can't discover socket interleave");
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
*socket = sad_interleave[idx];
if (!new_mci) {
sprintf(msg, "Struct for socket #%u wasn't initialized",
*socket);
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
mci = new_mci;
limit = TAD_LIMIT(reg);
if (limit <= prv) {
sprintf(msg, "Can't discover the memory channel");
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
if (addr <= limit)
break;
default:
sprintf(msg, "Can't discover the TAD target");
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
*channel_mask = 1 << base_ch;
break;
default:
sprintf(msg, "Invalid mirror set. Can't decode addr");
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
} else
if (offset > addr) {
sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!",
offset, addr);
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
addr -= offset;
if (n_rir == MAX_RIR_RANGES) {
sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx",
ch_addr);
- edac_mc_handle_ce_no_info(mci, msg);
return -EINVAL;
}
rir_way = RIR_WAY(reg);
{
struct mem_ctl_info *new_mci;
struct sbridge_pvt *pvt = mci->pvt_info;
- char *type, *optype, *msg, *recoverable_msg;
+ enum hw_event_mc_err_type tp_event;
+ char *type, *optype, msg[256];
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
long channel_mask, first_channel;
u8 rank, socket;
- int csrow, rc, dimm;
- char *area_type = "Unknown";
-
- if (ripv)
- type = "NON_FATAL";
- else
- type = "FATAL";
+ int rc, dimm;
+ char *area_type = NULL;
+
+ if (uncorrected_error) {
+ if (ripv) {
+ type = "FATAL";
+ tp_event = HW_EVENT_ERR_FATAL;
+ } else {
+ type = "NON_FATAL";
+ tp_event = HW_EVENT_ERR_UNCORRECTED;
+ }
+ } else {
+ type = "CORRECTED";
+ tp_event = HW_EVENT_ERR_CORRECTED;
+ }
/*
- * According with Table 15-9 of the Intel Archictecture spec vol 3A,
+ * According with Table 15-9 of the Intel Architecture spec vol 3A,
* memory errors should fit in this mask:
* 000f 0000 1mmm cccc (binary)
* where:
} else {
switch (optypenum) {
case 0:
- optype = "generic undef request";
+ optype = "generic undef request error";
break;
case 1:
- optype = "memory read";
+ optype = "memory read error";
break;
case 2:
- optype = "memory write";
+ optype = "memory write error";
break;
case 3:
- optype = "addr/cmd";
+ optype = "addr/cmd error";
break;
case 4:
- optype = "memory scrubbing";
+ optype = "memory scrubbing error";
break;
default:
optype = "reserved";
}
rc = get_memory_error_data(mci, m->addr, &socket,
- &channel_mask, &rank, area_type);
+ &channel_mask, &rank, &area_type, msg);
if (rc < 0)
- return;
+ goto err_parsing;
new_mci = get_mci_for_node_id(socket);
if (!new_mci) {
- edac_mc_handle_ce_no_info(mci, "Error: socket got corrupted!");
- return;
+ strcpy(msg, "Error: socket got corrupted!");
+ goto err_parsing;
}
mci = new_mci;
pvt = mci->pvt_info;
else
dimm = 2;
- csrow = pvt->csrow_map[first_channel][dimm];
-
- if (uncorrected_error && recoverable)
- recoverable_msg = " recoverable";
- else
- recoverable_msg = "";
/*
- * FIXME: What should we do with "channel" information on mcelog?
- * Probably, we can just discard it, as the channel information
- * comes from the get_memory_error_data() address decoding
+ * FIXME: On some memory configurations (mirror, lockstep), the
+ * Memory Controller can't point the error to a single DIMM. The
+ * EDAC core should be handling the channel mask, in order to point
+ * to the group of dimm's where the error may be happening.
*/
- msg = kasprintf(GFP_ATOMIC,
- "%d %s error(s): %s on %s area %s%s: cpu=%d Err=%04x:%04x (ch=%d), "
- "addr = 0x%08llx => socket=%d, Channel=%ld(mask=%ld), rank=%d\n",
- core_err_cnt,
- area_type,
- optype,
- type,
- recoverable_msg,
- overflow ? "OVERFLOW" : "",
- m->cpu,
- mscod, errcode,
- channel, /* 1111b means not specified */
- (long long) m->addr,
- socket,
- first_channel, /* This is the real channel on SB */
- channel_mask,
- rank);
+ snprintf(msg, sizeof(msg),
+ "count:%d%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
+ core_err_cnt,
+ overflow ? " OVERFLOW" : "",
+ (uncorrected_error && recoverable) ? " recoverable" : "",
+ area_type,
+ mscod, errcode,
+ socket,
+ channel_mask,
+ rank);
debugf0("%s", msg);
+ /* FIXME: need support for channel mask */
+
/* Call the helper to output message */
- if (uncorrected_error)
- edac_mc_handle_fbd_ue(mci, csrow, 0, 0, msg);
- else
- edac_mc_handle_fbd_ce(mci, csrow, 0, msg);
+ edac_mc_handle_error(tp_event, mci,
+ m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
+ channel, dimm, -1,
+ optype, msg, m);
+ return;
+ err_parsing:
+ edac_mc_handle_error(tp_event, mci, 0, 0, 0,
+ -1, -1, -1,
+ msg, "", m);
- kfree(msg);
}
/*
static int sbridge_register_mci(struct sbridge_dev *sbridge_dev)
{
struct mem_ctl_info *mci;
+ struct edac_mc_layer layers[2];
struct sbridge_pvt *pvt;
- int rc, channels, csrows;
+ int rc;
/* Check the number of active and not disabled channels */
- rc = sbridge_get_active_channels(sbridge_dev->bus, &channels, &csrows);
+ rc = check_if_ecc_is_active(sbridge_dev->bus);
if (unlikely(rc < 0))
return rc;
/* allocate a new MC control structure */
- mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, sbridge_dev->mc);
+ layers[0].type = EDAC_MC_LAYER_CHANNEL;
+ layers[0].size = NUM_CHANNELS;
+ layers[0].is_virt_csrow = false;
+ layers[1].type = EDAC_MC_LAYER_SLOT;
+ layers[1].size = MAX_DIMMS;
+ layers[1].is_virt_csrow = true;
+ mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers,
+ sizeof(*pvt));
+
if (unlikely(!mci))
return -ENOMEM;