Skip to content
This repository has been archived by the owner on Aug 27, 2022. It is now read-only.

Commit

Permalink
x86/mce: Provide boot argument to honour bios-set CMCI threshold
Browse files Browse the repository at this point in the history
The ACPI spec doesn't provide for a way for the bios to pass down
recommended thresholds to the OS on a _per-bank_ basis. This patch adds
a new boot option, which if passed, tells Linux to use CMCI thresholds
set by the bios.

As fail-safe, we initialize threshold to 1 if some banks have not been
initialized by the bios and warn the user.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
  • Loading branch information
rnav authored and aegl committed Sep 27, 2012
1 parent 961ebea commit 450cc20
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 3 deletions.
7 changes: 7 additions & 0 deletions Documentation/x86/x86_64/boot-options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ Machine check
monarchtimeout:
Sets the time in us to wait for other CPUs on machine checks. 0
to disable.
mce=bios_cmci_threshold
Don't overwrite the bios-set CMCI threshold. This boot option
prevents Linux from overwriting the CMCI threshold set by the
bios. Without this option, Linux always sets the CMCI
threshold to 1. Enabling this may make memory predictive failure
analysis less effective if the bios sets thresholds for memory
errors since we will not see details for all errors.

nomce (for compatibility with i386): same as mce=off

Expand Down
1 change: 1 addition & 0 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device);
#ifdef CONFIG_X86_MCE_INTEL
extern int mce_cmci_disabled;
extern int mce_ignore_ce;
extern int mce_bios_cmci_threshold;
void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
Expand Down
10 changes: 10 additions & 0 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly;
int mce_bios_cmci_threshold __read_mostly;

struct mce_bank *mce_banks __read_mostly;

Expand Down Expand Up @@ -1946,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
Expand All @@ -1965,6 +1967,8 @@ static int __init mcheck_enable(char *str)
mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
else if (!strcmp(str, "bios_cmci_threshold"))
mce_bios_cmci_threshold = 1;
else if (isdigit(str[0])) {
get_option(&str, &tolerant);
if (*str == ',') {
Expand Down Expand Up @@ -2205,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
&mce_cmci_disabled
};

static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
__ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
&mce_bios_cmci_threshold
};

static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
Expand All @@ -2213,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr,
&dev_attr_bios_cmci_threshold.attr,
NULL
};

Expand Down
35 changes: 32 additions & 3 deletions arch/x86/kernel/cpu/mcheck/mce_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,12 @@ static void cmci_discover(int banks)
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
unsigned long flags;
int i;
int bios_wrong_thresh = 0;

raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;

if (test_bit(i, owned))
continue;
Expand All @@ -198,20 +200,47 @@ static void cmci_discover(int banks)
continue;
}

val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
if (!mce_bios_cmci_threshold) {
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= CMCI_THRESHOLD;
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
* If bios_cmci_threshold boot option was specified
* but the threshold is zero, we'll try to initialize
* it to 1.
*/
bios_zero_thresh = 1;
val |= CMCI_THRESHOLD;
}

val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);

/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
/*
* We are able to set thresholds for some banks that
* had a threshold of 0. This means the BIOS has not
* set the thresholds properly or does not work with
* this boot option. Note down now and report later.
*/
if (mce_bios_cmci_threshold && bios_zero_thresh &&
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
bios_wrong_thresh = 1;
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mce_bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
pr_info_once(
"bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
}
}

/*
Expand Down Expand Up @@ -249,7 +278,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
Expand Down

0 comments on commit 450cc20

Please sign in to comment.