From 6ffac1e90a17ea0aded5c581204397421eec91b6 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 24 Jul 2008 18:07:56 -0700
Subject: [PATCH 01/32] x64, fpu: fix possible FPU leakage in error conditions

On Thu, Jul 24, 2008 at 03:43:44PM -0700, Linus Torvalds wrote:
> So how about this patch as a starting point? This is the RightThing(tm) to
> do regardless, and if it then makes it easier to do some other cleanups,
> we should do it first. What do you think?

restore_fpu_checking() calls init_fpu() in error conditions.

While this is wrong(as our main intention is to clear the fpu state of
the thread), this was benign before commit 92d140e21f1 ("x86: fix taking
DNA during 64bit sigreturn").

Post commit 92d140e21f1, live FPU registers may not belong to this
process at this error scenario.

In the error condition for restore_fpu_checking() (especially during the
64bit signal return), we are doing init_fpu(), which saves the live FPU
register state (possibly belonging to some other process context) into
the thread struct (through unlazy_fpu() in init_fpu()). This is wrong
and can leak the FPU data.

For the signal handler restore error condition in restore_i387(), clear
the fpu state present in the thread struct(before ultimately sending a
SIGSEGV for badframe).

For the paranoid error condition check in math_state_restore(), send a
SIGSEGV, if we fail to restore the state.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: <stable@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/signal_64.c | 11 ++++++++++-
 arch/x86/kernel/traps_64.c  |  9 ++++++++-
 include/asm-x86/i387.h      |  2 --
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index b45ef8ddd651..ca316b5b742c 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -104,7 +104,16 @@ static inline int restore_i387(struct _fpstate __user *buf)
 		clts();
 		task_thread_info(current)->status |= TS_USEDFPU;
 	}
-	return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+	err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+	if (unlikely(err)) {
+		/*
+		 * Encountered an error while doing the restore from the
+		 * user buffer, clear the fpu state.
+		 */
+		clear_fpu(tsk);
+		clear_used_math();
+	}
+	return err;
 }
 
 /*
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 3f18d73f420c..513caaca7115 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -1131,7 +1131,14 @@ asmlinkage void math_state_restore(void)
 	}
 
 	clts();				/* Allow maths ops (or we recurse) */
-	restore_fpu_checking(&me->thread.xstate->fxsave);
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
+		stts();
+		force_sig(SIGSEGV, me);
+		return;
+	}
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index 96fa8449ff11..0048fb77afc4 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -62,8 +62,6 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 #else
 		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
 #endif
-	if (unlikely(err))
-		init_fpu(current);
 	return err;
 }
 

From 0ed89b06e49c326bff81d81f24b9ba955eb912d5 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Wed, 13 Aug 2008 10:17:24 +0200
Subject: [PATCH 02/32] x86: propagate new nonpanic bootmem macros to
 CONFIG_HAVE_ARCH_BOOTMEM_NODE

Commit 74768ed833344b "page allocator: use no-panic variant of
alloc_bootmem() in alloc_large_system_hash()" introduced two new
_nopanic macros which are undefined for CONFIG_HAVE_ARCH_BOOTMEM_NODE.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Acked-by: "Jan Beulich" <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/mmzone_32.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h
index b2298a227567..5862e6460658 100644
--- a/include/asm-x86/mmzone_32.h
+++ b/include/asm-x86/mmzone_32.h
@@ -97,10 +97,16 @@ static inline int pfn_valid(int pfn)
 	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
 #define alloc_bootmem(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_nopanic(x) \
+	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
+				__pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_nopanic(x) \
+	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
+				__pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x)					\

From 7b27718bdb1b70166383dec91391df5534d449ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joro@8bytes.org>
Date: Wed, 13 Aug 2008 10:07:05 +0200
Subject: [PATCH 03/32] x86: fix setup code crashes on my old 486 box

yesterday I tried to reactivate my old 486 box and wanted to install a
current Linux with latest kernel on it. But it turned out that the
latest kernel does not boot because the machine crashes early in the
setup code.

After some debugging it turned out that the problem is the query_ist()
function. If this interrupt with that function is called the machine
simply locks up. It looks like a BIOS bug. Looking for a workaround for
this problem I wrote the attached patch. It checks for the CPUID
instruction and if it is not implemented it does not call the speedstep
BIOS function. As far as I know speedstep should be available since some
Pentium earliest.

Alan Cox observed that it's available since the Pentium II, so cpuid
levels 4 and 5 can be excluded altogether.

H. Peter Anvin cleaned up the code some more:

> Right in concept, but I dislike the implementation (duplication of the
> CPU detect code we already have).  Could you try this patch and see if
> it works for you?

which, with a small modification to fix a build error with it the
resulting kernel boots on my machine.

Signed-off-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/boot.h     | 8 ++++++++
 arch/x86/boot/cpucheck.c | 8 +-------
 arch/x86/boot/main.c     | 4 ++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a34b9982c7cb..9d4b4b43d97a 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -25,6 +25,8 @@
 #include <asm/boot.h>
 #include <asm/setup.h>
 
+#define NCAPINTS   8
+
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
@@ -242,6 +244,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
 int cmdline_find_option_bool(const char *option);
 
 /* cpu.c, cpucheck.c */
+struct cpu_features {
+	int level;		/* Family, or 64 for x86-64 */
+	int model;
+	u32 flags[NCAPINTS];
+};
+extern struct cpu_features cpu;
 int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
 int validate_cpu(void);
 
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 7804389ee005..c1ce0303d994 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -30,13 +30,7 @@
 #include <asm/required-features.h>
 #include <asm/msr-index.h>
 
-struct cpu_features {
-	int level;		/* Family, or 64 for x86-64 */
-	int model;
-	u32 flags[NCAPINTS];
-};
-
-static struct cpu_features cpu;
+struct cpu_features cpu;
 static u32 cpu_vendor[3];
 static u32 err_flags[NCAPINTS];
 
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 2296164b54d2..01aa64b5575b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,6 +73,10 @@ static void keyboard_set_repeat(void)
  */
 static void query_ist(void)
 {
+	/* Some 486 BIOSes apparently crash on this call */
+	if (cpu.level < 6)
+		return;
+
 	asm("int $0x15"
 	    : "=a" (boot_params.ist_info.signature),
 	      "=b" (boot_params.ist_info.command),

From c9d08f0860d47ed6a3fe91d0f19335086179be7b Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Tue, 12 Aug 2008 23:23:03 +0200
Subject: [PATCH 04/32] x86: fix 2 section mismatch warnings - map_high()

WARNING: vmlinux.o(.text+0x14cf8): Section mismatch in reference from the function map_high() to the function .init.text:init_extra_mapping_uc()
The function map_high() references
the function __init init_extra_mapping_uc().
This is often because map_high lacks a __init
annotation or the annotation of init_extra_mapping_uc is wrong.

WARNING: vmlinux.o(.text+0x14d05): Section mismatch in reference from the function map_high() to the function .init.text:init_extra_mapping_wb()
The function map_high() references
the function __init init_extra_mapping_wb().
This is often because map_high lacks a __init
annotation or the annotation of init_extra_mapping_wb is wrong.

map_high is called only from __init functions (map_*_high)
and calls 2 __init_functions (init_extra_mapping_*)

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/genx2apic_uv_x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2cfcbded888a..2d7e307c7779 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -222,7 +222,7 @@ static __init void map_low_mmrs(void)
 
 enum map_type {map_wb, map_uc};
 
-static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
 {
 	unsigned long bytes, paddr;
 

From 6b3560229d3b6be7443fa9f9c6502e660bcfef5f Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Tue, 12 Aug 2008 23:23:05 +0200
Subject: [PATCH 05/32] x86: fix 2 section mismatch warnings -
 find_and_reserve_crashkernel

WARNING: vmlinux.o(.text+0xcd1f): Section mismatch in reference from the function find_and_reserve_crashkernel() to the function .init.text:find_e820_area()
The function find_and_reserve_crashkernel() references
the function __init find_e820_area().
This is often because find_and_reserve_crashkernel lacks a __init
annotation or the annotation of find_e820_area is wrong.

WARNING: vmlinux.o(.text+0xcd38): Section mismatch in reference from the function find_and_reserve_crashkernel() to the function .init.text:reserve_bootmem_generic()
The function find_and_reserve_crashkernel() references
the function __init reserve_bootmem_generic().
This is often because find_and_reserve_crashkernel lacks a __init
annotation or the annotation of reserve_bootmem_generic is wrong.

find_and_reserve_crashkernel is called from __init function (reserve_crashkernel)
and calls 2 __init functions (find_e820_area, reserve_bootmem_generic),
so mark it __init

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 68b48e3fbcbd..a4656adab53b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -445,7 +445,7 @@ static void __init reserve_early_setup_data(void)
  * @size: Size of the crashkernel memory to reserve.
  * Returns the base address on success, and -1ULL on failure.
  */
-unsigned long long find_and_reserve_crashkernel(unsigned long long size)
+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
 {
 	const unsigned long long alignment = 16<<20; 	/* 16M */
 	unsigned long long start = 0LL;

From a726c6009e6eba4acfccf8b683854866eeabb184 Mon Sep 17 00:00:00 2001
From: John Keller <jpk@sgi.com>
Date: Tue, 29 Jul 2008 14:34:16 -0500
Subject: [PATCH 06/32] x86: allow MMCONFIG above 4GB on x86_64

SGI UV will have MMCFG base addresses that are greater than 4GB (32 bits).

v2: Use CONFIG_RESOURCES_64BIT instead of CONFIG_X86_64.
v3: Create a flag, that is set by platform specific code,
    to disable the > 4GB check.

Signed-off-by: John Keller <jpk@sgi.com>
Cc: jpk@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fa88a1d71290..bfd10fd211cd 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -97,6 +97,8 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 #warning ACPI uses CMPXCHG, i486 and later hardware
 #endif
 
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
 /* --------------------------------------------------------------------------
                               Boot-time Configuration
    -------------------------------------------------------------------------- */
@@ -158,6 +160,14 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 struct acpi_mcfg_allocation *pci_mmcfg_config;
 int pci_mmcfg_config_num;
 
+static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
+{
+	if (!strcmp(mcfg->header.oem_id, "SGI"))
+		acpi_mcfg_64bit_base_addr = TRUE;
+
+	return 0;
+}
+
 int __init acpi_parse_mcfg(struct acpi_table_header *header)
 {
 	struct acpi_table_mcfg *mcfg;
@@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
 	}
 
 	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+
+	acpi_mcfg_oem_check(mcfg);
+
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
-		if (pci_mmcfg_config[i].address > 0xFFFFFFFF) {
+		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
+		    !acpi_mcfg_64bit_base_addr) {
 			printk(KERN_ERR PREFIX
 			       "MMCONFIG not in low 4GB of memory\n");
 			kfree(pci_mmcfg_config);

From 875e40b97571e1f06d1184ad6cbb2acf9cb31a23 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jul 2008 12:26:26 -0700
Subject: [PATCH 07/32] x86: use WARN() in arch/x86/mm/pageattr.c

Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes
part of the warning section for better reporting/collection.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: akpm@linux-foundation.org
Cc: arjan@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr-test.c | 3 +--
 arch/x86/mm/pageattr.c      | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 0dcd42eb94e6..d4aa503caaa2 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -221,8 +221,7 @@ static int pageattr_test(void)
 	failed += print_split(&sc);
 
 	if (failed) {
-		printk(KERN_ERR "NOT PASSED. Please report.\n");
-		WARN_ON(1);
+		WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
 		return -EINVAL;
 	} else {
 		if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 65c6e46bf059..ba24537d69dd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -592,10 +592,9 @@ repeat:
 	if (!pte_val(old_pte)) {
 		if (!primary)
 			return 0;
-		printk(KERN_WARNING "CPA: called for zero pte. "
+		WARN(1, KERN_WARNING "CPA: called for zero pte. "
 		       "vaddr = %lx cpa->vaddr = %lx\n", address,
 		       cpa->vaddr);
-		WARN_ON(1);
 		return -EINVAL;
 	}
 

From c2dcfde8274883e1f6050784dcbd34b01e824b91 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 13 Aug 2008 13:14:22 -0700
Subject: [PATCH 08/32] x86: cleanup for setup code crashes during IST probe

Clean up the code for crashes during SpeedStep probing on older
machines.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/boot.h     | 4 ++--
 arch/x86/boot/cpu.c      | 3 ---
 arch/x86/boot/cpucheck.c | 2 --
 arch/x86/boot/main.c     | 3 ++-
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 9d4b4b43d97a..616b804a2295 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,8 +24,8 @@
 #include <linux/edd.h>
 #include <asm/boot.h>
 #include <asm/setup.h>
-
-#define NCAPINTS   8
+#include "bitops.h"
+#include <asm/cpufeature.h>
 
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 92d6fd73dc7d..75298fe2edca 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,9 +16,6 @@
  */
 
 #include "boot.h"
-#include "bitops.h"
-#include <asm/cpufeature.h>
-
 #include "cpustr.h"
 
 static char *cpu_name(int level)
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index c1ce0303d994..4b9ae7c56748 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,10 +22,8 @@
 
 #ifdef _SETUP
 # include "boot.h"
-# include "bitops.h"
 #endif
 #include <linux/types.h>
-#include <asm/cpufeature.h>
 #include <asm/processor-flags.h>
 #include <asm/required-features.h>
 #include <asm/msr-index.h>
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 01aa64b5575b..197421db1af1 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,7 +73,8 @@ static void keyboard_set_repeat(void)
  */
 static void query_ist(void)
 {
-	/* Some 486 BIOSes apparently crash on this call */
+	/* Some older BIOSes apparently crash on this call, so filter
+	   it from machines too old to have SpeedStep at all. */
 	if (cpu.level < 6)
 		return;
 

From 23b49c19f6946cc33392a1fc75dd788dd4a90fb7 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Mon, 11 Aug 2008 14:55:31 -0700
Subject: [PATCH 09/32] x86: resurrect proper handling of maxcpus= kernel
 option (v2)

For some reason we had two parsers registered for maxcpus=. One in init/main.c
and another in arch/x86/smpboot.c. So I nuked the one in arch/x86.

Also 64-bit kernels used to handle maxcpus= as documented in
Documentation/cpu-hotplug.txt. CPUs with 'id > maxcpus' are initialized
but not booted. 32-bit version for some reason ignored them even though
all the infrastructure for booting them later is there.

In the current mainline both 64 and 32 bit versions are broken.
This patch restores the correct behaviour. I've tested x86_64 version on
4- and 8- way Core2 and 2-way Opteron based machines. Various config
combinations SMP, !SMP, CPU_HOTPLUG, !CPU_HOTPLUG.
Booted with maxcpus=1 and maxcpus=4, etc. Everything is working as expected.

So far we've received two reports from different people confirming that 32-bit
version also works fine, both on dual core laptops and 16way server machines.

[v2: This version fixes visws breakage pointed out by Ingo.]

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: lizf@cn.fujitsu.com
Cc: jeff.chua.linux@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c      |  8 --------
 arch/x86/kernel/apic_64.c      |  7 -------
 arch/x86/kernel/smpboot.c      | 14 --------------
 arch/x86/kernel/visws_quirks.c |  6 ++----
 4 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 039a8d4aaf62..f88bd0d982b0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1454,8 +1454,6 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 	}
 }
 
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
-
 void __cpuinit generic_processor_info(int apicid, int version)
 {
 	int cpu;
@@ -1482,12 +1480,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		return;
 	}
 
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
-		return;
-	}
-
 	num_processors++;
 	cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 7f1f030da7ee..446c062e831c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -90,7 +90,6 @@ static unsigned long apic_phys;
 
 unsigned long mp_lapic_addr;
 
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
 /*
  * Get the LAPIC version
  */
@@ -1062,12 +1061,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		return;
 	}
 
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-		       " Processor ignored.\n", maxcpus);
-		return;
-	}
-
 	num_processors++;
 	cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 91055d7fc1b0..e25287e4a85f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1386,17 +1386,3 @@ void __cpu_die(unsigned int cpu)
 	BUG();
 }
 #endif
-
-/*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
- */
-static int __init parse_maxcpus(char *arg)
-{
-	extern unsigned int maxcpus;
-
-	if (arg)
-		maxcpus = simple_strtoul(arg, NULL, 0);
-	return 0;
-}
-early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 41e01b145c48..594ef47f0a63 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -184,8 +184,6 @@ static int __init visws_get_smp_config(unsigned int early)
 	return 1;
 }
 
-extern unsigned int __cpuinitdata maxcpus;
-
 /*
  * The Visual Workstation is Intel MP compliant in the hardware
  * sense, but it doesn't have a BIOS(-configuration table).
@@ -244,8 +242,8 @@ static int __init visws_find_smp_config(unsigned int reserve)
 		ncpus = CO_CPU_MAX;
 	}
 
-	if (ncpus > maxcpus)
-		ncpus = maxcpus;
+	if (ncpus > setup_max_cpus)
+		ncpus = setup_max_cpus;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	smp_found_config = 1;

From 858f774733b72609acb28104475f131abb912c08 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 14 Aug 2008 02:16:29 -0700
Subject: [PATCH 10/32] x86: don't call e820_regiter_active_regions if out of
 range on node

so we don't get warning on 32bit system with 64g RAM or more

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_32.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 1eb2973a301c..16ae70fc57e7 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void)
  * start of the node, and that the current "end" address is after
  * the previous one.
  */
-static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
 {
 	/*
 	 * Only add present memory as told by the e820.
@@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 	if (memory_chunk->start_pfn >= max_pfn) {
 		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
 			memory_chunk->start_pfn, memory_chunk->end_pfn);
-		return;
+		return -1;
 	}
 	if (memory_chunk->nid != nid)
-		return;
+		return -1;
 
 	if (!node_has_online_mem(nid))
 		node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 
 	if (node_end_pfn[nid] < memory_chunk->end_pfn)
 		node_end_pfn[nid] = memory_chunk->end_pfn;
+
+	return 0;
 }
 
 int __init get_memcfg_from_srat(void)
@@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void)
 		printk(KERN_DEBUG
 			"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
-		node_read_chunk(chunk->nid, chunk);
+		if (node_read_chunk(chunk->nid, chunk))
+			continue;
+
 		e820_register_active_regions(chunk->nid, chunk->start_pfn,
 					     min(chunk->end_pfn, max_pfn));
 	}

From a58f03b07539f6575adaa011712fa139c9343742 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 14 Aug 2008 02:16:30 -0700
Subject: [PATCH 11/32] x86: check bigsmp in smp_sanity_check instead of cpu_up

clear bits for cpu nr > 8.

This allows us to boot the full range of possible CPUs that the
supported APIC model will allow. Previously we'd hang or boot up
with less than 8 CPUs.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Tested-by: Jeff Chua <jeff.chua.linux@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e25287e4a85f..a8fb8a980fae 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -994,17 +994,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 	flush_tlb_all();
 	low_mappings = 1;
 
-#ifdef CONFIG_X86_PC
-	if (def_to_bigsmp && apicid > 8) {
-		printk(KERN_WARNING
-			"More than 8 CPUs detected - skipping them.\n"
-			"Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
-		err = -1;
-	} else
-		err = do_boot_cpu(apicid, cpu);
-#else
 	err = do_boot_cpu(apicid, cpu);
-#endif
 
 	zap_low_mappings();
 	low_mappings = 0;
@@ -1058,6 +1048,34 @@ static __init void disable_smp(void)
 static int __init smp_sanity_check(unsigned max_cpus)
 {
 	preempt_disable();
+
+#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+	if (def_to_bigsmp && nr_cpu_ids > 8) {
+		unsigned int cpu;
+		unsigned nr;
+
+		printk(KERN_WARNING
+		       "More than 8 CPUs detected - skipping them.\n"
+		       "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
+
+		nr = 0;
+		for_each_present_cpu(cpu) {
+			if (nr >= 8)
+				cpu_clear(cpu, cpu_present_map);
+			nr++;
+		}
+
+		nr = 0;
+		for_each_possible_cpu(cpu) {
+			if (nr >= 8)
+				cpu_clear(cpu, cpu_possible_map);
+			nr++;
+		}
+
+		nr_cpu_ids = 8;
+	}
+#endif
+
 	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 		printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
 				    "by the BIOS.\n", hard_smp_processor_id());

From a6825f1c1fa83b1e92b6715ee5771a4d6524d3b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Aug 2008 12:17:06 +0200
Subject: [PATCH 12/32] x86: hpet: workaround SB700 BIOS

AMD SB700 based systems with spread spectrum enabled use a SMM based
HPET emulation to provide proper frequency setting. The SMM code is
initialized with the first HPET register access and takes some time to
complete. During this time the config register reads 0xffffffff. We
check for max. 1000 loops whether the config register reads a non
0xffffffff value to make sure that HPET is up and running before we go
further. A counting loop is safe, as the HPET access takes thousands
of CPU cycles. On non SB700 based machines this check is only done
once and has no side effects.

Based on a quirk patch from: crane cai <crane.cai@amd.com>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad2b15a1334d..59fd3b6b1303 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -359,6 +359,7 @@ static int hpet_clocksource_register(void)
 int __init hpet_enable(void)
 {
 	unsigned long id;
+	int i;
 
 	if (!is_hpet_capable())
 		return 0;
@@ -369,6 +370,29 @@ int __init hpet_enable(void)
 	 * Read the period and check for a sane value:
 	 */
 	hpet_period = hpet_readl(HPET_PERIOD);
+
+	/*
+	 * AMD SB700 based systems with spread spectrum enabled use a
+	 * SMM based HPET emulation to provide proper frequency
+	 * setting. The SMM code is initialized with the first HPET
+	 * register access and takes some time to complete. During
+	 * this time the config register reads 0xffffffff. We check
+	 * for max. 1000 loops whether the config register reads a non
+	 * 0xffffffff value to make sure that HPET is up and running
+	 * before we go further. A counting loop is safe, as the HPET
+	 * access takes thousands of CPU cycles. On non SB700 based
+	 * machines this check is only done once and has no side
+	 * effects.
+	 */
+	for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
+		if (i == 1000) {
+			printk(KERN_WARNING
+			       "HPET config register value = 0xFFFFFFFF. "
+			       "Disabling HPET\n");
+			goto out_nohpet;
+		}
+	}
+
 	if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
 		goto out_nohpet;
 

From 967060d00d7ab8e992963a966cd3d18156c02d55 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Thu, 14 Aug 2008 15:43:33 -0700
Subject: [PATCH 13/32] x86, msr: fix NULL pointer deref due to msr_open on
 nonexistent CPUs

msr_open tests for someone trying to open a device for a nonexistent CPU.
However, the function always returns 0, not ret like it should, hence
userspace can BUG the kernel trivially.  This bug was introduced by the
cdev lock_kernel pushdown patch last May.

The BUG can be reproduced with these commands:

# mknod fubar c 202 8 <-- pick a number less than NR_CPUS that is not
                          the number of an online CPU
# cat fubar

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/msr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 9fd809552447..e43938086885 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -131,7 +131,7 @@ static int msr_open(struct inode *inode, struct file *file)
 		ret = -EIO;	/* MSR not supported */
 out:
 	unlock_kernel();
-	return 0;
+	return ret;
 }
 
 /*

From ef31023743e66de7184e9aad432291c842a6384b Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 14 Aug 2008 15:07:03 -0400
Subject: [PATCH 14/32] x86: silence mmconfig printk

There's so much broken mmconfig hardware/bios'es out there,
that classing this as an error seems a little extreme.
Lower its priority to KERN_INFO so that it isn't so noisy
when booting with 'quiet'

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/pci/mmconfig-shared.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 23faaa890ffc..2bd5c53f6386 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -365,7 +365,7 @@ static void __init pci_mmcfg_reject_broken(int early)
 	return;
 
 reject:
-	printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+	printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
 	pci_mmcfg_arch_free();
 	kfree(pci_mmcfg_config);
 	pci_mmcfg_config = NULL;

From 519c31bacf78a969efa8d2e55ed8862848f28590 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 14 Aug 2008 19:55:15 +0200
Subject: [PATCH 15/32] x86, AMD IOMMU: use status bit instead of memory
 write-back for completion wait

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c       | 17 ++++++++++-------
 include/asm-x86/amd_iommu_types.h |  4 ++++
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 22d7d050905d..028e945c68ad 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -101,16 +101,13 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
  */
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
-	int ret;
+	int ret, ready = 0;
+	unsigned status = 0;
 	struct iommu_cmd cmd;
-	volatile u64 ready = 0;
-	unsigned long ready_phys = virt_to_phys(&ready);
 	unsigned long i = 0;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
-	cmd.data[1] = upper_32_bits(ready_phys);
-	cmd.data[2] = 1; /* value written to 'ready' */
+	cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
 	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
 
 	iommu->need_sync = 0;
@@ -122,9 +119,15 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 
 	while (!ready && (i < EXIT_LOOP_COUNT)) {
 		++i;
-		cpu_relax();
+		/* wait for the bit to become one */
+		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+		ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
 	}
 
+	/* set bit back to zero */
+	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
 	if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
 		printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
 
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h
index 22aa58ca1991..32543229db76 100644
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -69,6 +69,9 @@
 #define MMIO_EVT_TAIL_OFFSET	0x2018
 #define MMIO_STATUS_OFFSET	0x2020
 
+/* MMIO status bits */
+#define MMIO_STATUS_COM_WAIT_INT_MASK	0x04
+
 /* feature control bits */
 #define CONTROL_IOMMU_EN        0x00ULL
 #define CONTROL_HT_TUN_EN       0x01ULL
@@ -89,6 +92,7 @@
 #define CMD_INV_IOMMU_PAGES     0x03
 
 #define CMD_COMPL_WAIT_STORE_MASK	0x01
+#define CMD_COMPL_WAIT_INT_MASK		0x02
 #define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01
 #define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02
 

From 9f5f5fb35d2934fe7dc0cb019854a030efd10cd7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 14 Aug 2008 19:55:16 +0200
Subject: [PATCH 16/32] x86, AMD IOMMU: initialize device table properly

This patch adds device table initializations which forbids memory accesses
for devices per default and disables all page faults.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c  | 18 ++++++++++++++++++
 include/asm-x86/amd_iommu_types.h |  1 +
 2 files changed, 19 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index d9a9da597e79..ceba33811537 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -800,6 +800,21 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
 	return 0;
 }
 
+/*
+ * Init the device table to not allow DMA access for devices and
+ * suppress all page faults
+ */
+static void init_device_table(void)
+{
+	u16 devid;
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
+		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
+		set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
+	}
+}
+
 /*
  * This function finally enables all IOMMUs found in the system after
  * they have been initialized
@@ -931,6 +946,9 @@ int __init amd_iommu_init(void)
 	if (amd_iommu_pd_alloc_bitmap == NULL)
 		goto free;
 
+	/* init the device table */
+	init_device_table();
+
 	/*
 	 * let all alias entries point to itself
 	 */
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h
index 32543229db76..f0beca73e364 100644
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -103,6 +103,7 @@
 #define DEV_ENTRY_TRANSLATION   0x01
 #define DEV_ENTRY_IR            0x3d
 #define DEV_ENTRY_IW            0x3e
+#define DEV_ENTRY_NO_PAGE_FAULT	0x62
 #define DEV_ENTRY_EX            0x67
 #define DEV_ENTRY_SYSMGT1       0x68
 #define DEV_ENTRY_SYSMGT2       0x69

From 8a456695c5020d6317f9c7af190999e9414b0d3e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 14 Aug 2008 19:55:17 +0200
Subject: [PATCH 17/32] x86m AMD IOMMU: cleanup: replace LOW_U32 macro with
 generic lower_32_bits

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c       | 2 +-
 include/asm-x86/amd_iommu_types.h | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 028e945c68ad..de39e1f2ede5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -164,7 +164,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 	address &= PAGE_MASK;
 	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
 	cmd.data[1] |= domid;
-	cmd.data[2] = LOW_U32(address);
+	cmd.data[2] = lower_32_bits(address);
 	cmd.data[3] = upper_32_bits(address);
 	if (s) /* size bit - we flush more than one 4kb page */
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h
index f0beca73e364..dcc812067394 100644
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -31,9 +31,6 @@
 #define ALIAS_TABLE_ENTRY_SIZE		2
 #define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
 
-/* helper macros */
-#define LOW_U32(x) ((x) & ((1ULL << 32)-1))
-
 /* Length of the MMIO region for the AMD IOMMU */
 #define MMIO_REGION_LENGTH       0x4000
 

From 129d6aba444d1e99d4cbfb9866a4652912426b65 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 14 Aug 2008 19:55:18 +0200
Subject: [PATCH 18/32] x86, AMD IOMMU: initialize dma_ops after sysfs
 registration

If sysfs registration fails all memory used by IOMMU is freed. This
happens after dma_ops initialization and the functions will access the
freed memory then.

Fix this by initializing dma_ops after the sysfs registration.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index ceba33811537..a69cc0f52042 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -972,10 +972,6 @@ int __init amd_iommu_init(void)
 	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
 		goto free;
 
-	ret = amd_iommu_init_dma_ops();
-	if (ret)
-		goto free;
-
 	ret = sysdev_class_register(&amd_iommu_sysdev_class);
 	if (ret)
 		goto free;
@@ -984,6 +980,10 @@ int __init amd_iommu_init(void)
 	if (ret)
 		goto free;
 
+	ret = amd_iommu_init_dma_ops();
+	if (ret)
+		goto free;
+
 	enable_iommus();
 
 	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",

From dcc984166870150709f0c645b521a47becd9a047 Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu, 14 Aug 2008 16:32:15 -0400
Subject: [PATCH 19/32] x86, perfctr: don't use CCCR_OVF_PMI1 on Pentium 4Ds

Currently, setup_p4_watchdog() use CCCR_OVF_PMI1 to enable the counter
overflow interrupts to the second logical core. But this bit doesn't work
on Pentium 4 Ds (model 4, stepping 4) and this patch avoids its use on
these processors. Tested on 4 different machines that have this
specific model with success.

Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Cc: jvillalovos@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perfctr-watchdog.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index de7439f82b92..05cc22dbd4ff 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -478,7 +478,13 @@ static int setup_p4_watchdog(unsigned nmi_hz)
 		perfctr_msr = MSR_P4_IQ_PERFCTR1;
 		evntsel_msr = MSR_P4_CRU_ESCR0;
 		cccr_msr = MSR_P4_IQ_CCCR1;
-		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+
+		/* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
+		if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
+			cccr_val = P4_CCCR_OVF_PMI0;
+		else
+			cccr_val = P4_CCCR_OVF_PMI1;
+		cccr_val |= P4_CCCR_ESCR_SELECT(4);
 	}
 
 	evntsel = P4_ESCR_EVENT_SELECT(0x3F)

From 394a15051c33f2b18e72f42283b36a9388fa414b Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 14 Aug 2008 09:11:26 -0500
Subject: [PATCH 20/32] x86: invalidate caches before going into suspend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a CPU core is shut down, all of its caches need to be flushed
to prevent stale data from causing errors if the core is resumed.
Current Linux suspend code performs an assignment after the flush,
which can add dirty data back to the cache.  On some AMD platforms,
additional speculative reads have caused crashes on resume because
of this dirty data.

Relocate the cache flush to be the very last thing done before
halting.  Tie into an assembly line so the compile will not
reorder it.  Add some documentation explaining what is going
on and why we're doing this.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Acked-by: Mark Borden <mark.borden@amd.com>
Acked-by: Michael Hohmuth <michael.hohmuth@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c |  5 ++---
 arch/x86/kernel/process_64.c |  5 ++---
 include/asm-x86/processor.h  | 23 +++++++++++++++++++++++
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 53bc653ed5ca..3b7a1ddcc0bc 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -95,7 +95,6 @@ static inline void play_dead(void)
 {
 	/* This must be done before dead CPU ack */
 	cpu_exit_clear();
-	wbinvd();
 	mb();
 	/* Ack it */
 	__get_cpu_var(cpu_state) = CPU_DEAD;
@@ -104,8 +103,8 @@ static inline void play_dead(void)
 	 * With physical CPU hotplug, we should halt the cpu
 	 */
 	local_irq_disable();
-	while (1)
-		halt();
+	/* mask all interrupts, flush any and all caches, and halt */
+	wbinvd_halt();
 }
 #else
 static inline void play_dead(void)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3fb62a7d9a16..71553b664e2a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,14 +93,13 @@ DECLARE_PER_CPU(int, cpu_state);
 static inline void play_dead(void)
 {
 	idle_task_exit();
-	wbinvd();
 	mb();
 	/* Ack it */
 	__get_cpu_var(cpu_state) = CPU_DEAD;
 
 	local_irq_disable();
-	while (1)
-		halt();
+	/* mask all interrupts, flush any and all caches, and halt */
+	wbinvd_halt();
 }
 #else
 static inline void play_dead(void)
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 5f58da401b43..4df3e2f6fb56 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -728,6 +728,29 @@ extern unsigned long		boot_option_idle_override;
 extern unsigned long		idle_halt;
 extern unsigned long		idle_nomwait;
 
+/*
+ * on systems with caches, caches must be flashed as the absolute
+ * last instruction before going into a suspended halt.  Otherwise,
+ * dirty data can linger in the cache and become stale on resume,
+ * leading to strange errors.
+ *
+ * perform a variety of operations to guarantee that the compiler
+ * will not reorder instructions.  wbinvd itself is serializing
+ * so the processor will not reorder.
+ *
+ * Systems without cache can just go into halt.
+ */
+static inline void wbinvd_halt(void)
+{
+	mb();
+	/* check for clflush to determine if wbinvd is legal */
+	if (cpu_has_clflush)
+		asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
+	else
+		while (1)
+			halt();
+}
+
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 

From 04b69447f79eade34e92f3117a39e8fa6ecb519b Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 14 Aug 2008 17:16:50 +0200
Subject: [PATCH 21/32] arch/x86/Kconfig: clean up, experimental adjustement

Adjust experimental tags in Kconfig, update config to notice that
i386/x86_64 is now single architecture.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac2fb0641a04..68d91c8233f4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -951,9 +951,9 @@ config NUMA
 	  local memory controller of the CPU and add some more
 	  NUMA awareness to the kernel.
 
-	  For i386 this is currently highly experimental and should be only
+	  For 32-bit this is currently highly experimental and should be only
 	  used for kernel development. It might also cause boot failures.
-	  For x86_64 this is recommended on all multiprocessor Opteron systems.
+	  For 64-bit this is recommended on all multiprocessor Opteron systems.
 	  If the system is EM64T, you should say N unless your system is
 	  EM64T NUMA.
 
@@ -1263,7 +1263,7 @@ config KEXEC
 	  strongly in flux, so no good recommendation can be made.
 
 config CRASH_DUMP
-	bool "kernel crash dumps (EXPERIMENTAL)"
+	bool "kernel crash dumps"
 	depends on X86_64 || (X86_32 && HIGHMEM)
 	help
 	  Generate crash dump after being started by kexec.

From 1c5b0eb66d74683e2be5da0c53e33c1f4ca982fd Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Wed, 13 Aug 2008 21:07:07 +0200
Subject: [PATCH 22/32] x86: fix readb() et al compile error with gcc-3.2.3

Building 2.6.27-rc1 on x86 with gcc-3.2.3 fails with:

In file included from include/asm/dma.h:12,
                 from include/linux/bootmem.h:8,
                 from init/main.c:26:
include/asm/io.h: In function `readb':
include/asm/io.h:32: syntax error before string constant
include/asm/io.h: In function `readw':
include/asm/io.h:33: syntax error before string constant
include/asm/io.h: In function `readl':
include/asm/io.h:34: syntax error before string constant
include/asm/io.h: In function `__readb':
include/asm/io.h:36: syntax error before string constant
include/asm/io.h: In function `__readw':
include/asm/io.h:37: syntax error before string constant
include/asm/io.h: In function `__readl':
include/asm/io.h:38: syntax error before string constant
make[1]: *** [init/main.o] Error 1
make: *** [init] Error 2

Starting with 2.6.27-rc1 readb() et al are generated by a
build_mmio_read() macro, which generates asm() statements with
output register constraints like "=" "q", i.e. as two adjacent
string literals. This doesn't work with gcc-3.2.3.

Fixed by moving the "=" part into the callers' reg parameter
(as suggested by Ingo).

Build and boot-tested with gcc-3.2.3 on 32 and 64-bit x86.

Fixes <http://bugzilla.kernel.org/show_bug.cgi?id=11205>.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/io.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h
index bf5d629b3a39..0f954dc89cb3 100644
--- a/include/asm-x86/io.h
+++ b/include/asm-x86/io.h
@@ -21,7 +21,7 @@ extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
-{ type ret; asm volatile("mov" size " %1,%0":"=" reg (ret) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
 :"m" (*(volatile type __force *)addr) barrier); return ret; }
 
 #define build_mmio_write(name, size, type, reg, barrier) \
@@ -29,13 +29,13 @@ static inline void name(type val, volatile void __iomem *addr) \
 { asm volatile("mov" size " %0,%1": :reg (val), \
 "m" (*(volatile type __force *)addr) barrier); }
 
-build_mmio_read(readb, "b", unsigned char, "q", :"memory")
-build_mmio_read(readw, "w", unsigned short, "r", :"memory")
-build_mmio_read(readl, "l", unsigned int, "r", :"memory")
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
 
-build_mmio_read(__readb, "b", unsigned char, "q", )
-build_mmio_read(__readw, "w", unsigned short, "r", )
-build_mmio_read(__readl, "l", unsigned int, "r", )
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
 
 build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
 build_mmio_write(writew, "w", unsigned short, "r", :"memory")
@@ -59,8 +59,8 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 #define mmiowb() barrier()
 
 #ifdef CONFIG_X86_64
-build_mmio_read(readq, "q", unsigned long, "r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "r", )
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
 build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
 build_mmio_write(__writeq, "q", unsigned long, "r", )
 

From a06de63000b95e1ed1c6373a72376876c952608e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 15 Aug 2008 13:58:32 +0100
Subject: [PATCH 23/32] x86: fix /proc/meminfo DirectMap

Do we actually want these DirectMap lines in the x86 /proc/meminfo?
I can see they're interesting to CPA developers and TLB optimizers,
but they don't fit its usual "where has all my memory gone?" usage.
If they are to stay, here are some fixes.

1. On x86_32 without PAE, they're not 2M but 4M pages: no need to
   mess with the internal enum, but show the right name to users.

2. Many machines can never show anything but 0 for DirectMap1G,
   so suppress that line unless direct_gbpages are really enabled.

3. The unit in /proc/meminfo is kB not number of pages: HugePages
   messed that up, but they're an example to regret not to follow.

4. Once we use kB, it's easy to see that 1GB has gone missing (which
   explains why CONFIG_CPA_DEBUG=y soon wraps DirectMap2M negative):
   because head_64.S's level2_ident_pgt entries were not counted.
   My fix is not ideal, but works for more and for less than 1G,
   and avoids interfering with early bootup pagetable contortions.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c  |  6 +++++-
 arch/x86/mm/pageattr.c | 18 ++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 129618ca0ea2..b3e6c3075acc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -60,7 +60,7 @@ static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-int direct_gbpages __meminitdata
+int direct_gbpages
 #ifdef CONFIG_DIRECT_GBPAGES
 				= 1
 #endif
@@ -314,6 +314,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 {
 	unsigned long pages = 0;
 	unsigned long last_map_addr = end;
+	unsigned long start = address;
 
 	int i = pmd_index(address);
 
@@ -334,6 +335,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 			if (!pmd_large(*pmd))
 				last_map_addr = phys_pte_update(pmd, address,
 								 end);
+			/* Count entries we're using from level2_ident_pgt */
+			if (start == 0)
+				pages++;
 			continue;
 		}
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index ba24537d69dd..f5f5154ea11e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -55,13 +55,19 @@ static void split_page_count(int level)
 
 int arch_report_meminfo(char *page)
 {
-	int n = sprintf(page, "DirectMap4k:  %8lu\n"
-			"DirectMap2M:  %8lu\n",
-			direct_pages_count[PG_LEVEL_4K],
-			direct_pages_count[PG_LEVEL_2M]);
+	int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+	n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
 #ifdef CONFIG_X86_64
-	n += sprintf(page + n, "DirectMap1G:  %8lu\n",
-		     direct_pages_count[PG_LEVEL_1G]);
+	if (direct_gbpages)
+		n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_1G] << 20);
 #endif
 	return n;
 }

From 15636668449d4135ac77a79715ba430a81aed16d Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <arozansk@redhat.com>
Date: Fri, 15 Aug 2008 08:36:14 -0400
Subject: [PATCH 24/32] x86, NMI: fix watchdog failure message

> it just won't work at boot time - the second logic unit will be stuck:
>
> Booting processor 1/2 APIC 0x1
> Initializing CPU#1
> Calibrating delay using timer specific routine.. 5586.12 BogoMIPS (lpj=2793063)
> CPU: Trace cache: 12K uops, L1 D cache: 16K
> CPU: L2 cache: 1024K
> CPU: Physical Processor ID: 0
> CPU: Processor Core ID: 1
> CPU1: Thermal monitoring enabled (TM1)
>               Intel(R) Pentium(R) D CPU 2.80GHz stepping 04
> Brought up 2 CPUs
> testing NMI watchdog ... <4>WARNING: CPU#1: NMI appears to be stuck (0->0)!

while at it... - fix that newline

Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Cc: jvillalo@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ac6d51222e7d..919473ad4a29 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -142,11 +142,15 @@ int __init check_nmi_watchdog(void)
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
 		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+			printk("\n");
 			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
 				"appears to be stuck (%d->%d)!\n",
 				cpu,
 				prev_nmi_count[cpu],
 				get_nmi_count(cpu));
+			printk(KERN_WARNING "Please report this to "
+			       "linux-kernel@vger.kernel.org and attach "
+			       "the output of 'dmesg' command.\n");
 			per_cpu(wd_enabled, cpu) = 0;
 			atomic_dec(&nmi_active);
 		}

From 8bb851900f5d0a79d3fddac808cc670d9894ef67 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 Aug 2008 15:34:32 +0200
Subject: [PATCH 25/32] x86, nmi: clean UP NMI watchdog failure message

clean up the failure message - and redirect people to bugzilla
instead of lkml.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 919473ad4a29..abb78a2cc4ad 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
+static void report_broken_nmi(int cpu, int *prev_nmi_count)
+{
+	printk(KERN_CONT "\n");
+
+	printk(KERN_WARNING
+		"WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+			cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
+
+	printk(KERN_WARNING
+		"Please report this to bugzilla.kernel.org,\n");
+	printk(KERN_WARNING
+		"and attach the output of the 'dmesg' command.\n");
+
+	per_cpu(wd_enabled, cpu) = 0;
+	atomic_dec(&nmi_active);
+}
+
 int __init check_nmi_watchdog(void)
 {
 	unsigned int *prev_nmi_count;
@@ -141,19 +158,8 @@ int __init check_nmi_watchdog(void)
 	for_each_online_cpu(cpu) {
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
-		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-			printk("\n");
-			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
-				"appears to be stuck (%d->%d)!\n",
-				cpu,
-				prev_nmi_count[cpu],
-				get_nmi_count(cpu));
-			printk(KERN_WARNING "Please report this to "
-			       "linux-kernel@vger.kernel.org and attach "
-			       "the output of 'dmesg' command.\n");
-			per_cpu(wd_enabled, cpu) = 0;
-			atomic_dec(&nmi_active);
-		}
+		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
+			report_broken_nmi(cpu, prev_nmi_count);
 	}
 	endflag = 1;
 	if (!atomic_read(&nmi_active)) {

From 7bc069c6bc4ede519a7116be1b9e149a1dbf787a Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 4 Aug 2008 14:38:54 +0100
Subject: [PATCH 26/32] x86: fix spin_is_contended()

The masked difference is what needs to be compared against 1, rather
than the difference of masked values (which can be negative).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/spinlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h
index 4f9a9861799a..e39c790dbfd2 100644
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -65,7 +65,7 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
 
-	return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1;
+	return (((tmp >> 8) - tmp) & 0xff) > 1;
 }
 
 static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
@@ -127,7 +127,7 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
 
-	return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1;
+	return (((tmp >> 16) - tmp) & 0xffff) > 1;
 }
 
 static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)

From 9744f5a32853642f8ed0749a1c9ed8cf9c9c9dc4 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 3 Aug 2008 19:25:48 +0200
Subject: [PATCH 27/32] x86, acpi: cleanup, temp_stack is used only when
 CONFIG_SMP is set

fix:

  arch/x86/kernel/acpi/sleep.c:24: warning: 'temp_stack' defined but not used

[ Sven Wegener <sven.wegener@stealer.net>: fix build bug ]

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fa2161d5003b..81e5ab6542d8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags;
 /* address in low memory of the wakeup routine. */
 static unsigned long acpi_realmode;
 
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
 static char temp_stack[10240];
 #endif
 

From 0d5cdc97e242a5589e5dca23277675f4b4482490 Mon Sep 17 00:00:00 2001
From: Jens Rottmann <JRottmann@LiPPERTEmbedded.de>
Date: Mon, 4 Aug 2008 14:40:16 +0200
Subject: [PATCH 28/32] x86, geode-mfgpt: check IRQ before using MFGPT as
 clocksource

Adds a simple IRQ autodetection to the AMD Geode MFGPT driver, and more
importantly, adds some checks, if IRQs can actually be received on the
chosen line.  This fixes cases where MFGPT is selected as clocksource
though not producing any ticks, so the kernel simply starves during
boot.

Signed-off-by: Jens Rottmann <JRottmann@LiPPERTEmbedded.de>
Cc: Andres Salomon <dilinger@debian.org>
Cc: linux-geode@bombadil.infradead.org
Cc: Jordan Crouse <jordan.crouse@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mfgpt_32.c | 52 +++++++++++++++++++++++++++-----------
 include/asm-x86/geode.h    |  3 ++-
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
 #include <linux/module.h>
 #include <asm/geode.h>
 
+#define MFGPT_DEFAULT_IRQ	7
+
 static struct mfgpt_timer_t {
 	unsigned int avail:1;
 } mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
 }
 EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
 
-int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
+int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
 {
-	u32 val, dummy;
-	int offset;
+	u32 zsel, lpc, dummy;
+	int shift;
 
 	if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
 		return -EIO;
 
-	if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
+	/*
+	 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
+	 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
+	 * 2, and we mustn't use nor change it.
+	 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
+	 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
+	 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
+	 */
+	rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
+	shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
+	if (((zsel >> shift) & 0xF) == 2)
 		return -EIO;
 
-	rdmsr(MSR_PIC_ZSEL_LOW, val, dummy);
+	/* Choose IRQ: if none supplied, keep IRQ already set or use default */
+	if (!*irq)
+		*irq = (zsel >> shift) & 0xF;
+	if (!*irq)
+		*irq = MFGPT_DEFAULT_IRQ;
 
-	offset = (timer % 4) * 4;
-
-	val &= ~((0xF << offset) | (0xF << (offset + 16)));
+	/* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
+	if (*irq < 1 || *irq == 2 || *irq > 15)
+		return -EIO;
+	rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
+	if (lpc & (1 << *irq))
+		return -EIO;
 
+	/* All chosen and checked - go for it */
+	if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
+		return -EIO;
 	if (enable) {
-		val |= (irq & 0x0F) << (offset);
-		val |= (irq & 0x0F) << (offset + 16);
+		zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
+		wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
 	}
 
-	wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
 	return 0;
 }
 
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
 static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
 static u16 mfgpt_event_clock;
 
-static int irq = 7;
+static int irq;
 static int __init mfgpt_setup(char *str)
 {
 	get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
 	mfgpt_event_clock = timer;
 
 	/* Set up the IRQ on the MFGPT side */
-	if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) {
+	if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
 		printk(KERN_ERR "mfgpt-timer:  Could not set up IRQ %d\n", irq);
 		return -EIO;
 	}
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
 			&mfgpt_clockevent);
 
 	printk(KERN_INFO
-	       "mfgpt-timer:  registering the MFGPT timer as a clock event.\n");
+	       "mfgpt-timer:  Registering MFGPT timer %d as a clock event, using IRQ %d\n",
+	       timer, irq);
 	clockevents_register_device(&mfgpt_clockevent);
 
 	return 0;
 
 err:
-	geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq);
+	geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
 	printk(KERN_ERR
 	       "mfgpt-timer:  Unable to set up the MFGPT clock source\n");
 	return -EIO;
diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h
index bb06027fc83e..2c1cda0b8a86 100644
--- a/include/asm-x86/geode.h
+++ b/include/asm-x86/geode.h
@@ -50,6 +50,7 @@ extern int geode_get_dev_base(unsigned int dev);
 #define MSR_PIC_YSEL_HIGH	0x51400021
 #define MSR_PIC_ZSEL_LOW	0x51400022
 #define MSR_PIC_ZSEL_HIGH	0x51400023
+#define MSR_PIC_IRQM_LPC	0x51400025
 
 #define MSR_MFGPT_IRQ		0x51400028
 #define MSR_MFGPT_NR		0x51400029
@@ -237,7 +238,7 @@ static inline u16 geode_mfgpt_read(int timer, u16 reg)
 }
 
 extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
-extern int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable);
+extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
 extern int geode_mfgpt_alloc_timer(int timer, int domain);
 
 #define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)

From 66d4bdf22b8652cda215e2653c8bbec7a767ed57 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 31 Jul 2008 16:48:31 +0100
Subject: [PATCH 29/32] x86-64: fix overlap of modules and fixmap areas

Plus add a build time check so this doesn't go unnoticed again.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head64.c     | 1 +
 include/asm-x86/pgtable_64.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1b318e903bf6..9bfc4d72fb2e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -88,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 				(__START_KERNEL & PGDIR_MASK)));
+	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index ac5fff4cc58a..549144d03d99 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -151,7 +151,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
 #define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
 #define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
-#define MODULES_END      _AC(0xfffffffffff00000, UL)
+#define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
 #ifndef __ASSEMBLY__

From fc0091b3c86396afc8e6c273aff21671cf882ee1 Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Fri, 15 Aug 2008 17:21:14 +0100
Subject: [PATCH 30/32] x86: change init_gdt to update the gdt via write_gdt,
 rather than a direct write.

By writing directly, a memory access violation can occur whilst
hotplugging a CPU if the entry was previously marked read-only.

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Cc: Jeremy Fitzhardinge <Jeremy.Fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpcommon.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
 DEFINE_PER_CPU(unsigned long, this_cpu_off);
 EXPORT_PER_CPU_SYMBOL(this_cpu_off);
 
-/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
-   (still using the master per-cpu area), or a CPU doing it for a
-   secondary which will soon come up. */
+/*
+ * Initialize the CPU's GDT.  This is either the boot CPU doing itself
+ * (still using the master per-cpu area), or a CPU doing it for a
+ * secondary which will soon come up.
+ */
 __cpuinit void init_gdt(int cpu)
 {
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	struct desc_struct gdt;
 
-	pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
-			__per_cpu_offset[cpu], 0xFFFFF,
+	pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
 			0x2 | DESCTYPE_S, 0x8);
+	gdt.s = 1;
 
-	gdt[GDT_ENTRY_PERCPU].s = 1;
+	write_gdt_entry(get_cpu_gdt_table(cpu),
+			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
 
 	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
 	per_cpu(cpu_number, cpu) = cpu;

From 8d6ea9674cb12b90c800dc572214bf06f6ce8340 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Fri, 15 Aug 2008 18:32:24 +0200
Subject: [PATCH 31/32] x86: fix section mismatch warning - spp_getpage()

WARNING: vmlinux.o(.text+0x17a3e): Section mismatch in reference from the function set_pte_vaddr_pud() to the function .init.text:spp_getpage()
The function set_pte_vaddr_pud() references
the function __init spp_getpage().
This is often because set_pte_vaddr_pud lacks a __init
annotation or the annotation of spp_getpage is wrong.

spp_getpage is called from __init (__init_extra_mapping) and
non __init (set_pte_vaddr_pud) functions, so it can't be __init.
Unfortunately it calls alloc_bootmem_pages which is __init,
but does it only when bootmem allocator is available (after_bootmem == 0).

So annotate it accordingly.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/init_64.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b3e6c3075acc..a87ea0e4b3dc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,7 +88,11 @@ early_param("gbpages", parse_direct_gbpages_on);
 
 int after_bootmem;
 
-static __init void *spp_getpage(void)
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
 {
 	void *ptr;
 

From 2fdc86901d2ab30a12402b46238951d2a7891590 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 Aug 2008 18:02:18 +0200
Subject: [PATCH 32/32] x86: add MAP_STACK mmap flag

as per this discussion:

   http://lkml.org/lkml/2008/8/12/423

Pardo reported that 64-bit threaded apps, if their stacks exceed the
combined size of ~4GB, slow down drastically in pthread_create() - because
glibc uses MAP_32BIT to allocate the stacks. The use of MAP_32BIT is
a legacy hack - to speed up context switching on certain early model
64-bit P4 CPUs.

So introduce a new flag to be used by glibc instead, to not constrain
64-bit apps like this.

glibc can switch to this new flag straight away - it will be ignored
by the kernel. If those old CPUs ever matter to anyone, support for
it can be implemented.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Ulrich Drepper <drepper@gmail.com>
---
 include/asm-x86/mman.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/asm-x86/mman.h b/include/asm-x86/mman.h
index c1682b542daf..90bc4108a4fd 100644
--- a/include/asm-x86/mman.h
+++ b/include/asm-x86/mman.h
@@ -12,6 +12,7 @@
 #define MAP_NORESERVE	0x4000		/* don't check for reservations */
 #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
+#define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 
 #define MCL_CURRENT	1		/* lock all current mappings */
 #define MCL_FUTURE	2		/* lock all future mappings */