acpi, memory-hotplug: parse SRAT before memblock is ready

On linux, the pages used by kernel could not be migrated.  As a result,
if a memory range is used by kernel, it cannot be hot-removed.  So if we
want to hot-remove memory, we should prevent kernel from using it.

The way now used to prevent this is specify a memory range by
movablemem_map boot option and set it as ZONE_MOVABLE.

But when the system is booting, memblock will allocate memory, and
reserve the memory for kernel.  And before we parse SRAT, and know the
node memory ranges, memblock is working.  And it may allocate memory in
ranges to be set as ZONE_MOVABLE.  This memory can be used by kernel,
and never be freed.

So, let's parse SRAT before memblock is called first.  And it is early
enough.

The first call of memblock_find_in_range_node() is in:

  setup_arch()
    |-->setup_real_mode()

so, this patch add a function early_parse_srat() to parse SRAT, and call
it before setup_real_mode() is called.

NOTE:

1) early_parse_srat() is called before numa_init(), and has initialized
   numa_meminfo.  So DO NOT clear numa_nodes_parsed in numa_init() and DO
   NOT zero numa_meminfo in numa_init(), otherwise we will lose memory
   numa info.

2) I don't know why using count of memory affinities parsed from SRAT
   as a return value in original acpi_numa_init().  So I add a static
   variable srat_mem_cnt to remember this count and use it as the return
   value of the new acpi_numa_init()

[mhocko@suse.cz: parse SRAT before memblock is ready fix]
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Wu Jianguo <wujianguo@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Len Brown <lenb@kernel.org>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Tang Chen 2013-02-22 16:33:44 -08:00 committed by Linus Torvalds
parent fb06bc8e5f
commit e8d1955258
4 changed files with 34 additions and 16 deletions

View File

@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check(); setup_bios_corruption_check();
#endif #endif
/*
* In the memory hotplug case, the kernel needs info from SRAT to
* determine which memory is hotpluggable before allocating memory
* using memblock.
*/
acpi_boot_table_init();
early_acpi_boot_init();
early_parse_srat();
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
(max_pfn_mapped<<PAGE_SHIFT) - 1); (max_pfn_mapped<<PAGE_SHIFT) - 1);
@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
/* /*
* Parse the ACPI tables for possible boot-time SMP configuration. * Parse the ACPI tables for possible boot-time SMP configuration.
*/ */
acpi_boot_table_init();
early_acpi_boot_init();
initmem_init(); initmem_init();
memblock_find_dma_reserve(); memblock_find_dma_reserve();

View File

@ -560,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
for (i = 0; i < MAX_LOCAL_APIC; i++) for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE); set_apicid_to_node(i, NUMA_NO_NODE);
nodes_clear(numa_nodes_parsed); /*
* Do not clear numa_nodes_parsed or zero numa_meminfo here, because
* SRAT was parsed earlier in early_parse_srat().
*/
nodes_clear(node_possible_map); nodes_clear(node_possible_map);
nodes_clear(node_online_map); nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
numa_reset_distance(); numa_reset_distance();

View File

@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
handler, max_entries); handler, max_entries);
} }
int __init acpi_numa_init(void) static int srat_mem_cnt;
{
int cnt = 0;
void __init early_parse_srat(void)
{
/* /*
* Should not limit number with cpu num that is from NR_CPUS or nr_cpus= * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
* SRAT cpu entries could have different order with that in MADT. * SRAT cpu entries could have different order with that in MADT.
@ -295,21 +295,24 @@ int __init acpi_numa_init(void)
/* SRAT: Static Resource Affinity Table */ /* SRAT: Static Resource Affinity Table */
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
acpi_parse_x2apic_affinity, 0); acpi_parse_x2apic_affinity, 0);
acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
acpi_parse_processor_affinity, 0); acpi_parse_processor_affinity, 0);
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity, acpi_parse_memory_affinity,
NR_NODE_MEMBLKS); NR_NODE_MEMBLKS);
} }
}
int __init acpi_numa_init(void)
{
/* SLIT: System Locality Information Table */ /* SLIT: System Locality Information Table */
acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
acpi_numa_arch_fixup(); acpi_numa_arch_fixup();
if (cnt < 0) if (srat_mem_cnt < 0)
return cnt; return srat_mem_cnt;
else if (!parsed_numa_memblks) else if (!parsed_numa_memblks)
return -ENOENT; return -ENOENT;
return 0; return 0;

View File

@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev,
#endif /* !CONFIG_ACPI */ #endif /* !CONFIG_ACPI */
#ifdef CONFIG_ACPI_NUMA
void __init early_parse_srat(void);
#else
static inline void early_parse_srat(void)
{
}
#endif
#ifdef CONFIG_ACPI #ifdef CONFIG_ACPI
void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
u32 pm1a_ctrl, u32 pm1b_ctrl)); u32 pm1a_ctrl, u32 pm1b_ctrl));