From c729de8fcea37a1c444e81857eace12494c804a9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:45 -0700 Subject: [PATCH 1/4] x86, kdump: Set crashkernel_low automatically Chao said that kdump does does work well on his system on 3.8 without extra parameter, even iommu does not work with kdump. And now have to append crashkernel_low=Y in first kernel to make kdump work. We have now modified crashkernel=X to allocate memory beyong 4G (if available) and do not allocate low range for crashkernel if the user does not specify that with crashkernel_low=Y. This causes regression if iommu is not enabled. Without iommu, swiotlb needs to be setup in first 4G and there is no low memory available to second kernel. Set crashkernel_low automatically if the user does not specify that. For system that does support IOMMU with kdump properly, user could specify crashkernel_low=0 to save that 72M low ram. -v3: add swiotlb_size() according to Konrad. -v4: add comments what 8M is for according to hpa. also update more crashkernel_low= in kernel-parameters.txt -v5: update changelog according to Vivek. -v6: Change description about swiotlb referring according to HATAYAMA. Reported-by: WANG Chao Tested-by: WANG Chao Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-2-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 14 +++++++++++--- arch/x86/kernel/setup.c | 21 ++++++++++++++++++--- include/linux/swiotlb.h | 1 + lib/swiotlb.c | 19 +++++++++++++++---- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37..cff672da2486 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -596,9 +596,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. is selected automatically. Check Documentation/kdump/kdump.txt for further details. - crashkernel_low=size[KMG] - [KNL, x86] parts under 4G. - crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is @@ -606,6 +603,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. + crashkernel_low=size[KMG] + [KNL, x86_64] range under 4G. When crashkernel= is + passed, kernel allocate physical memory region + above 4G, that cause second kernel crash on system + that require some amount of low memory, e.g. swiotlb + requires at least 64M+32K low memory. Kernel would + try to allocate 72M below 4G automatically. + This one let user to specify own low range under 4G + for second kernel instead. + 0: to disable low allocation. + cs89x0_dma= [HW,NET] Format: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 90d8cc930f5e..12349202cae7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -521,19 +521,34 @@ static void __init reserve_crashkernel_low(void) unsigned long long low_base = 0, low_size = 0; unsigned long total_low_mem; unsigned long long base; + bool auto_set = false; int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); - if (ret != 0 || low_size <= 0) - return; + if (ret != 0) { + /* + * two parts from lib/swiotlb.c: + * swiotlb size: user specified with swiotlb= or default. + * swiotlb overflow buffer: now is hardcoded to 32k. + * We round it to 8M for other buffers that + * may need to stay low too. + */ + low_size = swiotlb_size_or_default() + (8UL<<20); + auto_set = true; + } else { + /* passed with crashkernel_low=0 ? */ + if (!low_size) + return; + } low_base = memblock_find_in_range(low_size, (1ULL<<32), low_size, alignment); if (!low_base) { - pr_info("crashkernel low reservation failed - No suitable area found.\n"); + if (!auto_set) + pr_info("crashkernel low reservation failed - No suitable area found.\n"); return; } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 2de42f9401d2..a5ffd32642fd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -25,6 +25,7 @@ extern int swiotlb_force; extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); +unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); /* diff --git a/lib/swiotlb.c b/lib/swiotlb.c index bfe02b8fc55b..d23762e6652c 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -105,9 +105,9 @@ setup_io_tlb_npages(char *str) if (!strcmp(str, "force")) swiotlb_force = 1; - return 1; + return 0; } -__setup("swiotlb=", setup_io_tlb_npages); +early_param("swiotlb", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ unsigned long swiotlb_nr_tbl(void) @@ -115,6 +115,18 @@ unsigned long swiotlb_nr_tbl(void) return io_tlb_nslabs; } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); + +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) +unsigned long swiotlb_size_or_default(void) +{ + unsigned long size; + + size = io_tlb_nslabs << IO_TLB_SHIFT; + + return size ? size : (IO_TLB_DEFAULT_SIZE); +} + /* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) @@ -188,8 +200,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { - /* default to 64MB */ - size_t default_size = 64UL<<20; + size_t default_size = IO_TLB_DEFAULT_SIZE; unsigned char *vstart; unsigned long bytes; From 55a20ee7804ab64ac90bcdd4e2868a42829e2784 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:47 -0700 Subject: [PATCH 2/4] x86, kdump: Retore crashkernel= to allocate under 896M Vivek found old kexec-tools does not work new kernel anymore. So change back crashkernel= back to old behavoir, and add crashkernel_high= to let user decide if buffer could be above 4G, and also new kexec-tools will be needed. -v2: let crashkernel=X override crashkernel_high= update description about _high will be ignored by crashkernel=X -v3: update description about kernel-parameters.txt according to Vivek. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-4-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 13 +++++++++++-- arch/x86/kernel/setup.c | 24 +++++++++++++++++++----- include/linux/kexec.h | 2 ++ kernel/kexec.c | 9 +++++++++ 4 files changed, 41 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cff672da2486..709eb3edc6b2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -603,9 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. + crashkernel_high=size[KMG] + [KNL, x86_64] range could be above 4G. Allow kernel + to allocate physical memory region from top, so could + be above 4G if system have more than 4G ram installed. + Otherwise memory region will be allocated below 4G, if + available. + It will be ignored if crashkernel=X is specified. crashkernel_low=size[KMG] - [KNL, x86_64] range under 4G. When crashkernel= is - passed, kernel allocate physical memory region + [KNL, x86_64] range under 4G. When crashkernel_high= is + passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory. Kernel would @@ -613,6 +620,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This one let user to specify own low range under 4G for second kernel instead. 0: to disable low allocation. + It will be ignored when crashkernel_high=X is not used + or memory reserved is below 4G. cs89x0_dma= [HW,NET] Format: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 12349202cae7..a85a144f2052 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -507,11 +507,14 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX MAXMEM +# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) +# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM #endif static void __init reserve_crashkernel_low(void) @@ -525,6 +528,7 @@ static void __init reserve_crashkernel_low(void) int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + /* crashkernel_low=YM */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { @@ -569,14 +573,22 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; + bool high = false; int ret; total_mem = memblock_phys_mem_size(); + /* crashkernel=XM */ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; + if (ret != 0 || crash_size <= 0) { + /* crashkernel_high=XM */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } /* 0 means: find the address automatically */ if (crash_base <= 0) { @@ -584,7 +596,9 @@ static void __init reserve_crashkernel(void) * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - CRASH_KERNEL_ADDR_MAX, crash_size, alignment); + high ? CRASH_KERNEL_ADDR_HIGH_MAX : + CRASH_KERNEL_ADDR_LOW_MAX, + crash_size, alignment); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d2e6927bbaae..d78d28a733b1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -200,6 +200,8 @@ extern size_t vmcoreinfo_max_size; int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b6..1b2f73f5f9b9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1422,6 +1422,15 @@ int __init parse_crashkernel(char *cmdline, "crashkernel="); } +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_high="); +} + int __init parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, From adbc742bf78695bb98c79d18c558b61571748b99 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:48 -0700 Subject: [PATCH 3/4] x86, kdump: Change crashkernel_high/low= to crashkernel=,high/low Per hpa, use crashkernel=X,high crashkernel=Y,low instead of crashkernel_hign=X crashkernel_low=Y. As that could be extensible. -v2: according to Vivek, change delimiter to ; -v3: let hign and low only handle simple form and it conforms to description in kernel-parameters.txt still keep crashkernel=X override any crashkernel=X,high crashkernel=Y,low -v4: update get_last_crashkernel returning and add more strict checking in parse_crashkernel_simple() found by HATAYAMA. -v5: Change delimiter back to , according to HPA. also separate parse_suffix from parse_simper according to vivek. so we can avoid @pos in that path. -v6: Tight the checking about crashkernel=X,highblahblah,high found by HTYAYAMA. Cc: HATAYAMA Daisuke Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-5-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 10 +-- arch/x86/kernel/setup.c | 6 +- kernel/kexec.c | 109 ++++++++++++++++++++++++---- 3 files changed, 104 insertions(+), 21 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 709eb3edc6b2..a1ac1f1d6d34 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -603,16 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. - crashkernel_high=size[KMG] + crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel to allocate physical memory region from top, so could be above 4G if system have more than 4G ram installed. Otherwise memory region will be allocated below 4G, if available. It will be ignored if crashkernel=X is specified. - crashkernel_low=size[KMG] - [KNL, x86_64] range under 4G. When crashkernel_high= is - passed, kernel could allocate physical memory region + crashkernel=size[KMG],low + [KNL, x86_64] range under 4G. When crashkernel=X,high + is passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory. Kernel would @@ -620,7 +620,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This one let user to specify own low range under 4G for second kernel instead. 0: to disable low allocation. - It will be ignored when crashkernel_high=X is not used + It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. cs89x0_dma= [HW,NET] diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a85a144f2052..fae9134a2de9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -528,7 +528,7 @@ static void __init reserve_crashkernel_low(void) int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); - /* crashkernel_low=YM */ + /* crashkernel=Y,low */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { @@ -542,7 +542,7 @@ static void __init reserve_crashkernel_low(void) low_size = swiotlb_size_or_default() + (8UL<<20); auto_set = true; } else { - /* passed with crashkernel_low=0 ? */ + /* passed with crashkernel=0,low ? */ if (!low_size) return; } @@ -582,7 +582,7 @@ static void __init reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) { - /* crashkernel_high=XM */ + /* crashkernel=X,high */ ret = parse_crashkernel_high(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b2f73f5f9b9..401fdb041f35 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1368,35 +1368,114 @@ static int __init parse_crashkernel_simple(char *cmdline, return 0; } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - const char *name) + const char *name, + const char *suffix) { - char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; + char *ck_cmdline; BUG_ON(!crash_size || !crash_base); *crash_size = 0; *crash_base = 0; - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - ck_cmdline = p; - p = strstr(p+1, name); - } + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); if (!ck_cmdline) return -EINVAL; ck_cmdline += strlen(name); + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + crash_base, suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -1413,13 +1492,17 @@ static int __init __parse_crashkernel(char *cmdline, return 0; } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel="); + "crashkernel=", NULL); } int __init parse_crashkernel_high(char *cmdline, @@ -1428,7 +1511,7 @@ int __init parse_crashkernel_high(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_high="); + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); } int __init parse_crashkernel_low(char *cmdline, @@ -1437,7 +1520,7 @@ int __init parse_crashkernel_low(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_low="); + "crashkernel=", suffix_tbl[SUFFIX_LOW]); } static void update_vmcoreinfo_note(void) From 157752d84f5df47e01577970f9c5f61a0b9f4546 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:46 -0700 Subject: [PATCH 4/4] kexec: use Crash kernel for Crash kernel low We can extend kexec-tools to support multiple "Crash kernel" in /proc/iomem instead. So we can use "Crash kernel" instead of "Crash kernel low" in /proc/iomem. Suggested-by: Vivek Goyal Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-3-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index 401fdb041f35..ffd4e111fd67 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -55,7 +55,7 @@ struct resource crashk_res = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; struct resource crashk_low_res = { - .name = "Crash kernel low", + .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM