From 886cc68943ebe8cf7e5f970be33459f95068a441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 14 Feb 2020 14:49:52 +0000 Subject: [PATCH 1/8] accel/tcg: fix race in cpu_exec_step_atomic (bug 1863025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug describes a race whereby cpu_exec_step_atomic can acquire a TB which is invalidated by a tb_flush before we execute it. This doesn't affect the other cpu_exec modes as a tb_flush by it's nature can only occur on a quiescent system. The race was described as: B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code B3. tcg_tb_alloc obtains a new TB C3. TB obtained with tb_lookup__cpu_state or tb_gen_code (same TB as B2) A3. start_exclusive critical section entered A4. do_tb_flush is called, TB memory freed/re-allocated A5. end_exclusive exits critical section B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code B3. tcg_tb_alloc reallocates TB from B2 C4. start_exclusive critical section entered C5. cpu_tb_exec executes the TB code that was free in A4 The simplest fix is to widen the exclusive period to include the TB lookup. As a result we can drop the complication of checking we are in the exclusive region before we end it. Cc: Yifan Buglink: https://bugs.launchpad.net/qemu/+bug/1863025 Reviewed-by: Paolo Bonzini Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20200214144952.15502-1-alex.bennee@linaro.org> Signed-off-by: Richard Henderson --- accel/tcg/cpu-exec.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 2560c90eec..d95c4848a4 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -240,6 +240,8 @@ void cpu_exec_step_atomic(CPUState *cpu) uint32_t cf_mask = cflags & CF_HASH_MASK; if (sigsetjmp(cpu->jmp_env, 0) == 0) { + start_exclusive(); + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); if (tb == NULL) { mmap_lock(); @@ -247,8 +249,6 @@ void cpu_exec_step_atomic(CPUState *cpu) mmap_unlock(); } - start_exclusive(); - /* Since we got here, we know that parallel_cpus must be true. */ parallel_cpus = false; cc->cpu_exec_enter(cpu); @@ -271,14 +271,15 @@ void cpu_exec_step_atomic(CPUState *cpu) qemu_plugin_disable_mem_helpers(cpu); } - if (cpu_in_exclusive_context(cpu)) { - /* We might longjump out of either the codegen or the - * execution, so must make sure we only end the exclusive - * region if we started it. - */ - parallel_cpus = true; - end_exclusive(); - } + + /* + * As we start the exclusive region before codegen we must still + * be in the region if we longjump out of either the codegen or + * the execution. + */ + g_assert(cpu_in_exclusive_context(cpu)); + parallel_cpus = true; + end_exclusive(); } struct tb_desc { From b59ea3640cc03f1f609fc5155605be71f43bcd2e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 5 Feb 2020 22:15:45 +0800 Subject: [PATCH 2/8] compiler.h: Don't use compile-time assert when __NO_INLINE__ is defined Our robot reported the following compile-time warning while compiling Qemu with -fno-inline cflags: In function 'load_memop', inlined from 'load_helper' at /qemu/accel/tcg/cputlb.c:1578:20, inlined from 'full_ldub_mmu' at /qemu/accel/tcg/cputlb.c:1624:12: /qemu/accel/tcg/cputlb.c:1502:9: error: call to 'qemu_build_not_reached' declared with attribute error: code path is reachable qemu_build_not_reached(); ^~~~~~~~~~~~~~~~~~~~~~~~ [...] It looks like a false-positive because only (MO_UB ^ MO_BSWAP) will hit the default case in load_memop() while need_swap (size > 1) has already ensured that MO_UB is not involved. So the thing is that compilers get confused by the -fno-inline and just can't accurately evaluate memop_size(op) at compile time, and then the qemu_build_not_reached() is wrongly triggered by (MO_UB ^ MO_BSWAP). Let's carefully don't use the compile-time assert when no functions will be inlined into their callers. Reported-by: Euler Robot Suggested-by: Richard Henderson Signed-off-by: Zenghui Yu Message-Id: <20200205141545.180-1-yuzenghui@huawei.com> Signed-off-by: Richard Henderson --- include/qemu/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index 85c02c16d3..c76281f354 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -236,7 +236,7 @@ * supports QEMU_ERROR, this will be reported at compile time; otherwise * this will be reported at link time due to the missing symbol. */ -#ifdef __OPTIMIZE__ +#if defined(__OPTIMIZE__) && !defined(__NO_INLINE__) extern void QEMU_NORETURN QEMU_ERROR("code path is reachable") qemu_build_not_reached(void); #else From d6fa50f536b2e4422f3065b6a11c4f8e42676f47 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 15 Feb 2020 21:40:01 -0800 Subject: [PATCH 3/8] tcg/arm: Split out tcg_out_epilogue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will shortly use this function from tcg_out_op as well. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.inc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index fffb6611e2..e1aa740ba4 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -1746,6 +1746,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) } static tcg_insn_unit *tb_ret_addr; +static void tcg_out_epilogue(TCGContext *s); static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) @@ -2284,19 +2285,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) + TCG_TARGET_STACK_ALIGN - 1) \ & -TCG_TARGET_STACK_ALIGN) +#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE) + static void tcg_target_qemu_prologue(TCGContext *s) { - int stack_addend; - /* Calling convention requires us to save r4-r11 and lr. */ /* stmdb sp!, { r4 - r11, lr } */ tcg_out32(s, (COND_AL << 28) | 0x092d4ff0); /* Reserve callee argument and tcg temp space. */ - stack_addend = FRAME_SIZE - PUSH_SIZE; - tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK, - TCG_REG_CALL_STACK, stack_addend, 1); + TCG_REG_CALL_STACK, STACK_ADDEND, 1); tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, CPU_TEMP_BUF_NLONGS * sizeof(long)); @@ -2310,11 +2309,15 @@ static void tcg_target_qemu_prologue(TCGContext *s) */ s->code_gen_epilogue = s->code_ptr; tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); - - /* TB epilogue */ tb_ret_addr = s->code_ptr; + tcg_out_epilogue(s); +} + +static void tcg_out_epilogue(TCGContext *s) +{ + /* Release local stack frame. */ tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK, - TCG_REG_CALL_STACK, stack_addend, 1); + TCG_REG_CALL_STACK, STACK_ADDEND, 1); /* ldmia sp!, { r4 - r11, pc } */ tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0); From fc1bfdfd0cc49cbe084b1da11a6fadf80b9c95c8 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 16 Feb 2020 01:36:56 -0800 Subject: [PATCH 4/8] tcg/arm: Expand epilogue inline It is, after all, just two instructions. Profiling on a cortex-a15, using -d nochain to increase the number of exit_tb that are executed, shows a minor improvement of 0.5%. Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.inc.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index e1aa740ba4..6aa7757aac 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -1745,7 +1745,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) #endif } -static tcg_insn_unit *tb_ret_addr; static void tcg_out_epilogue(TCGContext *s); static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, @@ -1756,14 +1755,8 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, switch (opc) { case INDEX_op_exit_tb: - /* Reuse the zeroing that exists for goto_ptr. */ - a0 = args[0]; - if (a0 == 0) { - tcg_out_goto(s, COND_AL, s->code_gen_epilogue); - } else { - tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]); - tcg_out_goto(s, COND_AL, tb_ret_addr); - } + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]); + tcg_out_epilogue(s); break; case INDEX_op_goto_tb: { @@ -2309,7 +2302,6 @@ static void tcg_target_qemu_prologue(TCGContext *s) */ s->code_gen_epilogue = s->code_ptr; tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); - tb_ret_addr = s->code_ptr; tcg_out_epilogue(s); } From a2fa63a8f57b6294009f3db198fa841139051c56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 28 Feb 2020 19:24:12 +0000 Subject: [PATCH 5/8] accel/tcg: use units.h for defining code gen buffer sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's easier to read. Signed-off-by: Alex Bennée Reviewed-by: Niek Linnenbank Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Message-Id: <20200228192415.19867-2-alex.bennee@linaro.org> Signed-off-by: Richard Henderson --- accel/tcg/translate-all.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index a08ab11f65..238b0e575b 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "qemu-common.h" #define NO_CPU_IO_DEFS @@ -901,33 +902,33 @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, /* Minimum size of the code gen buffer. This number is randomly chosen, but not so small that we can't have a fair number of TB's live. */ -#define MIN_CODE_GEN_BUFFER_SIZE (1024u * 1024) +#define MIN_CODE_GEN_BUFFER_SIZE (1 * MiB) /* Maximum size of the code gen buffer we'd like to use. Unless otherwise indicated, this is constrained by the range of direct branches on the host cpu, as used by the TCG implementation of goto_tb. */ #if defined(__x86_64__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__sparc__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__powerpc64__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__powerpc__) -# define MAX_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (32 * MiB) #elif defined(__aarch64__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__s390x__) /* We have a +- 4GB range on the branches; leave some slop. */ -# define MAX_CODE_GEN_BUFFER_SIZE (3ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (3 * GiB) #elif defined(__mips__) /* We have a 256MB branch region, but leave room to make sure the main executable is also within that region. */ -# define MAX_CODE_GEN_BUFFER_SIZE (128ul * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (128 * MiB) #else # define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) #endif -#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024) +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB) #define DEFAULT_CODE_GEN_BUFFER_SIZE \ (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \ From 47a2def4533a2807e48954abd50b32ecb1aaf29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 28 Feb 2020 19:24:13 +0000 Subject: [PATCH 6/8] accel/tcg: remove link between guest ram and TCG cache size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Basing the TB cache size on the ram_size was always a little heuristic and was broken by a1b18df9a4 which caused ram_size not to be fully realised at the time we initialise the TCG translation cache. The current DEFAULT_CODE_GEN_BUFFER_SIZE may still be a little small but follow-up patches will address that. Fixes: a1b18df9a4 Cc: Igor Mammedov Signed-off-by: Alex Bennée Reviewed-by: Richard Henderson Tested-by: Philippe Mathieu-Daudé Reviewed-by: Niek Linnenbank Message-Id: <20200228192415.19867-3-alex.bennee@linaro.org> Signed-off-by: Richard Henderson --- accel/tcg/translate-all.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 238b0e575b..5b66af783b 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -938,15 +938,7 @@ static inline size_t size_code_gen_buffer(size_t tb_size) { /* Size the buffer. */ if (tb_size == 0) { -#ifdef USE_STATIC_CODE_GEN_BUFFER tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE; -#else - /* ??? Needs adjustments. */ - /* ??? If we relax the requirement that CONFIG_USER_ONLY use the - static buffer, we could size this on RESERVED_VA, on the text - segment size of the executable, or continue to use the default. */ - tb_size = (unsigned long)(ram_size / 4); -#endif } if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) { tb_size = MIN_CODE_GEN_BUFFER_SIZE; From 21f2f447ad28d099d3ac5adc8ffe8eeaa56604f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 28 Feb 2020 19:24:14 +0000 Subject: [PATCH 7/8] accel/tcg: only USE_STATIC_CODE_GEN_BUFFER on 32 bit hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no particular reason to use a static codegen buffer on 64 bit hosts as we have address space to burn. Allow the common CONFIG_USER case to use the mmap'ed buffers like SoftMMU. Signed-off-by: Alex Bennée Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Reviewed-by: Niek Linnenbank Message-Id: <20200228192415.19867-4-alex.bennee@linaro.org> Signed-off-by: Richard Henderson --- accel/tcg/translate-all.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 5b66af783b..4ce5d1b393 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -892,11 +892,12 @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, } } -#if defined(CONFIG_USER_ONLY) -/* Currently it is not recommended to allocate big chunks of data in - user mode. It will change when a dedicated libc will be used. */ -/* ??? 64-bit hosts ought to have no problem mmaping data outside the - region in which the guest needs to run. Revisit this. */ +#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32 +/* + * For user mode on smaller 32 bit systems we may run into trouble + * allocating big chunks of data in the right place. On these systems + * we utilise a static code generation buffer directly in the binary. + */ #define USE_STATIC_CODE_GEN_BUFFER #endif From 600e17b261555c56a048781b8dd5ba3985650013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 28 Feb 2020 19:24:15 +0000 Subject: [PATCH 8/8] accel/tcg: increase default code gen buffer size for 64 bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While 32mb is certainly usable a full system boot ends up flushing the codegen buffer nearly 100 times. Increase the default on 64 bit hosts to take advantage of all that spare memory. After this change I can boot my tests system without any TB flushes. As we usually run more CONFIG_USER binaries at a time in typical usage we aren't quite as profligate for user-mode code generation usage. We also bring the static code gen defies to the same place to keep all the reasoning in the comments together. Signed-off-by: Alex Bennée Tested-by: Niek Linnenbank Reviewed-by: Niek Linnenbank Message-Id: <20200228192415.19867-5-alex.bennee@linaro.org> Signed-off-by: Richard Henderson --- accel/tcg/translate-all.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 4ce5d1b393..78914154bf 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -892,15 +892,6 @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, } } -#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32 -/* - * For user mode on smaller 32 bit systems we may run into trouble - * allocating big chunks of data in the right place. On these systems - * we utilise a static code generation buffer directly in the binary. - */ -#define USE_STATIC_CODE_GEN_BUFFER -#endif - /* Minimum size of the code gen buffer. This number is randomly chosen, but not so small that we can't have a fair number of TB's live. */ #define MIN_CODE_GEN_BUFFER_SIZE (1 * MiB) @@ -929,7 +920,33 @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, # define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) #endif +#if TCG_TARGET_REG_BITS == 32 #define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB) +#ifdef CONFIG_USER_ONLY +/* + * For user mode on smaller 32 bit systems we may run into trouble + * allocating big chunks of data in the right place. On these systems + * we utilise a static code generation buffer directly in the binary. + */ +#define USE_STATIC_CODE_GEN_BUFFER +#endif +#else /* TCG_TARGET_REG_BITS == 64 */ +#ifdef CONFIG_USER_ONLY +/* + * As user-mode emulation typically means running multiple instances + * of the translator don't go too nuts with our default code gen + * buffer lest we make things too hard for the OS. + */ +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (128 * MiB) +#else +/* + * We expect most system emulation to run one or two guests per host. + * Users running large scale system emulation may want to tweak their + * runtime setup via the tb-size control on the command line. + */ +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (1 * GiB) +#endif +#endif #define DEFAULT_CODE_GEN_BUFFER_SIZE \ (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \