diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 2560c90eec..d95c4848a4 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -240,6 +240,8 @@ void cpu_exec_step_atomic(CPUState *cpu) uint32_t cf_mask = cflags & CF_HASH_MASK; if (sigsetjmp(cpu->jmp_env, 0) == 0) { + start_exclusive(); + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); if (tb == NULL) { mmap_lock(); @@ -247,8 +249,6 @@ void cpu_exec_step_atomic(CPUState *cpu) mmap_unlock(); } - start_exclusive(); - /* Since we got here, we know that parallel_cpus must be true. */ parallel_cpus = false; cc->cpu_exec_enter(cpu); @@ -271,14 +271,15 @@ void cpu_exec_step_atomic(CPUState *cpu) qemu_plugin_disable_mem_helpers(cpu); } - if (cpu_in_exclusive_context(cpu)) { - /* We might longjump out of either the codegen or the - * execution, so must make sure we only end the exclusive - * region if we started it. - */ - parallel_cpus = true; - end_exclusive(); - } + + /* + * As we start the exclusive region before codegen we must still + * be in the region if we longjump out of either the codegen or + * the execution. + */ + g_assert(cpu_in_exclusive_context(cpu)); + parallel_cpus = true; + end_exclusive(); } struct tb_desc { diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index a08ab11f65..78914154bf 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "qemu-common.h" #define NO_CPU_IO_DEFS @@ -891,43 +892,61 @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, } } -#if defined(CONFIG_USER_ONLY) -/* Currently it is not recommended to allocate big chunks of data in - user mode. It will change when a dedicated libc will be used. */ -/* ??? 64-bit hosts ought to have no problem mmaping data outside the - region in which the guest needs to run. Revisit this. */ -#define USE_STATIC_CODE_GEN_BUFFER -#endif - /* Minimum size of the code gen buffer. This number is randomly chosen, but not so small that we can't have a fair number of TB's live. */ -#define MIN_CODE_GEN_BUFFER_SIZE (1024u * 1024) +#define MIN_CODE_GEN_BUFFER_SIZE (1 * MiB) /* Maximum size of the code gen buffer we'd like to use. Unless otherwise indicated, this is constrained by the range of direct branches on the host cpu, as used by the TCG implementation of goto_tb. */ #if defined(__x86_64__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__sparc__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__powerpc64__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__powerpc__) -# define MAX_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (32 * MiB) #elif defined(__aarch64__) -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) #elif defined(__s390x__) /* We have a +- 4GB range on the branches; leave some slop. */ -# define MAX_CODE_GEN_BUFFER_SIZE (3ul * 1024 * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (3 * GiB) #elif defined(__mips__) /* We have a 256MB branch region, but leave room to make sure the main executable is also within that region. */ -# define MAX_CODE_GEN_BUFFER_SIZE (128ul * 1024 * 1024) +# define MAX_CODE_GEN_BUFFER_SIZE (128 * MiB) #else # define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) #endif -#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024) +#if TCG_TARGET_REG_BITS == 32 +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB) +#ifdef CONFIG_USER_ONLY +/* + * For user mode on smaller 32 bit systems we may run into trouble + * allocating big chunks of data in the right place. On these systems + * we utilise a static code generation buffer directly in the binary. + */ +#define USE_STATIC_CODE_GEN_BUFFER +#endif +#else /* TCG_TARGET_REG_BITS == 64 */ +#ifdef CONFIG_USER_ONLY +/* + * As user-mode emulation typically means running multiple instances + * of the translator don't go too nuts with our default code gen + * buffer lest we make things too hard for the OS. + */ +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (128 * MiB) +#else +/* + * We expect most system emulation to run one or two guests per host. + * Users running large scale system emulation may want to tweak their + * runtime setup via the tb-size control on the command line. + */ +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (1 * GiB) +#endif +#endif #define DEFAULT_CODE_GEN_BUFFER_SIZE \ (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \ @@ -937,15 +956,7 @@ static inline size_t size_code_gen_buffer(size_t tb_size) { /* Size the buffer. */ if (tb_size == 0) { -#ifdef USE_STATIC_CODE_GEN_BUFFER tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE; -#else - /* ??? Needs adjustments. */ - /* ??? If we relax the requirement that CONFIG_USER_ONLY use the - static buffer, we could size this on RESERVED_VA, on the text - segment size of the executable, or continue to use the default. */ - tb_size = (unsigned long)(ram_size / 4); -#endif } if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) { tb_size = MIN_CODE_GEN_BUFFER_SIZE; diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index 85c02c16d3..c76281f354 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -236,7 +236,7 @@ * supports QEMU_ERROR, this will be reported at compile time; otherwise * this will be reported at link time due to the missing symbol. */ -#ifdef __OPTIMIZE__ +#if defined(__OPTIMIZE__) && !defined(__NO_INLINE__) extern void QEMU_NORETURN QEMU_ERROR("code path is reachable") qemu_build_not_reached(void); #else diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index fffb6611e2..6aa7757aac 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -1745,7 +1745,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) #endif } -static tcg_insn_unit *tb_ret_addr; +static void tcg_out_epilogue(TCGContext *s); static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) @@ -1755,14 +1755,8 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, switch (opc) { case INDEX_op_exit_tb: - /* Reuse the zeroing that exists for goto_ptr. */ - a0 = args[0]; - if (a0 == 0) { - tcg_out_goto(s, COND_AL, s->code_gen_epilogue); - } else { - tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]); - tcg_out_goto(s, COND_AL, tb_ret_addr); - } + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]); + tcg_out_epilogue(s); break; case INDEX_op_goto_tb: { @@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) + TCG_TARGET_STACK_ALIGN - 1) \ & -TCG_TARGET_STACK_ALIGN) +#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE) + static void tcg_target_qemu_prologue(TCGContext *s) { - int stack_addend; - /* Calling convention requires us to save r4-r11 and lr. */ /* stmdb sp!, { r4 - r11, lr } */ tcg_out32(s, (COND_AL << 28) | 0x092d4ff0); /* Reserve callee argument and tcg temp space. */ - stack_addend = FRAME_SIZE - PUSH_SIZE; - tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK, - TCG_REG_CALL_STACK, stack_addend, 1); + TCG_REG_CALL_STACK, STACK_ADDEND, 1); tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, CPU_TEMP_BUF_NLONGS * sizeof(long)); @@ -2310,11 +2302,14 @@ static void tcg_target_qemu_prologue(TCGContext *s) */ s->code_gen_epilogue = s->code_ptr; tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); + tcg_out_epilogue(s); +} - /* TB epilogue */ - tb_ret_addr = s->code_ptr; +static void tcg_out_epilogue(TCGContext *s) +{ + /* Release local stack frame. */ tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK, - TCG_REG_CALL_STACK, stack_addend, 1); + TCG_REG_CALL_STACK, STACK_ADDEND, 1); /* ldmia sp!, { r4 - r11, pc } */ tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);