mirror of
https://github.com/FEX-Emu/linux.git
synced 2025-01-10 11:30:49 +00:00
a7159a87a3
For many sun4v processor types, reading or writing a privileged register has a latency of 40 to 70 cycles. Use a combination of the low-latency allclean, otherw, normalw, and nop instructions in etrap and rtrap to replace 2 rdpr and 5 wrpr instructions and improve etrap/rtrap performance. allclean, otherw, and normalw are available on NG2 and later processors. The average ticks to execute the flush windows trap ("ta 0x3") with and without this patch on select platforms: CPU Not patched Patched % Latency Reduction NG2 1762 1558 -11.58 NG4 3619 3204 -11.47 M7 3015 2624 -12.97 SPARC64-X 829 770 -7.12 Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
349 lines
9.0 KiB
ArmAsm
349 lines
9.0 KiB
ArmAsm
/*
|
|
* rtrap.S: Preparing for return from trap on Sparc V9.
|
|
*
|
|
* Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
|
|
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
|
|
*/
|
|
|
|
|
|
#include <asm/asi.h>
|
|
#include <asm/pstate.h>
|
|
#include <asm/ptrace.h>
|
|
#include <asm/spitfire.h>
|
|
#include <asm/head.h>
|
|
#include <asm/visasm.h>
|
|
#include <asm/processor.h>
|
|
|
|
#ifdef CONFIG_CONTEXT_TRACKING
|
|
# define SCHEDULE_USER schedule_user
|
|
#else
|
|
# define SCHEDULE_USER schedule
|
|
#endif
|
|
|
|
.text
|
|
.align 32
|
|
__handle_preemption:
|
|
call SCHEDULE_USER
|
|
wrpr %g0, RTRAP_PSTATE, %pstate
|
|
ba,pt %xcc, __handle_preemption_continue
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
|
|
__handle_user_windows:
|
|
call fault_in_user_windows
|
|
wrpr %g0, RTRAP_PSTATE, %pstate
|
|
ba,pt %xcc, __handle_preemption_continue
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
|
|
__handle_userfpu:
|
|
rd %fprs, %l5
|
|
andcc %l5, FPRS_FEF, %g0
|
|
sethi %hi(TSTATE_PEF), %o0
|
|
be,a,pn %icc, __handle_userfpu_continue
|
|
andn %l1, %o0, %l1
|
|
ba,a,pt %xcc, __handle_userfpu_continue
|
|
|
|
__handle_signal:
|
|
mov %l5, %o1
|
|
add %sp, PTREGS_OFF, %o0
|
|
mov %l0, %o2
|
|
call do_notify_resume
|
|
wrpr %g0, RTRAP_PSTATE, %pstate
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
|
|
/* Signal delivery can modify pt_regs tstate, so we must
|
|
* reload it.
|
|
*/
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
|
|
sethi %hi(0xf << 20), %l4
|
|
and %l1, %l4, %l4
|
|
ba,pt %xcc, __handle_preemption_continue
|
|
andn %l1, %l4, %l1
|
|
|
|
/* When returning from a NMI (%pil==15) interrupt we want to
|
|
* avoid running softirqs, doing IRQ tracing, preempting, etc.
|
|
*/
|
|
.globl rtrap_nmi
|
|
rtrap_nmi: ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
|
|
sethi %hi(0xf << 20), %l4
|
|
and %l1, %l4, %l4
|
|
andn %l1, %l4, %l1
|
|
srl %l4, 20, %l4
|
|
ba,pt %xcc, rtrap_no_irq_enable
|
|
nop
|
|
/* Do not actually set the %pil here. We will do that
|
|
* below after we clear PSTATE_IE in the %pstate register.
|
|
* If we re-enable interrupts here, we can recurse down
|
|
* the hardirq stack potentially endlessly, causing a
|
|
* stack overflow.
|
|
*/
|
|
|
|
.align 64
|
|
.globl rtrap_irq, rtrap, irqsz_patchme, rtrap_xcall
|
|
rtrap_irq:
|
|
rtrap:
|
|
/* mm/ultra.S:xcall_report_regs KNOWS about this load. */
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
|
|
rtrap_xcall:
|
|
sethi %hi(0xf << 20), %l4
|
|
and %l1, %l4, %l4
|
|
andn %l1, %l4, %l1
|
|
srl %l4, 20, %l4
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
brnz,pn %l4, rtrap_no_irq_enable
|
|
nop
|
|
call trace_hardirqs_on
|
|
nop
|
|
/* Do not actually set the %pil here. We will do that
|
|
* below after we clear PSTATE_IE in the %pstate register.
|
|
* If we re-enable interrupts here, we can recurse down
|
|
* the hardirq stack potentially endlessly, causing a
|
|
* stack overflow.
|
|
*
|
|
* It is tempting to put this test and trace_hardirqs_on
|
|
* call at the 'rt_continue' label, but that will not work
|
|
* as that path hits unconditionally and we do not want to
|
|
* execute this in NMI return paths, for example.
|
|
*/
|
|
#endif
|
|
rtrap_no_irq_enable:
|
|
andcc %l1, TSTATE_PRIV, %l3
|
|
bne,pn %icc, to_kernel
|
|
nop
|
|
|
|
/* We must hold IRQs off and atomically test schedule+signal
|
|
* state, then hold them off all the way back to userspace.
|
|
* If we are returning to kernel, none of this matters. Note
|
|
* that we are disabling interrupts via PSTATE_IE, not using
|
|
* %pil.
|
|
*
|
|
* If we do not do this, there is a window where we would do
|
|
* the tests, later the signal/resched event arrives but we do
|
|
* not process it since we are still in kernel mode. It would
|
|
* take until the next local IRQ before the signal/resched
|
|
* event would be handled.
|
|
*
|
|
* This also means that if we have to deal with user
|
|
* windows, we have to redo all of these sched+signal checks
|
|
* with IRQs disabled.
|
|
*/
|
|
to_user: wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
wrpr 0, %pil
|
|
__handle_preemption_continue:
|
|
ldx [%g6 + TI_FLAGS], %l0
|
|
sethi %hi(_TIF_USER_WORK_MASK), %o0
|
|
or %o0, %lo(_TIF_USER_WORK_MASK), %o0
|
|
andcc %l0, %o0, %g0
|
|
sethi %hi(TSTATE_PEF), %o0
|
|
be,pt %xcc, user_nowork
|
|
andcc %l1, %o0, %g0
|
|
andcc %l0, _TIF_NEED_RESCHED, %g0
|
|
bne,pn %xcc, __handle_preemption
|
|
andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0
|
|
bne,pn %xcc, __handle_signal
|
|
ldub [%g6 + TI_WSAVED], %o2
|
|
brnz,pn %o2, __handle_user_windows
|
|
nop
|
|
sethi %hi(TSTATE_PEF), %o0
|
|
andcc %l1, %o0, %g0
|
|
|
|
/* This fpdepth clear is necessary for non-syscall rtraps only */
|
|
user_nowork:
|
|
bne,pn %xcc, __handle_userfpu
|
|
stb %g0, [%g6 + TI_FPDEPTH]
|
|
__handle_userfpu_continue:
|
|
|
|
rt_continue: ldx [%sp + PTREGS_OFF + PT_V9_G1], %g1
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G2], %g2
|
|
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G3], %g3
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G4], %g4
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G5], %g5
|
|
brz,pt %l3, 1f
|
|
mov %g6, %l2
|
|
|
|
/* Must do this before thread reg is clobbered below. */
|
|
LOAD_PER_CPU_BASE(%g5, %g6, %i0, %i1, %i2)
|
|
1:
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G6], %g6
|
|
ldx [%sp + PTREGS_OFF + PT_V9_G7], %g7
|
|
|
|
/* Normal globals are restored, go to trap globals. */
|
|
661: wrpr %g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
|
|
nop
|
|
.section .sun4v_2insn_patch, "ax"
|
|
.word 661b
|
|
wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
|
|
SET_GL(1)
|
|
.previous
|
|
|
|
mov %l2, %g6
|
|
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I0], %i0
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I1], %i1
|
|
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I2], %i2
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I3], %i3
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I4], %i4
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I5], %i5
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I6], %i6
|
|
ldx [%sp + PTREGS_OFF + PT_V9_I7], %i7
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TPC], %l2
|
|
ldx [%sp + PTREGS_OFF + PT_V9_TNPC], %o2
|
|
|
|
ld [%sp + PTREGS_OFF + PT_V9_Y], %o3
|
|
wr %o3, %g0, %y
|
|
wrpr %l4, 0x0, %pil
|
|
wrpr %g0, 0x1, %tl
|
|
andn %l1, TSTATE_SYSCALL, %l1
|
|
wrpr %l1, %g0, %tstate
|
|
wrpr %l2, %g0, %tpc
|
|
wrpr %o2, %g0, %tnpc
|
|
|
|
brnz,pn %l3, kern_rtt
|
|
mov PRIMARY_CONTEXT, %l7
|
|
|
|
661: ldxa [%l7 + %l7] ASI_DMMU, %l0
|
|
.section .sun4v_1insn_patch, "ax"
|
|
.word 661b
|
|
ldxa [%l7 + %l7] ASI_MMU, %l0
|
|
.previous
|
|
|
|
sethi %hi(sparc64_kern_pri_nuc_bits), %l1
|
|
ldx [%l1 + %lo(sparc64_kern_pri_nuc_bits)], %l1
|
|
or %l0, %l1, %l0
|
|
|
|
661: stxa %l0, [%l7] ASI_DMMU
|
|
.section .sun4v_1insn_patch, "ax"
|
|
.word 661b
|
|
stxa %l0, [%l7] ASI_MMU
|
|
.previous
|
|
|
|
sethi %hi(KERNBASE), %l7
|
|
flush %l7
|
|
rdpr %wstate, %l1
|
|
rdpr %otherwin, %l2
|
|
srl %l1, 3, %l1
|
|
|
|
661: wrpr %l2, %g0, %canrestore
|
|
.section .fast_win_ctrl_1insn_patch, "ax"
|
|
.word 661b
|
|
.word 0x89880000 ! normalw
|
|
.previous
|
|
|
|
wrpr %l1, %g0, %wstate
|
|
brnz,pt %l2, user_rtt_restore
|
|
661: wrpr %g0, %g0, %otherwin
|
|
.section .fast_win_ctrl_1insn_patch, "ax"
|
|
.word 661b
|
|
nop
|
|
.previous
|
|
|
|
ldx [%g6 + TI_FLAGS], %g3
|
|
wr %g0, ASI_AIUP, %asi
|
|
rdpr %cwp, %g1
|
|
andcc %g3, _TIF_32BIT, %g0
|
|
sub %g1, 1, %g1
|
|
bne,pt %xcc, user_rtt_fill_32bit
|
|
wrpr %g1, %cwp
|
|
ba,a,pt %xcc, user_rtt_fill_64bit
|
|
nop
|
|
|
|
user_rtt_fill_fixup_dax:
|
|
ba,pt %xcc, user_rtt_fill_fixup_common
|
|
mov 1, %g3
|
|
|
|
user_rtt_fill_fixup_mna:
|
|
ba,pt %xcc, user_rtt_fill_fixup_common
|
|
mov 2, %g3
|
|
|
|
user_rtt_fill_fixup:
|
|
ba,pt %xcc, user_rtt_fill_fixup_common
|
|
clr %g3
|
|
|
|
user_rtt_pre_restore:
|
|
add %g1, 1, %g1
|
|
wrpr %g1, 0x0, %cwp
|
|
|
|
user_rtt_restore:
|
|
restore
|
|
rdpr %canrestore, %g1
|
|
wrpr %g1, 0x0, %cleanwin
|
|
retry
|
|
nop
|
|
|
|
kern_rtt: rdpr %canrestore, %g1
|
|
brz,pn %g1, kern_rtt_fill
|
|
nop
|
|
kern_rtt_restore:
|
|
stw %g0, [%sp + PTREGS_OFF + PT_V9_MAGIC]
|
|
restore
|
|
retry
|
|
|
|
to_kernel:
|
|
#ifdef CONFIG_PREEMPT
|
|
ldsw [%g6 + TI_PRE_COUNT], %l5
|
|
brnz %l5, kern_fpucheck
|
|
ldx [%g6 + TI_FLAGS], %l5
|
|
andcc %l5, _TIF_NEED_RESCHED, %g0
|
|
be,pt %xcc, kern_fpucheck
|
|
nop
|
|
cmp %l4, 0
|
|
bne,pn %xcc, kern_fpucheck
|
|
nop
|
|
call preempt_schedule_irq
|
|
nop
|
|
ba,pt %xcc, rtrap
|
|
#endif
|
|
kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5
|
|
brz,pt %l5, rt_continue
|
|
srl %l5, 1, %o0
|
|
add %g6, TI_FPSAVED, %l6
|
|
ldub [%l6 + %o0], %l2
|
|
sub %l5, 2, %l5
|
|
|
|
add %g6, TI_GSR, %o1
|
|
andcc %l2, (FPRS_FEF|FPRS_DU), %g0
|
|
be,pt %icc, 2f
|
|
and %l2, FPRS_DL, %l6
|
|
andcc %l2, FPRS_FEF, %g0
|
|
be,pn %icc, 5f
|
|
sll %o0, 3, %o5
|
|
rd %fprs, %g1
|
|
|
|
wr %g1, FPRS_FEF, %fprs
|
|
ldx [%o1 + %o5], %g1
|
|
add %g6, TI_XFSR, %o1
|
|
sll %o0, 8, %o2
|
|
add %g6, TI_FPREGS, %o3
|
|
brz,pn %l6, 1f
|
|
add %g6, TI_FPREGS+0x40, %o4
|
|
|
|
membar #Sync
|
|
ldda [%o3 + %o2] ASI_BLK_P, %f0
|
|
ldda [%o4 + %o2] ASI_BLK_P, %f16
|
|
membar #Sync
|
|
1: andcc %l2, FPRS_DU, %g0
|
|
be,pn %icc, 1f
|
|
wr %g1, 0, %gsr
|
|
add %o2, 0x80, %o2
|
|
membar #Sync
|
|
ldda [%o3 + %o2] ASI_BLK_P, %f32
|
|
ldda [%o4 + %o2] ASI_BLK_P, %f48
|
|
1: membar #Sync
|
|
ldx [%o1 + %o5], %fsr
|
|
2: stb %l5, [%g6 + TI_FPDEPTH]
|
|
ba,pt %xcc, rt_continue
|
|
nop
|
|
5: wr %g0, FPRS_FEF, %fprs
|
|
sll %o0, 8, %o2
|
|
|
|
add %g6, TI_FPREGS+0x80, %o3
|
|
add %g6, TI_FPREGS+0xc0, %o4
|
|
membar #Sync
|
|
ldda [%o3 + %o2] ASI_BLK_P, %f32
|
|
ldda [%o4 + %o2] ASI_BLK_P, %f48
|
|
membar #Sync
|
|
wr %g0, FPRS_DU, %fprs
|
|
ba,pt %xcc, rt_continue
|
|
stb %l5, [%g6 + TI_FPDEPTH]
|