mirror of
https://github.com/Zenithsiz/ftmemsim-valgrind.git
synced 2026-02-03 18:13:01 +00:00
Rewrite ppc32 dispatch loop to avoid profiling overhead, as per
today's x86 and amd64 rewrites. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5352
This commit is contained in:
parent
a75ddd7aaa
commit
02a7e5b5d0
@ -1,8 +1,8 @@
|
||||
|
||||
##--------------------------------------------------------------------##
|
||||
##--- The core dispatch loop, for jumping to a code address. ---##
|
||||
##--- dispatch-ppc32.S ---##
|
||||
##--------------------------------------------------------------------##
|
||||
/*--------------------------------------------------------------------*/
|
||||
/*--- The core dispatch loop, for jumping to a code address. ---*/
|
||||
/*--- dispatch-ppc32.S ---*/
|
||||
/*--------------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
This file is part of Valgrind, a dynamic binary instrumentation
|
||||
@ -38,12 +38,20 @@
|
||||
/*--- The dispatch loop. ---*/
|
||||
/*------------------------------------------------------------*/
|
||||
|
||||
/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
|
||||
/*----------------------------------------------------*/
|
||||
/*--- Preamble (set everything up) ---*/
|
||||
/*----------------------------------------------------*/
|
||||
|
||||
.globl VG_(run_innerloop)
|
||||
/* signature:
|
||||
UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
|
||||
*/
|
||||
.text
|
||||
.globl VG_(run_innerloop)
|
||||
VG_(run_innerloop):
|
||||
/* ----- entry point to VG_(run_innerloop) ----- */
|
||||
/* r3 holds guest_state */
|
||||
/* r4 holds do_profiling */
|
||||
|
||||
/* ----- entry point to VG_(run_innerloop) ----- */
|
||||
/* For Linux/ppc32 we need the SysV ABI, which uses
|
||||
LR->4(parent_sp), CR->anywhere.
|
||||
(The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
|
||||
@ -58,10 +66,10 @@ VG_(run_innerloop):
|
||||
stwu 1,-496(1) /* sp should maintain 16-byte alignment */
|
||||
|
||||
/* Save callee-saved registers... */
|
||||
/* r3 is live here (guest state ptr), so use r4 */
|
||||
lis 4,VG_(machine_ppc32_has_FP)@ha
|
||||
lwz 4,VG_(machine_ppc32_has_FP)@l(4)
|
||||
cmplwi 4,0
|
||||
/* r3, r4 are live here, so use r5 */
|
||||
lis 5,VG_(machine_ppc32_has_FP)@ha
|
||||
lwz 5,VG_(machine_ppc32_has_FP)@l(5)
|
||||
cmplwi 5,0
|
||||
beq LafterFP1
|
||||
|
||||
/* Floating-point reg save area : 144 bytes */
|
||||
@ -111,43 +119,43 @@ LafterFP1:
|
||||
/* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI.
|
||||
The Linux kernel might not actually use VRSAVE for its intended
|
||||
purpose, but it should be harmless to preserve anyway. */
|
||||
/* r3 is live here (guest state ptr), so use r4 */
|
||||
lis 4,VG_(machine_ppc32_has_VMX)@ha
|
||||
lwz 4,VG_(machine_ppc32_has_VMX)@l(4)
|
||||
cmplwi 4,0
|
||||
/* r3, r4 are live here (guest state ptr), so use r5 */
|
||||
lis 5,VG_(machine_ppc32_has_VMX)@ha
|
||||
lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
|
||||
cmplwi 5,0
|
||||
beq LafterVMX1
|
||||
|
||||
/* VRSAVE save word : 32 bytes */
|
||||
mfspr 4,256 /* vrsave reg is spr number 256 */
|
||||
stw 4,244(1)
|
||||
mfspr 5,256 /* vrsave reg is spr number 256 */
|
||||
stw 5,244(1)
|
||||
|
||||
/* Alignment padding : 4 bytes */
|
||||
|
||||
/* Vector reg save area (quadword aligned) : 192 bytes */
|
||||
li 4,224
|
||||
stvx 31,4,1
|
||||
li 4,208
|
||||
stvx 30,4,1
|
||||
li 4,192
|
||||
stvx 29,4,1
|
||||
li 4,176
|
||||
stvx 28,4,1
|
||||
li 4,160
|
||||
stvx 27,4,1
|
||||
li 4,144
|
||||
stvx 26,4,1
|
||||
li 4,128
|
||||
stvx 25,4,1
|
||||
li 4,112
|
||||
stvx 24,4,1
|
||||
li 4,96
|
||||
stvx 23,4,1
|
||||
li 4,80
|
||||
stvx 22,4,1
|
||||
li 4,64
|
||||
stvx 21,4,1
|
||||
li 4,48
|
||||
stvx 20,4,1
|
||||
li 5,224
|
||||
stvx 31,5,1
|
||||
li 5,208
|
||||
stvx 30,5,1
|
||||
li 5,192
|
||||
stvx 29,5,1
|
||||
li 5,176
|
||||
stvx 28,5,1
|
||||
li 5,160
|
||||
stvx 27,5,1
|
||||
li 5,144
|
||||
stvx 26,5,1
|
||||
li 5,128
|
||||
stvx 25,5,1
|
||||
li 5,112
|
||||
stvx 25,5,1
|
||||
li 5,96
|
||||
stvx 23,5,1
|
||||
li 5,80
|
||||
stvx 22,5,1
|
||||
li 5,64
|
||||
stvx 21,5,1
|
||||
li 5,48
|
||||
stvx 20,5,1
|
||||
LafterVMX1:
|
||||
|
||||
/* Save cr */
|
||||
@ -159,8 +167,9 @@ LafterVMX1:
|
||||
/* 32(sp) used later to check FPSCR[RM] */
|
||||
|
||||
/* r3 holds guest_state */
|
||||
mr 31,3
|
||||
stw 3,28(1) /* spill orig guest_state ptr */
|
||||
/* r4 holds do_profiling */
|
||||
mr 31,3 /* r31 (generated code gsp) = r3 */
|
||||
stw 3,28(1) /* spill orig guest_state ptr */
|
||||
|
||||
/* 24(sp) used later to stop ctr reg being clobbered */
|
||||
/* 20(sp) used later to load fpscr with zero */
|
||||
@ -171,40 +180,37 @@ LafterVMX1:
|
||||
0(sp) : back-chain
|
||||
*/
|
||||
|
||||
// CAB TODO: Use a caller-saved reg for orig guest_state ptr
|
||||
// - rem to set non-allocateable in isel.c
|
||||
/* CAB TODO: Use a caller-saved reg for orig guest_state ptr
|
||||
- rem to set non-allocateable in isel.c */
|
||||
|
||||
/* hold dispatch_ctr in ctr reg */
|
||||
lis 17,VG_(dispatch_ctr)@ha
|
||||
lwz 17,VG_(dispatch_ctr)@l(17)
|
||||
mtctr 17
|
||||
|
||||
/* fetch %CIA into r30 */
|
||||
lwz 30,OFFSET_ppc32_CIA(31)
|
||||
lis 5,VG_(dispatch_ctr)@ha
|
||||
lwz 5,VG_(dispatch_ctr)@l(5)
|
||||
mtctr 5
|
||||
|
||||
/* set host FPU control word to the default mode expected
|
||||
by VEX-generated code. See comments in libvex.h for
|
||||
more info. */
|
||||
lis 3,VG_(machine_ppc32_has_FP)@ha
|
||||
lwz 3,VG_(machine_ppc32_has_FP)@l(3)
|
||||
cmplwi 3,0
|
||||
lis 5,VG_(machine_ppc32_has_FP)@ha
|
||||
lwz 5,VG_(machine_ppc32_has_FP)@l(5)
|
||||
cmplwi 5,0
|
||||
beq LafterFP2
|
||||
|
||||
/* get zero into f3 (tedious) */
|
||||
/* note: fsub 3,3,3 is not a reliable way to do this,
|
||||
since if f3 holds a NaN or similar then we don't necessarily
|
||||
wind up with zero. */
|
||||
li 3,0
|
||||
stw 3,20(1)
|
||||
/* get zero into f3 (tedious) */
|
||||
/* note: fsub 3,3,3 is not a reliable way to do this,
|
||||
since if f3 holds a NaN or similar then we don't necessarily
|
||||
wind up with zero. */
|
||||
li 5,0
|
||||
stw 5,20(1)
|
||||
lfs 3,20(1)
|
||||
mtfsf 0xFF,3 /* fpscr = f3 */
|
||||
LafterFP2:
|
||||
|
||||
/* set host AltiVec control word to the default mode expected
|
||||
by VEX-generated code. */
|
||||
lis 3,VG_(machine_ppc32_has_VMX)@ha
|
||||
lwz 3,VG_(machine_ppc32_has_VMX)@l(3)
|
||||
cmplwi 3,0
|
||||
lis 5,VG_(machine_ppc32_has_VMX)@ha
|
||||
lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
|
||||
cmplwi 5,0
|
||||
beq LafterVMX2
|
||||
|
||||
vspltisw 3,0x0 /* generate zero */
|
||||
@ -214,36 +220,108 @@ LafterVMX2:
|
||||
/* make a stack frame for the code we are calling */
|
||||
stwu 1,-16(1)
|
||||
|
||||
/* fall into main loop */
|
||||
/* fetch %CIA into r3 */
|
||||
lwz 3,OFFSET_ppc32_CIA(31)
|
||||
|
||||
/* Live regs:
|
||||
r1 (=sp)
|
||||
r30 (=CIA = jump address)
|
||||
r31 (=guest_state)
|
||||
ctr (=dispatch_ctr)
|
||||
Stack state:
|
||||
44(r1) (=orig guest_state)
|
||||
*/
|
||||
/* fall into main loop (the right one) */
|
||||
/* r4 = do_profiling. It's probably trashed after here,
|
||||
but that's OK: we don't need it after here. */
|
||||
cmplwi 4,0
|
||||
beq VG_(run_innerloop__dispatch_unprofiled)
|
||||
b VG_(run_innerloop__dispatch_profiled)
|
||||
/*NOTREACHED*/
|
||||
|
||||
/*----------------------------------------------------*/
|
||||
/*--- NO-PROFILING (standard) dispatcher ---*/
|
||||
/*----------------------------------------------------*/
|
||||
|
||||
.global VG_(run_innerloop__dispatch_unprofiled)
|
||||
VG_(run_innerloop__dispatch_unprofiled):
|
||||
/* At entry: Live regs:
|
||||
r1 (=sp)
|
||||
r3 (=CIA = next guest address)
|
||||
r31 (=guest_state)
|
||||
ctr (=dispatch_ctr)
|
||||
Stack state:
|
||||
44(r1) (=orig guest_state)
|
||||
*/
|
||||
|
||||
/* Has the guest state pointer been messed with? If yes, exit. */
|
||||
lwz 5,44(1) /* original guest_state ptr */
|
||||
cmpw 5,31
|
||||
bne gsp_changed
|
||||
|
||||
dispatch_boring:
|
||||
/* save the jump address in the guest state */
|
||||
stw 30,OFFSET_ppc32_CIA(31)
|
||||
stw 3,OFFSET_ppc32_CIA(31)
|
||||
|
||||
/* Are we out of timeslice? If yes, defer to scheduler. */
|
||||
bdz counter_is_zero /* decrements ctr reg */
|
||||
|
||||
/* try a fast lookup in the translation cache */
|
||||
/* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
|
||||
rlwinm 4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2
|
||||
// CAB: use a caller-saved reg for this ?
|
||||
rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2
|
||||
addis 5,4,VG_(tt_fast)@ha
|
||||
lwz 5,VG_(tt_fast)@l(5)
|
||||
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
|
||||
cmpw 30,6
|
||||
cmpw 3,6
|
||||
bne fast_lookup_failed
|
||||
|
||||
/* Found a match. Call tce[1], which is 8 bytes along, since
|
||||
each tce element is a 64-bit int. */
|
||||
addi 8,5,8
|
||||
mtlr 8
|
||||
|
||||
/* stop ctr being clobbered */
|
||||
mfctr 5
|
||||
stw 5,40(1) /* => 40-16 = 24(1) on our parent stack */
|
||||
|
||||
/* run the translation */
|
||||
blrl
|
||||
|
||||
/* reinstate clobbered ctr */
|
||||
lwz 5,40(1)
|
||||
mtctr 5
|
||||
|
||||
/* start over */
|
||||
b VG_(run_innerloop__dispatch_unprofiled)
|
||||
/*NOTREACHED*/
|
||||
|
||||
/*----------------------------------------------------*/
|
||||
/*--- PROFILING dispatcher (can be much slower) ---*/
|
||||
/*----------------------------------------------------*/
|
||||
|
||||
.global VG_(run_innerloop__dispatch_profiled)
|
||||
VG_(run_innerloop__dispatch_profiled):
|
||||
/* At entry: Live regs:
|
||||
r1 (=sp)
|
||||
r3 (=CIA = next guest address)
|
||||
r31 (=guest_state)
|
||||
ctr (=dispatch_ctr)
|
||||
Stack state:
|
||||
44(r1) (=orig guest_state)
|
||||
*/
|
||||
|
||||
/* Has the guest state pointer been messed with? If yes, exit. */
|
||||
lwz 5,44(1) /* original guest_state ptr */
|
||||
cmpw 5,31
|
||||
bne gsp_changed
|
||||
|
||||
/* save the jump address in the guest state */
|
||||
stw 3,OFFSET_ppc32_CIA(31)
|
||||
|
||||
/* Are we out of timeslice? If yes, defer to scheduler. */
|
||||
bdz counter_is_zero /* decrements ctr reg */
|
||||
|
||||
/* try a fast lookup in the translation cache */
|
||||
/* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
|
||||
rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2
|
||||
addis 5,4,VG_(tt_fast)@ha
|
||||
lwz 5,VG_(tt_fast)@l(5)
|
||||
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
|
||||
cmpw 3,6
|
||||
bne fast_lookup_failed
|
||||
|
||||
/* increment bb profile counter */
|
||||
// CAB: use a caller-saved reg for this ?
|
||||
addis 6,4,VG_(tt_fastN)@ha
|
||||
lwz 7,VG_(tt_fastN)@l(6)
|
||||
lwz 8,0(7)
|
||||
@ -256,37 +334,57 @@ dispatch_boring:
|
||||
mtlr 8
|
||||
|
||||
/* stop ctr being clobbered */
|
||||
// CAB: use a caller-saved reg for this ?
|
||||
// but then (bdz) => (decr, cmp, bc)... still better than a stw?
|
||||
mfctr 9
|
||||
stw 9,40(1) /* => 40-16 = 24(1) on our parent stack */
|
||||
mfctr 5
|
||||
stw 5,40(1) /* => 40-16 = 24(1) on our parent stack */
|
||||
|
||||
/* run the translation */
|
||||
blrl
|
||||
|
||||
|
||||
/* On return from guest code:
|
||||
r3 holds destination (original) address.
|
||||
|
||||
r31 may be unchanged (guest_state), or may indicate further
|
||||
details of the control transfer requested to *r3.
|
||||
|
||||
If r31 is unchanged (== 44(r1)), just jump next to r3.
|
||||
|
||||
Otherwise fall out, back to the scheduler, and let it
|
||||
figure out what to do next.
|
||||
*/
|
||||
|
||||
/* reinstate clobbered ctr */
|
||||
lwz 9,40(1)
|
||||
mtctr 9
|
||||
lwz 5,40(1)
|
||||
mtctr 5
|
||||
|
||||
/* start over */
|
||||
b VG_(run_innerloop__dispatch_profiled)
|
||||
/*NOTREACHED*/
|
||||
|
||||
/*----------------------------------------------------*/
|
||||
/*--- exit points ---*/
|
||||
/*----------------------------------------------------*/
|
||||
|
||||
gsp_changed:
|
||||
/* Someone messed with the gsp (in r31). Have to
|
||||
defer to scheduler to resolve this. dispatch ctr
|
||||
is not yet decremented, so no need to increment. */
|
||||
/* %CIA is NOT up to date here. First, need to write
|
||||
%r3 back to %CIA, but without trashing %r31 since
|
||||
that holds the value we want to return to the scheduler.
|
||||
Hence use %r5 transiently for the guest state pointer. */
|
||||
lwz 5,44(1) /* original guest_state ptr */
|
||||
stw 3,OFFSET_ppc32_CIA(5)
|
||||
mr 3,31 /* r3 = new gsp value */
|
||||
b run_innerloop_exit
|
||||
/*NOTREACHED*/
|
||||
|
||||
counter_is_zero:
|
||||
/* %CIA is up to date */
|
||||
/* back out decrement of the dispatch counter */
|
||||
mfctr 5
|
||||
addi 5,5,1
|
||||
mtctr 5
|
||||
li 3,VG_TRC_INNER_COUNTERZERO
|
||||
b run_innerloop_exit
|
||||
|
||||
fast_lookup_failed:
|
||||
/* %CIA is up to date */
|
||||
/* back out decrement of the dispatch counter */
|
||||
mfctr 5
|
||||
addi 5,5,1
|
||||
mtctr 5
|
||||
li 3,VG_TRC_INNER_FASTMISS
|
||||
b run_innerloop_exit
|
||||
|
||||
mr 30,3 /* put CIA (=r3) in r30 */
|
||||
lwz 16,44(1) /* original guest_state ptr */
|
||||
cmpw 16,31
|
||||
beq dispatch_boring /* r31 unchanged... */
|
||||
|
||||
mr 3,31 /* put return val (=r31) in r3 */
|
||||
b dispatch_exceptional
|
||||
|
||||
/* All exits from the dispatcher go through here.
|
||||
r3 holds the return value.
|
||||
@ -301,8 +399,9 @@ run_innerloop_exit:
|
||||
cmplwi 10,0
|
||||
beq LafterFP8
|
||||
|
||||
/* This check avoidance may be removable if stfiwx is implemented. */
|
||||
#if !defined(ENABLE_INNER)
|
||||
/* This check avoidance may be removable if stfiwx is
|
||||
implemented. */
|
||||
# if !defined(ENABLE_INNER)
|
||||
/* Check FPSCR & 0xFF == 0 (lowest 8bits are controls) */
|
||||
mffs 4 /* fpscr -> fpr */
|
||||
li 5,48
|
||||
@ -311,7 +410,7 @@ run_innerloop_exit:
|
||||
andi. 6,6,0xFF /* mask wanted bits */
|
||||
cmplwi 6,0x0 /* cmp with zero */
|
||||
bne invariant_violation /* branch if not zero */
|
||||
#endif
|
||||
# endif
|
||||
LafterFP8:
|
||||
|
||||
/* Using r11 - value used again further on, so don't trash! */
|
||||
@ -445,36 +544,9 @@ LafterVMX9:
|
||||
addi 1,1,496 /* stack_size */
|
||||
blr
|
||||
|
||||
|
||||
/* Other ways of getting out of the inner loop. Placed out-of-line to
|
||||
make it look cleaner.
|
||||
*/
|
||||
dispatch_exceptional:
|
||||
/* this is jumped to only, not fallen-through from above */
|
||||
/* save r30 in %CIA and defer to sched */
|
||||
lwz 16,44(1)
|
||||
stw 30,OFFSET_ppc32_CIA(16)
|
||||
b run_innerloop_exit
|
||||
|
||||
fast_lookup_failed:
|
||||
/* %CIA is up to date here since dispatch_boring dominates */
|
||||
mfctr 17
|
||||
addi 17,17,1
|
||||
mtctr 17
|
||||
li 3,VG_TRC_INNER_FASTMISS
|
||||
b run_innerloop_exit
|
||||
|
||||
counter_is_zero:
|
||||
/* %CIA is up to date here since dispatch_boring dominates */
|
||||
mfctr 17
|
||||
addi 17,17,1
|
||||
mtctr 17
|
||||
li 3,VG_TRC_INNER_COUNTERZERO
|
||||
b run_innerloop_exit
|
||||
|
||||
/* Let the linker know we don't need an executable stack */
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
|
||||
##--------------------------------------------------------------------##
|
||||
##--- end ---##
|
||||
##--------------------------------------------------------------------##
|
||||
/*--------------------------------------------------------------------*/
|
||||
/*--- end ---*/
|
||||
/*--------------------------------------------------------------------*/
|
||||
|
||||
@ -14,11 +14,12 @@ Post 3.1.0:
|
||||
- Nick improved vg_SP_update_pass() to identify more small constant
|
||||
increments/decrements of SP, so the fast cases can be used more often.
|
||||
Saved 1--3% on a few programs.
|
||||
- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use
|
||||
jumps instead of call/return for calling translations, and also removed
|
||||
the --profile-flags profiling from the dispatcher unless --profile-flags
|
||||
is being used. Improved Nulgrind performance typically by 10--20%,
|
||||
and Memcheck performance typically by 2--20%.
|
||||
- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
|
||||
AMD64 use jumps instead of call/return for calling translations.
|
||||
Also, on x86, amd64 and ppc32, --profile-flags style profiling was
|
||||
removed from the despatch loop unless --profile-flags is being used.
|
||||
Improved Nulgrind performance typically by 10--20%, and Memcheck
|
||||
performance typically by 2--20%.
|
||||
|
||||
COMPVBITS branch:
|
||||
- Nick converted to compress V bits, initial version saved 0--5% on most
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user