Rewrite ppc32 dispatch loop to avoid profiling overhead, as per

today's x86 and amd64 rewrites.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5352
This commit is contained in:
Julian Seward 2005-12-15 21:40:34 +00:00
parent a75ddd7aaa
commit 02a7e5b5d0
2 changed files with 214 additions and 141 deletions

View File

@ -1,8 +1,8 @@
##--------------------------------------------------------------------##
##--- The core dispatch loop, for jumping to a code address. ---##
##--- dispatch-ppc32.S ---##
##--------------------------------------------------------------------##
/*--------------------------------------------------------------------*/
/*--- The core dispatch loop, for jumping to a code address. ---*/
/*--- dispatch-ppc32.S ---*/
/*--------------------------------------------------------------------*/
/*
This file is part of Valgrind, a dynamic binary instrumentation
@ -38,12 +38,20 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
/*----------------------------------------------------*/
/*--- Preamble (set everything up) ---*/
/*----------------------------------------------------*/
.globl VG_(run_innerloop)
/* signature:
UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
*/
.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* ----- entry point to VG_(run_innerloop) ----- */
/* r3 holds guest_state */
/* r4 holds do_profiling */
/* ----- entry point to VG_(run_innerloop) ----- */
/* For Linux/ppc32 we need the SysV ABI, which uses
LR->4(parent_sp), CR->anywhere.
(The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
@ -58,10 +66,10 @@ VG_(run_innerloop):
stwu 1,-496(1) /* sp should maintain 16-byte alignment */
/* Save callee-saved registers... */
/* r3 is live here (guest state ptr), so use r4 */
lis 4,VG_(machine_ppc32_has_FP)@ha
lwz 4,VG_(machine_ppc32_has_FP)@l(4)
cmplwi 4,0
/* r3, r4 are live here, so use r5 */
lis 5,VG_(machine_ppc32_has_FP)@ha
lwz 5,VG_(machine_ppc32_has_FP)@l(5)
cmplwi 5,0
beq LafterFP1
/* Floating-point reg save area : 144 bytes */
@ -111,43 +119,43 @@ LafterFP1:
/* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI.
The Linux kernel might not actually use VRSAVE for its intended
purpose, but it should be harmless to preserve anyway. */
/* r3 is live here (guest state ptr), so use r4 */
lis 4,VG_(machine_ppc32_has_VMX)@ha
lwz 4,VG_(machine_ppc32_has_VMX)@l(4)
cmplwi 4,0
/* r3, r4 are live here (guest state ptr), so use r5 */
lis 5,VG_(machine_ppc32_has_VMX)@ha
lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
cmplwi 5,0
beq LafterVMX1
/* VRSAVE save word : 32 bytes */
mfspr 4,256 /* vrsave reg is spr number 256 */
stw 4,244(1)
mfspr 5,256 /* vrsave reg is spr number 256 */
stw 5,244(1)
/* Alignment padding : 4 bytes */
/* Vector reg save area (quadword aligned) : 192 bytes */
li 4,224
stvx 31,4,1
li 4,208
stvx 30,4,1
li 4,192
stvx 29,4,1
li 4,176
stvx 28,4,1
li 4,160
stvx 27,4,1
li 4,144
stvx 26,4,1
li 4,128
stvx 25,4,1
li 4,112
stvx 24,4,1
li 4,96
stvx 23,4,1
li 4,80
stvx 22,4,1
li 4,64
stvx 21,4,1
li 4,48
stvx 20,4,1
li 5,224
stvx 31,5,1
li 5,208
stvx 30,5,1
li 5,192
stvx 29,5,1
li 5,176
stvx 28,5,1
li 5,160
stvx 27,5,1
li 5,144
stvx 26,5,1
li 5,128
stvx 25,5,1
li 5,112
stvx 25,5,1
li 5,96
stvx 23,5,1
li 5,80
stvx 22,5,1
li 5,64
stvx 21,5,1
li 5,48
stvx 20,5,1
LafterVMX1:
/* Save cr */
@ -159,8 +167,9 @@ LafterVMX1:
/* 32(sp) used later to check FPSCR[RM] */
/* r3 holds guest_state */
mr 31,3
stw 3,28(1) /* spill orig guest_state ptr */
/* r4 holds do_profiling */
mr 31,3 /* r31 (generated code gsp) = r3 */
stw 3,28(1) /* spill orig guest_state ptr */
/* 24(sp) used later to stop ctr reg being clobbered */
/* 20(sp) used later to load fpscr with zero */
@ -171,40 +180,37 @@ LafterVMX1:
0(sp) : back-chain
*/
// CAB TODO: Use a caller-saved reg for orig guest_state ptr
// - rem to set non-allocateable in isel.c
/* CAB TODO: Use a caller-saved reg for orig guest_state ptr
- rem to set non-allocateable in isel.c */
/* hold dispatch_ctr in ctr reg */
lis 17,VG_(dispatch_ctr)@ha
lwz 17,VG_(dispatch_ctr)@l(17)
mtctr 17
/* fetch %CIA into r30 */
lwz 30,OFFSET_ppc32_CIA(31)
lis 5,VG_(dispatch_ctr)@ha
lwz 5,VG_(dispatch_ctr)@l(5)
mtctr 5
/* set host FPU control word to the default mode expected
by VEX-generated code. See comments in libvex.h for
more info. */
lis 3,VG_(machine_ppc32_has_FP)@ha
lwz 3,VG_(machine_ppc32_has_FP)@l(3)
cmplwi 3,0
lis 5,VG_(machine_ppc32_has_FP)@ha
lwz 5,VG_(machine_ppc32_has_FP)@l(5)
cmplwi 5,0
beq LafterFP2
/* get zero into f3 (tedious) */
/* note: fsub 3,3,3 is not a reliable way to do this,
since if f3 holds a NaN or similar then we don't necessarily
wind up with zero. */
li 3,0
stw 3,20(1)
/* get zero into f3 (tedious) */
/* note: fsub 3,3,3 is not a reliable way to do this,
since if f3 holds a NaN or similar then we don't necessarily
wind up with zero. */
li 5,0
stw 5,20(1)
lfs 3,20(1)
mtfsf 0xFF,3 /* fpscr = f3 */
LafterFP2:
/* set host AltiVec control word to the default mode expected
by VEX-generated code. */
lis 3,VG_(machine_ppc32_has_VMX)@ha
lwz 3,VG_(machine_ppc32_has_VMX)@l(3)
cmplwi 3,0
lis 5,VG_(machine_ppc32_has_VMX)@ha
lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
cmplwi 5,0
beq LafterVMX2
vspltisw 3,0x0 /* generate zero */
@ -214,36 +220,108 @@ LafterVMX2:
/* make a stack frame for the code we are calling */
stwu 1,-16(1)
/* fall into main loop */
/* fetch %CIA into r3 */
lwz 3,OFFSET_ppc32_CIA(31)
/* Live regs:
r1 (=sp)
r30 (=CIA = jump address)
r31 (=guest_state)
ctr (=dispatch_ctr)
Stack state:
44(r1) (=orig guest_state)
*/
/* fall into main loop (the right one) */
/* r4 = do_profiling. It's probably trashed after here,
but that's OK: we don't need it after here. */
cmplwi 4,0
beq VG_(run_innerloop__dispatch_unprofiled)
b VG_(run_innerloop__dispatch_profiled)
/*NOTREACHED*/
/*----------------------------------------------------*/
/*--- NO-PROFILING (standard) dispatcher ---*/
/*----------------------------------------------------*/
.global VG_(run_innerloop__dispatch_unprofiled)
VG_(run_innerloop__dispatch_unprofiled):
/* At entry: Live regs:
r1 (=sp)
r3 (=CIA = next guest address)
r31 (=guest_state)
ctr (=dispatch_ctr)
Stack state:
44(r1) (=orig guest_state)
*/
/* Has the guest state pointer been messed with? If yes, exit. */
lwz 5,44(1) /* original guest_state ptr */
cmpw 5,31
bne gsp_changed
dispatch_boring:
/* save the jump address in the guest state */
stw 30,OFFSET_ppc32_CIA(31)
stw 3,OFFSET_ppc32_CIA(31)
/* Are we out of timeslice? If yes, defer to scheduler. */
bdz counter_is_zero /* decrements ctr reg */
/* try a fast lookup in the translation cache */
/* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
rlwinm 4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2
// CAB: use a caller-saved reg for this ?
rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
cmpw 30,6
cmpw 3,6
bne fast_lookup_failed
/* Found a match. Call tce[1], which is 8 bytes along, since
each tce element is a 64-bit int. */
addi 8,5,8
mtlr 8
/* stop ctr being clobbered */
mfctr 5
stw 5,40(1) /* => 40-16 = 24(1) on our parent stack */
/* run the translation */
blrl
/* reinstate clobbered ctr */
lwz 5,40(1)
mtctr 5
/* start over */
b VG_(run_innerloop__dispatch_unprofiled)
/*NOTREACHED*/
/*----------------------------------------------------*/
/*--- PROFILING dispatcher (can be much slower) ---*/
/*----------------------------------------------------*/
.global VG_(run_innerloop__dispatch_profiled)
VG_(run_innerloop__dispatch_profiled):
/* At entry: Live regs:
r1 (=sp)
r3 (=CIA = next guest address)
r31 (=guest_state)
ctr (=dispatch_ctr)
Stack state:
44(r1) (=orig guest_state)
*/
/* Has the guest state pointer been messed with? If yes, exit. */
lwz 5,44(1) /* original guest_state ptr */
cmpw 5,31
bne gsp_changed
/* save the jump address in the guest state */
stw 3,OFFSET_ppc32_CIA(31)
/* Are we out of timeslice? If yes, defer to scheduler. */
bdz counter_is_zero /* decrements ctr reg */
/* try a fast lookup in the translation cache */
/* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
cmpw 3,6
bne fast_lookup_failed
/* increment bb profile counter */
// CAB: use a caller-saved reg for this ?
addis 6,4,VG_(tt_fastN)@ha
lwz 7,VG_(tt_fastN)@l(6)
lwz 8,0(7)
@ -256,37 +334,57 @@ dispatch_boring:
mtlr 8
/* stop ctr being clobbered */
// CAB: use a caller-saved reg for this ?
// but then (bdz) => (decr, cmp, bc)... still better than a stw?
mfctr 9
stw 9,40(1) /* => 40-16 = 24(1) on our parent stack */
mfctr 5
stw 5,40(1) /* => 40-16 = 24(1) on our parent stack */
/* run the translation */
blrl
/* On return from guest code:
r3 holds destination (original) address.
r31 may be unchanged (guest_state), or may indicate further
details of the control transfer requested to *r3.
If r31 is unchanged (== 44(r1)), just jump next to r3.
Otherwise fall out, back to the scheduler, and let it
figure out what to do next.
*/
/* reinstate clobbered ctr */
lwz 9,40(1)
mtctr 9
lwz 5,40(1)
mtctr 5
/* start over */
b VG_(run_innerloop__dispatch_profiled)
/*NOTREACHED*/
/*----------------------------------------------------*/
/*--- exit points ---*/
/*----------------------------------------------------*/
gsp_changed:
/* Someone messed with the gsp (in r31). Have to
defer to scheduler to resolve this. dispatch ctr
is not yet decremented, so no need to increment. */
/* %CIA is NOT up to date here. First, need to write
%r3 back to %CIA, but without trashing %r31 since
that holds the value we want to return to the scheduler.
Hence use %r5 transiently for the guest state pointer. */
lwz 5,44(1) /* original guest_state ptr */
stw 3,OFFSET_ppc32_CIA(5)
mr 3,31 /* r3 = new gsp value */
b run_innerloop_exit
/*NOTREACHED*/
counter_is_zero:
/* %CIA is up to date */
/* back out decrement of the dispatch counter */
mfctr 5
addi 5,5,1
mtctr 5
li 3,VG_TRC_INNER_COUNTERZERO
b run_innerloop_exit
fast_lookup_failed:
/* %CIA is up to date */
/* back out decrement of the dispatch counter */
mfctr 5
addi 5,5,1
mtctr 5
li 3,VG_TRC_INNER_FASTMISS
b run_innerloop_exit
mr 30,3 /* put CIA (=r3) in r30 */
lwz 16,44(1) /* original guest_state ptr */
cmpw 16,31
beq dispatch_boring /* r31 unchanged... */
mr 3,31 /* put return val (=r31) in r3 */
b dispatch_exceptional
/* All exits from the dispatcher go through here.
r3 holds the return value.
@ -301,8 +399,9 @@ run_innerloop_exit:
cmplwi 10,0
beq LafterFP8
/* This check avoidance may be removable if stfiwx is implemented. */
#if !defined(ENABLE_INNER)
/* This check avoidance may be removable if stfiwx is
implemented. */
# if !defined(ENABLE_INNER)
/* Check FPSCR & 0xFF == 0 (lowest 8bits are controls) */
mffs 4 /* fpscr -> fpr */
li 5,48
@ -311,7 +410,7 @@ run_innerloop_exit:
andi. 6,6,0xFF /* mask wanted bits */
cmplwi 6,0x0 /* cmp with zero */
bne invariant_violation /* branch if not zero */
#endif
# endif
LafterFP8:
/* Using r11 - value used again further on, so don't trash! */
@ -445,36 +544,9 @@ LafterVMX9:
addi 1,1,496 /* stack_size */
blr
/* Other ways of getting out of the inner loop. Placed out-of-line to
make it look cleaner.
*/
dispatch_exceptional:
/* this is jumped to only, not fallen-through from above */
/* save r30 in %CIA and defer to sched */
lwz 16,44(1)
stw 30,OFFSET_ppc32_CIA(16)
b run_innerloop_exit
fast_lookup_failed:
/* %CIA is up to date here since dispatch_boring dominates */
mfctr 17
addi 17,17,1
mtctr 17
li 3,VG_TRC_INNER_FASTMISS
b run_innerloop_exit
counter_is_zero:
/* %CIA is up to date here since dispatch_boring dominates */
mfctr 17
addi 17,17,1
mtctr 17
li 3,VG_TRC_INNER_COUNTERZERO
b run_innerloop_exit
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
##--------------------------------------------------------------------##
##--- end ---##
##--------------------------------------------------------------------##
/*--------------------------------------------------------------------*/
/*--- end ---*/
/*--------------------------------------------------------------------*/

View File

@ -14,11 +14,12 @@ Post 3.1.0:
- Nick improved vg_SP_update_pass() to identify more small constant
increments/decrements of SP, so the fast cases can be used more often.
Saved 1--3% on a few programs.
- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use
jumps instead of call/return for calling translations, and also removed
the --profile-flags profiling from the dispatcher unless --profile-flags
is being used. Improved Nulgrind performance typically by 10--20%,
and Memcheck performance typically by 2--20%.
- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
AMD64 use jumps instead of call/return for calling translations.
Also, on x86, amd64 and ppc32, --profile-flags style profiling was
removed from the despatch loop unless --profile-flags is being used.
Improved Nulgrind performance typically by 10--20%, and Memcheck
performance typically by 2--20%.
COMPVBITS branch:
- Nick converted to compress V bits, initial version saved 0--5% on most