From 02a7e5b5d0ef37979fd57d62654ff2f40b854511 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Thu, 15 Dec 2005 21:40:34 +0000 Subject: [PATCH] Rewrite ppc32 dispatch loop to avoid profiling overhead, as per today's x86 and amd64 rewrites. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5352 --- coregrind/m_dispatch/dispatch-ppc32-linux.S | 344 ++++++++++++-------- docs/internals/performance.txt | 11 +- 2 files changed, 214 insertions(+), 141 deletions(-) diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S index 2220daa22..cd53ab53d 100644 --- a/coregrind/m_dispatch/dispatch-ppc32-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S @@ -1,8 +1,8 @@ -##--------------------------------------------------------------------## -##--- The core dispatch loop, for jumping to a code address. ---## -##--- dispatch-ppc32.S ---## -##--------------------------------------------------------------------## +/*--------------------------------------------------------------------*/ +/*--- The core dispatch loop, for jumping to a code address. ---*/ +/*--- dispatch-ppc32.S ---*/ +/*--------------------------------------------------------------------*/ /* This file is part of Valgrind, a dynamic binary instrumentation @@ -38,12 +38,20 @@ /*--- The dispatch loop. ---*/ /*------------------------------------------------------------*/ -/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */ +/*----------------------------------------------------*/ +/*--- Preamble (set everything up) ---*/ +/*----------------------------------------------------*/ - .globl VG_(run_innerloop) +/* signature: +UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling ); +*/ +.text +.globl VG_(run_innerloop) VG_(run_innerloop): - /* ----- entry point to VG_(run_innerloop) ----- */ + /* r3 holds guest_state */ + /* r4 holds do_profiling */ + /* ----- entry point to VG_(run_innerloop) ----- */ /* For Linux/ppc32 we need the SysV ABI, which uses LR->4(parent_sp), CR->anywhere. (The AIX ABI, used on Darwin, and maybe Linux/ppc64?, @@ -58,10 +66,10 @@ VG_(run_innerloop): stwu 1,-496(1) /* sp should maintain 16-byte alignment */ /* Save callee-saved registers... */ - /* r3 is live here (guest state ptr), so use r4 */ - lis 4,VG_(machine_ppc32_has_FP)@ha - lwz 4,VG_(machine_ppc32_has_FP)@l(4) - cmplwi 4,0 + /* r3, r4 are live here, so use r5 */ + lis 5,VG_(machine_ppc32_has_FP)@ha + lwz 5,VG_(machine_ppc32_has_FP)@l(5) + cmplwi 5,0 beq LafterFP1 /* Floating-point reg save area : 144 bytes */ @@ -111,43 +119,43 @@ LafterFP1: /* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI. The Linux kernel might not actually use VRSAVE for its intended purpose, but it should be harmless to preserve anyway. */ - /* r3 is live here (guest state ptr), so use r4 */ - lis 4,VG_(machine_ppc32_has_VMX)@ha - lwz 4,VG_(machine_ppc32_has_VMX)@l(4) - cmplwi 4,0 + /* r3, r4 are live here (guest state ptr), so use r5 */ + lis 5,VG_(machine_ppc32_has_VMX)@ha + lwz 5,VG_(machine_ppc32_has_VMX)@l(5) + cmplwi 5,0 beq LafterVMX1 /* VRSAVE save word : 32 bytes */ - mfspr 4,256 /* vrsave reg is spr number 256 */ - stw 4,244(1) + mfspr 5,256 /* vrsave reg is spr number 256 */ + stw 5,244(1) /* Alignment padding : 4 bytes */ /* Vector reg save area (quadword aligned) : 192 bytes */ - li 4,224 - stvx 31,4,1 - li 4,208 - stvx 30,4,1 - li 4,192 - stvx 29,4,1 - li 4,176 - stvx 28,4,1 - li 4,160 - stvx 27,4,1 - li 4,144 - stvx 26,4,1 - li 4,128 - stvx 25,4,1 - li 4,112 - stvx 24,4,1 - li 4,96 - stvx 23,4,1 - li 4,80 - stvx 22,4,1 - li 4,64 - stvx 21,4,1 - li 4,48 - stvx 20,4,1 + li 5,224 + stvx 31,5,1 + li 5,208 + stvx 30,5,1 + li 5,192 + stvx 29,5,1 + li 5,176 + stvx 28,5,1 + li 5,160 + stvx 27,5,1 + li 5,144 + stvx 26,5,1 + li 5,128 + stvx 25,5,1 + li 5,112 + stvx 25,5,1 + li 5,96 + stvx 23,5,1 + li 5,80 + stvx 22,5,1 + li 5,64 + stvx 21,5,1 + li 5,48 + stvx 20,5,1 LafterVMX1: /* Save cr */ @@ -159,8 +167,9 @@ LafterVMX1: /* 32(sp) used later to check FPSCR[RM] */ /* r3 holds guest_state */ - mr 31,3 - stw 3,28(1) /* spill orig guest_state ptr */ + /* r4 holds do_profiling */ + mr 31,3 /* r31 (generated code gsp) = r3 */ + stw 3,28(1) /* spill orig guest_state ptr */ /* 24(sp) used later to stop ctr reg being clobbered */ /* 20(sp) used later to load fpscr with zero */ @@ -171,40 +180,37 @@ LafterVMX1: 0(sp) : back-chain */ -// CAB TODO: Use a caller-saved reg for orig guest_state ptr -// - rem to set non-allocateable in isel.c + /* CAB TODO: Use a caller-saved reg for orig guest_state ptr + - rem to set non-allocateable in isel.c */ /* hold dispatch_ctr in ctr reg */ - lis 17,VG_(dispatch_ctr)@ha - lwz 17,VG_(dispatch_ctr)@l(17) - mtctr 17 - - /* fetch %CIA into r30 */ - lwz 30,OFFSET_ppc32_CIA(31) + lis 5,VG_(dispatch_ctr)@ha + lwz 5,VG_(dispatch_ctr)@l(5) + mtctr 5 /* set host FPU control word to the default mode expected by VEX-generated code. See comments in libvex.h for more info. */ - lis 3,VG_(machine_ppc32_has_FP)@ha - lwz 3,VG_(machine_ppc32_has_FP)@l(3) - cmplwi 3,0 + lis 5,VG_(machine_ppc32_has_FP)@ha + lwz 5,VG_(machine_ppc32_has_FP)@l(5) + cmplwi 5,0 beq LafterFP2 - /* get zero into f3 (tedious) */ - /* note: fsub 3,3,3 is not a reliable way to do this, - since if f3 holds a NaN or similar then we don't necessarily - wind up with zero. */ - li 3,0 - stw 3,20(1) + /* get zero into f3 (tedious) */ + /* note: fsub 3,3,3 is not a reliable way to do this, + since if f3 holds a NaN or similar then we don't necessarily + wind up with zero. */ + li 5,0 + stw 5,20(1) lfs 3,20(1) mtfsf 0xFF,3 /* fpscr = f3 */ LafterFP2: /* set host AltiVec control word to the default mode expected by VEX-generated code. */ - lis 3,VG_(machine_ppc32_has_VMX)@ha - lwz 3,VG_(machine_ppc32_has_VMX)@l(3) - cmplwi 3,0 + lis 5,VG_(machine_ppc32_has_VMX)@ha + lwz 5,VG_(machine_ppc32_has_VMX)@l(5) + cmplwi 5,0 beq LafterVMX2 vspltisw 3,0x0 /* generate zero */ @@ -214,36 +220,108 @@ LafterVMX2: /* make a stack frame for the code we are calling */ stwu 1,-16(1) - /* fall into main loop */ + /* fetch %CIA into r3 */ + lwz 3,OFFSET_ppc32_CIA(31) -/* Live regs: - r1 (=sp) - r30 (=CIA = jump address) - r31 (=guest_state) - ctr (=dispatch_ctr) - Stack state: - 44(r1) (=orig guest_state) -*/ + /* fall into main loop (the right one) */ + /* r4 = do_profiling. It's probably trashed after here, + but that's OK: we don't need it after here. */ + cmplwi 4,0 + beq VG_(run_innerloop__dispatch_unprofiled) + b VG_(run_innerloop__dispatch_profiled) + /*NOTREACHED*/ + +/*----------------------------------------------------*/ +/*--- NO-PROFILING (standard) dispatcher ---*/ +/*----------------------------------------------------*/ + +.global VG_(run_innerloop__dispatch_unprofiled) +VG_(run_innerloop__dispatch_unprofiled): + /* At entry: Live regs: + r1 (=sp) + r3 (=CIA = next guest address) + r31 (=guest_state) + ctr (=dispatch_ctr) + Stack state: + 44(r1) (=orig guest_state) + */ + + /* Has the guest state pointer been messed with? If yes, exit. */ + lwz 5,44(1) /* original guest_state ptr */ + cmpw 5,31 + bne gsp_changed -dispatch_boring: /* save the jump address in the guest state */ - stw 30,OFFSET_ppc32_CIA(31) + stw 3,OFFSET_ppc32_CIA(31) /* Are we out of timeslice? If yes, defer to scheduler. */ bdz counter_is_zero /* decrements ctr reg */ /* try a fast lookup in the translation cache */ /* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */ - rlwinm 4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2 -// CAB: use a caller-saved reg for this ? + rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 addis 5,4,VG_(tt_fast)@ha lwz 5,VG_(tt_fast)@l(5) lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */ - cmpw 30,6 + cmpw 3,6 + bne fast_lookup_failed + + /* Found a match. Call tce[1], which is 8 bytes along, since + each tce element is a 64-bit int. */ + addi 8,5,8 + mtlr 8 + + /* stop ctr being clobbered */ + mfctr 5 + stw 5,40(1) /* => 40-16 = 24(1) on our parent stack */ + + /* run the translation */ + blrl + + /* reinstate clobbered ctr */ + lwz 5,40(1) + mtctr 5 + + /* start over */ + b VG_(run_innerloop__dispatch_unprofiled) + /*NOTREACHED*/ + +/*----------------------------------------------------*/ +/*--- PROFILING dispatcher (can be much slower) ---*/ +/*----------------------------------------------------*/ + +.global VG_(run_innerloop__dispatch_profiled) +VG_(run_innerloop__dispatch_profiled): + /* At entry: Live regs: + r1 (=sp) + r3 (=CIA = next guest address) + r31 (=guest_state) + ctr (=dispatch_ctr) + Stack state: + 44(r1) (=orig guest_state) + */ + + /* Has the guest state pointer been messed with? If yes, exit. */ + lwz 5,44(1) /* original guest_state ptr */ + cmpw 5,31 + bne gsp_changed + + /* save the jump address in the guest state */ + stw 3,OFFSET_ppc32_CIA(31) + + /* Are we out of timeslice? If yes, defer to scheduler. */ + bdz counter_is_zero /* decrements ctr reg */ + + /* try a fast lookup in the translation cache */ + /* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */ + rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 + addis 5,4,VG_(tt_fast)@ha + lwz 5,VG_(tt_fast)@l(5) + lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */ + cmpw 3,6 bne fast_lookup_failed /* increment bb profile counter */ -// CAB: use a caller-saved reg for this ? addis 6,4,VG_(tt_fastN)@ha lwz 7,VG_(tt_fastN)@l(6) lwz 8,0(7) @@ -256,37 +334,57 @@ dispatch_boring: mtlr 8 /* stop ctr being clobbered */ -// CAB: use a caller-saved reg for this ? -// but then (bdz) => (decr, cmp, bc)... still better than a stw? - mfctr 9 - stw 9,40(1) /* => 40-16 = 24(1) on our parent stack */ + mfctr 5 + stw 5,40(1) /* => 40-16 = 24(1) on our parent stack */ + /* run the translation */ blrl - - /* On return from guest code: - r3 holds destination (original) address. - - r31 may be unchanged (guest_state), or may indicate further - details of the control transfer requested to *r3. - - If r31 is unchanged (== 44(r1)), just jump next to r3. - - Otherwise fall out, back to the scheduler, and let it - figure out what to do next. - */ - /* reinstate clobbered ctr */ - lwz 9,40(1) - mtctr 9 + lwz 5,40(1) + mtctr 5 + + /* start over */ + b VG_(run_innerloop__dispatch_profiled) + /*NOTREACHED*/ + +/*----------------------------------------------------*/ +/*--- exit points ---*/ +/*----------------------------------------------------*/ + +gsp_changed: + /* Someone messed with the gsp (in r31). Have to + defer to scheduler to resolve this. dispatch ctr + is not yet decremented, so no need to increment. */ + /* %CIA is NOT up to date here. First, need to write + %r3 back to %CIA, but without trashing %r31 since + that holds the value we want to return to the scheduler. + Hence use %r5 transiently for the guest state pointer. */ + lwz 5,44(1) /* original guest_state ptr */ + stw 3,OFFSET_ppc32_CIA(5) + mr 3,31 /* r3 = new gsp value */ + b run_innerloop_exit + /*NOTREACHED*/ + +counter_is_zero: + /* %CIA is up to date */ + /* back out decrement of the dispatch counter */ + mfctr 5 + addi 5,5,1 + mtctr 5 + li 3,VG_TRC_INNER_COUNTERZERO + b run_innerloop_exit + +fast_lookup_failed: + /* %CIA is up to date */ + /* back out decrement of the dispatch counter */ + mfctr 5 + addi 5,5,1 + mtctr 5 + li 3,VG_TRC_INNER_FASTMISS + b run_innerloop_exit - mr 30,3 /* put CIA (=r3) in r30 */ - lwz 16,44(1) /* original guest_state ptr */ - cmpw 16,31 - beq dispatch_boring /* r31 unchanged... */ - mr 3,31 /* put return val (=r31) in r3 */ - b dispatch_exceptional /* All exits from the dispatcher go through here. r3 holds the return value. @@ -301,8 +399,9 @@ run_innerloop_exit: cmplwi 10,0 beq LafterFP8 -/* This check avoidance may be removable if stfiwx is implemented. */ -#if !defined(ENABLE_INNER) + /* This check avoidance may be removable if stfiwx is + implemented. */ +# if !defined(ENABLE_INNER) /* Check FPSCR & 0xFF == 0 (lowest 8bits are controls) */ mffs 4 /* fpscr -> fpr */ li 5,48 @@ -311,7 +410,7 @@ run_innerloop_exit: andi. 6,6,0xFF /* mask wanted bits */ cmplwi 6,0x0 /* cmp with zero */ bne invariant_violation /* branch if not zero */ -#endif +# endif LafterFP8: /* Using r11 - value used again further on, so don't trash! */ @@ -445,36 +544,9 @@ LafterVMX9: addi 1,1,496 /* stack_size */ blr - -/* Other ways of getting out of the inner loop. Placed out-of-line to - make it look cleaner. -*/ -dispatch_exceptional: - /* this is jumped to only, not fallen-through from above */ - /* save r30 in %CIA and defer to sched */ - lwz 16,44(1) - stw 30,OFFSET_ppc32_CIA(16) - b run_innerloop_exit - -fast_lookup_failed: - /* %CIA is up to date here since dispatch_boring dominates */ - mfctr 17 - addi 17,17,1 - mtctr 17 - li 3,VG_TRC_INNER_FASTMISS - b run_innerloop_exit - -counter_is_zero: - /* %CIA is up to date here since dispatch_boring dominates */ - mfctr 17 - addi 17,17,1 - mtctr 17 - li 3,VG_TRC_INNER_COUNTERZERO - b run_innerloop_exit - /* Let the linker know we don't need an executable stack */ .section .note.GNU-stack,"",@progbits -##--------------------------------------------------------------------## -##--- end ---## -##--------------------------------------------------------------------## +/*--------------------------------------------------------------------*/ +/*--- end ---*/ +/*--------------------------------------------------------------------*/ diff --git a/docs/internals/performance.txt b/docs/internals/performance.txt index dcf122507..5665c61f2 100644 --- a/docs/internals/performance.txt +++ b/docs/internals/performance.txt @@ -14,11 +14,12 @@ Post 3.1.0: - Nick improved vg_SP_update_pass() to identify more small constant increments/decrements of SP, so the fast cases can be used more often. Saved 1--3% on a few programs. -- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use - jumps instead of call/return for calling translations, and also removed - the --profile-flags profiling from the dispatcher unless --profile-flags - is being used. Improved Nulgrind performance typically by 10--20%, - and Memcheck performance typically by 2--20%. +- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and + AMD64 use jumps instead of call/return for calling translations. + Also, on x86, amd64 and ppc32, --profile-flags style profiling was + removed from the despatch loop unless --profile-flags is being used. + Improved Nulgrind performance typically by 10--20%, and Memcheck + performance typically by 2--20%. COMPVBITS branch: - Nick converted to compress V bits, initial version saved 0--5% on most