Rewrite ppc32 dispatch loop to avoid profiling overhead, as per

today's x86 and amd64 rewrites. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5352
2026-02-03 18:13:01 +00:00 · 2005-12-15 21:40:34 +00:00 · 2005-12-15 21:40:34 +00:00 · 02a7e5b5d0
commit 02a7e5b5d0
parent a75ddd7aaa
2 changed files with 214 additions and 141 deletions
--- a/coregrind/m_dispatch/dispatch-ppc32-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S
@ -1,8 +1,8 @@

-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address.       ---##
-##---                                             dispatch-ppc32.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address.       ---*/
+/*---                                             dispatch-ppc32.S ---*/
+/*--------------------------------------------------------------------*/

 /*
  This file is part of Valgrind, a dynamic binary instrumentation
@ -38,12 +38,20 @@
 /*--- The dispatch loop.                                   ---*/
 /*------------------------------------------------------------*/

-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up)                 ---*/
+/*----------------------------------------------------*/

-        .globl  VG_(run_innerloop)
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+.text
+.globl  VG_(run_innerloop)
 VG_(run_innerloop):
-        /* ----- entry point to VG_(run_innerloop) ----- */
+	/* r3 holds guest_state */
+	/* r4 holds do_profiling */

+        /* ----- entry point to VG_(run_innerloop) ----- */
        /* For Linux/ppc32 we need the SysV ABI, which uses
           LR->4(parent_sp), CR->anywhere.
           (The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
@ -58,10 +66,10 @@ VG_(run_innerloop):
        stwu    1,-496(1)  /* sp should maintain 16-byte alignment */

        /* Save callee-saved registers... */
-	/* r3 is live here (guest state ptr), so use r4 */
-        lis     4,VG_(machine_ppc32_has_FP)@ha
-        lwz     4,VG_(machine_ppc32_has_FP)@l(4)
-        cmplwi  4,0
+	/* r3, r4 are live here, so use r5 */
+        lis     5,VG_(machine_ppc32_has_FP)@ha
+        lwz     5,VG_(machine_ppc32_has_FP)@l(5)
+        cmplwi  5,0
        beq     LafterFP1

        /* Floating-point reg save area : 144 bytes */
@ -111,43 +119,43 @@ LafterFP1:
        /* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI.
           The Linux kernel might not actually use VRSAVE for its intended
           purpose, but it should be harmless to preserve anyway. */
-	/* r3 is live here (guest state ptr), so use r4 */
-        lis     4,VG_(machine_ppc32_has_VMX)@ha
-        lwz     4,VG_(machine_ppc32_has_VMX)@l(4)
-        cmplwi  4,0
+	/* r3, r4 are live here (guest state ptr), so use r5 */
+        lis     5,VG_(machine_ppc32_has_VMX)@ha
+        lwz     5,VG_(machine_ppc32_has_VMX)@l(5)
+        cmplwi  5,0
        beq     LafterVMX1

        /* VRSAVE save word : 32 bytes */
-        mfspr   4,256         /* vrsave reg is spr number 256 */
-        stw     4,244(1)
+        mfspr   5,256         /* vrsave reg is spr number 256 */
+        stw     5,244(1)

        /* Alignment padding : 4 bytes */

        /* Vector reg save area (quadword aligned) : 192 bytes */
-        li      4,224
-        stvx    31,4,1
-        li      4,208
-        stvx    30,4,1
-        li      4,192
-        stvx    29,4,1
-        li      4,176
-        stvx    28,4,1
-        li      4,160
-        stvx    27,4,1
-        li      4,144
-        stvx    26,4,1
-        li      4,128
-        stvx    25,4,1
-        li      4,112
-        stvx    24,4,1
-        li      4,96
-        stvx    23,4,1
-        li      4,80
-        stvx    22,4,1
-        li      4,64
-        stvx    21,4,1
-        li      4,48
-        stvx    20,4,1
+        li      5,224
+        stvx    31,5,1
+        li      5,208
+        stvx    30,5,1
+        li      5,192
+        stvx    29,5,1
+        li      5,176
+        stvx    28,5,1
+        li      5,160
+        stvx    27,5,1
+        li      5,144
+        stvx    26,5,1
+        li      5,128
+        stvx    25,5,1
+        li      5,112
+        stvx    25,5,1
+        li      5,96
+        stvx    23,5,1
+        li      5,80
+        stvx    22,5,1
+        li      5,64
+        stvx    21,5,1
+        li      5,48
+        stvx    20,5,1
 LafterVMX1:

        /* Save cr */
@ -159,8 +167,9 @@ LafterVMX1:
        /* 32(sp) used later to check FPSCR[RM] */

        /* r3 holds guest_state */
-        mr      31,3
-        stw     3,28(1)       /* spill orig guest_state ptr */
+        /* r4 holds do_profiling */
+        mr      31,3      /* r31 (generated code gsp) = r3 */
+        stw     3,28(1)   /* spill orig guest_state ptr */

        /* 24(sp) used later to stop ctr reg being clobbered */
        /* 20(sp) used later to load fpscr with zero */
@ -171,40 +180,37 @@ LafterVMX1:
           0(sp)  : back-chain
        */

-// CAB TODO: Use a caller-saved reg for orig guest_state ptr
-// - rem to set non-allocateable in isel.c
+        /* CAB TODO: Use a caller-saved reg for orig guest_state ptr
+           - rem to set non-allocateable in isel.c */

        /* hold dispatch_ctr in ctr reg */
-        lis     17,VG_(dispatch_ctr)@ha
-        lwz     17,VG_(dispatch_ctr)@l(17)
-        mtctr   17
-
-        /* fetch %CIA into r30 */
-        lwz     30,OFFSET_ppc32_CIA(31)
+        lis     5,VG_(dispatch_ctr)@ha
+        lwz     5,VG_(dispatch_ctr)@l(5)
+        mtctr   5

        /* set host FPU control word to the default mode expected 
           by VEX-generated code.  See comments in libvex.h for
           more info. */
-        lis     3,VG_(machine_ppc32_has_FP)@ha
-        lwz     3,VG_(machine_ppc32_has_FP)@l(3)
-        cmplwi  3,0
+        lis     5,VG_(machine_ppc32_has_FP)@ha
+        lwz     5,VG_(machine_ppc32_has_FP)@l(5)
+        cmplwi  5,0
        beq     LafterFP2

-       /* get zero into f3 (tedious) */
-       /* note: fsub 3,3,3 is not a reliable way to do this, 
-          since if f3 holds a NaN or similar then we don't necessarily
-          wind up with zero. */
-        li      3,0
-        stw     3,20(1)
+        /* get zero into f3 (tedious) */
+        /* note: fsub 3,3,3 is not a reliable way to do this, 
+           since if f3 holds a NaN or similar then we don't necessarily
+           wind up with zero. */
+        li      5,0
+        stw     5,20(1)
        lfs     3,20(1)
        mtfsf   0xFF,3   /* fpscr = f3 */
 LafterFP2:

        /* set host AltiVec control word to the default mode expected 
           by VEX-generated code. */
-        lis     3,VG_(machine_ppc32_has_VMX)@ha
-        lwz     3,VG_(machine_ppc32_has_VMX)@l(3)
-        cmplwi  3,0
+        lis     5,VG_(machine_ppc32_has_VMX)@ha
+        lwz     5,VG_(machine_ppc32_has_VMX)@l(5)
+        cmplwi  5,0
        beq     LafterVMX2

        vspltisw 3,0x0  /* generate zero */
@ -214,36 +220,108 @@ LafterVMX2:
        /* make a stack frame for the code we are calling */
        stwu    1,-16(1)

-        /* fall into main loop */
+        /* fetch %CIA into r3 */
+        lwz     3,OFFSET_ppc32_CIA(31)

-/* Live regs:
-	r1 (=sp)
-	r30 (=CIA = jump address)
-	r31 (=guest_state)
-	ctr (=dispatch_ctr)
-   Stack state:
-	44(r1) (=orig guest_state)
-*/
+        /* fall into main loop  (the right one) */
+	/* r4 = do_profiling.  It's probably trashed after here,
+           but that's OK: we don't need it after here. */
+	cmplwi	4,0
+	beq	VG_(run_innerloop__dispatch_unprofiled)
+	b	VG_(run_innerloop__dispatch_profiled)
+	/*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher           ---*/
+/*----------------------------------------------------*/
+
+.global	VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+	/* At entry: Live regs:
+		r1 (=sp)
+		r3  (=CIA = next guest address)
+		r31 (=guest_state)
+		ctr (=dispatch_ctr)
+	   Stack state:
+		44(r1) (=orig guest_state)
+	*/
+
+	/* Has the guest state pointer been messed with?  If yes, exit. */
+        lwz     5,44(1)         /* original guest_state ptr */
+        cmpw    5,31
+        bne	gsp_changed

-dispatch_boring:
        /* save the jump address in the guest state */
-        stw     30,OFFSET_ppc32_CIA(31)
+        stw     3,OFFSET_ppc32_CIA(31)

        /* Are we out of timeslice?  If yes, defer to scheduler. */
        bdz     counter_is_zero  /* decrements ctr reg */

        /* try a fast lookup in the translation cache */
        /* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
-        rlwinm  4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2  
-// CAB:	use a caller-saved reg for this ?
+        rlwinm  4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2  
        addis   5,4,VG_(tt_fast)@ha
        lwz     5,VG_(tt_fast)@l(5)
        lwz     6,4(5)   /* big-endian, so comparing 2nd 32bit word */
-        cmpw    30,6
+        cmpw    3,6
+        bne     fast_lookup_failed
+
+        /* Found a match.  Call tce[1], which is 8 bytes along, since
+           each tce element is a 64-bit int. */
+        addi    8,5,8
+        mtlr    8
+
+        /* stop ctr being clobbered */
+        mfctr   5
+        stw     5,40(1)  /* => 40-16 = 24(1) on our parent stack */
+
+	/* run the translation */
+        blrl
+
+	/* reinstate clobbered ctr */
+        lwz     5,40(1)
+        mtctr   5
+
+	/* start over */
+	b	VG_(run_innerloop__dispatch_unprofiled)
+	/*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower)    ---*/
+/*----------------------------------------------------*/
+
+.global	VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+	/* At entry: Live regs:
+		r1 (=sp)
+		r3  (=CIA = next guest address)
+		r31 (=guest_state)
+		ctr (=dispatch_ctr)
+	   Stack state:
+		44(r1) (=orig guest_state)
+	*/
+
+	/* Has the guest state pointer been messed with?  If yes, exit. */
+        lwz     5,44(1)         /* original guest_state ptr */
+        cmpw    5,31
+        bne	gsp_changed
+
+        /* save the jump address in the guest state */
+        stw     3,OFFSET_ppc32_CIA(31)
+
+        /* Are we out of timeslice?  If yes, defer to scheduler. */
+        bdz     counter_is_zero  /* decrements ctr reg */
+
+        /* try a fast lookup in the translation cache */
+        /* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
+        rlwinm  4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2  
+        addis   5,4,VG_(tt_fast)@ha
+        lwz     5,VG_(tt_fast)@l(5)
+        lwz     6,4(5)   /* big-endian, so comparing 2nd 32bit word */
+        cmpw    3,6
        bne     fast_lookup_failed

        /* increment bb profile counter */
-// CAB:	use a caller-saved reg for this ?
        addis   6,4,VG_(tt_fastN)@ha
        lwz     7,VG_(tt_fastN)@l(6)
        lwz     8,0(7)
@ -256,37 +334,57 @@ dispatch_boring:
        mtlr    8

        /* stop ctr being clobbered */
-// CAB:	use a caller-saved reg for this ?
-//      but then (bdz) => (decr, cmp, bc)... still better than a stw?
-        mfctr   9
-        stw     9,40(1)  /* => 40-16 = 24(1) on our parent stack */
+        mfctr   5
+        stw     5,40(1)  /* => 40-16 = 24(1) on our parent stack */

+	/* run the translation */
        blrl

-
-        /* On return from guest code:
-	   r3 holds destination (original) address.
-
-           r31 may be unchanged (guest_state), or may indicate further
-           details of the control transfer requested to *r3.
-
-           If r31 is unchanged (== 44(r1)), just jump next to r3.
-
-           Otherwise fall out, back to the scheduler, and let it
-           figure out what to do next.
-        */
-
 	/* reinstate clobbered ctr */
-        lwz     9,40(1)
-        mtctr   9
+        lwz     5,40(1)
+        mtctr   5
+
+	/* start over */
+	b	VG_(run_innerloop__dispatch_profiled)
+	/*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- exit points                                  ---*/
+/*----------------------------------------------------*/
+
+gsp_changed:
+	/* Someone messed with the gsp (in r31).  Have to
+           defer to scheduler to resolve this.  dispatch ctr
+	   is not yet decremented, so no need to increment. */
+	/* %CIA is NOT up to date here.  First, need to write
+	   %r3 back to %CIA, but without trashing %r31 since
+	   that holds the value we want to return to the scheduler.
+	   Hence use %r5 transiently for the guest state pointer. */
+        lwz     5,44(1)         /* original guest_state ptr */
+        stw     3,OFFSET_ppc32_CIA(5)
+	mr	3,31		/* r3 = new gsp value */
+	b	run_innerloop_exit
+	/*NOTREACHED*/
+
+counter_is_zero:
+	/* %CIA is up to date */
+	/* back out decrement of the dispatch counter */
+        mfctr   5
+        addi    5,5,1
+	mtctr   5
+        li      3,VG_TRC_INNER_COUNTERZERO
+        b       run_innerloop_exit
+
+fast_lookup_failed:
+	/* %CIA is up to date */
+	/* back out decrement of the dispatch counter */
+        mfctr   5
+        addi    5,5,1
+	mtctr   5
+        li      3,VG_TRC_INNER_FASTMISS
+	b       run_innerloop_exit

-        mr      30,3             /* put CIA (=r3) in r30 */
-        lwz     16,44(1)         /* original guest_state ptr */
-        cmpw    16,31
-        beq     dispatch_boring  /* r31 unchanged... */

-        mr      3,31             /* put return val (=r31) in r3 */
-        b       dispatch_exceptional

 /* All exits from the dispatcher go through here.
   r3 holds the return value. 
@ -301,8 +399,9 @@ run_innerloop_exit:
        cmplwi  10,0
        beq     LafterFP8

-/* This check avoidance may be removable if stfiwx is implemented. */
-#if !defined(ENABLE_INNER)
+	/* This check avoidance may be removable if stfiwx is
+	implemented. */
+#	if !defined(ENABLE_INNER)
        /* Check FPSCR & 0xFF == 0 (lowest 8bits are controls)  */
        mffs      4                       /* fpscr -> fpr */
        li        5,48
@ -311,7 +410,7 @@ run_innerloop_exit:
        andi.     6,6,0xFF                /* mask wanted bits */
        cmplwi    6,0x0                   /* cmp with zero */
        bne       invariant_violation     /* branch if not zero */
-#endif
+#	endif
 LafterFP8:

 	/* Using r11 - value used again further on, so don't trash! */
@ -445,36 +544,9 @@ LafterVMX9:
        addi    1,1,496   /* stack_size */
        blr

-
-/* Other ways of getting out of the inner loop.  Placed out-of-line to
-   make it look cleaner. 
-*/
-dispatch_exceptional:
-	/* this is jumped to only, not fallen-through from above */
-	/* save r30 in %CIA and defer to sched */
-        lwz     16,44(1)
-        stw     30,OFFSET_ppc32_CIA(16)
-        b       run_innerloop_exit
-
-fast_lookup_failed:
-	/* %CIA is up to date here since dispatch_boring dominates */
-        mfctr   17
-        addi    17,17,1
-	mtctr   17
-        li      3,VG_TRC_INNER_FASTMISS
-	b       run_innerloop_exit
-
-counter_is_zero:
-	/* %CIA is up to date here since dispatch_boring dominates */
-        mfctr   17
-        addi    17,17,1
-	mtctr   17
-        li      3,VG_TRC_INNER_COUNTERZERO
-        b       run_innerloop_exit
-
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits

-##--------------------------------------------------------------------##
-##--- end                                                          ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
--- a/docs/internals/performance.txt
+++ b/docs/internals/performance.txt
@ -14,11 +14,12 @@ Post 3.1.0:
 - Nick improved vg_SP_update_pass() to identify more small constant
  increments/decrements of SP, so the fast cases can be used more often.
  Saved 1--3% on a few programs.
- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use
-  jumps instead of call/return for calling translations, and also removed
-  the --profile-flags profiling from the dispatcher unless --profile-flags
-  is being used.  Improved Nulgrind performance typically by 10--20%,
-  and Memcheck performance typically by 2--20%.
+- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
+  AMD64 use jumps instead of call/return for calling translations.
+  Also, on x86, amd64 and ppc32, --profile-flags style profiling was
+  removed from the despatch loop unless --profile-flags is being used.
+  Improved Nulgrind performance typically by 10--20%, and Memcheck
+  performance typically by 2--20%.

 COMPVBITS branch:
 - Nick converted to compress V bits, initial version saved 0--5% on most