From 02a7e5b5d0ef37979fd57d62654ff2f40b854511 Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Thu, 15 Dec 2005 21:40:34 +0000
Subject: [PATCH] Rewrite ppc32 dispatch loop to avoid profiling overhead, as
 per today's x86 and amd64 rewrites.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5352
---
 coregrind/m_dispatch/dispatch-ppc32-linux.S | 344 ++++++++++++--------
 docs/internals/performance.txt              |  11 +-
 2 files changed, 214 insertions(+), 141 deletions(-)

diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S
index 2220daa22..cd53ab53d 100644
--- a/coregrind/m_dispatch/dispatch-ppc32-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S
@@ -1,8 +1,8 @@
 
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address.       ---##
-##---                                             dispatch-ppc32.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address.       ---*/
+/*---                                             dispatch-ppc32.S ---*/
+/*--------------------------------------------------------------------*/
 
 /*
   This file is part of Valgrind, a dynamic binary instrumentation
@@ -38,12 +38,20 @@
 /*--- The dispatch loop.                                   ---*/
 /*------------------------------------------------------------*/
 
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up)                 ---*/
+/*----------------------------------------------------*/
 
-        .globl  VG_(run_innerloop)
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+.text
+.globl  VG_(run_innerloop)
 VG_(run_innerloop):
-        /* ----- entry point to VG_(run_innerloop) ----- */
+	/* r3 holds guest_state */
+	/* r4 holds do_profiling */
 
+        /* ----- entry point to VG_(run_innerloop) ----- */
         /* For Linux/ppc32 we need the SysV ABI, which uses
            LR->4(parent_sp), CR->anywhere.
            (The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
@@ -58,10 +66,10 @@ VG_(run_innerloop):
         stwu    1,-496(1)  /* sp should maintain 16-byte alignment */
 
         /* Save callee-saved registers... */
-	/* r3 is live here (guest state ptr), so use r4 */
-        lis     4,VG_(machine_ppc32_has_FP)@ha
-        lwz     4,VG_(machine_ppc32_has_FP)@l(4)
-        cmplwi  4,0
+	/* r3, r4 are live here, so use r5 */
+        lis     5,VG_(machine_ppc32_has_FP)@ha
+        lwz     5,VG_(machine_ppc32_has_FP)@l(5)
+        cmplwi  5,0
         beq     LafterFP1
 
         /* Floating-point reg save area : 144 bytes */
@@ -111,43 +119,43 @@ LafterFP1:
         /* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI.
            The Linux kernel might not actually use VRSAVE for its intended
            purpose, but it should be harmless to preserve anyway. */
-	/* r3 is live here (guest state ptr), so use r4 */
-        lis     4,VG_(machine_ppc32_has_VMX)@ha
-        lwz     4,VG_(machine_ppc32_has_VMX)@l(4)
-        cmplwi  4,0
+	/* r3, r4 are live here (guest state ptr), so use r5 */
+        lis     5,VG_(machine_ppc32_has_VMX)@ha
+        lwz     5,VG_(machine_ppc32_has_VMX)@l(5)
+        cmplwi  5,0
         beq     LafterVMX1
 
         /* VRSAVE save word : 32 bytes */
-        mfspr   4,256         /* vrsave reg is spr number 256 */
-        stw     4,244(1)
+        mfspr   5,256         /* vrsave reg is spr number 256 */
+        stw     5,244(1)
 
         /* Alignment padding : 4 bytes */
 
         /* Vector reg save area (quadword aligned) : 192 bytes */
-        li      4,224
-        stvx    31,4,1
-        li      4,208
-        stvx    30,4,1
-        li      4,192
-        stvx    29,4,1
-        li      4,176
-        stvx    28,4,1
-        li      4,160
-        stvx    27,4,1
-        li      4,144
-        stvx    26,4,1
-        li      4,128
-        stvx    25,4,1
-        li      4,112
-        stvx    24,4,1
-        li      4,96
-        stvx    23,4,1
-        li      4,80
-        stvx    22,4,1
-        li      4,64
-        stvx    21,4,1
-        li      4,48
-        stvx    20,4,1
+        li      5,224
+        stvx    31,5,1
+        li      5,208
+        stvx    30,5,1
+        li      5,192
+        stvx    29,5,1
+        li      5,176
+        stvx    28,5,1
+        li      5,160
+        stvx    27,5,1
+        li      5,144
+        stvx    26,5,1
+        li      5,128
+        stvx    25,5,1
+        li      5,112
+        stvx    25,5,1
+        li      5,96
+        stvx    23,5,1
+        li      5,80
+        stvx    22,5,1
+        li      5,64
+        stvx    21,5,1
+        li      5,48
+        stvx    20,5,1
 LafterVMX1:
 
         /* Save cr */
@@ -159,8 +167,9 @@ LafterVMX1:
         /* 32(sp) used later to check FPSCR[RM] */
 
         /* r3 holds guest_state */
-        mr      31,3
-        stw     3,28(1)       /* spill orig guest_state ptr */
+        /* r4 holds do_profiling */
+        mr      31,3      /* r31 (generated code gsp) = r3 */
+        stw     3,28(1)   /* spill orig guest_state ptr */
 
         /* 24(sp) used later to stop ctr reg being clobbered */
         /* 20(sp) used later to load fpscr with zero */
@@ -171,40 +180,37 @@ LafterVMX1:
            0(sp)  : back-chain
         */
 
-// CAB TODO: Use a caller-saved reg for orig guest_state ptr
-// - rem to set non-allocateable in isel.c
+        /* CAB TODO: Use a caller-saved reg for orig guest_state ptr
+           - rem to set non-allocateable in isel.c */
 
         /* hold dispatch_ctr in ctr reg */
-        lis     17,VG_(dispatch_ctr)@ha
-        lwz     17,VG_(dispatch_ctr)@l(17)
-        mtctr   17
-
-        /* fetch %CIA into r30 */
-        lwz     30,OFFSET_ppc32_CIA(31)
+        lis     5,VG_(dispatch_ctr)@ha
+        lwz     5,VG_(dispatch_ctr)@l(5)
+        mtctr   5
 
         /* set host FPU control word to the default mode expected 
            by VEX-generated code.  See comments in libvex.h for
            more info. */
-        lis     3,VG_(machine_ppc32_has_FP)@ha
-        lwz     3,VG_(machine_ppc32_has_FP)@l(3)
-        cmplwi  3,0
+        lis     5,VG_(machine_ppc32_has_FP)@ha
+        lwz     5,VG_(machine_ppc32_has_FP)@l(5)
+        cmplwi  5,0
         beq     LafterFP2
 
-       /* get zero into f3 (tedious) */
-       /* note: fsub 3,3,3 is not a reliable way to do this, 
-          since if f3 holds a NaN or similar then we don't necessarily
-          wind up with zero. */
-        li      3,0
-        stw     3,20(1)
+        /* get zero into f3 (tedious) */
+        /* note: fsub 3,3,3 is not a reliable way to do this, 
+           since if f3 holds a NaN or similar then we don't necessarily
+           wind up with zero. */
+        li      5,0
+        stw     5,20(1)
         lfs     3,20(1)
         mtfsf   0xFF,3   /* fpscr = f3 */
 LafterFP2:
 
         /* set host AltiVec control word to the default mode expected 
            by VEX-generated code. */
-        lis     3,VG_(machine_ppc32_has_VMX)@ha
-        lwz     3,VG_(machine_ppc32_has_VMX)@l(3)
-        cmplwi  3,0
+        lis     5,VG_(machine_ppc32_has_VMX)@ha
+        lwz     5,VG_(machine_ppc32_has_VMX)@l(5)
+        cmplwi  5,0
         beq     LafterVMX2
 
         vspltisw 3,0x0  /* generate zero */
@@ -214,36 +220,108 @@ LafterVMX2:
         /* make a stack frame for the code we are calling */
         stwu    1,-16(1)
 
-        /* fall into main loop */
+        /* fetch %CIA into r3 */
+        lwz     3,OFFSET_ppc32_CIA(31)
 
-/* Live regs:
-	r1 (=sp)
-	r30 (=CIA = jump address)
-	r31 (=guest_state)
-	ctr (=dispatch_ctr)
-   Stack state:
-	44(r1) (=orig guest_state)
-*/
+        /* fall into main loop  (the right one) */
+	/* r4 = do_profiling.  It's probably trashed after here,
+           but that's OK: we don't need it after here. */
+	cmplwi	4,0
+	beq	VG_(run_innerloop__dispatch_unprofiled)
+	b	VG_(run_innerloop__dispatch_profiled)
+	/*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher           ---*/
+/*----------------------------------------------------*/
+
+.global	VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+	/* At entry: Live regs:
+		r1 (=sp)
+		r3  (=CIA = next guest address)
+		r31 (=guest_state)
+		ctr (=dispatch_ctr)
+	   Stack state:
+		44(r1) (=orig guest_state)
+	*/
+
+	/* Has the guest state pointer been messed with?  If yes, exit. */
+        lwz     5,44(1)         /* original guest_state ptr */
+        cmpw    5,31
+        bne	gsp_changed
 
-dispatch_boring:
         /* save the jump address in the guest state */
-        stw     30,OFFSET_ppc32_CIA(31)
+        stw     3,OFFSET_ppc32_CIA(31)
 
         /* Are we out of timeslice?  If yes, defer to scheduler. */
         bdz     counter_is_zero  /* decrements ctr reg */
 
         /* try a fast lookup in the translation cache */
         /* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
-        rlwinm  4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2  
-// CAB:	use a caller-saved reg for this ?
+        rlwinm  4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2  
         addis   5,4,VG_(tt_fast)@ha
         lwz     5,VG_(tt_fast)@l(5)
         lwz     6,4(5)   /* big-endian, so comparing 2nd 32bit word */
-        cmpw    30,6
+        cmpw    3,6
+        bne     fast_lookup_failed
+
+        /* Found a match.  Call tce[1], which is 8 bytes along, since
+           each tce element is a 64-bit int. */
+        addi    8,5,8
+        mtlr    8
+
+        /* stop ctr being clobbered */
+        mfctr   5
+        stw     5,40(1)  /* => 40-16 = 24(1) on our parent stack */
+
+	/* run the translation */
+        blrl
+
+	/* reinstate clobbered ctr */
+        lwz     5,40(1)
+        mtctr   5
+
+	/* start over */
+	b	VG_(run_innerloop__dispatch_unprofiled)
+	/*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower)    ---*/
+/*----------------------------------------------------*/
+
+.global	VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+	/* At entry: Live regs:
+		r1 (=sp)
+		r3  (=CIA = next guest address)
+		r31 (=guest_state)
+		ctr (=dispatch_ctr)
+	   Stack state:
+		44(r1) (=orig guest_state)
+	*/
+
+	/* Has the guest state pointer been messed with?  If yes, exit. */
+        lwz     5,44(1)         /* original guest_state ptr */
+        cmpw    5,31
+        bne	gsp_changed
+
+        /* save the jump address in the guest state */
+        stw     3,OFFSET_ppc32_CIA(31)
+
+        /* Are we out of timeslice?  If yes, defer to scheduler. */
+        bdz     counter_is_zero  /* decrements ctr reg */
+
+        /* try a fast lookup in the translation cache */
+        /* r4=((r30<<2) & (VG_TT_FAST_MASK<<2)) */
+        rlwinm  4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2  
+        addis   5,4,VG_(tt_fast)@ha
+        lwz     5,VG_(tt_fast)@l(5)
+        lwz     6,4(5)   /* big-endian, so comparing 2nd 32bit word */
+        cmpw    3,6
         bne     fast_lookup_failed
 
         /* increment bb profile counter */
-// CAB:	use a caller-saved reg for this ?
         addis   6,4,VG_(tt_fastN)@ha
         lwz     7,VG_(tt_fastN)@l(6)
         lwz     8,0(7)
@@ -256,37 +334,57 @@ dispatch_boring:
         mtlr    8
 
         /* stop ctr being clobbered */
-// CAB:	use a caller-saved reg for this ?
-//      but then (bdz) => (decr, cmp, bc)... still better than a stw?
-        mfctr   9
-        stw     9,40(1)  /* => 40-16 = 24(1) on our parent stack */
+        mfctr   5
+        stw     5,40(1)  /* => 40-16 = 24(1) on our parent stack */
 
+	/* run the translation */
         blrl
 
-
-        /* On return from guest code:
-	   r3 holds destination (original) address.
-
-           r31 may be unchanged (guest_state), or may indicate further
-           details of the control transfer requested to *r3.
-
-           If r31 is unchanged (== 44(r1)), just jump next to r3.
-
-           Otherwise fall out, back to the scheduler, and let it
-           figure out what to do next.
-        */
-
 	/* reinstate clobbered ctr */
-        lwz     9,40(1)
-        mtctr   9
+        lwz     5,40(1)
+        mtctr   5
+
+	/* start over */
+	b	VG_(run_innerloop__dispatch_profiled)
+	/*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- exit points                                  ---*/
+/*----------------------------------------------------*/
+
+gsp_changed:
+	/* Someone messed with the gsp (in r31).  Have to
+           defer to scheduler to resolve this.  dispatch ctr
+	   is not yet decremented, so no need to increment. */
+	/* %CIA is NOT up to date here.  First, need to write
+	   %r3 back to %CIA, but without trashing %r31 since
+	   that holds the value we want to return to the scheduler.
+	   Hence use %r5 transiently for the guest state pointer. */
+        lwz     5,44(1)         /* original guest_state ptr */
+        stw     3,OFFSET_ppc32_CIA(5)
+	mr	3,31		/* r3 = new gsp value */
+	b	run_innerloop_exit
+	/*NOTREACHED*/
+
+counter_is_zero:
+	/* %CIA is up to date */
+	/* back out decrement of the dispatch counter */
+        mfctr   5
+        addi    5,5,1
+	mtctr   5
+        li      3,VG_TRC_INNER_COUNTERZERO
+        b       run_innerloop_exit
+
+fast_lookup_failed:
+	/* %CIA is up to date */
+	/* back out decrement of the dispatch counter */
+        mfctr   5
+        addi    5,5,1
+	mtctr   5
+        li      3,VG_TRC_INNER_FASTMISS
+	b       run_innerloop_exit
 
-        mr      30,3             /* put CIA (=r3) in r30 */
-        lwz     16,44(1)         /* original guest_state ptr */
-        cmpw    16,31
-        beq     dispatch_boring  /* r31 unchanged... */
 
-        mr      3,31             /* put return val (=r31) in r3 */
-        b       dispatch_exceptional
 
 /* All exits from the dispatcher go through here.
    r3 holds the return value. 
@@ -301,8 +399,9 @@ run_innerloop_exit:
         cmplwi  10,0
         beq     LafterFP8
 
-/* This check avoidance may be removable if stfiwx is implemented. */
-#if !defined(ENABLE_INNER)
+	/* This check avoidance may be removable if stfiwx is
+	implemented. */
+#	if !defined(ENABLE_INNER)
         /* Check FPSCR & 0xFF == 0 (lowest 8bits are controls)  */
         mffs      4                       /* fpscr -> fpr */
         li        5,48
@@ -311,7 +410,7 @@ run_innerloop_exit:
         andi.     6,6,0xFF                /* mask wanted bits */
         cmplwi    6,0x0                   /* cmp with zero */
         bne       invariant_violation     /* branch if not zero */
-#endif
+#	endif
 LafterFP8:
 
 	/* Using r11 - value used again further on, so don't trash! */
@@ -445,36 +544,9 @@ LafterVMX9:
         addi    1,1,496   /* stack_size */
         blr
 
-
-/* Other ways of getting out of the inner loop.  Placed out-of-line to
-   make it look cleaner. 
-*/
-dispatch_exceptional:
-	/* this is jumped to only, not fallen-through from above */
-	/* save r30 in %CIA and defer to sched */
-        lwz     16,44(1)
-        stw     30,OFFSET_ppc32_CIA(16)
-        b       run_innerloop_exit
-
-fast_lookup_failed:
-	/* %CIA is up to date here since dispatch_boring dominates */
-        mfctr   17
-        addi    17,17,1
-	mtctr   17
-        li      3,VG_TRC_INNER_FASTMISS
-	b       run_innerloop_exit
-
-counter_is_zero:
-	/* %CIA is up to date here since dispatch_boring dominates */
-        mfctr   17
-        addi    17,17,1
-	mtctr   17
-        li      3,VG_TRC_INNER_COUNTERZERO
-        b       run_innerloop_exit
-
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
-##--------------------------------------------------------------------##
-##--- end                                                          ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
diff --git a/docs/internals/performance.txt b/docs/internals/performance.txt
index dcf122507..5665c61f2 100644
--- a/docs/internals/performance.txt
+++ b/docs/internals/performance.txt
@@ -14,11 +14,12 @@ Post 3.1.0:
 - Nick improved vg_SP_update_pass() to identify more small constant
   increments/decrements of SP, so the fast cases can be used more often.
   Saved 1--3% on a few programs.
-- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use
-  jumps instead of call/return for calling translations, and also removed
-  the --profile-flags profiling from the dispatcher unless --profile-flags
-  is being used.  Improved Nulgrind performance typically by 10--20%,
-  and Memcheck performance typically by 2--20%.
+- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
+  AMD64 use jumps instead of call/return for calling translations.
+  Also, on x86, amd64 and ppc32, --profile-flags style profiling was
+  removed from the despatch loop unless --profile-flags is being used.
+  Improved Nulgrind performance typically by 10--20%, and Memcheck
+  performance typically by 2--20%.
 
 COMPVBITS branch:
 - Nick converted to compress V bits, initial version saved 0--5% on most