mirror of
https://github.com/Zenithsiz/ftmemsim-valgrind.git
synced 2026-02-07 04:38:00 +00:00
[This commit contains an implementation for all targets except amd64-solaris and x86-solaris, which will be completed shortly.] In the baseline simulator, jumps to guest code addresses that are not known at JIT time have to be looked up in a guest->host mapping table. That means: indirect branches, indirect calls and most commonly, returns. Since there are huge numbers of these (often 10+ million/second) the mapping mechanism needs to be extremely cheap. Currently, this is implemented using a direct-mapped cache, VG_(tt_fast), with 2^15 (guest_addr, host_addr) pairs. This is queried in handwritten assembly in VG_(disp_cp_xindir) in dispatch-<arch>-<os>.S. If there is a miss in the cache then we fall back out to C land, and do a slow lookup using VG_(search_transtab). Given that the size of the translation table(s) in recent years has expanded significantly in order to keep pace with increasing application sizes, two bad things have happened: (1) the cost of a miss in the fast cache has risen significantly, and (2) the miss rate on the fast cache has also increased significantly. This means that large (~ one-million-basic-blocks-JITted) applications that run for a long time end up spending a lot of time in VG_(search_transtab). The proposed fix is to increase associativity of the fast cache, from 1 (direct mapped) to 4. Simulations of various cache configurations using indirect-branch traces from a large application show that is the best of various configurations. In an extreme case with 5.7 billion indirect branches: * The increase of associativity from 1 way to 4 way, whilst keeping the overall cache size the same (32k guest/host pairs), reduces the miss rate by around a factor of 3, from 4.02% to 1.30%. * The use of a slightly better hash function than merely slicing off the bottom 15 bits of the address, reduces the miss rate further, from 1.30% to 0.53%. Overall the VG_(tt_fast) miss rate is almost unchanged on small workloads, but reduced by a factor of up to almost 8 on large workloads. By implementing each (4-entry) cache set using a move-to-front scheme in the case of hits in ways 1, 2 or 3, the vast majority of hits can be made to happen in way 0. Hence the cost of having this extra associativity is almost zero in the case of a hit. The improved hash function costs an extra 2 ALU shots (a shift and an xor) but overall this seems performance neutral to a win.
322 lines
10 KiB
ArmAsm
322 lines
10 KiB
ArmAsm
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- The core dispatch loop, for jumping to a code address. ---*/
|
|
/*--- dispatch-amd64-linux.S ---*/
|
|
/*--------------------------------------------------------------------*/
|
|
|
|
/*
|
|
This file is part of Valgrind, a dynamic binary instrumentation
|
|
framework.
|
|
|
|
Copyright (C) 2000-2017 Julian Seward
|
|
jseward@acm.org
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License as
|
|
published by the Free Software Foundation; either version 2 of the
|
|
License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
02111-1307, USA.
|
|
|
|
The GNU General Public License is contained in the file COPYING.
|
|
*/
|
|
|
|
#include "pub_core_basics_asm.h"
|
|
|
|
#if defined(VGP_amd64_linux)
|
|
|
|
#include "pub_core_dispatch_asm.h"
|
|
#include "pub_core_transtab_asm.h"
|
|
#include "libvex_guest_offsets.h" /* for OFFSET_amd64_RIP */
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- ---*/
|
|
/*--- The dispatch loop. VG_(disp_run_translations) is ---*/
|
|
/*--- used to run all translations, ---*/
|
|
/*--- including no-redir ones. ---*/
|
|
/*--- ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
/*----------------------------------------------------*/
|
|
/*--- Entry and preamble (set everything up) ---*/
|
|
/*----------------------------------------------------*/
|
|
|
|
/* signature:
|
|
void VG_(disp_run_translations)( UWord* two_words,
|
|
void* guest_state,
|
|
Addr host_addr );
|
|
*/
|
|
.text
|
|
.globl VG_(disp_run_translations)
|
|
.type VG_(disp_run_translations), @function
|
|
VG_(disp_run_translations):
|
|
/* %rdi holds two_words */
|
|
/* %rsi holds guest_state */
|
|
/* %rdx holds host_addr */
|
|
|
|
/* The preamble */
|
|
|
|
/* Save integer registers, since this is a pseudo-function. */
|
|
pushq %rax
|
|
pushq %rbx
|
|
pushq %rcx
|
|
pushq %rdx
|
|
pushq %rsi
|
|
pushq %rbp
|
|
pushq %r8
|
|
pushq %r9
|
|
pushq %r10
|
|
pushq %r11
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
/* %rdi must be saved last */
|
|
pushq %rdi
|
|
|
|
/* Get the host CPU in the state expected by generated code. */
|
|
|
|
/* set host FPU control word to the default mode expected
|
|
by VEX-generated code. See comments in libvex.h for
|
|
more info. */
|
|
finit
|
|
pushq $0x027F
|
|
fldcw (%rsp)
|
|
addq $8, %rsp
|
|
|
|
/* set host SSE control word to the default mode expected
|
|
by VEX-generated code. */
|
|
pushq $0x1F80
|
|
ldmxcsr (%rsp)
|
|
addq $8, %rsp
|
|
|
|
/* set dir flag to known value */
|
|
cld
|
|
|
|
/* Set up the guest state pointer */
|
|
movq %rsi, %rbp
|
|
|
|
/* and jump into the code cache. Chained translations in
|
|
the code cache run, until for whatever reason, they can't
|
|
continue. When that happens, the translation in question
|
|
will jump (or call) to one of the continuation points
|
|
VG_(cp_...) below. */
|
|
jmpq *%rdx
|
|
/*NOTREACHED*/
|
|
|
|
/*----------------------------------------------------*/
|
|
/*--- Postamble and exit. ---*/
|
|
/*----------------------------------------------------*/
|
|
|
|
postamble:
|
|
/* At this point, %rax and %rdx contain two
|
|
words to be returned to the caller. %rax
|
|
holds a TRC value, and %rdx optionally may
|
|
hold another word (for CHAIN_ME exits, the
|
|
address of the place to patch.) */
|
|
|
|
/* We're leaving. Check that nobody messed with %mxcsr
|
|
or %fpucw. We can't mess with %rax or %rdx here as they
|
|
hold the tentative return values, but any others are OK. */
|
|
#if !defined(ENABLE_INNER)
|
|
/* This check fails for self-hosting, so skip in that case */
|
|
pushq $0
|
|
fstcw (%rsp)
|
|
cmpl $0x027F, (%rsp)
|
|
popq %r15 /* get rid of the word without trashing %rflags */
|
|
jnz invariant_violation
|
|
#endif
|
|
pushq $0
|
|
stmxcsr (%rsp)
|
|
andl $0xFFFFFFC0, (%rsp) /* mask out status flags */
|
|
cmpl $0x1F80, (%rsp)
|
|
popq %r15
|
|
jnz invariant_violation
|
|
/* otherwise we're OK */
|
|
jmp remove_frame
|
|
invariant_violation:
|
|
movq $VG_TRC_INVARIANT_FAILED, %rax
|
|
movq $0, %rdx
|
|
|
|
remove_frame:
|
|
/* Pop %rdi, stash return values */
|
|
popq %rdi
|
|
movq %rax, 0(%rdi)
|
|
movq %rdx, 8(%rdi)
|
|
/* Now pop everything else */
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %r11
|
|
popq %r10
|
|
popq %r9
|
|
popq %r8
|
|
popq %rbp
|
|
popq %rsi
|
|
popq %rdx
|
|
popq %rcx
|
|
popq %rbx
|
|
popq %rax
|
|
ret
|
|
|
|
/*----------------------------------------------------*/
|
|
/*--- Continuation points ---*/
|
|
/*----------------------------------------------------*/
|
|
|
|
/* ------ Chain me to slow entry point ------ */
|
|
.global VG_(disp_cp_chain_me_to_slowEP)
|
|
VG_(disp_cp_chain_me_to_slowEP):
|
|
/* We got called. The return address indicates
|
|
where the patching needs to happen. Collect
|
|
the return address and, exit back to C land,
|
|
handing the caller the pair (Chain_me_S, RA) */
|
|
movq $VG_TRC_CHAIN_ME_TO_SLOW_EP, %rax
|
|
popq %rdx
|
|
/* 10 = movabsq $VG_(disp_chain_me_to_slowEP), %r11;
|
|
3 = call *%r11 */
|
|
subq $10+3, %rdx
|
|
jmp postamble
|
|
|
|
/* ------ Chain me to fast entry point ------ */
|
|
.global VG_(disp_cp_chain_me_to_fastEP)
|
|
VG_(disp_cp_chain_me_to_fastEP):
|
|
/* We got called. The return address indicates
|
|
where the patching needs to happen. Collect
|
|
the return address and, exit back to C land,
|
|
handing the caller the pair (Chain_me_F, RA) */
|
|
movq $VG_TRC_CHAIN_ME_TO_FAST_EP, %rax
|
|
popq %rdx
|
|
/* 10 = movabsq $VG_(disp_chain_me_to_fastEP), %r11;
|
|
3 = call *%r11 */
|
|
subq $10+3, %rdx
|
|
jmp postamble
|
|
|
|
/* ------ Indirect but boring jump ------ */
|
|
.global VG_(disp_cp_xindir)
|
|
VG_(disp_cp_xindir):
|
|
/* Where are we going? */
|
|
movq OFFSET_amd64_RIP(%rbp), %rax // "guest"
|
|
|
|
/* stats only */
|
|
addl $1, VG_(stats__n_xIndirs_32)
|
|
|
|
// LIVE: %rbp (guest state ptr), %rax (guest address to go to).
|
|
// We use 4 temporaries:
|
|
// %r9 (to point at the relevant FastCacheSet),
|
|
// %r10, %r11 and %r12 (scratch).
|
|
|
|
/* Try a fast lookup in the translation cache. This is pretty much
|
|
a handcoded version of VG_(lookupInFastCache). */
|
|
|
|
// Compute %r9 = VG_TT_FAST_HASH(guest)
|
|
movq %rax, %r9 // guest
|
|
shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS)
|
|
xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest
|
|
andq $VG_TT_FAST_MASK, %r9 // setNo
|
|
|
|
// Compute %r9 = &VG_(tt_fast)[%r9]
|
|
shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet)
|
|
movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0]
|
|
leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo]
|
|
|
|
// LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set)
|
|
// try way 0
|
|
cmpq %rax, FCS_g0(%r9) // cmp against .guest0
|
|
jnz 1f
|
|
// hit at way 0
|
|
jmp *FCS_h0(%r9) // goto .host0
|
|
ud2
|
|
|
|
1: // try way 1
|
|
cmpq %rax, FCS_g1(%r9) // cmp against .guest1
|
|
jnz 2f
|
|
// hit at way 1; swap upwards
|
|
/* stats only */
|
|
addl $1, VG_(stats__n_xIndir_hits1_32)
|
|
movq FCS_g0(%r9), %r10 // r10 = old .guest0
|
|
movq FCS_h0(%r9), %r11 // r11 = old .host0
|
|
movq FCS_h1(%r9), %r12 // r12 = old .host1
|
|
movq %rax, FCS_g0(%r9) // new .guest0 = guest
|
|
movq %r12, FCS_h0(%r9) // new .host0 = old .host1
|
|
movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0
|
|
movq %r11, FCS_h1(%r9) // new .host1 = old .host0
|
|
jmp *%r12 // goto old .host1 a.k.a. new .host0
|
|
ud2
|
|
|
|
2: // try way 2
|
|
cmpq %rax, FCS_g2(%r9) // cmp against .guest2
|
|
jnz 3f
|
|
// hit at way 2; swap upwards
|
|
/* stats only */
|
|
addl $1, VG_(stats__n_xIndir_hits2_32)
|
|
movq FCS_g1(%r9), %r10
|
|
movq FCS_h1(%r9), %r11
|
|
movq FCS_h2(%r9), %r12
|
|
movq %rax, FCS_g1(%r9)
|
|
movq %r12, FCS_h1(%r9)
|
|
movq %r10, FCS_g2(%r9)
|
|
movq %r11, FCS_h2(%r9)
|
|
jmp *%r12
|
|
ud2
|
|
|
|
3: // try way 3
|
|
cmpq %rax, FCS_g3(%r9) // cmp against .guest3
|
|
jnz 4f
|
|
// hit at way 3; swap upwards
|
|
/* stats only */
|
|
addl $1, VG_(stats__n_xIndir_hits3_32)
|
|
movq FCS_g2(%r9), %r10
|
|
movq FCS_h2(%r9), %r11
|
|
movq FCS_h3(%r9), %r12
|
|
movq %rax, FCS_g2(%r9)
|
|
movq %r12, FCS_h2(%r9)
|
|
movq %r10, FCS_g3(%r9)
|
|
movq %r11, FCS_h3(%r9)
|
|
jmp *%r12
|
|
ud2
|
|
|
|
4: // fast lookup failed
|
|
/* stats only */
|
|
addl $1, VG_(stats__n_xIndir_misses_32)
|
|
|
|
movq $VG_TRC_INNER_FASTMISS, %rax
|
|
movq $0, %rdx
|
|
jmp postamble
|
|
|
|
/* ------ Assisted jump ------ */
|
|
.global VG_(disp_cp_xassisted)
|
|
VG_(disp_cp_xassisted):
|
|
/* %rbp contains the TRC */
|
|
movq %rbp, %rax
|
|
movq $0, %rdx
|
|
jmp postamble
|
|
|
|
/* ------ Event check failed ------ */
|
|
.global VG_(disp_cp_evcheck_fail)
|
|
VG_(disp_cp_evcheck_fail):
|
|
movq $VG_TRC_INNER_COUNTERZERO, %rax
|
|
movq $0, %rdx
|
|
jmp postamble
|
|
|
|
|
|
.size VG_(disp_run_translations), .-VG_(disp_run_translations)
|
|
|
|
#endif // defined(VGP_amd64_linux)
|
|
|
|
/* Let the linker know we don't need an executable stack */
|
|
MARK_STACK_NO_EXEC
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- end ---*/
|
|
/*--------------------------------------------------------------------*/
|