From efcf1e61ec9305ce78b9baf0a3f9b6cccff88f4b Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Wed, 19 Jan 2005 11:55:34 +0000 Subject: [PATCH] Complete rewrite (I think this is the 4th incarnation) of translation table/cache management. Two main changes. (1) Translation areas are recorded using VexGuestExtents, so that Vex is now properly supported and code deletion works correctly. (2) Low overhead BB profiling, enabled by the --profile-flags= option. This finds the top N bbs at exit and shows them, so as to give a basis from which to do performane tuning. To support this, the way tt/tc work is changed. It is still a sectored arrangement, but now each sector has its own hash table. This simplifies a lot of things. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@3226 --- coregrind/core.h | 39 +- coregrind/core_asm.h | 2 +- coregrind/vg_errcontext.c | 5 +- coregrind/vg_main.c | 45 +- coregrind/vg_memory.c | 2 +- coregrind/vg_scheduler.c | 15 +- coregrind/vg_symtab2.c | 4 +- coregrind/vg_translate.c | 56 +- coregrind/vg_transtab.c | 1237 +++++++++++++++++--------------- coregrind/x86/dispatch.S | 23 +- include/basic_types.h | 1 + none/tests/cmdline2.stdout.exp | 6 +- 12 files changed, 795 insertions(+), 640 deletions(-) diff --git a/coregrind/core.h b/coregrind/core.h index 6a8484086..06f44652a 100644 --- a/coregrind/core.h +++ b/coregrind/core.h @@ -289,11 +289,10 @@ extern Int VG_(clo_n_suppressions); /* The names of the suppression files. */ extern Char* VG_(clo_suppressions)[VG_CLO_MAX_SFILES]; -/* PROFILE: collect bb profiling data? default: NO */ -extern Bool VG_(clo_bbprofile); - /* DEBUG: print generated code? default: 00000000 ( == NO ) */ -extern Bool VG_(clo_trace_codegen); +extern Bool VG_(clo_trace_flags); +/* DEBUG: do bb profiling? default: 00000000 ( == NO ) */ +extern Bool VG_(clo_profile_flags); /* DEBUG: if tracing codegen, be quiet until after this bb ( 0 ) */ extern Int VG_(clo_trace_notbelow); /* DEBUG: print system calls? default: NO */ @@ -1101,8 +1100,11 @@ extern void VG_(demangle) ( Char* orig, Char* result, Int result_size ); Exports of vg_translate.c ------------------------------------------------------------------ */ -extern Bool VG_(translate) ( ThreadId tid, Addr orig_addr, Bool debugging ); - +extern +Bool VG_(translate) ( ThreadId tid, + Addr64 orig_addr, + Bool debugging_translation, + Int debugging_verbosity ); /* --------------------------------------------------------------------- Exports of vg_execontext.c. @@ -1711,21 +1713,32 @@ GEN_SYSCALL_WRAPPER(sys_mq_getsetattr); // * P? Exports of vg_transtab.c ------------------------------------------------------------------ */ -/* The fast-cache for tt-lookup. */ -extern Addr VG_(tt_fast)[VG_TT_FAST_SIZE]; +/* The fast-cache for tt-lookup, and for finding counters. */ +extern ULong* VG_(tt_fast) [VG_TT_FAST_SIZE]; +extern UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE]; extern void VG_(init_tt_tc) ( void ); -extern void VG_(add_to_trans_tab) ( Addr orig_addr, Int orig_size, - Addr trans_addr, Int trans_size ); -extern Addr VG_(search_transtab) ( Addr original_addr ); -extern void VG_(invalidate_translations) ( Addr start, UInt range ); +extern +void VG_(add_to_trans_tab)( VexGuestExtents* vge, + Addr64 entry, + AddrH code, + UInt code_len ); + +extern Bool VG_(search_transtab) ( /*OUT*/AddrH* result, + Addr64 guest_addr, + Bool upd_cache ); + +extern void VG_(discard_translations) ( Addr64 start, UInt range ); extern void VG_(sanity_check_tt_tc) ( Char* caller ); extern void VG_(print_tt_tc_stats) ( void ); -extern Int VG_(get_bbs_translated) ( void ); +extern UInt VG_(get_bbs_translated) ( void ); + +extern void VG_(show_BB_profile) ( void ); + /* --------------------------------------------------------------------- Exports of vg_syscall.S diff --git a/coregrind/core_asm.h b/coregrind/core_asm.h index 6b78520f0..386b8e3f6 100644 --- a/coregrind/core_asm.h +++ b/coregrind/core_asm.h @@ -56,7 +56,7 @@ /* Constants for the fast translation lookup cache. */ -#define VG_TT_FAST_BITS 15 +#define VG_TT_FAST_BITS 16 #define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS) #define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1) diff --git a/coregrind/vg_errcontext.c b/coregrind/vg_errcontext.c index a6f8b2d8e..612f666ec 100644 --- a/coregrind/vg_errcontext.c +++ b/coregrind/vg_errcontext.c @@ -665,8 +665,9 @@ void VG_(show_all_errors) ( void ) pp_Error( p_min, False ); if ((i+1 == VG_(clo_dump_error))) { - VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/, - p_min->where->ips[0], /*debugging*/True); + VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/, + p_min->where->ips[0], /*debugging*/True, + 0xFE/*verbosity*/); } p_min->count = 1 << 30; diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c index 3ea5cbef0..2cc023143 100644 --- a/coregrind/vg_main.c +++ b/coregrind/vg_main.c @@ -1483,8 +1483,8 @@ Int VG_(clo_input_fd) = 0; /* stdin */ Int VG_(clo_n_suppressions) = 0; Char* VG_(clo_suppressions)[VG_CLO_MAX_SFILES]; Bool VG_(clo_profile) = False; -Bool VG_(clo_bbprofile) = False; -UChar VG_(clo_trace_codegen) = 0; // 00000000b +UChar VG_(clo_trace_flags) = 0; // 00000000b +UChar VG_(clo_profile_flags) = 0; // 00000000b Int VG_(clo_trace_notbelow) = 0; Bool VG_(clo_trace_syscalls) = False; Bool VG_(clo_trace_signals) = False; @@ -1561,9 +1561,9 @@ void usage ( Bool debug_help ) " --single-step=no|yes translate each instr separately? [no]\n" " --optimise=no|yes improve intermediate code? [yes]\n" " --profile=no|yes profile? (tool must be built for it) [no]\n" -" --bbprofile=no|yes profile bbs? [no]\n" " --branchpred=yes|no generate branch prediction hints [no]\n" -" --trace-codegen= show generated code? (X = 0|1) [00000000]\n" +" --trace-flags= show generated code? (X = 0|1) [00000000]\n" +" --profile-flags= ditto, but for profiling (X = 0|1) [00000000]\n" " --trace-notbelow= only show BBs above [0]\n" " --trace-syscalls=no|yes show all system calls? [no]\n" " --trace-signals=no|yes show signal handling details? [no]\n" @@ -1579,7 +1579,7 @@ void usage ( Bool debug_help ) " --vex-guest-max-insns 1 .. 100 [50]\n" " --vex-guest-chase-thresh 0 .. 99 [10]\n" "\n" -" --trace-codegen values (omit the middle space):\n" +" --trace-flags and --profile-flags values (omit the middle space):\n" " 1000 0000 show conversion into IR\n" " 0100 0000 show after initial opt\n" " 0010 0000 show after instrumentation\n" @@ -1752,7 +1752,6 @@ static void process_cmd_line_options( UInt* client_auxv, const char* toolname ) else VG_BOOL_CLO("--pointercheck", VG_(clo_pointercheck)) else VG_BOOL_CLO("--support-elan3", VG_(clo_support_elan3)) else VG_BOOL_CLO("--profile", VG_(clo_profile)) - else VG_BOOL_CLO("--bbprofile", VG_(clo_bbprofile)) else VG_BOOL_CLO("--run-libc-freeres", VG_(clo_run_libc_freeres)) else VG_BOOL_CLO("--show-below-main", VG_(clo_show_below_main)) else VG_BOOL_CLO("--time-stamp", VG_(clo_time_stamp)) @@ -1834,21 +1833,42 @@ static void process_cmd_line_options( UInt* client_auxv, const char* toolname ) VG_(clo_n_suppressions)++; } - /* "vwxyz" --> 000zyxwv (binary) */ - else if (VG_CLO_STREQN(16, arg, "--trace-codegen=")) { + /* "stuvwxyz" --> stuvwxyz (binary) */ + else if (VG_CLO_STREQN(14, arg, "--trace-flags=")) { + Int j; + char* opt = & arg[14]; + + if (8 != VG_(strlen)(opt)) { + VG_(message)(Vg_UserMsg, + "--trace-flags argument must have 8 digits"); + VG_(bad_option)(arg); + } + for (j = 0; j < 8; j++) { + if ('0' == opt[j]) { /* do nothing */ } + else if ('1' == opt[j]) VG_(clo_trace_flags) |= (1 << (7-j)); + else { + VG_(message)(Vg_UserMsg, "--trace-flags argument can only " + "contain 0s and 1s"); + VG_(bad_option)(arg); + } + } + } + + /* "stuvwxyz" --> stuvwxyz (binary) */ + else if (VG_CLO_STREQN(16, arg, "--profile-flags=")) { Int j; char* opt = & arg[16]; if (8 != VG_(strlen)(opt)) { VG_(message)(Vg_UserMsg, - "--trace-codegen argument must have 8 digits"); + "--profile-flags argument must have 8 digits"); VG_(bad_option)(arg); } for (j = 0; j < 8; j++) { if ('0' == opt[j]) { /* do nothing */ } - else if ('1' == opt[j]) VG_(clo_trace_codegen) |= (1 << (7-j)); + else if ('1' == opt[j]) VG_(clo_profile_flags) |= (1 << (7-j)); else { - VG_(message)(Vg_UserMsg, "--trace-codegen argument can only " + VG_(message)(Vg_UserMsg, "--profile-flags argument can only " "contain 0s and 1s"); VG_(bad_option)(arg); } @@ -2808,6 +2828,9 @@ int main(int argc, char **argv) if (VG_(clo_profile)) VGP_(done_profiling)(); + if (VG_(clo_profile_flags) > 0) + VG_(show_BB_profile)(); + /* We're exiting, so nuke all the threads and clean up the proxy LWPs */ vg_assert(src == VgSrc_FatalSig || VG_(threads)[last_run_tid].status == VgTs_Runnable || diff --git a/coregrind/vg_memory.c b/coregrind/vg_memory.c index cc94abac0..f4fb1a9ae 100644 --- a/coregrind/vg_memory.c +++ b/coregrind/vg_memory.c @@ -91,7 +91,7 @@ Bool VG_(seg_overlaps)(const Segment *s, Addr p, SizeT len) static void recycleseg(Segment *s) { if (s->flags & SF_CODE) - VG_(invalidate_translations)(s->addr, s->len); + VG_(discard_translations)(s->addr, s->len); if (s->filename != NULL) VG_(arena_free)(VG_AR_CORE, (Char *)s->filename); diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c index 9ec067fba..89a17ecdd 100644 --- a/coregrind/vg_scheduler.c +++ b/coregrind/vg_scheduler.c @@ -720,7 +720,7 @@ VgSchedReturnCode do_scheduler ( Int* exitcode, ThreadId* last_run_tid ) UInt trc; Int done_this_time, n_in_bounded_wait; Int n_exists, n_waiting_for_reaper; - Addr trans_addr; + Bool found; /* Start with the root thread. tid in general indicates the currently runnable/just-finished-running thread. */ @@ -880,12 +880,13 @@ VgSchedReturnCode do_scheduler ( Int* exitcode, ThreadId* last_run_tid ) /* Trivial event. Miss in the fast-cache. Do a full lookup for it. */ - trans_addr = VG_(search_transtab)( ip ); - if (trans_addr == (Addr)0) { + found = VG_(search_transtab)( NULL, + ip, True/*upd_fast_cache*/ ); + if (!found) { /* Not found; we need to request a translation. */ - if (VG_(translate)( tid, ip, /*debug*/False )) { - trans_addr = VG_(search_transtab)( ip ); - if (trans_addr == (Addr)0) + if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/ )) { + found = VG_(search_transtab)( NULL, ip, True ); + if (!found) VG_(core_panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry"); } else { // If VG_(translate)() fails, it's because it had to throw @@ -3280,7 +3281,7 @@ void do_client_request ( ThreadId tid, UWord* arg ) " addr %p, len %d\n", (void*)arg[1], arg[2] ); - VG_(invalidate_translations)( arg[1], arg[2] ); + VG_(discard_translations)( arg[1], arg[2] ); SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */ break; diff --git a/coregrind/vg_symtab2.c b/coregrind/vg_symtab2.c index ae0c138ce..33de4c69b 100644 --- a/coregrind/vg_symtab2.c +++ b/coregrind/vg_symtab2.c @@ -2352,7 +2352,7 @@ static Bool resolve_redir(CodeRedirect *redir, const SegInfo *si) redir->to_lib, redir->to_sym, redir->to_addr); } - if (VG_(search_transtab)(redir->from_addr) != 0) { + if (VG_(search_transtab)(NULL, redir->from_addr, False)) { /* For some given (from, to) redir, the "from" function got called before the .so containing "to" became available. We know this because there is already a translation for the @@ -2377,7 +2377,7 @@ static Bool resolve_redir(CodeRedirect *redir, const SegInfo *si) " %s (%p -> %p)", redir->from_sym, redir->from_addr, redir->to_addr ); } - VG_(invalidate_translations)(redir->from_addr, 1); + VG_(discard_translations)(redir->from_addr, 1); } VG_(SkipList_Insert)(&sk_resolved_redir, redir); diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c index 3c4790aa6..7d3510406 100644 --- a/coregrind/vg_translate.c +++ b/coregrind/vg_translate.c @@ -343,14 +343,17 @@ static Bool need_to_handle_SP_assignment(void) } -Bool VG_(translate) ( ThreadId tid, Addr orig_addr, - Bool debugging_translation ) +Bool VG_(translate) ( ThreadId tid, + Addr64 orig_addr, + Bool debugging_translation, + Int debugging_verbosity ) { - Addr redir, orig_addr0 = orig_addr; - Int orig_size, tmpbuf_used, verbosity; + Addr64 redir, orig_addr0 = orig_addr; + Int tmpbuf_used, verbosity; Bool notrace_until_done; UInt notrace_until_limit = 0; Segment* seg; + VexGuestExtents vge; /* Make sure Vex is initialised right. */ VexTranslateResult tres; @@ -372,13 +375,24 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, redir = VG_(code_redirect)(orig_addr); if (redir != orig_addr && VG_(clo_verbosity) >= 2) { + Bool ok; Char name1[64] = ""; Char name2[64] = ""; - VG_(get_fnname_w_offset)(orig_addr, name1, 64); - VG_(get_fnname_w_offset)(redir, name2, 64); - name1[63] = name2[63] = 0; + name1[0] = name2[0] = 0; + ok = VG_(get_fnname_w_offset)(orig_addr, name1, 64); + if (ok) { + name1[63] = 0; + } else { + VG_(strcpy)(name1, "???"); + } + ok = VG_(get_fnname_w_offset)(redir, name2, 64); + if (ok) { + name2[63] = 0; + } else { + VG_(strcpy)(name2, "???"); + } VG_(message)(Vg_UserMsg, - "TRANSLATE: %p (%s) redirected to %p (%s)", + "TRANSLATE: 0x%llx (%s) redirected to 0x%llx (%s)", orig_addr, name1, redir, name2 ); } @@ -390,7 +404,8 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, few blocks translated prior to a failure. Set notrace_until_limit to be the number of translations to be made before --trace-codegen= style printing takes effect. */ - notrace_until_done = VG_(get_bbs_translated)() >= notrace_until_limit; + notrace_until_done + = VG_(get_bbs_translated)() >= notrace_until_limit; seg = VG_(find_segment)(orig_addr); @@ -414,11 +429,11 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, seg->flags |= SF_CODE; /* contains cached code */ /* If doing any code printing, print a basic block start marker */ - if (VG_(clo_trace_codegen) || debugging_translation) { + if (VG_(clo_trace_flags) || debugging_translation) { Char fnname[64] = ""; VG_(get_fnname_w_offset)(orig_addr, fnname, 64); VG_(printf)( - "==== BB %d %s(%p) approx BBs exec'd %llu ====\n", + "==== BB %d %s(0x%llx) approx BBs exec'd %lld ====\n", VG_(get_bbs_translated)(), fnname, orig_addr, VG_(bbs_done)); } @@ -426,21 +441,22 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, /* True if a debug trans., or if bit N set in VG_(clo_trace_codegen). */ verbosity = 0; if (debugging_translation) { - verbosity = 0xFE; + verbosity = debugging_verbosity; } else - if ( (VG_(clo_trace_codegen) > 0 + if ( (VG_(clo_trace_flags) > 0 && VG_(get_bbs_translated)() >= VG_(clo_trace_notbelow) )) { - verbosity = VG_(clo_trace_codegen); + verbosity = VG_(clo_trace_flags); } /* Actually do the translation. */ tres = LibVEX_Translate ( VG_(vex_arch), VG_(vex_subarch), VG_(vex_arch), VG_(vex_subarch), - (Char*)orig_addr, (Addr64)orig_addr, + (UChar*)orig_addr, + (Addr64)orig_addr, chase_into_ok, - &orig_size, + &vge, tmpbuf, N_TMPBUF, &tmpbuf_used, TL_(instrument), need_to_handle_SP_assignment() @@ -458,8 +474,6 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, #undef DECIDE_IF_PRINTING_CODEGEN /* Copy data at trans_addr into the translation cache. */ - /* Since the .orig_size and .trans_size fields are UShort, be paranoid. */ - vg_assert(orig_size >= 0 && orig_size < 65536); vg_assert(tmpbuf_used > 0 && tmpbuf_used < 65536); // If debugging, don't do anything with the translated block; we @@ -467,8 +481,10 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, if (!debugging_translation) { // Note that we use orig_addr0, not orig_addr, which might have been // changed by the redirection - VG_(add_to_trans_tab)( orig_addr0, orig_size, - (Addr)(&tmpbuf[0]), tmpbuf_used ); + VG_(add_to_trans_tab)( &vge, + orig_addr0, + (Addr)(&tmpbuf[0]), + tmpbuf_used ); } VGP_POPCC(VgpTranslate); diff --git a/coregrind/vg_transtab.c b/coregrind/vg_transtab.c index 13dbc6d99..af3e1602b 100644 --- a/coregrind/vg_transtab.c +++ b/coregrind/vg_transtab.c @@ -8,7 +8,7 @@ This file is part of Valgrind, a dynamic binary instrumentation framework. - Copyright (C) 2000-2004 Julian Seward + Copyright (C) 2000-2005 Julian Seward jseward@acm.org This program is free software; you can redistribute it and/or @@ -40,602 +40,508 @@ /*------------------ CONSTANTS ------------------*/ -/* Number of sectors the TC is divided into. */ -#define VG_TC_N_SECTORS 8 +/* Number of sectors the TC is divided into. If you need a larger + overall translation cache, increase this value. */ +#define N_SECTORS 8 -/* Calculated once at startup and never changed. */ -static /* const */ Int vg_tc_sector_szB = 0; +/* Number of TC entries in each sector. This needs to be a prime + number to work properly, and it is strongly recommended not to + change this. */ +#define N_TTES_PER_SECTOR /*30011*/ 40009 -/* Number of entries in the translation table. This must be a prime - number in order to make the hashing work properly. */ -#define VG_TT_SIZE /*5281*/ /*100129*/ /*200191*/ 250829 /*300007*/ +/* Because each sector contains a hash table of TTEntries, we need to + specify the maximum allowable loading, after which the sector is + deemed full. */ +#define SECTOR_TT_LIMIT_PERCENT 60 -/* Do an LRU pass when the translation table becomes this full. */ -#define VG_TT_LIMIT_PERCENT /*67*/ 80 - -#define VG_TT_LIMIT ((VG_TT_SIZE * VG_TT_LIMIT_PERCENT) / 100) +/* The sector is deemed full when this many entries are in it. */ +#define N_TTES_PER_SECTOR_USABLE \ + ((N_TTES_PER_SECTOR * SECTOR_TT_LIMIT_PERCENT) / 100) /*------------------ TYPES ------------------*/ -/* An entry in TC. Payload always is always padded out to a - word-aligned quantity so that these structs are always - word-aligned. Note, the layout of this is known by - /dispatch.S, so do not change it unless you change them - too. */ -typedef - struct { - /* 32-bit or 64-bit offsets */ - /* +0 or 0 */ Addr orig_addr; - /* +4 or 8 */ UShort orig_size; - /* +6 or 10 */ UShort trans_size; - /* +8 or 12 */ UChar payload[0]; - } - TCEntry; - -/* An entry in TT. */ +/* A translation-cache entry is two parts: + - The guest address of the first (entry) bb in the translation, + as a 64-bit word. + - One or more 64-bit words containing the code. + It is supposed to be 64-bit aligned. +*/ +/* typedef struct { - Addr orig_addr; - TCEntry* tcentry; + Addr64 orig_addr; + ULong code[0]; + } + TCEntry; +*/ + +/* A translation-table entry. This indicates precisely which areas of + guest code are included in the translation, and contains all other + auxiliary info too. */ +typedef + struct { + /* Profiling only: the count and weight (arbitrary meaning) for + this translation. Weight is a property of the translation + itself and computed once when the translation is created. + Count is an entry count for the translation and is + incremented by 1 every time the translation is used, if we + are profiling. */ + UInt count; + UShort weight; + + /* Status of the slot. Note, we need to be able to do lazy + deletion, hence the Deleted state. */ + enum { InUse, Deleted, Empty } status; + + /* Pointer to the corresponding TCEntry (must be in the same + sector!) */ + ULong* tce; + + /* This is the original guest address that purportedly is the + entry point of the translation. You might think that .entry + should be the same as .vge->base[0], and most of the time it + is. However, when doing redirections, that is not the case. + .vge must always correctly describe the guest code sections + from which this translation was made. However, .entry may or + may not be a lie, depending on whether or not we're doing + redirection. */ + Addr64 entry; + + /* This structure describes precisely what ranges of guest code + the translation covers, so we can decide whether or not to + delete it when translations of a given address range are + invalidated. */ + VexGuestExtents vge; } TTEntry; -#define PAYLOAD_OFFSET (sizeof(void*)==8 ? 12 : 8) -#define CODE_ALIGNMENT sizeof(void*) // alignment of TCEntries -#define CODE_ALIGN(a) (((a)+CODE_ALIGNMENT-1) & ~(CODE_ALIGNMENT-1)) -#define IS_ALIGNED(a) (((a) & (CODE_ALIGNMENT-1)) == 0) +/* Finally, a sector itself. Each sector contains an array of + TCEntries, which hold code, and an array of TTEntries, containing + all required administrative info. Profiling is supported using the + TTEntry .count and .weight fields, if required. Each sector is + independent in that no cross-sector references are allowed. + If the sector is not in use, all three pointers are NULL and + tt_n_inuse is zero. +*/ +typedef + struct { + /* The TCEntry area. Size of this depends on the average + translation size. We try and size it so it becomes full + precisely when this sector's translation table (tt) reaches + its load limit (SECTOR_TT_LIMIT_PERCENT). */ + ULong* tc; + /* The TTEntry array. This is a fixed size, always containing + exactly N_TTES_PER_SECTOR entries. */ + TTEntry* tt; -/* Denotes an empty TT slot, when TTEntry.orig_addr holds this - value. */ -#define VG_TTE_EMPTY ((Addr)1) + /* This points to the current allocation point in tc. */ + ULong* tc_next; -/* Denotes an empty TT slot, when TTEntry.orig_addr holds this - value. */ -#define VG_TTE_DELETED ((Addr)3) - -/* A bogus TCEntry which hopefully does not match code from any valid - address. This is what all VG_(tt_fast) entries are made to point - at when we want to invalidate it. */ -static const TCEntry vg_tc_bogus_TCEntry = { ((Addr)5), 0, 0 }; + /* The count of tt entries with state InUse. */ + Int tt_n_inuse; + } + Sector; /*------------------ DECLS ------------------*/ -/* The translation cache sectors. These are NULL until allocated - dynamically. */ -static UChar* vg_tc[VG_TC_N_SECTORS]; +/* The root data structure is an array of sectors. The index of the + youngest sector is recorded, and new translations are put into that + sector. When it fills up, we move along to the next sector and + start to fill that up, wrapping around at the end of the array. + That way, once all N_TC_SECTORS have been bought into use for the + first time, and are full, we then re-use the oldest sector, + endlessly. -/* Count of bytes used in each sector of the TC. */ -static Int vg_tc_used[VG_TC_N_SECTORS]; + When running, youngest sector should be between >= 0 and < + N_TC_SECTORS. The initial -1 value indicates the TT/TC system is + not yet initialised. +*/ +static Sector sectors[N_SECTORS]; +static Int youngest_sector = -1; -/* The age of each sector, so we can find the oldest. We just use the - global count of translations made when the sector was brought into - use. Doesn't matter if this mechanism gets confused (wraps around - 4G) once in a while. */ -static Int vg_tc_age[VG_TC_N_SECTORS]; - -/* The number of the sector currently being allocated in. */ -static Int vg_tc_current; - -/* Count of number of translations, orig and new bytes in each sector. - For stats purposes only. */ -static Int vg_tc_stats_count[VG_TC_N_SECTORS]; -static Int vg_tc_stats_osize[VG_TC_N_SECTORS]; -static Int vg_tc_stats_tsize[VG_TC_N_SECTORS]; - -static UInt n_tt_fast_misses = 0; // number of lookups missing fast TT helper -static UInt n_tc_discards = 0; // number of TT/TC discards - -// Number and total original/translated size of translations overall. -static UInt overall_in_count = 0; -static UInt overall_in_osize = 0; -static UInt overall_in_tsize = 0; -// Number and total original/t size of discards overall. -static UInt overall_out_count = 0; -static UInt overall_out_osize = 0; -static UInt overall_out_tsize = 0; +/* The number of ULongs in each TCEntry area. This is computed once + at startup and does not change. */ +static Int tc_sector_szQ; - -/*------------------ TRANSLATION TABLE ------------------*/ - -/* The translation table. An array of VG_TT_SIZE TTEntrys. */ -static TTEntry* vg_tt = NULL; - -/* Count of non-empty TT entries. This includes deleted ones. */ -static Int vg_tt_used = 0; - -/* Fast helper for the TT. A direct-mapped cache which holds a +/* Fast helper for the TC. A direct-mapped cache which holds a pointer to a TC entry which may or may not be the correct one, but which we hope usually is. This array is referred to directly from - vg_dispatch.S. */ -Addr /* TCEntry*, really */ VG_(tt_fast)[VG_TT_FAST_SIZE]; + /dispatch.S. -static void for_each_tc(Int sector, void (*fn)(TCEntry *)); + Entries in tt_fast may point to any valid TC entry, regardless of + which sector it's in. Consequently we must be very careful to + invalidate this cache when TC entries are changed or disappear. + + A special TCEntry -- bogus_tc_entry -- must be pointed at to cause + that cache entry to miss. This relies on the assumption that no + guest code actually has an address of 0x1. +*/ +/*global*/ ULong* VG_(tt_fast)[VG_TT_FAST_SIZE]; + +static ULong bogus_tc_entry = (Addr64)1; -/*------------------ TT HELPERS ------------------*/ +/* For profiling, we have a parallel array of pointers to .count + fields in TT entries. Again, these pointers must be invalidated + when translations disappear. A NULL pointer suffices to indicate + an unused slot. -static -void pp_tt_tc_status ( Char* submsg ) + tt_fast and tt_fastN change together: if tt_fast[i] points to + bogus_tc_entry then the corresponding tt_fastN[i] must be null. If + tt_fast[i] points to some TC entry somewhere, then tt_fastN[i] + *must* point to the .count field of the corresponding TT entry. + + tt_fast and tt_fastN are referred to from assembly code + (dispatch.S). +*/ +/*global*/ UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE]; + + +/*------------------ STATS DECLS ------------------*/ + +/* Number of fast-cache updates and flushes done. */ +ULong n_fast_flushes = 0; +ULong n_fast_updates = 0; + +/* Number of full lookups done. */ +ULong n_full_lookups = 0; +ULong n_lookup_probes = 0; + +/* Number/osize/tsize of translations entered. */ +ULong n_in_count = 0; +ULong n_in_osize = 0; +ULong n_in_tsize = 0; + +/* Number/osize of translations discarded due to lack of space. */ +ULong n_dump_count = 0; +ULong n_dump_osize = 0; + +/* Number/osize of translations discarded due to requests to do so. */ +ULong n_disc_count = 0; +ULong n_disc_osize = 0; + + + +/*-------------------------------------------------------------*/ +/*--- Add/delete/find translations ---*/ +/*-------------------------------------------------------------*/ + +static UInt vge_osize ( VexGuestExtents* vge ) { - Int tc_used, s; - if (VG_(clo_verbosity) <= 2) - return; - tc_used = 0; - for (s = 0; s < VG_TC_N_SECTORS; s++) - tc_used += vg_tc_used[s]; - - VG_(message)(Vg_DebugMsg, - "%lluk bbs: tt %d, tc %d: %s", - VG_(bbs_done) / 1000, - vg_tt_used, tc_used, submsg ); + UInt i, n = 0; + for (i = 0; i < vge->n_used; i++) + n += (UInt)vge->len[i]; + return n; } -/* Invalidate the tt_fast cache, for whatever reason, by pointing all - entries at vg_tc_bogus_TCEntry. */ -static -void vg_invalidate_tt_fast( void ) +static Bool isValidSector ( Int sector ) { - Int j; - for (j = 0; j < VG_TT_FAST_SIZE; j++) - VG_(tt_fast)[j] = (Addr)&vg_tc_bogus_TCEntry; + if (sector < 0 || sector >= N_SECTORS) + return False; + return True; +} + +static inline UInt HASH_TT ( Addr64 key ) +{ + UInt kHi = (UInt)(key >> 32); + UInt kLo = (UInt)key; + return (kHi ^ kLo) % N_TTES_PER_SECTOR; +} + +static void setFastCacheEntry ( Addr64 key, ULong* tce, UInt* count ) +{ + UInt cno = ((UInt)key) & VG_TT_FAST_MASK; + VG_(tt_fast)[cno] = tce; + VG_(tt_fastN)[cno] = count; + n_fast_updates++; +} + +static void invalidateFastCache ( void ) +{ + UInt j; + for (j = 0; j < VG_TT_FAST_SIZE; j++) { + VG_(tt_fast)[j] = &bogus_tc_entry; + VG_(tt_fastN)[j] = NULL; + } + n_fast_flushes++; +} + +static void initialiseSector ( Int sno ) +{ + Int i; + vg_assert(isValidSector(sno)); + + if (sectors[sno].tc == NULL) { + /* Sector has never been used before. Need to allocate tt and + tc. */ + vg_assert(sectors[sno].tt == NULL); + vg_assert(sectors[sno].tc_next == NULL); + vg_assert(sectors[sno].tt_n_inuse == 0); + sectors[sno].tc + = VG_(get_memory_from_mmap) + ( 8 * tc_sector_szQ, "sectors[sno].tc" ); + sectors[sno].tt + = VG_(get_memory_from_mmap) + ( N_TTES_PER_SECTOR * sizeof(TTEntry), "sectors[sno].tt" ); + if (VG_(clo_verbosity) > 2) + VG_(message)(Vg_DebugMsg, "TT/TC: initialise sector %d", sno); + } else { + /* Sector has been used before. */ + vg_assert(sectors[sno].tt != NULL); + vg_assert(sectors[sno].tc_next != NULL); + n_dump_count += sectors[sno].tt_n_inuse; + for (i = 0; i < N_TTES_PER_SECTOR; i++) { + if (sectors[sno].tt[i].status == InUse) { + n_dump_osize += vge_osize(§ors[sno].tt[i].vge); + } + } + if (VG_(clo_verbosity) > 2) + VG_(message)(Vg_DebugMsg, "TT/TC: recycle sector %d", sno); + } + + sectors[sno].tc_next = sectors[sno].tc; + sectors[sno].tt_n_inuse = 0; + for (i = 0; i < N_TTES_PER_SECTOR; i++) + sectors[sno].tt[i].status = Empty; + + invalidateFastCache(); } -static -void add_tt_entry ( TCEntry* tce ) +/* Add a translation of vge to TT/TC. The translation is temporarily + in code[0 .. code_len-1]. + + pre: youngest_sector points to a valid (although possibly full) + sector. +*/ +void VG_(add_to_trans_tab)( VexGuestExtents* vge, + Addr64 entry, + AddrH code, + UInt code_len ) { - UInt i; - /* VG_(printf)("add_TT_entry orig_addr %p\n", tce->orig_addr); */ - /* Hash to get initial probe point. */ - i = tce->orig_addr % VG_TT_SIZE; + Int tcAvailQ, reqdQ, y, i; + ULong *tce, *tce2; + UChar* srcP; + UChar* dstP; + + vg_assert(vge->n_used >= 1 && vge->n_used <= 3); + vg_assert(code_len > 0 && code_len < 20000); + + if (0) + VG_(printf)("add_to_trans_tab(entry = 0x%llx, len = %d)\n", + entry, code_len); + + n_in_count++; + n_in_tsize += code_len; + n_in_osize += vge_osize(vge); + + y = youngest_sector; + vg_assert(isValidSector(y)); + + if (sectors[y].tc == NULL) + initialiseSector(y); + + /* Try putting the translation in this sector. */ + reqdQ = 1 + ((code_len + 7) >> 3); + + /* Will it fit in tc? */ + tcAvailQ = ((ULong*)(§ors[y].tc[tc_sector_szQ])) + - ((ULong*)(sectors[y].tc_next)); + vg_assert(tcAvailQ >= 0); + vg_assert(tcAvailQ <= tc_sector_szQ); + + if (tcAvailQ < reqdQ + || sectors[y].tt_n_inuse >= N_TTES_PER_SECTOR_USABLE) { + /* No. So move on to the next sector. Either it's never been + used before, in which case it will get its tt/tc allocated + now, or it has been used before, in which case it is set to be + empty, hence throwing out the oldest sector. */ + youngest_sector++; + if (youngest_sector >= N_SECTORS) + youngest_sector = 0; + y = youngest_sector; + initialiseSector(y); + } + + /* Be sure ... */ + tcAvailQ = ((ULong*)(§ors[y].tc[tc_sector_szQ])) + - ((ULong*)(sectors[y].tc_next)); + vg_assert(tcAvailQ >= 0); + vg_assert(tcAvailQ <= tc_sector_szQ); + vg_assert(tcAvailQ >= reqdQ); + vg_assert(sectors[y].tt_n_inuse < N_TTES_PER_SECTOR_USABLE); + vg_assert(sectors[y].tt_n_inuse >= 0); + + /* Copy into tc. */ + tce = sectors[y].tc_next; + vg_assert(tce >= §ors[y].tc[0]); + vg_assert(tce <= §ors[y].tc[tc_sector_szQ]); + + tce[0] = entry; + dstP = (UChar*)(&tce[1]); + srcP = (UChar*)code; + for (i = 0; i < code_len; i++) + dstP[i] = srcP[i]; + sectors[y].tc_next += reqdQ; + sectors[y].tt_n_inuse++; + + /* more paranoia */ + tce2 = sectors[y].tc_next; + vg_assert(tce2 >= §ors[y].tc[0]); + vg_assert(tce2 <= §ors[y].tc[tc_sector_szQ]); + + /* Find an empty tt slot, and use it. There must be such a slot + since tt is never allowed to get completely full. */ + i = HASH_TT(entry); + vg_assert(i >= 0 && i < N_TTES_PER_SECTOR); while (True) { - if (vg_tt[i].orig_addr == tce->orig_addr) - VG_(core_panic)("add_TT_entry: duplicate"); - if (vg_tt[i].orig_addr == VG_TTE_EMPTY) + if (sectors[y].tt[i].status == Empty + || sectors[y].tt[i].status == Deleted) break; i++; - if (i == VG_TT_SIZE) + if (i >= N_TTES_PER_SECTOR) i = 0; } - vg_tt[i].orig_addr = tce->orig_addr; - vg_tt[i].tcentry = tce; - vg_tt_used++; - /* sanity ... */ - vg_assert(vg_tt_used < VG_TT_SIZE-1000); + sectors[y].tt[i].status = InUse; + sectors[y].tt[i].tce = tce; + sectors[y].tt[i].count = 0; + sectors[y].tt[i].weight = 1; + sectors[y].tt[i].vge = *vge; + sectors[y].tt[i].entry = entry; + + setFastCacheEntry( entry, tce, §ors[y].tt[i].count ); } -/* Search TT to find the translated address of the supplied original, - or NULL if not found. This routine is used when we miss in - VG_(tt_fast). +/* Search for the translation of the given guest address. If + requested, a successful search can also cause the fast-caches to be + updated. */ -static __inline__ -TTEntry* search_tt ( Addr orig_addr ) +Bool VG_(search_transtab) ( /*OUT*/AddrH* result, + Addr64 guest_addr, + Bool upd_cache ) { - Int i; - /* Hash to get initial probe point. */ - i = orig_addr % VG_TT_SIZE; - while (True) { - if (vg_tt[i].orig_addr == orig_addr) - return &vg_tt[i]; - if (vg_tt[i].orig_addr == VG_TTE_EMPTY) - return NULL; - i++; - if (i == VG_TT_SIZE) i = 0; - } -} + Int i, j, k, kstart, sno; + /* Find the initial probe point just once. It will be the same in + all sectors and avoids multiple expensive % operations. */ + n_full_lookups++; + k = -1; + kstart = HASH_TT(guest_addr); + vg_assert(kstart >= 0 && kstart < N_TTES_PER_SECTOR); + /* Search in all the sectors. Although the order should not matter, + it might be most efficient to search in the order youngest to + oldest. */ + sno = youngest_sector; + for (i = 0; i < N_SECTORS; i++) { -static -void initialise_tt ( void ) -{ - Int i; - vg_tt_used = 0; - for (i = 0; i < VG_TT_SIZE; i++) { - vg_tt[i].orig_addr = VG_TTE_EMPTY; - } - vg_invalidate_tt_fast(); -} + if (sectors[sno].tc == NULL) + goto notfound; /* sector not in use. */ - -static -void rebuild_TT ( void ) -{ - Int s; - - /* Throw away TT. */ - initialise_tt(); - - /* Rebuild TT from the remaining quarters. */ - for (s = 0; s < VG_TC_N_SECTORS; s++) { - for_each_tc(s, add_tt_entry); - } - pp_tt_tc_status ( "after rebuild of TC" ); -# if 1 /* def DEBUG_TRANSTAB */ - VG_(sanity_check_tt_tc)("rebuild_TT"); -# endif - -} - - -/*------------------ TC HELPERS ------------------*/ - -static -void for_each_tc(Int s, void (*fn)(TCEntry *)) -{ - UChar *pc; - UChar *pc_lim; - TCEntry *tce; - - pc = &(vg_tc[s][0]); - pc_lim = &(vg_tc[s][vg_tc_used[s]]); - while (True) { - if (pc >= pc_lim) break; - tce = (TCEntry*)pc; - pc += sizeof(TCEntry) + tce->trans_size; - if (tce->orig_addr != VG_TTE_DELETED) - (*fn)(tce); - } -} - -/* Find the oldest non-NULL, non-empty sector, or -1 if none such. */ -static -Int find_oldest_sector ( void ) -{ - Int oldest_age, oldest, i; - oldest_age = 1000 * 1000 * 1000; - oldest = -1; - for (i = 0; i < VG_TC_N_SECTORS; i++) { - if (vg_tc[i] == NULL) - continue; - if (vg_tc_used[i] == 0) - continue; - if (vg_tc_age[i] < oldest_age) { - oldest = i; - oldest_age = vg_tc_age[i]; + k = kstart; + for (j = 0; j < N_TTES_PER_SECTOR; j++) { + n_lookup_probes++; + if (sectors[sno].tt[k].status == InUse + && sectors[sno].tt[k].entry == guest_addr) { + /* found it */ + if (upd_cache) + setFastCacheEntry( + guest_addr, sectors[sno].tt[k].tce, + §ors[sno].tt[k].count ); + if (result) + *result = sizeof(Addr64) + (AddrH)sectors[sno].tt[k].tce; + return True; + } + if (sectors[sno].tt[k].status == Empty) + break; /* not found in this sector */ + k++; + if (k == N_TTES_PER_SECTOR) + k = 0; } + + /* If we fall off the end, all entries are InUse and not + matching, or Deleted. In any case we did not find it in this + sector. */ + + notfound: + /* move to the next oldest sector */ + sno = sno==0 ? (N_SECTORS-1) : (sno-1); } - return oldest; + + /* Not found in any sector. */ + return False; } -/* Discard the oldest sector, if any such exists. */ -static -void discard_oldest_sector ( void ) -{ - Char msg[100]; - Int s = find_oldest_sector(); - if (s != -1) { - vg_assert(s >= 0 && s < VG_TC_N_SECTORS); - VG_(sprintf)(msg, "before discard of sector %d (%d bytes)", - s, vg_tc_used[s]); - pp_tt_tc_status ( msg ); - overall_out_count += vg_tc_stats_count[s]; - overall_out_osize += vg_tc_stats_osize[s]; - overall_out_tsize += vg_tc_stats_tsize[s]; - vg_tc_used[s] = 0; - vg_tc_stats_count[s] = 0; - vg_tc_stats_osize[s] = 0; - vg_tc_stats_tsize[s] = 0; - n_tc_discards++; - } -} - - -/* Find an empty sector and bring it into use. If there isn't one, - try and allocate one. If that fails, return -1. */ -static -Int maybe_commission_sector ( void ) -{ - Char msg[100]; - Int s; - for (s = 0; s < VG_TC_N_SECTORS; s++) { - if (vg_tc[s] != NULL && vg_tc_used[s] == 0) { - vg_tc_age[s] = overall_in_count; - VG_(sprintf)(msg, "after commission of sector %d " - "at time %d", - s, vg_tc_age[s]); - pp_tt_tc_status ( msg ); -# if 1 /* def DEBUG_TRANSTAB */ - VG_(sanity_check_tt_tc)("maybe_commission_sector"); -# endif - return s; - } - } - for (s = 0; s < VG_TC_N_SECTORS; s++) { - if (vg_tc[s] == NULL) { - vg_tc[s] = VG_(get_memory_from_mmap) - ( vg_tc_sector_szB, "trans-cache(sector)" ); - vg_tc_used[s] = 0; - VG_(sprintf)(msg, "after allocation of sector %d (size %d)", - s, vg_tc_sector_szB ); - pp_tt_tc_status ( msg ); - return maybe_commission_sector(); - } - } - return -1; -} - - -static -UChar* allocate ( Int nBytes ) -{ - vg_assert(IS_ALIGNED(nBytes)); - - /* Ensure the TT is still OK. */ - while (vg_tt_used >= VG_TT_LIMIT) { - discard_oldest_sector(); - rebuild_TT(); - vg_assert(vg_tt_used < VG_TT_LIMIT); - } - - /* Can we get it into the current sector? */ - if (vg_tc_current >= 0 - && vg_tc_current < VG_TC_N_SECTORS - && vg_tc[vg_tc_current] != NULL - && vg_tc_used[vg_tc_current] + nBytes <= vg_tc_sector_szB) { - /* Yes. */ - UChar* p = &(vg_tc[vg_tc_current][ vg_tc_used[vg_tc_current] ]); - vg_tc_used[vg_tc_current] += nBytes; - return p; - } - - /* Perhaps we can bring a new sector into use, for the first - time. */ - vg_tc_current = maybe_commission_sector(); - if (vg_tc_current >= 0 && vg_tc_current < VG_TC_N_SECTORS) - return allocate(nBytes); - - /* That didn't work. We'll have to dump the oldest. */ - discard_oldest_sector(); - - rebuild_TT(); - vg_tc_current = maybe_commission_sector(); - vg_assert(vg_tc_current >= 0 && vg_tc_current < VG_TC_N_SECTORS); -# ifdef DEBUG_TRANSTAB - VG_(sanity_check_tt_tc)(); -# endif - - return allocate(nBytes); -} - - -/* Just so these counts can be queried without making them globally - visible. */ -void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used ) -{ - Int s; - *tt_used = vg_tt_used; - *tc_used = 0; - for (s = 0; s < VG_TC_N_SECTORS; s++) - *tc_used += vg_tc_used[s]; -} - - -/* Do a sanity check on TT/TC. +/* Delete all translations which intersect with any part of the + specified guest address range. Note, this is SLOW. */ + +static inline +Bool overlap1 ( Addr64 s1, UInt r1, Addr64 s2, UInt r2 ) +{ + Addr64 e1 = s1 + (ULong)r1 - 1ULL; + Addr64 e2 = s2 + (ULong)r1 - 1ULL; + if (e1 < s2 || e2 < s1) + return False; + return True; +} + +static inline +Bool overlaps ( Addr64 start, UInt range, VexGuestExtents* vge ) +{ + if (overlap1(start, range, vge->base[0], (UInt)vge->len[0])) + return True; + if (vge->n_used < 2) + return False; + if (overlap1(start, range, vge->base[1], (UInt)vge->len[1])) + return True; + if (vge->n_used < 3) + return False; + if (overlap1(start, range, vge->base[2], (UInt)vge->len[2])) + return True; + return False; +} + + +void VG_(discard_translations) ( Addr64 guest_start, UInt range ) +{ + Int sno, i; + Bool anyDeleted = False; + + for (sno = 0; sno < N_SECTORS; sno++) { + if (sectors[sno].tc == NULL) + continue; + for (i = 0; i < N_TTES_PER_SECTOR; i++) { + if (sectors[sno].tt[i].status == InUse + && overlaps( guest_start, range, §ors[sno].tt[i].vge )) { + sectors[sno].tt[i].status = Deleted; + sectors[sno].tt_n_inuse--; + anyDeleted = True; + n_disc_count++; + n_disc_osize += vge_osize(§ors[sno].tt[i].vge); + } + } + } + + if (anyDeleted) + invalidateFastCache(); +} + + +/*------------------------------------------------------------*/ +/*--- Sanity checking ---*/ +/*------------------------------------------------------------*/ + void VG_(sanity_check_tt_tc) ( Char* who ) { - Int i, s; - TTEntry* tte; - TCEntry* tce; - Char msg[200]; - - vg_assert(VG_(strlen)(who) < 50); - VG_(sprintf)(msg, "sanity_check_tt_tc: begin (%s)", who ); - pp_tt_tc_status ( msg ); - - /* Some basic checks on the sector array. */ - for (i = 0; i < VG_TC_N_SECTORS; i++) { - if (vg_tc[i] == NULL) { - vg_assert(vg_tc_used[i] == 0); - vg_assert(vg_tc_age[i] == 0); - } else { - vg_assert(vg_tc_used[i] <= vg_tc_sector_szB); - } - } - - /* Checks: - - Each TT entry points to a valid and corresponding TC entry. - */ - for (i = 0; i < VG_TT_SIZE; i++) { - tte = &vg_tt[i]; - /* empty slots are harmless. */ - if (tte->orig_addr == VG_TTE_EMPTY) continue; - /* all others should agree with the TC entry. */ - tce = tte->tcentry; - // XXX: 64-bit cleanness: should this be IS_WORD_ALIGNED? - vg_assert(IS_4_ALIGNED(tce)); - /* does this point into a valid TC sector? */ - for (s = 0; s < VG_TC_N_SECTORS; s++) - if (vg_tc[s] != NULL - && ((Addr)tce) >= (Addr)&vg_tc[s][0] - && ((Addr)tce) < (Addr)&vg_tc[s][ vg_tc_used[s] ]) - break; - vg_assert(s < VG_TC_N_SECTORS); - /* It should agree with the TC entry on the orig_addr. This may - be VG_TTE_DELETED, or a real orig addr. */ - vg_assert(tte->orig_addr == tce->orig_addr); - } - - VG_(sprintf)(msg, "sanity_check_tt_tc: done (%s)", who ); - pp_tt_tc_status ( msg ); -} - - -static __inline__ Int safe_idiv(Int a, Int b) -{ - return (b == 0 ? 0 : a / b); -} - -void VG_(print_tt_tc_stats)(void) -{ - VG_(message)(Vg_DebugMsg, - " TT/TC: %d tc sectors discarded.", - n_tc_discards ); - VG_(message)(Vg_DebugMsg, - " %d tt_fast misses.", - n_tt_fast_misses); - VG_(message)(Vg_DebugMsg, - "translate: new %d (%d -> %d; ratio %d:10)", - overall_in_count, overall_in_osize, overall_in_tsize, - safe_idiv(10*overall_in_tsize, overall_in_osize)); - VG_(message)(Vg_DebugMsg, - " discard %d (%d -> %d; ratio %d:10).", - overall_out_count, overall_out_osize, overall_out_tsize, - safe_idiv(10*overall_out_tsize, overall_out_osize)); -} - -Int VG_(get_bbs_translated) ( void ) -{ - return overall_in_count; -} - -/* Add this already-filled-in entry to the TT. Assumes that the - relevant code chunk has been placed in TC, along with a dummy back - pointer, which is inserted here. -*/ -void VG_(add_to_trans_tab) ( Addr orig_addr, Int orig_size, - Addr trans_addr, Int trans_size ) -{ - Int i, nBytes, trans_size_aligned; - TCEntry* tce; - /* - VG_(printf)("add_to_trans_tab(%d) %x %d %x %d\n", - vg_tt_used, tte->orig_addr, tte->orig_size, - tte->trans_addr, tte->trans_size); - */ - - // paranoia - vg_assert(offsetof(TCEntry, payload) == PAYLOAD_OFFSET); - vg_assert(trans_size > 0); - - /* figure out how many bytes we require. */ - nBytes = CODE_ALIGN(trans_size + sizeof(TCEntry)); - trans_size_aligned = nBytes-sizeof(TCEntry); - vg_assert(IS_ALIGNED(nBytes)); - - tce = (TCEntry*)allocate(nBytes); - /* - VG_(printf)("allocate returned %p (code start %p)\n", - tce, &tce->payload[0]); - */ - vg_assert(vg_tc_current >= 0 && vg_tc_current < VG_TC_N_SECTORS); - vg_assert(vg_tc_sector_szB > 0); - - /* Range check for writing in the trans cache. */ - vg_assert( ((UChar*)(tce)) - >= ((UChar*)(&vg_tc[vg_tc_current][0])) ); - vg_assert( ((UChar*)(&tce->payload[trans_size_aligned-1])) - < ((UChar*)(&vg_tc[vg_tc_current][vg_tc_sector_szB])) ); - - tce->orig_addr = orig_addr; - tce->orig_size = (UShort)orig_size; /* what's the point of storing this? */ - tce->trans_size = (UShort)trans_size_aligned; - for (i = 0; i < trans_size; i++) { - tce->payload[i] = ((UChar*)trans_addr)[i]; - } - - add_tt_entry(tce); - - /* Update stats. */ - overall_in_count ++; - overall_in_osize += orig_size; - overall_in_tsize += trans_size; - - vg_tc_stats_count[vg_tc_current] ++; - vg_tc_stats_osize[vg_tc_current] += orig_size; - vg_tc_stats_tsize[vg_tc_current] += trans_size; -} - - -/* Find the translation address for a given (original) code address. - If found, update VG_(tt_fast) so subsequent lookups are fast. If - no translation can be found, return zero. This routine is (the - only one) called from vg_run_innerloop. */ -Addr VG_(search_transtab) ( Addr original_addr ) -{ - TTEntry* tte; - VGP_PUSHCC(VgpSlowFindT); - tte = search_tt ( original_addr ); - if (tte == NULL) { - /* We didn't find it. vg_run_innerloop will have to request a - translation. */ - VGP_POPCC(VgpSlowFindT); - return (Addr)0; - } else { - /* Found it. Put the search result into the fast cache now. */ - UInt cno = (UInt)original_addr & VG_TT_FAST_MASK; - VG_(tt_fast)[cno] = (Addr)(tte->tcentry); - n_tt_fast_misses++; - VGP_POPCC(VgpSlowFindT); - return (Addr)&(tte->tcentry->payload[0]); - } -} - - -/* Invalidate translations of original code [start .. start + range - 1]. - This is slow, so you *really* don't want to call it very often. -*/ -void VG_(invalidate_translations) ( Addr start, UInt range ) -{ - Addr i_start, i_end, o_start, o_end; - UInt out_count, out_osize, out_tsize; - Int i; - TCEntry* tce; -# ifdef DEBUG_TRANSTAB - VG_(sanity_check_tt_tc)(); -# endif - i_start = start; - i_end = start + range - 1; - out_count = out_osize = out_tsize = 0; - - for (i = 0; i < VG_TT_SIZE; i++) { - if (vg_tt[i].orig_addr == VG_TTE_EMPTY - || vg_tt[i].orig_addr == VG_TTE_DELETED) continue; - tce = vg_tt[i].tcentry; - o_start = tce->orig_addr; - o_end = o_start + tce->trans_size - 1; - if (o_end < i_start || o_start > i_end) - continue; - - if (VG_(needs).basic_block_discards) - TL_(discard_basic_block_info)( tce->orig_addr, - tce->orig_size ); - - vg_tt[i].orig_addr = VG_TTE_DELETED; - tce->orig_addr = VG_TTE_DELETED; - - overall_out_count ++; - overall_out_osize += tce->orig_size; - overall_out_tsize += tce->trans_size; - out_count ++; - out_osize += tce->orig_size; - out_tsize += tce->trans_size; - } - - if (out_count > 0) { - vg_invalidate_tt_fast(); - VG_(sanity_check_tt_tc)("invalidate_translations"); -# ifdef DEBUG_TRANSTAB - { Addr aa; - for (aa = i_start; aa <= i_end; aa++) - vg_assert(search_tt ( aa ) == NULL); - } -# endif - } - - if (VG_(clo_verbosity) > 2) - VG_(message)(Vg_UserMsg, - "discard %d (%d -> %d) translations in range %p .. %p", - out_count, out_osize, out_tsize, i_start, i_end ); } @@ -645,52 +551,247 @@ void VG_(invalidate_translations) ( Addr start, UInt range ) void VG_(init_tt_tc) ( void ) { - Int s; + Int i, avg_codeszQ; /* Otherwise lots of things go wrong... */ - vg_assert(offsetof(TCEntry, payload) == PAYLOAD_OFFSET); - - /* Figure out how big each sector should be. */ - vg_tc_sector_szB - = (VG_TT_LIMIT /* max TT entries we expect */ - * (VG_(details).avg_translation_sizeB - + sizeof(TCEntry) - + (CODE_ALIGNMENT/2) /* avg alignment loss */) - ) - / VG_TC_N_SECTORS; + vg_assert(sizeof(ULong) == 8); + vg_assert(sizeof(Addr64) == 8); + + if (VG_(clo_verbosity) > 2) + VG_(message)(Vg_DebugMsg, + "TT/TC: VG_(init_tt_tc) " + "(startup of code management)"); + + /* Figure out how big each tc area should be. */ + avg_codeszQ + = (VG_(details).avg_translation_sizeB + 7) / 8; + + tc_sector_szQ + = N_TTES_PER_SECTOR_USABLE * (1 + avg_codeszQ); + /* Ensure the calculated value is not way crazy. */ - vg_assert(vg_tc_sector_szB >= 50000); - vg_assert(vg_tc_sector_szB <= 11500000); + vg_assert(tc_sector_szQ >= 2 * N_TTES_PER_SECTOR_USABLE); + vg_assert(tc_sector_szQ <= 50 * N_TTES_PER_SECTOR_USABLE); - for (s = 0; s < VG_TC_N_SECTORS; s++) { - vg_tc[s] = NULL; - vg_tc_used[s] = 0; - vg_tc_age[s] = 0; - vg_tc_stats_count[s] = 0; - vg_tc_stats_osize[s] = 0; - vg_tc_stats_tsize[s] = 0; + /* Initialise the sectors */ + youngest_sector = 0; + for (i = 0; i < N_SECTORS; i++) { + sectors[i].tc = NULL; + sectors[i].tt = NULL; + sectors[i].tc_next = NULL; + sectors[i].tt_n_inuse = 0; } - vg_tc_current = 0; - vg_tt = VG_(get_memory_from_mmap) ( VG_TT_SIZE * sizeof(TTEntry), - "trans-table" ); - /* The main translation table is empty. */ - initialise_tt(); + /* and the fast caches. */ + invalidateFastCache(); if (VG_(clo_verbosity) > 2) { VG_(message)(Vg_DebugMsg, - "Translation Cache: using %d sectors of %d bytes each", - VG_TC_N_SECTORS, vg_tc_sector_szB ); + "TT/TC: cache: %d sectors of %d bytes each = %d total", + N_SECTORS, 8 * tc_sector_szQ, + N_SECTORS * 8 * tc_sector_szQ ); VG_(message)(Vg_DebugMsg, - "Translation Table: %d total entries, max occupancy %d (%d%%)", - VG_TT_SIZE, VG_TT_LIMIT, VG_TT_LIMIT_PERCENT ); + "TT/TC: table: %d total entries, max occupancy %d (%d%%)", + N_SECTORS * N_TTES_PER_SECTOR, + N_SECTORS * N_TTES_PER_SECTOR_USABLE, + SECTOR_TT_LIMIT_PERCENT ); + } +} + + +/*------------------------------------------------------------*/ +/*--- Printing out statistics. ---*/ +/*------------------------------------------------------------*/ + +static ULong safe_idiv( ULong a, ULong b ) +{ + return (b == 0 ? 0 : a / b); +} + +UInt VG_(get_bbs_translated) ( void ) +{ + return n_in_count; +} + +void VG_(print_tt_tc_stats) ( void ) +{ + VG_(message)(Vg_DebugMsg, + " tt/tc: %llu tt lookups requiring %llu probes", + n_full_lookups, n_lookup_probes ); + VG_(message)(Vg_DebugMsg, + " tt/tc: %llu fast-cache updates, %llu flushes", + n_fast_updates, n_fast_flushes ); + + VG_(message)(Vg_DebugMsg, + "translate: new %lld (%lld -> %lld; ratio %lld:10)", + n_in_count, n_in_osize, n_in_tsize, + safe_idiv(10*n_in_tsize, n_in_osize)); + VG_(message)(Vg_DebugMsg, + "translate: dumped %lld (%lld -> ?" "?)", + n_dump_count, n_dump_osize ); + VG_(message)(Vg_DebugMsg, + "translate: discarded %lld (%lld -> ?" "?)", + n_disc_count, n_disc_osize ); +} + +/*------------------------------------------------------------*/ +/*--- Printing out of profiling results. ---*/ +/*------------------------------------------------------------*/ + +/* Only the top N_MAX bbs will be displayed. */ +#define N_MAX 10 + +static TTEntry* tops[N_MAX]; + +static ULong score ( TTEntry* tte ) +{ + return ((ULong)tte->weight) * ((ULong)tte->count); +} + +static Bool heavier ( TTEntry* t1, TTEntry* t2 ) +{ + return score(t1) > score(t2); +} + +/* Print n/m in form xx.yy% */ +static +void percentify ( ULong n, ULong m, Int field_width, Char* buf) +{ + Int i, len, space; + ULong lo, hi; + if (m == 0) m = 1; /* stay sane */ + hi = (n * 100) / m; + lo = (((n * 100) - hi * m) * 100) / m; + vg_assert(lo < 100); + if (lo < 10) + VG_(sprintf)(buf, "%lld.0%lld%%", hi, lo); + else + VG_(sprintf)(buf, "%lld.%lld%%", hi, lo); + + len = VG_(strlen)(buf); + space = field_width - len; + if (space < 0) space = 0; /* Allow for v. small field_width */ + i = len; + + /* Right justify in field */ + for ( ; i >= 0; i--) buf[i + space] = buf[i]; + for (i = 0; i < space; i++) buf[i] = ' '; +} + + +void VG_(show_BB_profile) ( void ) +{ + Char name[64]; + Int sno, i, r, s; + ULong score_total, score_cumul, score_here; + Char buf_cumul[10]; + Char buf_here[10]; + + /* First, compute the total weighted count, and find the top N + ttes. tops contains pointers to the most-used N_MAX blocks, in + descending order (viz, tops[0] is the highest scorer). */ + for (i = 0; i < N_MAX; i++) + tops[i] = NULL; + + score_total = 0; + + for (sno = 0; sno < N_SECTORS; sno++) { + if (sectors[sno].tc == NULL) + continue; + for (i = 0; i < N_TTES_PER_SECTOR; i++) { + if (sectors[sno].tt[i].status != InUse) + continue; + score_total += score(§ors[sno].tt[i]); + /* Find the rank for sectors[sno].tt[i]. */ + r = N_MAX-1; + while (True) { + if (r == -1) + break; + if (tops[r] == NULL) { + r--; + continue; + } + if (heavier(§ors[sno].tt[i], tops[r])) { + r--; + continue; + } + break; + } + r++; + vg_assert(r >= 0 && r <= N_MAX); + /* This bb should be placed at r, and bbs above it shifted + upwards one slot. */ + if (r < N_MAX) { + for (s = N_MAX-1; s > r; s--) + tops[s] = tops[s-1]; + tops[r] = §ors[sno].tt[i]; + } + } } -# ifdef DEBUG_TRANSTAB - VG_(sanity_check_tt_tc)(); -# endif + VG_(printf)("\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("--- BEGIN BB Profile (summary of scores) ---\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("\n"); + + VG_(printf)("Total score = %lld\n\n", score_total); + + score_cumul = 0; + for (r = 0; r < N_MAX; r++) { + if (tops[r] == NULL) + continue; + name[0] = 0; + VG_(get_fnname_w_offset)(tops[r]->entry, name, 64); + name[63] = 0; + score_here = score(tops[r]); + score_cumul += score_here; + percentify(score_cumul, score_total, 6, buf_cumul); + percentify(score_here, score_total, 6, buf_here); + VG_(printf)("%3d: (%9lld %s) %9lld %s 0x%llx %s\n", + r, + score_cumul, buf_cumul, + score_here, buf_here, tops[r]->entry, name ); + } + + VG_(printf)("\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("--- BB Profile (BB details) ---\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("\n"); + + score_cumul = 0; + for (r = 0; r < N_MAX; r++) { + if (tops[r] == NULL) + continue; + name[0] = 0; + VG_(get_fnname_w_offset)(tops[r]->entry, name, 64); + name[63] = 0; + score_here = score(tops[r]); + score_cumul += score_here; + percentify(score_cumul, score_total, 6, buf_cumul); + percentify(score_here, score_total, 6, buf_here); + VG_(printf)("\n"); + VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-= begin BB rank %d " + "=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r); + VG_(printf)("%3d: (%9lld %s) %9lld %s 0x%llx %s\n", + r, + score_cumul, buf_cumul, + score_here, buf_here, tops[r]->entry, name ); + VG_(printf)("\n"); + VG_(translate)(0, tops[r]->entry, True, VG_(clo_profile_flags)); + VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-= end BB rank %d " + "=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r); + } + + VG_(printf)("\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("--- END BB Profile ---\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("\n"); } + /*--------------------------------------------------------------------*/ /*--- end vg_transtab.c ---*/ /*--------------------------------------------------------------------*/ diff --git a/coregrind/x86/dispatch.S b/coregrind/x86/dispatch.S index 504053449..19489cca2 100644 --- a/coregrind/x86/dispatch.S +++ b/coregrind/x86/dispatch.S @@ -37,13 +37,6 @@ /*--- The dispatch loop. ---*/ /*------------------------------------------------------------*/ -#define TT_LOOKUP(reg, fail) \ - movl %eax, reg; \ - andl $VG_TT_FAST_MASK, reg; \ - movl VG_(tt_fast)(,reg,4), reg; \ - cmpl %eax, (reg); \ - jnz fail - /* signature: UInt VG_(run_innerloop) ( void* guest_state ) */ .globl VG_(run_innerloop) @@ -99,13 +92,19 @@ dispatch_boring: jz counter_is_zero /* try a fast lookup in the translation cache */ - TT_LOOKUP(%ebx, fast_lookup_failed) + movl %eax, %ebx + andl $VG_TT_FAST_MASK, %ebx + movl VG_(tt_fast)(,%ebx,4), %ecx + cmpl %eax, (%ecx) + jnz fast_lookup_failed + movl VG_(tt_fastN)(,%ebx,4), %edx + incl (%edx) - /* Found a match. Call the tce.payload field. The magic 8 - value is offsetof(TCEntry,payload) on a 32-bit platform. */ + /* Found a match. Call tce[1], which is 8 bytes along, since + each tce element is a 64-bit int. */ - addl $8, %ebx - call *%ebx + addl $8, %ecx + call *%ecx /* %eax holds destination (original) address. diff --git a/include/basic_types.h b/include/basic_types.h index 5a5c2ea31..112b26e29 100644 --- a/include/basic_types.h +++ b/include/basic_types.h @@ -50,6 +50,7 @@ typedef unsigned long UWord; // 32 64 typedef signed long Word; // 32 64 typedef UWord Addr; // 32 64 +typedef UWord AddrH; // 32 64 typedef UWord SizeT; // 32 64 typedef Word SSizeT; // 32 64 diff --git a/none/tests/cmdline2.stdout.exp b/none/tests/cmdline2.stdout.exp index 10e906c4f..614689a04 100644 --- a/none/tests/cmdline2.stdout.exp +++ b/none/tests/cmdline2.stdout.exp @@ -43,9 +43,9 @@ usage: valgrind --tool= [options] prog-and-args --single-step=no|yes translate each instr separately? [no] --optimise=no|yes improve intermediate code? [yes] --profile=no|yes profile? (tool must be built for it) [no] - --bbprofile=no|yes profile bbs? [no] --branchpred=yes|no generate branch prediction hints [no] - --trace-codegen= show generated code? (X = 0|1) [00000000] + --trace-flags= show generated code? (X = 0|1) [00000000] + --profile-flags= ditto, but for profiling (X = 0|1) [00000000] --trace-notbelow= only show BBs above [0] --trace-syscalls=no|yes show all system calls? [no] --trace-signals=no|yes show signal handling details? [no] @@ -61,7 +61,7 @@ usage: valgrind --tool= [options] prog-and-args --vex-guest-max-insns 1 .. 100 [50] --vex-guest-chase-thresh 0 .. 99 [10] - --trace-codegen values (omit the middle space): + --trace-flags and --profile-flags values (omit the middle space): 1000 0000 show conversion into IR 0100 0000 show after initial opt 0010 0000 show after instrumentation