diff --git a/coregrind/core.h b/coregrind/core.h index 6a8484086..06f44652a 100644 --- a/coregrind/core.h +++ b/coregrind/core.h @@ -289,11 +289,10 @@ extern Int VG_(clo_n_suppressions); /* The names of the suppression files. */ extern Char* VG_(clo_suppressions)[VG_CLO_MAX_SFILES]; -/* PROFILE: collect bb profiling data? default: NO */ -extern Bool VG_(clo_bbprofile); - /* DEBUG: print generated code? default: 00000000 ( == NO ) */ -extern Bool VG_(clo_trace_codegen); +extern Bool VG_(clo_trace_flags); +/* DEBUG: do bb profiling? default: 00000000 ( == NO ) */ +extern Bool VG_(clo_profile_flags); /* DEBUG: if tracing codegen, be quiet until after this bb ( 0 ) */ extern Int VG_(clo_trace_notbelow); /* DEBUG: print system calls? default: NO */ @@ -1101,8 +1100,11 @@ extern void VG_(demangle) ( Char* orig, Char* result, Int result_size ); Exports of vg_translate.c ------------------------------------------------------------------ */ -extern Bool VG_(translate) ( ThreadId tid, Addr orig_addr, Bool debugging ); - +extern +Bool VG_(translate) ( ThreadId tid, + Addr64 orig_addr, + Bool debugging_translation, + Int debugging_verbosity ); /* --------------------------------------------------------------------- Exports of vg_execontext.c. @@ -1711,21 +1713,32 @@ GEN_SYSCALL_WRAPPER(sys_mq_getsetattr); // * P? Exports of vg_transtab.c ------------------------------------------------------------------ */ -/* The fast-cache for tt-lookup. */ -extern Addr VG_(tt_fast)[VG_TT_FAST_SIZE]; +/* The fast-cache for tt-lookup, and for finding counters. */ +extern ULong* VG_(tt_fast) [VG_TT_FAST_SIZE]; +extern UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE]; extern void VG_(init_tt_tc) ( void ); -extern void VG_(add_to_trans_tab) ( Addr orig_addr, Int orig_size, - Addr trans_addr, Int trans_size ); -extern Addr VG_(search_transtab) ( Addr original_addr ); -extern void VG_(invalidate_translations) ( Addr start, UInt range ); +extern +void VG_(add_to_trans_tab)( VexGuestExtents* vge, + Addr64 entry, + AddrH code, + UInt code_len ); + +extern Bool VG_(search_transtab) ( /*OUT*/AddrH* result, + Addr64 guest_addr, + Bool upd_cache ); + +extern void VG_(discard_translations) ( Addr64 start, UInt range ); extern void VG_(sanity_check_tt_tc) ( Char* caller ); extern void VG_(print_tt_tc_stats) ( void ); -extern Int VG_(get_bbs_translated) ( void ); +extern UInt VG_(get_bbs_translated) ( void ); + +extern void VG_(show_BB_profile) ( void ); + /* --------------------------------------------------------------------- Exports of vg_syscall.S diff --git a/coregrind/core_asm.h b/coregrind/core_asm.h index 6b78520f0..386b8e3f6 100644 --- a/coregrind/core_asm.h +++ b/coregrind/core_asm.h @@ -56,7 +56,7 @@ /* Constants for the fast translation lookup cache. */ -#define VG_TT_FAST_BITS 15 +#define VG_TT_FAST_BITS 16 #define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS) #define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1) diff --git a/coregrind/vg_errcontext.c b/coregrind/vg_errcontext.c index a6f8b2d8e..612f666ec 100644 --- a/coregrind/vg_errcontext.c +++ b/coregrind/vg_errcontext.c @@ -665,8 +665,9 @@ void VG_(show_all_errors) ( void ) pp_Error( p_min, False ); if ((i+1 == VG_(clo_dump_error))) { - VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/, - p_min->where->ips[0], /*debugging*/True); + VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/, + p_min->where->ips[0], /*debugging*/True, + 0xFE/*verbosity*/); } p_min->count = 1 << 30; diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c index 3ea5cbef0..2cc023143 100644 --- a/coregrind/vg_main.c +++ b/coregrind/vg_main.c @@ -1483,8 +1483,8 @@ Int VG_(clo_input_fd) = 0; /* stdin */ Int VG_(clo_n_suppressions) = 0; Char* VG_(clo_suppressions)[VG_CLO_MAX_SFILES]; Bool VG_(clo_profile) = False; -Bool VG_(clo_bbprofile) = False; -UChar VG_(clo_trace_codegen) = 0; // 00000000b +UChar VG_(clo_trace_flags) = 0; // 00000000b +UChar VG_(clo_profile_flags) = 0; // 00000000b Int VG_(clo_trace_notbelow) = 0; Bool VG_(clo_trace_syscalls) = False; Bool VG_(clo_trace_signals) = False; @@ -1561,9 +1561,9 @@ void usage ( Bool debug_help ) " --single-step=no|yes translate each instr separately? [no]\n" " --optimise=no|yes improve intermediate code? [yes]\n" " --profile=no|yes profile? (tool must be built for it) [no]\n" -" --bbprofile=no|yes profile bbs? [no]\n" " --branchpred=yes|no generate branch prediction hints [no]\n" -" --trace-codegen= show generated code? (X = 0|1) [00000000]\n" +" --trace-flags= show generated code? (X = 0|1) [00000000]\n" +" --profile-flags= ditto, but for profiling (X = 0|1) [00000000]\n" " --trace-notbelow= only show BBs above [0]\n" " --trace-syscalls=no|yes show all system calls? [no]\n" " --trace-signals=no|yes show signal handling details? [no]\n" @@ -1579,7 +1579,7 @@ void usage ( Bool debug_help ) " --vex-guest-max-insns 1 .. 100 [50]\n" " --vex-guest-chase-thresh 0 .. 99 [10]\n" "\n" -" --trace-codegen values (omit the middle space):\n" +" --trace-flags and --profile-flags values (omit the middle space):\n" " 1000 0000 show conversion into IR\n" " 0100 0000 show after initial opt\n" " 0010 0000 show after instrumentation\n" @@ -1752,7 +1752,6 @@ static void process_cmd_line_options( UInt* client_auxv, const char* toolname ) else VG_BOOL_CLO("--pointercheck", VG_(clo_pointercheck)) else VG_BOOL_CLO("--support-elan3", VG_(clo_support_elan3)) else VG_BOOL_CLO("--profile", VG_(clo_profile)) - else VG_BOOL_CLO("--bbprofile", VG_(clo_bbprofile)) else VG_BOOL_CLO("--run-libc-freeres", VG_(clo_run_libc_freeres)) else VG_BOOL_CLO("--show-below-main", VG_(clo_show_below_main)) else VG_BOOL_CLO("--time-stamp", VG_(clo_time_stamp)) @@ -1834,21 +1833,42 @@ static void process_cmd_line_options( UInt* client_auxv, const char* toolname ) VG_(clo_n_suppressions)++; } - /* "vwxyz" --> 000zyxwv (binary) */ - else if (VG_CLO_STREQN(16, arg, "--trace-codegen=")) { + /* "stuvwxyz" --> stuvwxyz (binary) */ + else if (VG_CLO_STREQN(14, arg, "--trace-flags=")) { + Int j; + char* opt = & arg[14]; + + if (8 != VG_(strlen)(opt)) { + VG_(message)(Vg_UserMsg, + "--trace-flags argument must have 8 digits"); + VG_(bad_option)(arg); + } + for (j = 0; j < 8; j++) { + if ('0' == opt[j]) { /* do nothing */ } + else if ('1' == opt[j]) VG_(clo_trace_flags) |= (1 << (7-j)); + else { + VG_(message)(Vg_UserMsg, "--trace-flags argument can only " + "contain 0s and 1s"); + VG_(bad_option)(arg); + } + } + } + + /* "stuvwxyz" --> stuvwxyz (binary) */ + else if (VG_CLO_STREQN(16, arg, "--profile-flags=")) { Int j; char* opt = & arg[16]; if (8 != VG_(strlen)(opt)) { VG_(message)(Vg_UserMsg, - "--trace-codegen argument must have 8 digits"); + "--profile-flags argument must have 8 digits"); VG_(bad_option)(arg); } for (j = 0; j < 8; j++) { if ('0' == opt[j]) { /* do nothing */ } - else if ('1' == opt[j]) VG_(clo_trace_codegen) |= (1 << (7-j)); + else if ('1' == opt[j]) VG_(clo_profile_flags) |= (1 << (7-j)); else { - VG_(message)(Vg_UserMsg, "--trace-codegen argument can only " + VG_(message)(Vg_UserMsg, "--profile-flags argument can only " "contain 0s and 1s"); VG_(bad_option)(arg); } @@ -2808,6 +2828,9 @@ int main(int argc, char **argv) if (VG_(clo_profile)) VGP_(done_profiling)(); + if (VG_(clo_profile_flags) > 0) + VG_(show_BB_profile)(); + /* We're exiting, so nuke all the threads and clean up the proxy LWPs */ vg_assert(src == VgSrc_FatalSig || VG_(threads)[last_run_tid].status == VgTs_Runnable || diff --git a/coregrind/vg_memory.c b/coregrind/vg_memory.c index cc94abac0..f4fb1a9ae 100644 --- a/coregrind/vg_memory.c +++ b/coregrind/vg_memory.c @@ -91,7 +91,7 @@ Bool VG_(seg_overlaps)(const Segment *s, Addr p, SizeT len) static void recycleseg(Segment *s) { if (s->flags & SF_CODE) - VG_(invalidate_translations)(s->addr, s->len); + VG_(discard_translations)(s->addr, s->len); if (s->filename != NULL) VG_(arena_free)(VG_AR_CORE, (Char *)s->filename); diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c index 9ec067fba..89a17ecdd 100644 --- a/coregrind/vg_scheduler.c +++ b/coregrind/vg_scheduler.c @@ -720,7 +720,7 @@ VgSchedReturnCode do_scheduler ( Int* exitcode, ThreadId* last_run_tid ) UInt trc; Int done_this_time, n_in_bounded_wait; Int n_exists, n_waiting_for_reaper; - Addr trans_addr; + Bool found; /* Start with the root thread. tid in general indicates the currently runnable/just-finished-running thread. */ @@ -880,12 +880,13 @@ VgSchedReturnCode do_scheduler ( Int* exitcode, ThreadId* last_run_tid ) /* Trivial event. Miss in the fast-cache. Do a full lookup for it. */ - trans_addr = VG_(search_transtab)( ip ); - if (trans_addr == (Addr)0) { + found = VG_(search_transtab)( NULL, + ip, True/*upd_fast_cache*/ ); + if (!found) { /* Not found; we need to request a translation. */ - if (VG_(translate)( tid, ip, /*debug*/False )) { - trans_addr = VG_(search_transtab)( ip ); - if (trans_addr == (Addr)0) + if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/ )) { + found = VG_(search_transtab)( NULL, ip, True ); + if (!found) VG_(core_panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry"); } else { // If VG_(translate)() fails, it's because it had to throw @@ -3280,7 +3281,7 @@ void do_client_request ( ThreadId tid, UWord* arg ) " addr %p, len %d\n", (void*)arg[1], arg[2] ); - VG_(invalidate_translations)( arg[1], arg[2] ); + VG_(discard_translations)( arg[1], arg[2] ); SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */ break; diff --git a/coregrind/vg_symtab2.c b/coregrind/vg_symtab2.c index ae0c138ce..33de4c69b 100644 --- a/coregrind/vg_symtab2.c +++ b/coregrind/vg_symtab2.c @@ -2352,7 +2352,7 @@ static Bool resolve_redir(CodeRedirect *redir, const SegInfo *si) redir->to_lib, redir->to_sym, redir->to_addr); } - if (VG_(search_transtab)(redir->from_addr) != 0) { + if (VG_(search_transtab)(NULL, redir->from_addr, False)) { /* For some given (from, to) redir, the "from" function got called before the .so containing "to" became available. We know this because there is already a translation for the @@ -2377,7 +2377,7 @@ static Bool resolve_redir(CodeRedirect *redir, const SegInfo *si) " %s (%p -> %p)", redir->from_sym, redir->from_addr, redir->to_addr ); } - VG_(invalidate_translations)(redir->from_addr, 1); + VG_(discard_translations)(redir->from_addr, 1); } VG_(SkipList_Insert)(&sk_resolved_redir, redir); diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c index 3c4790aa6..7d3510406 100644 --- a/coregrind/vg_translate.c +++ b/coregrind/vg_translate.c @@ -343,14 +343,17 @@ static Bool need_to_handle_SP_assignment(void) } -Bool VG_(translate) ( ThreadId tid, Addr orig_addr, - Bool debugging_translation ) +Bool VG_(translate) ( ThreadId tid, + Addr64 orig_addr, + Bool debugging_translation, + Int debugging_verbosity ) { - Addr redir, orig_addr0 = orig_addr; - Int orig_size, tmpbuf_used, verbosity; + Addr64 redir, orig_addr0 = orig_addr; + Int tmpbuf_used, verbosity; Bool notrace_until_done; UInt notrace_until_limit = 0; Segment* seg; + VexGuestExtents vge; /* Make sure Vex is initialised right. */ VexTranslateResult tres; @@ -372,13 +375,24 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, redir = VG_(code_redirect)(orig_addr); if (redir != orig_addr && VG_(clo_verbosity) >= 2) { + Bool ok; Char name1[64] = ""; Char name2[64] = ""; - VG_(get_fnname_w_offset)(orig_addr, name1, 64); - VG_(get_fnname_w_offset)(redir, name2, 64); - name1[63] = name2[63] = 0; + name1[0] = name2[0] = 0; + ok = VG_(get_fnname_w_offset)(orig_addr, name1, 64); + if (ok) { + name1[63] = 0; + } else { + VG_(strcpy)(name1, "???"); + } + ok = VG_(get_fnname_w_offset)(redir, name2, 64); + if (ok) { + name2[63] = 0; + } else { + VG_(strcpy)(name2, "???"); + } VG_(message)(Vg_UserMsg, - "TRANSLATE: %p (%s) redirected to %p (%s)", + "TRANSLATE: 0x%llx (%s) redirected to 0x%llx (%s)", orig_addr, name1, redir, name2 ); } @@ -390,7 +404,8 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, few blocks translated prior to a failure. Set notrace_until_limit to be the number of translations to be made before --trace-codegen= style printing takes effect. */ - notrace_until_done = VG_(get_bbs_translated)() >= notrace_until_limit; + notrace_until_done + = VG_(get_bbs_translated)() >= notrace_until_limit; seg = VG_(find_segment)(orig_addr); @@ -414,11 +429,11 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, seg->flags |= SF_CODE; /* contains cached code */ /* If doing any code printing, print a basic block start marker */ - if (VG_(clo_trace_codegen) || debugging_translation) { + if (VG_(clo_trace_flags) || debugging_translation) { Char fnname[64] = ""; VG_(get_fnname_w_offset)(orig_addr, fnname, 64); VG_(printf)( - "==== BB %d %s(%p) approx BBs exec'd %llu ====\n", + "==== BB %d %s(0x%llx) approx BBs exec'd %lld ====\n", VG_(get_bbs_translated)(), fnname, orig_addr, VG_(bbs_done)); } @@ -426,21 +441,22 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, /* True if a debug trans., or if bit N set in VG_(clo_trace_codegen). */ verbosity = 0; if (debugging_translation) { - verbosity = 0xFE; + verbosity = debugging_verbosity; } else - if ( (VG_(clo_trace_codegen) > 0 + if ( (VG_(clo_trace_flags) > 0 && VG_(get_bbs_translated)() >= VG_(clo_trace_notbelow) )) { - verbosity = VG_(clo_trace_codegen); + verbosity = VG_(clo_trace_flags); } /* Actually do the translation. */ tres = LibVEX_Translate ( VG_(vex_arch), VG_(vex_subarch), VG_(vex_arch), VG_(vex_subarch), - (Char*)orig_addr, (Addr64)orig_addr, + (UChar*)orig_addr, + (Addr64)orig_addr, chase_into_ok, - &orig_size, + &vge, tmpbuf, N_TMPBUF, &tmpbuf_used, TL_(instrument), need_to_handle_SP_assignment() @@ -458,8 +474,6 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, #undef DECIDE_IF_PRINTING_CODEGEN /* Copy data at trans_addr into the translation cache. */ - /* Since the .orig_size and .trans_size fields are UShort, be paranoid. */ - vg_assert(orig_size >= 0 && orig_size < 65536); vg_assert(tmpbuf_used > 0 && tmpbuf_used < 65536); // If debugging, don't do anything with the translated block; we @@ -467,8 +481,10 @@ Bool VG_(translate) ( ThreadId tid, Addr orig_addr, if (!debugging_translation) { // Note that we use orig_addr0, not orig_addr, which might have been // changed by the redirection - VG_(add_to_trans_tab)( orig_addr0, orig_size, - (Addr)(&tmpbuf[0]), tmpbuf_used ); + VG_(add_to_trans_tab)( &vge, + orig_addr0, + (Addr)(&tmpbuf[0]), + tmpbuf_used ); } VGP_POPCC(VgpTranslate); diff --git a/coregrind/vg_transtab.c b/coregrind/vg_transtab.c index 13dbc6d99..af3e1602b 100644 --- a/coregrind/vg_transtab.c +++ b/coregrind/vg_transtab.c @@ -8,7 +8,7 @@ This file is part of Valgrind, a dynamic binary instrumentation framework. - Copyright (C) 2000-2004 Julian Seward + Copyright (C) 2000-2005 Julian Seward jseward@acm.org This program is free software; you can redistribute it and/or @@ -40,602 +40,508 @@ /*------------------ CONSTANTS ------------------*/ -/* Number of sectors the TC is divided into. */ -#define VG_TC_N_SECTORS 8 +/* Number of sectors the TC is divided into. If you need a larger + overall translation cache, increase this value. */ +#define N_SECTORS 8 -/* Calculated once at startup and never changed. */ -static /* const */ Int vg_tc_sector_szB = 0; +/* Number of TC entries in each sector. This needs to be a prime + number to work properly, and it is strongly recommended not to + change this. */ +#define N_TTES_PER_SECTOR /*30011*/ 40009 -/* Number of entries in the translation table. This must be a prime - number in order to make the hashing work properly. */ -#define VG_TT_SIZE /*5281*/ /*100129*/ /*200191*/ 250829 /*300007*/ +/* Because each sector contains a hash table of TTEntries, we need to + specify the maximum allowable loading, after which the sector is + deemed full. */ +#define SECTOR_TT_LIMIT_PERCENT 60 -/* Do an LRU pass when the translation table becomes this full. */ -#define VG_TT_LIMIT_PERCENT /*67*/ 80 - -#define VG_TT_LIMIT ((VG_TT_SIZE * VG_TT_LIMIT_PERCENT) / 100) +/* The sector is deemed full when this many entries are in it. */ +#define N_TTES_PER_SECTOR_USABLE \ + ((N_TTES_PER_SECTOR * SECTOR_TT_LIMIT_PERCENT) / 100) /*------------------ TYPES ------------------*/ -/* An entry in TC. Payload always is always padded out to a - word-aligned quantity so that these structs are always - word-aligned. Note, the layout of this is known by - /dispatch.S, so do not change it unless you change them - too. */ -typedef - struct { - /* 32-bit or 64-bit offsets */ - /* +0 or 0 */ Addr orig_addr; - /* +4 or 8 */ UShort orig_size; - /* +6 or 10 */ UShort trans_size; - /* +8 or 12 */ UChar payload[0]; - } - TCEntry; - -/* An entry in TT. */ +/* A translation-cache entry is two parts: + - The guest address of the first (entry) bb in the translation, + as a 64-bit word. + - One or more 64-bit words containing the code. + It is supposed to be 64-bit aligned. +*/ +/* typedef struct { - Addr orig_addr; - TCEntry* tcentry; + Addr64 orig_addr; + ULong code[0]; + } + TCEntry; +*/ + +/* A translation-table entry. This indicates precisely which areas of + guest code are included in the translation, and contains all other + auxiliary info too. */ +typedef + struct { + /* Profiling only: the count and weight (arbitrary meaning) for + this translation. Weight is a property of the translation + itself and computed once when the translation is created. + Count is an entry count for the translation and is + incremented by 1 every time the translation is used, if we + are profiling. */ + UInt count; + UShort weight; + + /* Status of the slot. Note, we need to be able to do lazy + deletion, hence the Deleted state. */ + enum { InUse, Deleted, Empty } status; + + /* Pointer to the corresponding TCEntry (must be in the same + sector!) */ + ULong* tce; + + /* This is the original guest address that purportedly is the + entry point of the translation. You might think that .entry + should be the same as .vge->base[0], and most of the time it + is. However, when doing redirections, that is not the case. + .vge must always correctly describe the guest code sections + from which this translation was made. However, .entry may or + may not be a lie, depending on whether or not we're doing + redirection. */ + Addr64 entry; + + /* This structure describes precisely what ranges of guest code + the translation covers, so we can decide whether or not to + delete it when translations of a given address range are + invalidated. */ + VexGuestExtents vge; } TTEntry; -#define PAYLOAD_OFFSET (sizeof(void*)==8 ? 12 : 8) -#define CODE_ALIGNMENT sizeof(void*) // alignment of TCEntries -#define CODE_ALIGN(a) (((a)+CODE_ALIGNMENT-1) & ~(CODE_ALIGNMENT-1)) -#define IS_ALIGNED(a) (((a) & (CODE_ALIGNMENT-1)) == 0) +/* Finally, a sector itself. Each sector contains an array of + TCEntries, which hold code, and an array of TTEntries, containing + all required administrative info. Profiling is supported using the + TTEntry .count and .weight fields, if required. Each sector is + independent in that no cross-sector references are allowed. + If the sector is not in use, all three pointers are NULL and + tt_n_inuse is zero. +*/ +typedef + struct { + /* The TCEntry area. Size of this depends on the average + translation size. We try and size it so it becomes full + precisely when this sector's translation table (tt) reaches + its load limit (SECTOR_TT_LIMIT_PERCENT). */ + ULong* tc; + /* The TTEntry array. This is a fixed size, always containing + exactly N_TTES_PER_SECTOR entries. */ + TTEntry* tt; -/* Denotes an empty TT slot, when TTEntry.orig_addr holds this - value. */ -#define VG_TTE_EMPTY ((Addr)1) + /* This points to the current allocation point in tc. */ + ULong* tc_next; -/* Denotes an empty TT slot, when TTEntry.orig_addr holds this - value. */ -#define VG_TTE_DELETED ((Addr)3) - -/* A bogus TCEntry which hopefully does not match code from any valid - address. This is what all VG_(tt_fast) entries are made to point - at when we want to invalidate it. */ -static const TCEntry vg_tc_bogus_TCEntry = { ((Addr)5), 0, 0 }; + /* The count of tt entries with state InUse. */ + Int tt_n_inuse; + } + Sector; /*------------------ DECLS ------------------*/ -/* The translation cache sectors. These are NULL until allocated - dynamically. */ -static UChar* vg_tc[VG_TC_N_SECTORS]; +/* The root data structure is an array of sectors. The index of the + youngest sector is recorded, and new translations are put into that + sector. When it fills up, we move along to the next sector and + start to fill that up, wrapping around at the end of the array. + That way, once all N_TC_SECTORS have been bought into use for the + first time, and are full, we then re-use the oldest sector, + endlessly. -/* Count of bytes used in each sector of the TC. */ -static Int vg_tc_used[VG_TC_N_SECTORS]; + When running, youngest sector should be between >= 0 and < + N_TC_SECTORS. The initial -1 value indicates the TT/TC system is + not yet initialised. +*/ +static Sector sectors[N_SECTORS]; +static Int youngest_sector = -1; -/* The age of each sector, so we can find the oldest. We just use the - global count of translations made when the sector was brought into - use. Doesn't matter if this mechanism gets confused (wraps around - 4G) once in a while. */ -static Int vg_tc_age[VG_TC_N_SECTORS]; - -/* The number of the sector currently being allocated in. */ -static Int vg_tc_current; - -/* Count of number of translations, orig and new bytes in each sector. - For stats purposes only. */ -static Int vg_tc_stats_count[VG_TC_N_SECTORS]; -static Int vg_tc_stats_osize[VG_TC_N_SECTORS]; -static Int vg_tc_stats_tsize[VG_TC_N_SECTORS]; - -static UInt n_tt_fast_misses = 0; // number of lookups missing fast TT helper -static UInt n_tc_discards = 0; // number of TT/TC discards - -// Number and total original/translated size of translations overall. -static UInt overall_in_count = 0; -static UInt overall_in_osize = 0; -static UInt overall_in_tsize = 0; -// Number and total original/t size of discards overall. -static UInt overall_out_count = 0; -static UInt overall_out_osize = 0; -static UInt overall_out_tsize = 0; +/* The number of ULongs in each TCEntry area. This is computed once + at startup and does not change. */ +static Int tc_sector_szQ; - -/*------------------ TRANSLATION TABLE ------------------*/ - -/* The translation table. An array of VG_TT_SIZE TTEntrys. */ -static TTEntry* vg_tt = NULL; - -/* Count of non-empty TT entries. This includes deleted ones. */ -static Int vg_tt_used = 0; - -/* Fast helper for the TT. A direct-mapped cache which holds a +/* Fast helper for the TC. A direct-mapped cache which holds a pointer to a TC entry which may or may not be the correct one, but which we hope usually is. This array is referred to directly from - vg_dispatch.S. */ -Addr /* TCEntry*, really */ VG_(tt_fast)[VG_TT_FAST_SIZE]; + /dispatch.S. -static void for_each_tc(Int sector, void (*fn)(TCEntry *)); + Entries in tt_fast may point to any valid TC entry, regardless of + which sector it's in. Consequently we must be very careful to + invalidate this cache when TC entries are changed or disappear. + + A special TCEntry -- bogus_tc_entry -- must be pointed at to cause + that cache entry to miss. This relies on the assumption that no + guest code actually has an address of 0x1. +*/ +/*global*/ ULong* VG_(tt_fast)[VG_TT_FAST_SIZE]; + +static ULong bogus_tc_entry = (Addr64)1; -/*------------------ TT HELPERS ------------------*/ +/* For profiling, we have a parallel array of pointers to .count + fields in TT entries. Again, these pointers must be invalidated + when translations disappear. A NULL pointer suffices to indicate + an unused slot. -static -void pp_tt_tc_status ( Char* submsg ) + tt_fast and tt_fastN change together: if tt_fast[i] points to + bogus_tc_entry then the corresponding tt_fastN[i] must be null. If + tt_fast[i] points to some TC entry somewhere, then tt_fastN[i] + *must* point to the .count field of the corresponding TT entry. + + tt_fast and tt_fastN are referred to from assembly code + (dispatch.S). +*/ +/*global*/ UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE]; + + +/*------------------ STATS DECLS ------------------*/ + +/* Number of fast-cache updates and flushes done. */ +ULong n_fast_flushes = 0; +ULong n_fast_updates = 0; + +/* Number of full lookups done. */ +ULong n_full_lookups = 0; +ULong n_lookup_probes = 0; + +/* Number/osize/tsize of translations entered. */ +ULong n_in_count = 0; +ULong n_in_osize = 0; +ULong n_in_tsize = 0; + +/* Number/osize of translations discarded due to lack of space. */ +ULong n_dump_count = 0; +ULong n_dump_osize = 0; + +/* Number/osize of translations discarded due to requests to do so. */ +ULong n_disc_count = 0; +ULong n_disc_osize = 0; + + + +/*-------------------------------------------------------------*/ +/*--- Add/delete/find translations ---*/ +/*-------------------------------------------------------------*/ + +static UInt vge_osize ( VexGuestExtents* vge ) { - Int tc_used, s; - if (VG_(clo_verbosity) <= 2) - return; - tc_used = 0; - for (s = 0; s < VG_TC_N_SECTORS; s++) - tc_used += vg_tc_used[s]; - - VG_(message)(Vg_DebugMsg, - "%lluk bbs: tt %d, tc %d: %s", - VG_(bbs_done) / 1000, - vg_tt_used, tc_used, submsg ); + UInt i, n = 0; + for (i = 0; i < vge->n_used; i++) + n += (UInt)vge->len[i]; + return n; } -/* Invalidate the tt_fast cache, for whatever reason, by pointing all - entries at vg_tc_bogus_TCEntry. */ -static -void vg_invalidate_tt_fast( void ) +static Bool isValidSector ( Int sector ) { - Int j; - for (j = 0; j < VG_TT_FAST_SIZE; j++) - VG_(tt_fast)[j] = (Addr)&vg_tc_bogus_TCEntry; + if (sector < 0 || sector >= N_SECTORS) + return False; + return True; +} + +static inline UInt HASH_TT ( Addr64 key ) +{ + UInt kHi = (UInt)(key >> 32); + UInt kLo = (UInt)key; + return (kHi ^ kLo) % N_TTES_PER_SECTOR; +} + +static void setFastCacheEntry ( Addr64 key, ULong* tce, UInt* count ) +{ + UInt cno = ((UInt)key) & VG_TT_FAST_MASK; + VG_(tt_fast)[cno] = tce; + VG_(tt_fastN)[cno] = count; + n_fast_updates++; +} + +static void invalidateFastCache ( void ) +{ + UInt j; + for (j = 0; j < VG_TT_FAST_SIZE; j++) { + VG_(tt_fast)[j] = &bogus_tc_entry; + VG_(tt_fastN)[j] = NULL; + } + n_fast_flushes++; +} + +static void initialiseSector ( Int sno ) +{ + Int i; + vg_assert(isValidSector(sno)); + + if (sectors[sno].tc == NULL) { + /* Sector has never been used before. Need to allocate tt and + tc. */ + vg_assert(sectors[sno].tt == NULL); + vg_assert(sectors[sno].tc_next == NULL); + vg_assert(sectors[sno].tt_n_inuse == 0); + sectors[sno].tc + = VG_(get_memory_from_mmap) + ( 8 * tc_sector_szQ, "sectors[sno].tc" ); + sectors[sno].tt + = VG_(get_memory_from_mmap) + ( N_TTES_PER_SECTOR * sizeof(TTEntry), "sectors[sno].tt" ); + if (VG_(clo_verbosity) > 2) + VG_(message)(Vg_DebugMsg, "TT/TC: initialise sector %d", sno); + } else { + /* Sector has been used before. */ + vg_assert(sectors[sno].tt != NULL); + vg_assert(sectors[sno].tc_next != NULL); + n_dump_count += sectors[sno].tt_n_inuse; + for (i = 0; i < N_TTES_PER_SECTOR; i++) { + if (sectors[sno].tt[i].status == InUse) { + n_dump_osize += vge_osize(§ors[sno].tt[i].vge); + } + } + if (VG_(clo_verbosity) > 2) + VG_(message)(Vg_DebugMsg, "TT/TC: recycle sector %d", sno); + } + + sectors[sno].tc_next = sectors[sno].tc; + sectors[sno].tt_n_inuse = 0; + for (i = 0; i < N_TTES_PER_SECTOR; i++) + sectors[sno].tt[i].status = Empty; + + invalidateFastCache(); } -static -void add_tt_entry ( TCEntry* tce ) +/* Add a translation of vge to TT/TC. The translation is temporarily + in code[0 .. code_len-1]. + + pre: youngest_sector points to a valid (although possibly full) + sector. +*/ +void VG_(add_to_trans_tab)( VexGuestExtents* vge, + Addr64 entry, + AddrH code, + UInt code_len ) { - UInt i; - /* VG_(printf)("add_TT_entry orig_addr %p\n", tce->orig_addr); */ - /* Hash to get initial probe point. */ - i = tce->orig_addr % VG_TT_SIZE; + Int tcAvailQ, reqdQ, y, i; + ULong *tce, *tce2; + UChar* srcP; + UChar* dstP; + + vg_assert(vge->n_used >= 1 && vge->n_used <= 3); + vg_assert(code_len > 0 && code_len < 20000); + + if (0) + VG_(printf)("add_to_trans_tab(entry = 0x%llx, len = %d)\n", + entry, code_len); + + n_in_count++; + n_in_tsize += code_len; + n_in_osize += vge_osize(vge); + + y = youngest_sector; + vg_assert(isValidSector(y)); + + if (sectors[y].tc == NULL) + initialiseSector(y); + + /* Try putting the translation in this sector. */ + reqdQ = 1 + ((code_len + 7) >> 3); + + /* Will it fit in tc? */ + tcAvailQ = ((ULong*)(§ors[y].tc[tc_sector_szQ])) + - ((ULong*)(sectors[y].tc_next)); + vg_assert(tcAvailQ >= 0); + vg_assert(tcAvailQ <= tc_sector_szQ); + + if (tcAvailQ < reqdQ + || sectors[y].tt_n_inuse >= N_TTES_PER_SECTOR_USABLE) { + /* No. So move on to the next sector. Either it's never been + used before, in which case it will get its tt/tc allocated + now, or it has been used before, in which case it is set to be + empty, hence throwing out the oldest sector. */ + youngest_sector++; + if (youngest_sector >= N_SECTORS) + youngest_sector = 0; + y = youngest_sector; + initialiseSector(y); + } + + /* Be sure ... */ + tcAvailQ = ((ULong*)(§ors[y].tc[tc_sector_szQ])) + - ((ULong*)(sectors[y].tc_next)); + vg_assert(tcAvailQ >= 0); + vg_assert(tcAvailQ <= tc_sector_szQ); + vg_assert(tcAvailQ >= reqdQ); + vg_assert(sectors[y].tt_n_inuse < N_TTES_PER_SECTOR_USABLE); + vg_assert(sectors[y].tt_n_inuse >= 0); + + /* Copy into tc. */ + tce = sectors[y].tc_next; + vg_assert(tce >= §ors[y].tc[0]); + vg_assert(tce <= §ors[y].tc[tc_sector_szQ]); + + tce[0] = entry; + dstP = (UChar*)(&tce[1]); + srcP = (UChar*)code; + for (i = 0; i < code_len; i++) + dstP[i] = srcP[i]; + sectors[y].tc_next += reqdQ; + sectors[y].tt_n_inuse++; + + /* more paranoia */ + tce2 = sectors[y].tc_next; + vg_assert(tce2 >= §ors[y].tc[0]); + vg_assert(tce2 <= §ors[y].tc[tc_sector_szQ]); + + /* Find an empty tt slot, and use it. There must be such a slot + since tt is never allowed to get completely full. */ + i = HASH_TT(entry); + vg_assert(i >= 0 && i < N_TTES_PER_SECTOR); while (True) { - if (vg_tt[i].orig_addr == tce->orig_addr) - VG_(core_panic)("add_TT_entry: duplicate"); - if (vg_tt[i].orig_addr == VG_TTE_EMPTY) + if (sectors[y].tt[i].status == Empty + || sectors[y].tt[i].status == Deleted) break; i++; - if (i == VG_TT_SIZE) + if (i >= N_TTES_PER_SECTOR) i = 0; } - vg_tt[i].orig_addr = tce->orig_addr; - vg_tt[i].tcentry = tce; - vg_tt_used++; - /* sanity ... */ - vg_assert(vg_tt_used < VG_TT_SIZE-1000); + sectors[y].tt[i].status = InUse; + sectors[y].tt[i].tce = tce; + sectors[y].tt[i].count = 0; + sectors[y].tt[i].weight = 1; + sectors[y].tt[i].vge = *vge; + sectors[y].tt[i].entry = entry; + + setFastCacheEntry( entry, tce, §ors[y].tt[i].count ); } -/* Search TT to find the translated address of the supplied original, - or NULL if not found. This routine is used when we miss in - VG_(tt_fast). +/* Search for the translation of the given guest address. If + requested, a successful search can also cause the fast-caches to be + updated. */ -static __inline__ -TTEntry* search_tt ( Addr orig_addr ) +Bool VG_(search_transtab) ( /*OUT*/AddrH* result, + Addr64 guest_addr, + Bool upd_cache ) { - Int i; - /* Hash to get initial probe point. */ - i = orig_addr % VG_TT_SIZE; - while (True) { - if (vg_tt[i].orig_addr == orig_addr) - return &vg_tt[i]; - if (vg_tt[i].orig_addr == VG_TTE_EMPTY) - return NULL; - i++; - if (i == VG_TT_SIZE) i = 0; - } -} + Int i, j, k, kstart, sno; + /* Find the initial probe point just once. It will be the same in + all sectors and avoids multiple expensive % operations. */ + n_full_lookups++; + k = -1; + kstart = HASH_TT(guest_addr); + vg_assert(kstart >= 0 && kstart < N_TTES_PER_SECTOR); + /* Search in all the sectors. Although the order should not matter, + it might be most efficient to search in the order youngest to + oldest. */ + sno = youngest_sector; + for (i = 0; i < N_SECTORS; i++) { -static -void initialise_tt ( void ) -{ - Int i; - vg_tt_used = 0; - for (i = 0; i < VG_TT_SIZE; i++) { - vg_tt[i].orig_addr = VG_TTE_EMPTY; - } - vg_invalidate_tt_fast(); -} + if (sectors[sno].tc == NULL) + goto notfound; /* sector not in use. */ - -static -void rebuild_TT ( void ) -{ - Int s; - - /* Throw away TT. */ - initialise_tt(); - - /* Rebuild TT from the remaining quarters. */ - for (s = 0; s < VG_TC_N_SECTORS; s++) { - for_each_tc(s, add_tt_entry); - } - pp_tt_tc_status ( "after rebuild of TC" ); -# if 1 /* def DEBUG_TRANSTAB */ - VG_(sanity_check_tt_tc)("rebuild_TT"); -# endif - -} - - -/*------------------ TC HELPERS ------------------*/ - -static -void for_each_tc(Int s, void (*fn)(TCEntry *)) -{ - UChar *pc; - UChar *pc_lim; - TCEntry *tce; - - pc = &(vg_tc[s][0]); - pc_lim = &(vg_tc[s][vg_tc_used[s]]); - while (True) { - if (pc >= pc_lim) break; - tce = (TCEntry*)pc; - pc += sizeof(TCEntry) + tce->trans_size; - if (tce->orig_addr != VG_TTE_DELETED) - (*fn)(tce); - } -} - -/* Find the oldest non-NULL, non-empty sector, or -1 if none such. */ -static -Int find_oldest_sector ( void ) -{ - Int oldest_age, oldest, i; - oldest_age = 1000 * 1000 * 1000; - oldest = -1; - for (i = 0; i < VG_TC_N_SECTORS; i++) { - if (vg_tc[i] == NULL) - continue; - if (vg_tc_used[i] == 0) - continue; - if (vg_tc_age[i] < oldest_age) { - oldest = i; - oldest_age = vg_tc_age[i]; + k = kstart; + for (j = 0; j < N_TTES_PER_SECTOR; j++) { + n_lookup_probes++; + if (sectors[sno].tt[k].status == InUse + && sectors[sno].tt[k].entry == guest_addr) { + /* found it */ + if (upd_cache) + setFastCacheEntry( + guest_addr, sectors[sno].tt[k].tce, + §ors[sno].tt[k].count ); + if (result) + *result = sizeof(Addr64) + (AddrH)sectors[sno].tt[k].tce; + return True; + } + if (sectors[sno].tt[k].status == Empty) + break; /* not found in this sector */ + k++; + if (k == N_TTES_PER_SECTOR) + k = 0; } + + /* If we fall off the end, all entries are InUse and not + matching, or Deleted. In any case we did not find it in this + sector. */ + + notfound: + /* move to the next oldest sector */ + sno = sno==0 ? (N_SECTORS-1) : (sno-1); } - return oldest; + + /* Not found in any sector. */ + return False; } -/* Discard the oldest sector, if any such exists. */ -static -void discard_oldest_sector ( void ) -{ - Char msg[100]; - Int s = find_oldest_sector(); - if (s != -1) { - vg_assert(s >= 0 && s < VG_TC_N_SECTORS); - VG_(sprintf)(msg, "before discard of sector %d (%d bytes)", - s, vg_tc_used[s]); - pp_tt_tc_status ( msg ); - overall_out_count += vg_tc_stats_count[s]; - overall_out_osize += vg_tc_stats_osize[s]; - overall_out_tsize += vg_tc_stats_tsize[s]; - vg_tc_used[s] = 0; - vg_tc_stats_count[s] = 0; - vg_tc_stats_osize[s] = 0; - vg_tc_stats_tsize[s] = 0; - n_tc_discards++; - } -} - - -/* Find an empty sector and bring it into use. If there isn't one, - try and allocate one. If that fails, return -1. */ -static -Int maybe_commission_sector ( void ) -{ - Char msg[100]; - Int s; - for (s = 0; s < VG_TC_N_SECTORS; s++) { - if (vg_tc[s] != NULL && vg_tc_used[s] == 0) { - vg_tc_age[s] = overall_in_count; - VG_(sprintf)(msg, "after commission of sector %d " - "at time %d", - s, vg_tc_age[s]); - pp_tt_tc_status ( msg ); -# if 1 /* def DEBUG_TRANSTAB */ - VG_(sanity_check_tt_tc)("maybe_commission_sector"); -# endif - return s; - } - } - for (s = 0; s < VG_TC_N_SECTORS; s++) { - if (vg_tc[s] == NULL) { - vg_tc[s] = VG_(get_memory_from_mmap) - ( vg_tc_sector_szB, "trans-cache(sector)" ); - vg_tc_used[s] = 0; - VG_(sprintf)(msg, "after allocation of sector %d (size %d)", - s, vg_tc_sector_szB ); - pp_tt_tc_status ( msg ); - return maybe_commission_sector(); - } - } - return -1; -} - - -static -UChar* allocate ( Int nBytes ) -{ - vg_assert(IS_ALIGNED(nBytes)); - - /* Ensure the TT is still OK. */ - while (vg_tt_used >= VG_TT_LIMIT) { - discard_oldest_sector(); - rebuild_TT(); - vg_assert(vg_tt_used < VG_TT_LIMIT); - } - - /* Can we get it into the current sector? */ - if (vg_tc_current >= 0 - && vg_tc_current < VG_TC_N_SECTORS - && vg_tc[vg_tc_current] != NULL - && vg_tc_used[vg_tc_current] + nBytes <= vg_tc_sector_szB) { - /* Yes. */ - UChar* p = &(vg_tc[vg_tc_current][ vg_tc_used[vg_tc_current] ]); - vg_tc_used[vg_tc_current] += nBytes; - return p; - } - - /* Perhaps we can bring a new sector into use, for the first - time. */ - vg_tc_current = maybe_commission_sector(); - if (vg_tc_current >= 0 && vg_tc_current < VG_TC_N_SECTORS) - return allocate(nBytes); - - /* That didn't work. We'll have to dump the oldest. */ - discard_oldest_sector(); - - rebuild_TT(); - vg_tc_current = maybe_commission_sector(); - vg_assert(vg_tc_current >= 0 && vg_tc_current < VG_TC_N_SECTORS); -# ifdef DEBUG_TRANSTAB - VG_(sanity_check_tt_tc)(); -# endif - - return allocate(nBytes); -} - - -/* Just so these counts can be queried without making them globally - visible. */ -void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used ) -{ - Int s; - *tt_used = vg_tt_used; - *tc_used = 0; - for (s = 0; s < VG_TC_N_SECTORS; s++) - *tc_used += vg_tc_used[s]; -} - - -/* Do a sanity check on TT/TC. +/* Delete all translations which intersect with any part of the + specified guest address range. Note, this is SLOW. */ + +static inline +Bool overlap1 ( Addr64 s1, UInt r1, Addr64 s2, UInt r2 ) +{ + Addr64 e1 = s1 + (ULong)r1 - 1ULL; + Addr64 e2 = s2 + (ULong)r1 - 1ULL; + if (e1 < s2 || e2 < s1) + return False; + return True; +} + +static inline +Bool overlaps ( Addr64 start, UInt range, VexGuestExtents* vge ) +{ + if (overlap1(start, range, vge->base[0], (UInt)vge->len[0])) + return True; + if (vge->n_used < 2) + return False; + if (overlap1(start, range, vge->base[1], (UInt)vge->len[1])) + return True; + if (vge->n_used < 3) + return False; + if (overlap1(start, range, vge->base[2], (UInt)vge->len[2])) + return True; + return False; +} + + +void VG_(discard_translations) ( Addr64 guest_start, UInt range ) +{ + Int sno, i; + Bool anyDeleted = False; + + for (sno = 0; sno < N_SECTORS; sno++) { + if (sectors[sno].tc == NULL) + continue; + for (i = 0; i < N_TTES_PER_SECTOR; i++) { + if (sectors[sno].tt[i].status == InUse + && overlaps( guest_start, range, §ors[sno].tt[i].vge )) { + sectors[sno].tt[i].status = Deleted; + sectors[sno].tt_n_inuse--; + anyDeleted = True; + n_disc_count++; + n_disc_osize += vge_osize(§ors[sno].tt[i].vge); + } + } + } + + if (anyDeleted) + invalidateFastCache(); +} + + +/*------------------------------------------------------------*/ +/*--- Sanity checking ---*/ +/*------------------------------------------------------------*/ + void VG_(sanity_check_tt_tc) ( Char* who ) { - Int i, s; - TTEntry* tte; - TCEntry* tce; - Char msg[200]; - - vg_assert(VG_(strlen)(who) < 50); - VG_(sprintf)(msg, "sanity_check_tt_tc: begin (%s)", who ); - pp_tt_tc_status ( msg ); - - /* Some basic checks on the sector array. */ - for (i = 0; i < VG_TC_N_SECTORS; i++) { - if (vg_tc[i] == NULL) { - vg_assert(vg_tc_used[i] == 0); - vg_assert(vg_tc_age[i] == 0); - } else { - vg_assert(vg_tc_used[i] <= vg_tc_sector_szB); - } - } - - /* Checks: - - Each TT entry points to a valid and corresponding TC entry. - */ - for (i = 0; i < VG_TT_SIZE; i++) { - tte = &vg_tt[i]; - /* empty slots are harmless. */ - if (tte->orig_addr == VG_TTE_EMPTY) continue; - /* all others should agree with the TC entry. */ - tce = tte->tcentry; - // XXX: 64-bit cleanness: should this be IS_WORD_ALIGNED? - vg_assert(IS_4_ALIGNED(tce)); - /* does this point into a valid TC sector? */ - for (s = 0; s < VG_TC_N_SECTORS; s++) - if (vg_tc[s] != NULL - && ((Addr)tce) >= (Addr)&vg_tc[s][0] - && ((Addr)tce) < (Addr)&vg_tc[s][ vg_tc_used[s] ]) - break; - vg_assert(s < VG_TC_N_SECTORS); - /* It should agree with the TC entry on the orig_addr. This may - be VG_TTE_DELETED, or a real orig addr. */ - vg_assert(tte->orig_addr == tce->orig_addr); - } - - VG_(sprintf)(msg, "sanity_check_tt_tc: done (%s)", who ); - pp_tt_tc_status ( msg ); -} - - -static __inline__ Int safe_idiv(Int a, Int b) -{ - return (b == 0 ? 0 : a / b); -} - -void VG_(print_tt_tc_stats)(void) -{ - VG_(message)(Vg_DebugMsg, - " TT/TC: %d tc sectors discarded.", - n_tc_discards ); - VG_(message)(Vg_DebugMsg, - " %d tt_fast misses.", - n_tt_fast_misses); - VG_(message)(Vg_DebugMsg, - "translate: new %d (%d -> %d; ratio %d:10)", - overall_in_count, overall_in_osize, overall_in_tsize, - safe_idiv(10*overall_in_tsize, overall_in_osize)); - VG_(message)(Vg_DebugMsg, - " discard %d (%d -> %d; ratio %d:10).", - overall_out_count, overall_out_osize, overall_out_tsize, - safe_idiv(10*overall_out_tsize, overall_out_osize)); -} - -Int VG_(get_bbs_translated) ( void ) -{ - return overall_in_count; -} - -/* Add this already-filled-in entry to the TT. Assumes that the - relevant code chunk has been placed in TC, along with a dummy back - pointer, which is inserted here. -*/ -void VG_(add_to_trans_tab) ( Addr orig_addr, Int orig_size, - Addr trans_addr, Int trans_size ) -{ - Int i, nBytes, trans_size_aligned; - TCEntry* tce; - /* - VG_(printf)("add_to_trans_tab(%d) %x %d %x %d\n", - vg_tt_used, tte->orig_addr, tte->orig_size, - tte->trans_addr, tte->trans_size); - */ - - // paranoia - vg_assert(offsetof(TCEntry, payload) == PAYLOAD_OFFSET); - vg_assert(trans_size > 0); - - /* figure out how many bytes we require. */ - nBytes = CODE_ALIGN(trans_size + sizeof(TCEntry)); - trans_size_aligned = nBytes-sizeof(TCEntry); - vg_assert(IS_ALIGNED(nBytes)); - - tce = (TCEntry*)allocate(nBytes); - /* - VG_(printf)("allocate returned %p (code start %p)\n", - tce, &tce->payload[0]); - */ - vg_assert(vg_tc_current >= 0 && vg_tc_current < VG_TC_N_SECTORS); - vg_assert(vg_tc_sector_szB > 0); - - /* Range check for writing in the trans cache. */ - vg_assert( ((UChar*)(tce)) - >= ((UChar*)(&vg_tc[vg_tc_current][0])) ); - vg_assert( ((UChar*)(&tce->payload[trans_size_aligned-1])) - < ((UChar*)(&vg_tc[vg_tc_current][vg_tc_sector_szB])) ); - - tce->orig_addr = orig_addr; - tce->orig_size = (UShort)orig_size; /* what's the point of storing this? */ - tce->trans_size = (UShort)trans_size_aligned; - for (i = 0; i < trans_size; i++) { - tce->payload[i] = ((UChar*)trans_addr)[i]; - } - - add_tt_entry(tce); - - /* Update stats. */ - overall_in_count ++; - overall_in_osize += orig_size; - overall_in_tsize += trans_size; - - vg_tc_stats_count[vg_tc_current] ++; - vg_tc_stats_osize[vg_tc_current] += orig_size; - vg_tc_stats_tsize[vg_tc_current] += trans_size; -} - - -/* Find the translation address for a given (original) code address. - If found, update VG_(tt_fast) so subsequent lookups are fast. If - no translation can be found, return zero. This routine is (the - only one) called from vg_run_innerloop. */ -Addr VG_(search_transtab) ( Addr original_addr ) -{ - TTEntry* tte; - VGP_PUSHCC(VgpSlowFindT); - tte = search_tt ( original_addr ); - if (tte == NULL) { - /* We didn't find it. vg_run_innerloop will have to request a - translation. */ - VGP_POPCC(VgpSlowFindT); - return (Addr)0; - } else { - /* Found it. Put the search result into the fast cache now. */ - UInt cno = (UInt)original_addr & VG_TT_FAST_MASK; - VG_(tt_fast)[cno] = (Addr)(tte->tcentry); - n_tt_fast_misses++; - VGP_POPCC(VgpSlowFindT); - return (Addr)&(tte->tcentry->payload[0]); - } -} - - -/* Invalidate translations of original code [start .. start + range - 1]. - This is slow, so you *really* don't want to call it very often. -*/ -void VG_(invalidate_translations) ( Addr start, UInt range ) -{ - Addr i_start, i_end, o_start, o_end; - UInt out_count, out_osize, out_tsize; - Int i; - TCEntry* tce; -# ifdef DEBUG_TRANSTAB - VG_(sanity_check_tt_tc)(); -# endif - i_start = start; - i_end = start + range - 1; - out_count = out_osize = out_tsize = 0; - - for (i = 0; i < VG_TT_SIZE; i++) { - if (vg_tt[i].orig_addr == VG_TTE_EMPTY - || vg_tt[i].orig_addr == VG_TTE_DELETED) continue; - tce = vg_tt[i].tcentry; - o_start = tce->orig_addr; - o_end = o_start + tce->trans_size - 1; - if (o_end < i_start || o_start > i_end) - continue; - - if (VG_(needs).basic_block_discards) - TL_(discard_basic_block_info)( tce->orig_addr, - tce->orig_size ); - - vg_tt[i].orig_addr = VG_TTE_DELETED; - tce->orig_addr = VG_TTE_DELETED; - - overall_out_count ++; - overall_out_osize += tce->orig_size; - overall_out_tsize += tce->trans_size; - out_count ++; - out_osize += tce->orig_size; - out_tsize += tce->trans_size; - } - - if (out_count > 0) { - vg_invalidate_tt_fast(); - VG_(sanity_check_tt_tc)("invalidate_translations"); -# ifdef DEBUG_TRANSTAB - { Addr aa; - for (aa = i_start; aa <= i_end; aa++) - vg_assert(search_tt ( aa ) == NULL); - } -# endif - } - - if (VG_(clo_verbosity) > 2) - VG_(message)(Vg_UserMsg, - "discard %d (%d -> %d) translations in range %p .. %p", - out_count, out_osize, out_tsize, i_start, i_end ); } @@ -645,52 +551,247 @@ void VG_(invalidate_translations) ( Addr start, UInt range ) void VG_(init_tt_tc) ( void ) { - Int s; + Int i, avg_codeszQ; /* Otherwise lots of things go wrong... */ - vg_assert(offsetof(TCEntry, payload) == PAYLOAD_OFFSET); - - /* Figure out how big each sector should be. */ - vg_tc_sector_szB - = (VG_TT_LIMIT /* max TT entries we expect */ - * (VG_(details).avg_translation_sizeB - + sizeof(TCEntry) - + (CODE_ALIGNMENT/2) /* avg alignment loss */) - ) - / VG_TC_N_SECTORS; + vg_assert(sizeof(ULong) == 8); + vg_assert(sizeof(Addr64) == 8); + + if (VG_(clo_verbosity) > 2) + VG_(message)(Vg_DebugMsg, + "TT/TC: VG_(init_tt_tc) " + "(startup of code management)"); + + /* Figure out how big each tc area should be. */ + avg_codeszQ + = (VG_(details).avg_translation_sizeB + 7) / 8; + + tc_sector_szQ + = N_TTES_PER_SECTOR_USABLE * (1 + avg_codeszQ); + /* Ensure the calculated value is not way crazy. */ - vg_assert(vg_tc_sector_szB >= 50000); - vg_assert(vg_tc_sector_szB <= 11500000); + vg_assert(tc_sector_szQ >= 2 * N_TTES_PER_SECTOR_USABLE); + vg_assert(tc_sector_szQ <= 50 * N_TTES_PER_SECTOR_USABLE); - for (s = 0; s < VG_TC_N_SECTORS; s++) { - vg_tc[s] = NULL; - vg_tc_used[s] = 0; - vg_tc_age[s] = 0; - vg_tc_stats_count[s] = 0; - vg_tc_stats_osize[s] = 0; - vg_tc_stats_tsize[s] = 0; + /* Initialise the sectors */ + youngest_sector = 0; + for (i = 0; i < N_SECTORS; i++) { + sectors[i].tc = NULL; + sectors[i].tt = NULL; + sectors[i].tc_next = NULL; + sectors[i].tt_n_inuse = 0; } - vg_tc_current = 0; - vg_tt = VG_(get_memory_from_mmap) ( VG_TT_SIZE * sizeof(TTEntry), - "trans-table" ); - /* The main translation table is empty. */ - initialise_tt(); + /* and the fast caches. */ + invalidateFastCache(); if (VG_(clo_verbosity) > 2) { VG_(message)(Vg_DebugMsg, - "Translation Cache: using %d sectors of %d bytes each", - VG_TC_N_SECTORS, vg_tc_sector_szB ); + "TT/TC: cache: %d sectors of %d bytes each = %d total", + N_SECTORS, 8 * tc_sector_szQ, + N_SECTORS * 8 * tc_sector_szQ ); VG_(message)(Vg_DebugMsg, - "Translation Table: %d total entries, max occupancy %d (%d%%)", - VG_TT_SIZE, VG_TT_LIMIT, VG_TT_LIMIT_PERCENT ); + "TT/TC: table: %d total entries, max occupancy %d (%d%%)", + N_SECTORS * N_TTES_PER_SECTOR, + N_SECTORS * N_TTES_PER_SECTOR_USABLE, + SECTOR_TT_LIMIT_PERCENT ); + } +} + + +/*------------------------------------------------------------*/ +/*--- Printing out statistics. ---*/ +/*------------------------------------------------------------*/ + +static ULong safe_idiv( ULong a, ULong b ) +{ + return (b == 0 ? 0 : a / b); +} + +UInt VG_(get_bbs_translated) ( void ) +{ + return n_in_count; +} + +void VG_(print_tt_tc_stats) ( void ) +{ + VG_(message)(Vg_DebugMsg, + " tt/tc: %llu tt lookups requiring %llu probes", + n_full_lookups, n_lookup_probes ); + VG_(message)(Vg_DebugMsg, + " tt/tc: %llu fast-cache updates, %llu flushes", + n_fast_updates, n_fast_flushes ); + + VG_(message)(Vg_DebugMsg, + "translate: new %lld (%lld -> %lld; ratio %lld:10)", + n_in_count, n_in_osize, n_in_tsize, + safe_idiv(10*n_in_tsize, n_in_osize)); + VG_(message)(Vg_DebugMsg, + "translate: dumped %lld (%lld -> ?" "?)", + n_dump_count, n_dump_osize ); + VG_(message)(Vg_DebugMsg, + "translate: discarded %lld (%lld -> ?" "?)", + n_disc_count, n_disc_osize ); +} + +/*------------------------------------------------------------*/ +/*--- Printing out of profiling results. ---*/ +/*------------------------------------------------------------*/ + +/* Only the top N_MAX bbs will be displayed. */ +#define N_MAX 10 + +static TTEntry* tops[N_MAX]; + +static ULong score ( TTEntry* tte ) +{ + return ((ULong)tte->weight) * ((ULong)tte->count); +} + +static Bool heavier ( TTEntry* t1, TTEntry* t2 ) +{ + return score(t1) > score(t2); +} + +/* Print n/m in form xx.yy% */ +static +void percentify ( ULong n, ULong m, Int field_width, Char* buf) +{ + Int i, len, space; + ULong lo, hi; + if (m == 0) m = 1; /* stay sane */ + hi = (n * 100) / m; + lo = (((n * 100) - hi * m) * 100) / m; + vg_assert(lo < 100); + if (lo < 10) + VG_(sprintf)(buf, "%lld.0%lld%%", hi, lo); + else + VG_(sprintf)(buf, "%lld.%lld%%", hi, lo); + + len = VG_(strlen)(buf); + space = field_width - len; + if (space < 0) space = 0; /* Allow for v. small field_width */ + i = len; + + /* Right justify in field */ + for ( ; i >= 0; i--) buf[i + space] = buf[i]; + for (i = 0; i < space; i++) buf[i] = ' '; +} + + +void VG_(show_BB_profile) ( void ) +{ + Char name[64]; + Int sno, i, r, s; + ULong score_total, score_cumul, score_here; + Char buf_cumul[10]; + Char buf_here[10]; + + /* First, compute the total weighted count, and find the top N + ttes. tops contains pointers to the most-used N_MAX blocks, in + descending order (viz, tops[0] is the highest scorer). */ + for (i = 0; i < N_MAX; i++) + tops[i] = NULL; + + score_total = 0; + + for (sno = 0; sno < N_SECTORS; sno++) { + if (sectors[sno].tc == NULL) + continue; + for (i = 0; i < N_TTES_PER_SECTOR; i++) { + if (sectors[sno].tt[i].status != InUse) + continue; + score_total += score(§ors[sno].tt[i]); + /* Find the rank for sectors[sno].tt[i]. */ + r = N_MAX-1; + while (True) { + if (r == -1) + break; + if (tops[r] == NULL) { + r--; + continue; + } + if (heavier(§ors[sno].tt[i], tops[r])) { + r--; + continue; + } + break; + } + r++; + vg_assert(r >= 0 && r <= N_MAX); + /* This bb should be placed at r, and bbs above it shifted + upwards one slot. */ + if (r < N_MAX) { + for (s = N_MAX-1; s > r; s--) + tops[s] = tops[s-1]; + tops[r] = §ors[sno].tt[i]; + } + } } -# ifdef DEBUG_TRANSTAB - VG_(sanity_check_tt_tc)(); -# endif + VG_(printf)("\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("--- BEGIN BB Profile (summary of scores) ---\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("\n"); + + VG_(printf)("Total score = %lld\n\n", score_total); + + score_cumul = 0; + for (r = 0; r < N_MAX; r++) { + if (tops[r] == NULL) + continue; + name[0] = 0; + VG_(get_fnname_w_offset)(tops[r]->entry, name, 64); + name[63] = 0; + score_here = score(tops[r]); + score_cumul += score_here; + percentify(score_cumul, score_total, 6, buf_cumul); + percentify(score_here, score_total, 6, buf_here); + VG_(printf)("%3d: (%9lld %s) %9lld %s 0x%llx %s\n", + r, + score_cumul, buf_cumul, + score_here, buf_here, tops[r]->entry, name ); + } + + VG_(printf)("\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("--- BB Profile (BB details) ---\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("\n"); + + score_cumul = 0; + for (r = 0; r < N_MAX; r++) { + if (tops[r] == NULL) + continue; + name[0] = 0; + VG_(get_fnname_w_offset)(tops[r]->entry, name, 64); + name[63] = 0; + score_here = score(tops[r]); + score_cumul += score_here; + percentify(score_cumul, score_total, 6, buf_cumul); + percentify(score_here, score_total, 6, buf_here); + VG_(printf)("\n"); + VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-= begin BB rank %d " + "=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r); + VG_(printf)("%3d: (%9lld %s) %9lld %s 0x%llx %s\n", + r, + score_cumul, buf_cumul, + score_here, buf_here, tops[r]->entry, name ); + VG_(printf)("\n"); + VG_(translate)(0, tops[r]->entry, True, VG_(clo_profile_flags)); + VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-= end BB rank %d " + "=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r); + } + + VG_(printf)("\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("--- END BB Profile ---\n"); + VG_(printf)("------------------------------------------------------------\n"); + VG_(printf)("\n"); } + /*--------------------------------------------------------------------*/ /*--- end vg_transtab.c ---*/ /*--------------------------------------------------------------------*/ diff --git a/coregrind/x86/dispatch.S b/coregrind/x86/dispatch.S index 504053449..19489cca2 100644 --- a/coregrind/x86/dispatch.S +++ b/coregrind/x86/dispatch.S @@ -37,13 +37,6 @@ /*--- The dispatch loop. ---*/ /*------------------------------------------------------------*/ -#define TT_LOOKUP(reg, fail) \ - movl %eax, reg; \ - andl $VG_TT_FAST_MASK, reg; \ - movl VG_(tt_fast)(,reg,4), reg; \ - cmpl %eax, (reg); \ - jnz fail - /* signature: UInt VG_(run_innerloop) ( void* guest_state ) */ .globl VG_(run_innerloop) @@ -99,13 +92,19 @@ dispatch_boring: jz counter_is_zero /* try a fast lookup in the translation cache */ - TT_LOOKUP(%ebx, fast_lookup_failed) + movl %eax, %ebx + andl $VG_TT_FAST_MASK, %ebx + movl VG_(tt_fast)(,%ebx,4), %ecx + cmpl %eax, (%ecx) + jnz fast_lookup_failed + movl VG_(tt_fastN)(,%ebx,4), %edx + incl (%edx) - /* Found a match. Call the tce.payload field. The magic 8 - value is offsetof(TCEntry,payload) on a 32-bit platform. */ + /* Found a match. Call tce[1], which is 8 bytes along, since + each tce element is a 64-bit int. */ - addl $8, %ebx - call *%ebx + addl $8, %ecx + call *%ecx /* %eax holds destination (original) address. diff --git a/include/basic_types.h b/include/basic_types.h index 5a5c2ea31..112b26e29 100644 --- a/include/basic_types.h +++ b/include/basic_types.h @@ -50,6 +50,7 @@ typedef unsigned long UWord; // 32 64 typedef signed long Word; // 32 64 typedef UWord Addr; // 32 64 +typedef UWord AddrH; // 32 64 typedef UWord SizeT; // 32 64 typedef Word SSizeT; // 32 64 diff --git a/none/tests/cmdline2.stdout.exp b/none/tests/cmdline2.stdout.exp index 10e906c4f..614689a04 100644 --- a/none/tests/cmdline2.stdout.exp +++ b/none/tests/cmdline2.stdout.exp @@ -43,9 +43,9 @@ usage: valgrind --tool= [options] prog-and-args --single-step=no|yes translate each instr separately? [no] --optimise=no|yes improve intermediate code? [yes] --profile=no|yes profile? (tool must be built for it) [no] - --bbprofile=no|yes profile bbs? [no] --branchpred=yes|no generate branch prediction hints [no] - --trace-codegen= show generated code? (X = 0|1) [00000000] + --trace-flags= show generated code? (X = 0|1) [00000000] + --profile-flags= ditto, but for profiling (X = 0|1) [00000000] --trace-notbelow= only show BBs above [0] --trace-syscalls=no|yes show all system calls? [no] --trace-signals=no|yes show signal handling details? [no] @@ -61,7 +61,7 @@ usage: valgrind --tool= [options] prog-and-args --vex-guest-max-insns 1 .. 100 [50] --vex-guest-chase-thresh 0 .. 99 [10] - --trace-codegen values (omit the middle space): + --trace-flags and --profile-flags values (omit the middle space): 1000 0000 show conversion into IR 0100 0000 show after initial opt 0010 0000 show after instrumentation