mirror of
https://github.com/Zenithsiz/ftmemsim-valgrind.git
synced 2026-02-03 10:05:29 +00:00
- The command option --collect-systime has been enhanced to specify
the unit used to record the elapsed time spent during system calls.
The command option now accepts the values no|yes|msec|usec|nsec,
where yes is a synonym of msec. When giving the value nsec, the
system cpu time of system calls is also recorded.
Note that the nsec option is not supported on Darwin.
1740 lines
60 KiB
C
1740 lines
60 KiB
C
/*--------------------------------------------------------------------*/
|
|
/*--- Cache simulation. ---*/
|
|
/*--- sim.c ---*/
|
|
/*--------------------------------------------------------------------*/
|
|
|
|
/*
|
|
This file is part of Callgrind, a Valgrind tool for call graph
|
|
profiling programs.
|
|
|
|
Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
|
|
|
|
This tool is derived from and contains code from Cachegrind
|
|
Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License as
|
|
published by the Free Software Foundation; either version 2 of the
|
|
License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
|
|
The GNU General Public License is contained in the file COPYING.
|
|
*/
|
|
|
|
#include "global.h"
|
|
|
|
|
|
/* Notes:
|
|
- simulates a write-allocate cache
|
|
- (block --> set) hash function uses simple bit selection
|
|
- handling of references straddling two cache blocks:
|
|
- counts as only one cache access (not two)
|
|
- both blocks hit --> one hit
|
|
- one block hits, the other misses --> one miss
|
|
- both blocks miss --> one miss (not two)
|
|
*/
|
|
|
|
/* Cache configuration */
|
|
#include "cg_arch.c"
|
|
|
|
/* additional structures for cache use info, separated
|
|
* according usage frequency:
|
|
* - line_loaded : pointer to cost center of instruction
|
|
* which loaded the line into cache.
|
|
* Needed to increment counters when line is evicted.
|
|
* - line_use : updated on every access
|
|
*/
|
|
typedef struct {
|
|
UInt count;
|
|
UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
|
|
} line_use;
|
|
|
|
typedef struct {
|
|
Addr memline, iaddr;
|
|
line_use* dep_use; /* point to higher-level cacheblock for this memline */
|
|
ULong* use_base;
|
|
} line_loaded;
|
|
|
|
/* Cache state */
|
|
typedef struct {
|
|
const HChar* name;
|
|
int size; /* bytes */
|
|
int assoc;
|
|
int line_size; /* bytes */
|
|
Bool sectored; /* prefetch nearside cacheline on read */
|
|
int sets;
|
|
int sets_min_1;
|
|
int line_size_bits;
|
|
int tag_shift;
|
|
UWord tag_mask;
|
|
HChar desc_line[128]; // large enough
|
|
UWord* tags;
|
|
|
|
/* for cache use */
|
|
int line_size_mask;
|
|
int* line_start_mask;
|
|
int* line_end_mask;
|
|
line_loaded* loaded;
|
|
line_use* use;
|
|
} cache_t2;
|
|
|
|
/*
|
|
* States of flat caches in our model.
|
|
* We use a 2-level hierarchy,
|
|
*/
|
|
static cache_t2 I1, D1, LL;
|
|
|
|
/* Lower bits of cache tags are used as flags for a cache line */
|
|
#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
|
|
#define CACHELINE_DIRTY 1
|
|
|
|
|
|
/* Cache simulator Options */
|
|
static Bool clo_simulate_writeback = False;
|
|
static Bool clo_simulate_hwpref = False;
|
|
static Bool clo_simulate_sectors = False;
|
|
static Bool clo_collect_cacheuse = False;
|
|
|
|
/* Following global vars are setup before by setup_bbcc():
|
|
*
|
|
* - Addr CLG_(bb_base) (instruction start address of original BB)
|
|
* - ULong* CLG_(cost_base) (start of cost array for BB)
|
|
*/
|
|
|
|
Addr CLG_(bb_base);
|
|
ULong* CLG_(cost_base);
|
|
|
|
static InstrInfo* current_ii;
|
|
|
|
/* Cache use offsets */
|
|
/* The offsets are only correct because all per-instruction event sets get
|
|
* the "Use" set added first !
|
|
*/
|
|
static Int off_I1_AcCost = 0;
|
|
static Int off_I1_SpLoss = 1;
|
|
static Int off_D1_AcCost = 0;
|
|
static Int off_D1_SpLoss = 1;
|
|
static Int off_LL_AcCost = 2;
|
|
static Int off_LL_SpLoss = 3;
|
|
|
|
/* Cache access types */
|
|
typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
|
|
|
|
/* Result of a reference into a flat cache */
|
|
typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
|
|
|
|
/* Result of a reference into a hierarchical cache model */
|
|
typedef enum {
|
|
L1_Hit,
|
|
LL_Hit,
|
|
MemAccess,
|
|
WriteBackMemAccess } CacheModelResult;
|
|
|
|
typedef CacheModelResult (*simcall_type)(Addr, UChar);
|
|
|
|
static struct {
|
|
simcall_type I1_Read;
|
|
simcall_type D1_Read;
|
|
simcall_type D1_Write;
|
|
} simulator;
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Cache Simulator Initialization ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
static void cachesim_clearcache(cache_t2* c)
|
|
{
|
|
Int i;
|
|
|
|
for (i = 0; i < c->sets * c->assoc; i++)
|
|
c->tags[i] = 0;
|
|
if (c->use) {
|
|
for (i = 0; i < c->sets * c->assoc; i++) {
|
|
c->loaded[i].memline = 0;
|
|
c->loaded[i].use_base = 0;
|
|
c->loaded[i].dep_use = 0;
|
|
c->loaded[i].iaddr = 0;
|
|
c->use[i].mask = 0;
|
|
c->use[i].count = 0;
|
|
c->tags[i] = i % c->assoc; /* init lower bits as pointer */
|
|
}
|
|
}
|
|
}
|
|
|
|
static void cacheuse_initcache(cache_t2* c);
|
|
|
|
/* By this point, the size/assoc/line_size has been checked. */
|
|
static void cachesim_initcache(cache_t config, cache_t2* c)
|
|
{
|
|
c->size = config.size;
|
|
c->assoc = config.assoc;
|
|
c->line_size = config.line_size;
|
|
c->sectored = False; // FIXME
|
|
|
|
c->sets = (c->size / c->line_size) / c->assoc;
|
|
c->sets_min_1 = c->sets - 1;
|
|
c->line_size_bits = VG_(log2)(c->line_size);
|
|
c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
|
|
c->tag_mask = ~((1u<<c->tag_shift)-1);
|
|
|
|
/* Can bits in tag entries be used for flags?
|
|
* Should be always true as MIN_LINE_SIZE >= 16 */
|
|
CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
|
|
|
|
if (c->assoc == 1) {
|
|
VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
|
|
c->size, c->line_size,
|
|
c->sectored ? ", sectored":"");
|
|
} else {
|
|
VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
|
|
c->size, c->line_size, c->assoc,
|
|
c->sectored ? ", sectored":"");
|
|
}
|
|
|
|
c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
|
|
sizeof(UWord) * c->sets * c->assoc);
|
|
if (clo_collect_cacheuse)
|
|
cacheuse_initcache(c);
|
|
else
|
|
c->use = 0;
|
|
cachesim_clearcache(c);
|
|
}
|
|
|
|
|
|
#if 0
|
|
static void print_cache(cache_t2* c)
|
|
{
|
|
UInt set, way, i;
|
|
|
|
/* Note initialisation and update of 'i'. */
|
|
for (i = 0, set = 0; set < c->sets; set++) {
|
|
for (way = 0; way < c->assoc; way++, i++) {
|
|
VG_(printf)("%8x ", c->tags[i]);
|
|
}
|
|
VG_(printf)("\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Simple Cache Simulation ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
/*
|
|
* Model: single inclusive, 2-level cache hierarchy (L1/LL)
|
|
* with write-allocate
|
|
*
|
|
* For simple cache hit/miss counts, we do not have to
|
|
* maintain the dirty state of lines (no need to distinguish
|
|
* read/write references), and the resulting counts are the
|
|
* same for write-through and write-back caches.
|
|
*
|
|
* Simulator functions:
|
|
* CacheModelResult cachesim_I1_ref(Addr a, UChar size)
|
|
* CacheModelResult cachesim_D1_ref(Addr a, UChar size)
|
|
*/
|
|
__attribute__((always_inline))
|
|
static __inline__
|
|
CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
|
|
{
|
|
int i, j;
|
|
UWord *set;
|
|
|
|
set = &(c->tags[set_no * c->assoc]);
|
|
|
|
/* This loop is unrolled for just the first case, which is the most */
|
|
/* common. We can't unroll any further because it would screw up */
|
|
/* if we have a direct-mapped (1-way) cache. */
|
|
if (tag == set[0])
|
|
return Hit;
|
|
|
|
/* If the tag is one other than the MRU, move it into the MRU spot */
|
|
/* and shuffle the rest down. */
|
|
for (i = 1; i < c->assoc; i++) {
|
|
if (tag == set[i]) {
|
|
for (j = i; j > 0; j--) {
|
|
set[j] = set[j - 1];
|
|
}
|
|
set[0] = tag;
|
|
return Hit;
|
|
}
|
|
}
|
|
|
|
/* A miss; install this tag as MRU, shuffle rest down. */
|
|
for (j = c->assoc - 1; j > 0; j--) {
|
|
set[j] = set[j - 1];
|
|
}
|
|
set[0] = tag;
|
|
|
|
return Miss;
|
|
}
|
|
|
|
__attribute__((always_inline))
|
|
static __inline__
|
|
CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
|
|
{
|
|
UWord block1 = a >> c->line_size_bits;
|
|
UWord block2 = (a+size-1) >> c->line_size_bits;
|
|
UInt set1 = block1 & c->sets_min_1;
|
|
/* the tag does not need to include bits specifying the set,
|
|
* but it can, and this saves instructions */
|
|
UWord tag1 = block1;
|
|
|
|
/* Access entirely within line. */
|
|
if (block1 == block2)
|
|
return cachesim_setref(c, set1, tag1);
|
|
|
|
/* Access straddles two lines. */
|
|
else if (block1 + 1 == block2) {
|
|
UInt set2 = block2 & c->sets_min_1;
|
|
UWord tag2 = block2;
|
|
|
|
/* the call updates cache structures as side effect */
|
|
CacheResult res1 = cachesim_setref(c, set1, tag1);
|
|
CacheResult res2 = cachesim_setref(c, set2, tag2);
|
|
return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
|
|
|
|
} else {
|
|
VG_(printf)("addr: %lx size: %u blocks: %lu %lu",
|
|
a, size, block1, block2);
|
|
VG_(tool_panic)("item straddles more than two cache sets");
|
|
}
|
|
return Hit;
|
|
}
|
|
|
|
static
|
|
CacheModelResult cachesim_I1_ref(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
|
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
|
return MemAccess;
|
|
}
|
|
|
|
static
|
|
CacheModelResult cachesim_D1_ref(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
|
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
|
return MemAccess;
|
|
}
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Write Back Cache Simulation ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
/*
|
|
* More complex model: L1 Write-through, LL Write-back
|
|
* This needs to distinguish among read and write references.
|
|
*
|
|
* Simulator functions:
|
|
* CacheModelResult cachesim_I1_Read(Addr a, UChar size)
|
|
* CacheModelResult cachesim_D1_Read(Addr a, UChar size)
|
|
* CacheModelResult cachesim_D1_Write(Addr a, UChar size)
|
|
*/
|
|
|
|
/*
|
|
* With write-back, result can be a miss evicting a dirty line
|
|
* The dirty state of a cache line is stored in Bit0 of the tag for
|
|
* this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
|
|
* type (Read/Write), the line gets dirty on a write.
|
|
*/
|
|
__attribute__((always_inline))
|
|
static __inline__
|
|
CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
|
|
{
|
|
int i, j;
|
|
UWord *set, tmp_tag;
|
|
|
|
set = &(c->tags[set_no * c->assoc]);
|
|
|
|
/* This loop is unrolled for just the first case, which is the most */
|
|
/* common. We can't unroll any further because it would screw up */
|
|
/* if we have a direct-mapped (1-way) cache. */
|
|
if (tag == (set[0] & ~CACHELINE_DIRTY)) {
|
|
set[0] |= ref;
|
|
return Hit;
|
|
}
|
|
/* If the tag is one other than the MRU, move it into the MRU spot */
|
|
/* and shuffle the rest down. */
|
|
for (i = 1; i < c->assoc; i++) {
|
|
if (tag == (set[i] & ~CACHELINE_DIRTY)) {
|
|
tmp_tag = set[i] | ref; // update dirty flag
|
|
for (j = i; j > 0; j--) {
|
|
set[j] = set[j - 1];
|
|
}
|
|
set[0] = tmp_tag;
|
|
return Hit;
|
|
}
|
|
}
|
|
|
|
/* A miss; install this tag as MRU, shuffle rest down. */
|
|
tmp_tag = set[c->assoc - 1];
|
|
for (j = c->assoc - 1; j > 0; j--) {
|
|
set[j] = set[j - 1];
|
|
}
|
|
set[0] = tag | ref;
|
|
|
|
return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
|
|
}
|
|
|
|
__attribute__((always_inline))
|
|
static __inline__
|
|
CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
|
|
{
|
|
UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
|
|
UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
|
|
UWord tag = a & c->tag_mask;
|
|
|
|
/* Access entirely within line. */
|
|
if (set1 == set2)
|
|
return cachesim_setref_wb(c, ref, set1, tag);
|
|
|
|
/* Access straddles two lines. */
|
|
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
|
|
else if (((set1 + 1) & (c->sets_min_1)) == set2) {
|
|
UWord tag2 = (a+size-1) & c->tag_mask;
|
|
|
|
/* the call updates cache structures as side effect */
|
|
CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
|
|
CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
|
|
|
|
if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
|
|
return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
|
|
|
|
} else {
|
|
VG_(printf)("addr: %lx size: %u sets: %u %u", a, size, set1, set2);
|
|
VG_(tool_panic)("item straddles more than two cache sets");
|
|
}
|
|
return Hit;
|
|
}
|
|
|
|
|
|
static
|
|
CacheModelResult cachesim_I1_Read(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
|
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
|
case Hit: return LL_Hit;
|
|
case Miss: return MemAccess;
|
|
default: break;
|
|
}
|
|
return WriteBackMemAccess;
|
|
}
|
|
|
|
static
|
|
CacheModelResult cachesim_D1_Read(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
|
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
|
case Hit: return LL_Hit;
|
|
case Miss: return MemAccess;
|
|
default: break;
|
|
}
|
|
return WriteBackMemAccess;
|
|
}
|
|
|
|
static
|
|
CacheModelResult cachesim_D1_Write(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &D1, a, size) == Hit ) {
|
|
/* Even for a L1 hit, the write-trough L1 passes
|
|
* the write to the LL to make the LL line dirty.
|
|
* But this causes no latency, so return the hit.
|
|
*/
|
|
cachesim_ref_wb( &LL, Write, a, size);
|
|
return L1_Hit;
|
|
}
|
|
switch( cachesim_ref_wb( &LL, Write, a, size) ) {
|
|
case Hit: return LL_Hit;
|
|
case Miss: return MemAccess;
|
|
default: break;
|
|
}
|
|
return WriteBackMemAccess;
|
|
}
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Hardware Prefetch Simulation ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
static ULong prefetch_up = 0;
|
|
static ULong prefetch_down = 0;
|
|
|
|
#define PF_STREAMS 8
|
|
#define PF_PAGEBITS 12
|
|
|
|
static UInt pf_lastblock[PF_STREAMS];
|
|
static Int pf_seqblocks[PF_STREAMS];
|
|
|
|
static
|
|
void prefetch_clear(void)
|
|
{
|
|
int i;
|
|
for(i=0;i<PF_STREAMS;i++)
|
|
pf_lastblock[i] = pf_seqblocks[i] = 0;
|
|
}
|
|
|
|
/*
|
|
* HW Prefetch emulation
|
|
* Start prefetching when detecting sequential access to 3 memory blocks.
|
|
* One stream can be detected per 4k page.
|
|
*/
|
|
static __inline__
|
|
void prefetch_LL_doref(Addr a)
|
|
{
|
|
UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
|
|
UInt block = ( a >> LL.line_size_bits);
|
|
|
|
if (block != pf_lastblock[stream]) {
|
|
if (pf_seqblocks[stream] == 0) {
|
|
if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
|
|
else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
|
|
}
|
|
else if (pf_seqblocks[stream] >0) {
|
|
if (pf_lastblock[stream] +1 == block) {
|
|
pf_seqblocks[stream]++;
|
|
if (pf_seqblocks[stream] >= 2) {
|
|
prefetch_up++;
|
|
cachesim_ref(&LL, a + 5 * LL.line_size,1);
|
|
}
|
|
}
|
|
else pf_seqblocks[stream] = 0;
|
|
}
|
|
else if (pf_seqblocks[stream] <0) {
|
|
if (pf_lastblock[stream] -1 == block) {
|
|
pf_seqblocks[stream]--;
|
|
if (pf_seqblocks[stream] <= -2) {
|
|
prefetch_down++;
|
|
cachesim_ref(&LL, a - 5 * LL.line_size,1);
|
|
}
|
|
}
|
|
else pf_seqblocks[stream] = 0;
|
|
}
|
|
pf_lastblock[stream] = block;
|
|
}
|
|
}
|
|
|
|
/* simple model with hardware prefetch */
|
|
|
|
static
|
|
CacheModelResult prefetch_I1_ref(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
|
prefetch_LL_doref(a);
|
|
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
|
return MemAccess;
|
|
}
|
|
|
|
static
|
|
CacheModelResult prefetch_D1_ref(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
|
prefetch_LL_doref(a);
|
|
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
|
return MemAccess;
|
|
}
|
|
|
|
|
|
/* complex model with hardware prefetch */
|
|
|
|
static
|
|
CacheModelResult prefetch_I1_Read(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
|
prefetch_LL_doref(a);
|
|
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
|
case Hit: return LL_Hit;
|
|
case Miss: return MemAccess;
|
|
default: break;
|
|
}
|
|
return WriteBackMemAccess;
|
|
}
|
|
|
|
static
|
|
CacheModelResult prefetch_D1_Read(Addr a, UChar size)
|
|
{
|
|
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
|
prefetch_LL_doref(a);
|
|
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
|
case Hit: return LL_Hit;
|
|
case Miss: return MemAccess;
|
|
default: break;
|
|
}
|
|
return WriteBackMemAccess;
|
|
}
|
|
|
|
static
|
|
CacheModelResult prefetch_D1_Write(Addr a, UChar size)
|
|
{
|
|
prefetch_LL_doref(a);
|
|
if ( cachesim_ref( &D1, a, size) == Hit ) {
|
|
/* Even for a L1 hit, the write-trough L1 passes
|
|
* the write to the LL to make the LL line dirty.
|
|
* But this causes no latency, so return the hit.
|
|
*/
|
|
cachesim_ref_wb( &LL, Write, a, size);
|
|
return L1_Hit;
|
|
}
|
|
switch( cachesim_ref_wb( &LL, Write, a, size) ) {
|
|
case Hit: return LL_Hit;
|
|
case Miss: return MemAccess;
|
|
default: break;
|
|
}
|
|
return WriteBackMemAccess;
|
|
}
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Cache Simulation with use metric collection ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
/* can not be combined with write-back or prefetch */
|
|
|
|
static
|
|
void cacheuse_initcache(cache_t2* c)
|
|
{
|
|
int i;
|
|
unsigned int start_mask, start_val;
|
|
unsigned int end_mask, end_val;
|
|
|
|
c->use = CLG_MALLOC("cl.sim.cu_ic.1",
|
|
sizeof(line_use) * c->sets * c->assoc);
|
|
c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
|
|
sizeof(line_loaded) * c->sets * c->assoc);
|
|
c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
|
|
sizeof(int) * c->line_size);
|
|
c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
|
|
sizeof(int) * c->line_size);
|
|
|
|
c->line_size_mask = c->line_size-1;
|
|
|
|
/* Meaning of line_start_mask/line_end_mask
|
|
* Example: for a given cache line, you get an access starting at
|
|
* byte offset 5, length 4, byte 5 - 8 was touched. For a cache
|
|
* line size of 32, you have 1 bit per byte in the mask:
|
|
*
|
|
* bit31 bit8 bit5 bit 0
|
|
* | | | |
|
|
* 11..111111100000 line_start_mask[5]
|
|
* 00..000111111111 line_end_mask[(5+4)-1]
|
|
*
|
|
* use_mask |= line_start_mask[5] && line_end_mask[8]
|
|
*
|
|
*/
|
|
start_val = end_val = ~0;
|
|
if (c->line_size < 32) {
|
|
int bits_per_byte = 32/c->line_size;
|
|
start_mask = (1<<bits_per_byte)-1;
|
|
end_mask = start_mask << (32-bits_per_byte);
|
|
for(i=0;i<c->line_size;i++) {
|
|
c->line_start_mask[i] = start_val;
|
|
start_val = start_val & ~start_mask;
|
|
start_mask = start_mask << bits_per_byte;
|
|
|
|
c->line_end_mask[c->line_size-i-1] = end_val;
|
|
end_val = end_val & ~end_mask;
|
|
end_mask = end_mask >> bits_per_byte;
|
|
}
|
|
}
|
|
else {
|
|
int bytes_per_bit = c->line_size/32;
|
|
start_mask = 1;
|
|
end_mask = 1u << 31;
|
|
for(i=0;i<c->line_size;i++) {
|
|
c->line_start_mask[i] = start_val;
|
|
c->line_end_mask[c->line_size-i-1] = end_val;
|
|
if ( ((i+1)%bytes_per_bit) == 0) {
|
|
start_val &= ~start_mask;
|
|
end_val &= ~end_mask;
|
|
start_mask <<= 1;
|
|
end_mask >>= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
CLG_DEBUG(6, "Config %s:\n", c->desc_line);
|
|
for(i=0;i<c->line_size;i++) {
|
|
CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
|
|
i, (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
|
|
}
|
|
|
|
/* We use lower tag bits as offset pointers to cache use info.
|
|
* I.e. some cache parameters don't work.
|
|
*/
|
|
if ( (1<<c->tag_shift) < c->assoc) {
|
|
VG_(message)(Vg_DebugMsg,
|
|
"error: Use associativity < %d for cache use statistics!\n",
|
|
(1<<c->tag_shift) );
|
|
VG_(tool_panic)("Unsupported cache configuration");
|
|
}
|
|
}
|
|
|
|
|
|
/* for I1/D1 caches */
|
|
#define CACHEUSE(L) \
|
|
\
|
|
static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
|
|
{ \
|
|
UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
|
|
UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
|
|
UWord tag = a & L.tag_mask; \
|
|
UWord tag2; \
|
|
int i, j, idx; \
|
|
UWord *set, tmp_tag; \
|
|
UInt use_mask; \
|
|
\
|
|
CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n", \
|
|
L.name, a, size, set1, set2); \
|
|
\
|
|
/* First case: word entirely within line. */ \
|
|
if (set1 == set2) { \
|
|
\
|
|
set = &(L.tags[set1 * L.assoc]); \
|
|
use_mask = L.line_start_mask[a & L.line_size_mask] & \
|
|
L.line_end_mask[(a+size-1) & L.line_size_mask]; \
|
|
\
|
|
/* This loop is unrolled for just the first case, which is the most */\
|
|
/* common. We can't unroll any further because it would screw up */\
|
|
/* if we have a direct-mapped (1-way) cache. */\
|
|
if (tag == (set[0] & L.tag_mask)) { \
|
|
idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
|
|
L.use[idx].count ++; \
|
|
L.use[idx].mask |= use_mask; \
|
|
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
|
|
idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
|
|
use_mask, L.use[idx].mask, L.use[idx].count); \
|
|
return L1_Hit; \
|
|
} \
|
|
/* If the tag is one other than the MRU, move it into the MRU spot */\
|
|
/* and shuffle the rest down. */\
|
|
for (i = 1; i < L.assoc; i++) { \
|
|
if (tag == (set[i] & L.tag_mask)) { \
|
|
tmp_tag = set[i]; \
|
|
for (j = i; j > 0; j--) { \
|
|
set[j] = set[j - 1]; \
|
|
} \
|
|
set[0] = tmp_tag; \
|
|
idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
|
|
L.use[idx].count ++; \
|
|
L.use[idx].mask |= use_mask; \
|
|
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
|
|
i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
|
|
use_mask, L.use[idx].mask, L.use[idx].count); \
|
|
return L1_Hit; \
|
|
} \
|
|
} \
|
|
\
|
|
/* A miss; install this tag as MRU, shuffle rest down. */ \
|
|
tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
|
|
for (j = L.assoc - 1; j > 0; j--) { \
|
|
set[j] = set[j - 1]; \
|
|
} \
|
|
set[0] = tag | tmp_tag; \
|
|
idx = (set1 * L.assoc) + tmp_tag; \
|
|
return update_##L##_use(&L, idx, \
|
|
use_mask, a &~ L.line_size_mask); \
|
|
\
|
|
/* Second case: word straddles two lines. */ \
|
|
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
|
|
} else if (((set1 + 1) & (L.sets_min_1)) == set2) { \
|
|
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
|
|
set = &(L.tags[set1 * L.assoc]); \
|
|
use_mask = L.line_start_mask[a & L.line_size_mask]; \
|
|
if (tag == (set[0] & L.tag_mask)) { \
|
|
idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
|
|
L.use[idx].count ++; \
|
|
L.use[idx].mask |= use_mask; \
|
|
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
|
|
idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
|
|
use_mask, L.use[idx].mask, L.use[idx].count); \
|
|
goto block2; \
|
|
} \
|
|
for (i = 1; i < L.assoc; i++) { \
|
|
if (tag == (set[i] & L.tag_mask)) { \
|
|
tmp_tag = set[i]; \
|
|
for (j = i; j > 0; j--) { \
|
|
set[j] = set[j - 1]; \
|
|
} \
|
|
set[0] = tmp_tag; \
|
|
idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
|
|
L.use[idx].count ++; \
|
|
L.use[idx].mask |= use_mask; \
|
|
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
|
|
i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
|
|
use_mask, L.use[idx].mask, L.use[idx].count); \
|
|
goto block2; \
|
|
} \
|
|
} \
|
|
tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
|
|
for (j = L.assoc - 1; j > 0; j--) { \
|
|
set[j] = set[j - 1]; \
|
|
} \
|
|
set[0] = tag | tmp_tag; \
|
|
idx = (set1 * L.assoc) + tmp_tag; \
|
|
miss1 = update_##L##_use(&L, idx, \
|
|
use_mask, a &~ L.line_size_mask); \
|
|
block2: \
|
|
set = &(L.tags[set2 * L.assoc]); \
|
|
use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
|
|
tag2 = (a+size-1) & L.tag_mask; \
|
|
if (tag2 == (set[0] & L.tag_mask)) { \
|
|
idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
|
|
L.use[idx].count ++; \
|
|
L.use[idx].mask |= use_mask; \
|
|
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
|
|
idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
|
|
use_mask, L.use[idx].mask, L.use[idx].count); \
|
|
return miss1; \
|
|
} \
|
|
for (i = 1; i < L.assoc; i++) { \
|
|
if (tag2 == (set[i] & L.tag_mask)) { \
|
|
tmp_tag = set[i]; \
|
|
for (j = i; j > 0; j--) { \
|
|
set[j] = set[j - 1]; \
|
|
} \
|
|
set[0] = tmp_tag; \
|
|
idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
|
|
L.use[idx].count ++; \
|
|
L.use[idx].mask |= use_mask; \
|
|
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
|
|
i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
|
|
use_mask, L.use[idx].mask, L.use[idx].count); \
|
|
return miss1; \
|
|
} \
|
|
} \
|
|
tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
|
|
for (j = L.assoc - 1; j > 0; j--) { \
|
|
set[j] = set[j - 1]; \
|
|
} \
|
|
set[0] = tag2 | tmp_tag; \
|
|
idx = (set2 * L.assoc) + tmp_tag; \
|
|
miss2 = update_##L##_use(&L, idx, \
|
|
use_mask, (a+size-1) &~ L.line_size_mask); \
|
|
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
|
|
\
|
|
} else { \
|
|
VG_(printf)("addr: %#lx size: %u sets: %u %u", a, size, set1, set2); \
|
|
VG_(tool_panic)("item straddles more than two cache sets"); \
|
|
} \
|
|
return 0; \
|
|
}
|
|
|
|
|
|
/* logarithmic bitcounting algorithm, see
|
|
* http://graphics.stanford.edu/~seander/bithacks.html
|
|
*/
|
|
static __inline__ unsigned int countBits(unsigned int bits)
|
|
{
|
|
unsigned int c; // store the total here
|
|
const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
|
|
const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
|
|
|
|
c = bits;
|
|
c = ((c >> S[0]) & B[0]) + (c & B[0]);
|
|
c = ((c >> S[1]) & B[1]) + (c & B[1]);
|
|
c = ((c >> S[2]) & B[2]) + (c & B[2]);
|
|
c = ((c >> S[3]) & B[3]) + (c & B[3]);
|
|
c = ((c >> S[4]) & B[4]) + (c & B[4]);
|
|
return c;
|
|
}
|
|
|
|
static void update_LL_use(int idx, Addr memline)
|
|
{
|
|
line_loaded* loaded = &(LL.loaded[idx]);
|
|
line_use* use = &(LL.use[idx]);
|
|
int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
|
|
|
|
CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
|
|
idx, CLG_(bb_base) + current_ii->instr_offset, memline);
|
|
if (use->count>0) {
|
|
CLG_DEBUG(2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
|
|
use->count, i, use->mask, loaded->memline, loaded->iaddr);
|
|
CLG_DEBUG(2, " collect: %d, use_base %p\n",
|
|
CLG_(current_state).collect, loaded->use_base);
|
|
|
|
if (CLG_(current_state).collect && loaded->use_base) {
|
|
(loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
|
|
(loaded->use_base)[off_LL_SpLoss] += i;
|
|
}
|
|
}
|
|
|
|
use->count = 0;
|
|
use->mask = 0;
|
|
|
|
loaded->memline = memline;
|
|
loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset;
|
|
loaded->use_base = (CLG_(current_state).nonskipped) ?
|
|
CLG_(current_state).nonskipped->skipped :
|
|
CLG_(cost_base) + current_ii->cost_offset;
|
|
}
|
|
|
|
static
|
|
CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
|
|
{
|
|
UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
|
|
UWord* set = &(LL.tags[setNo * LL.assoc]);
|
|
UWord tag = memline & LL.tag_mask;
|
|
|
|
int i, j, idx;
|
|
UWord tmp_tag;
|
|
|
|
CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
|
|
|
|
if (tag == (set[0] & LL.tag_mask)) {
|
|
idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
|
|
l1_loaded->dep_use = &(LL.use[idx]);
|
|
|
|
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
|
|
idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
|
|
LL.use[idx].mask, LL.use[idx].count);
|
|
return LL_Hit;
|
|
}
|
|
for (i = 1; i < LL.assoc; i++) {
|
|
if (tag == (set[i] & LL.tag_mask)) {
|
|
tmp_tag = set[i];
|
|
for (j = i; j > 0; j--) {
|
|
set[j] = set[j - 1];
|
|
}
|
|
set[0] = tmp_tag;
|
|
idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
|
|
l1_loaded->dep_use = &(LL.use[idx]);
|
|
|
|
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
|
|
i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
|
|
LL.use[idx].mask, LL.use[idx].count);
|
|
return LL_Hit;
|
|
}
|
|
}
|
|
|
|
/* A miss; install this tag as MRU, shuffle rest down. */
|
|
tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
|
|
for (j = LL.assoc - 1; j > 0; j--) {
|
|
set[j] = set[j - 1];
|
|
}
|
|
set[0] = tag | tmp_tag;
|
|
idx = (setNo * LL.assoc) + tmp_tag;
|
|
l1_loaded->dep_use = &(LL.use[idx]);
|
|
|
|
update_LL_use(idx, memline);
|
|
|
|
return MemAccess;
|
|
}
|
|
|
|
|
|
|
|
|
|
#define UPDATE_USE(L) \
|
|
\
|
|
static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
|
|
UInt mask, Addr memline) \
|
|
{ \
|
|
line_loaded* loaded = &(cache->loaded[idx]); \
|
|
line_use* use = &(cache->use[idx]); \
|
|
int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
|
|
\
|
|
CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
|
|
cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
|
|
if (use->count>0) { \
|
|
CLG_DEBUG(2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
|
|
use->count, c, use->mask, loaded->memline, loaded->iaddr); \
|
|
CLG_DEBUG(2, " collect: %d, use_base %p\n", \
|
|
CLG_(current_state).collect, loaded->use_base); \
|
|
\
|
|
if (CLG_(current_state).collect && loaded->use_base) { \
|
|
(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
|
|
(loaded->use_base)[off_##L##_SpLoss] += c; \
|
|
\
|
|
/* FIXME (?): L1/LL line sizes must be equal ! */ \
|
|
loaded->dep_use->mask |= use->mask; \
|
|
loaded->dep_use->count += use->count; \
|
|
} \
|
|
} \
|
|
\
|
|
use->count = 1; \
|
|
use->mask = mask; \
|
|
loaded->memline = memline; \
|
|
loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; \
|
|
loaded->use_base = (CLG_(current_state).nonskipped) ? \
|
|
CLG_(current_state).nonskipped->skipped : \
|
|
CLG_(cost_base) + current_ii->cost_offset; \
|
|
\
|
|
if (memline == 0) return LL_Hit; \
|
|
return cacheuse_LL_access(memline, loaded); \
|
|
}
|
|
|
|
UPDATE_USE(I1);
|
|
UPDATE_USE(D1);
|
|
|
|
CACHEUSE(I1);
|
|
CACHEUSE(D1);
|
|
|
|
|
|
static
|
|
void cacheuse_finish(void)
|
|
{
|
|
int i;
|
|
InstrInfo ii = { 0,0,0,0 };
|
|
|
|
if (!CLG_(current_state).collect) return;
|
|
|
|
CLG_(bb_base) = 0;
|
|
current_ii = ⅈ /* needs to be set for update_XX_use */
|
|
CLG_(cost_base) = 0;
|
|
|
|
/* update usage counters */
|
|
if (I1.use)
|
|
for (i = 0; i < I1.sets * I1.assoc; i++)
|
|
if (I1.loaded[i].use_base)
|
|
update_I1_use( &I1, i, 0,0);
|
|
|
|
if (D1.use)
|
|
for (i = 0; i < D1.sets * D1.assoc; i++)
|
|
if (D1.loaded[i].use_base)
|
|
update_D1_use( &D1, i, 0,0);
|
|
|
|
if (LL.use)
|
|
for (i = 0; i < LL.sets * LL.assoc; i++)
|
|
if (LL.loaded[i].use_base)
|
|
update_LL_use(i, 0);
|
|
|
|
current_ii = 0;
|
|
}
|
|
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Helper functions called by instrumented code ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
|
|
static __inline__
|
|
void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
|
|
{
|
|
switch(r) {
|
|
case WriteBackMemAccess:
|
|
if (clo_simulate_writeback) {
|
|
c1[3]++;
|
|
c2[3]++;
|
|
}
|
|
// fall through
|
|
|
|
case MemAccess:
|
|
c1[2]++;
|
|
c2[2]++;
|
|
// fall through
|
|
|
|
case LL_Hit:
|
|
c1[1]++;
|
|
c2[1]++;
|
|
// fall through
|
|
|
|
default:
|
|
c1[0]++;
|
|
c2[0]++;
|
|
}
|
|
}
|
|
|
|
static
|
|
const HChar* cacheRes(CacheModelResult r)
|
|
{
|
|
switch(r) {
|
|
case L1_Hit: return "L1 Hit ";
|
|
case LL_Hit: return "LL Hit ";
|
|
case MemAccess: return "LL Miss";
|
|
case WriteBackMemAccess: return "LL Miss (dirty)";
|
|
default:
|
|
tl_assert(0);
|
|
}
|
|
return "??";
|
|
}
|
|
|
|
VG_REGPARM(1)
|
|
static void log_1I0D(InstrInfo* ii)
|
|
{
|
|
CacheModelResult IrRes;
|
|
|
|
current_ii = ii;
|
|
IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
|
|
|
|
CLG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n",
|
|
CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
|
|
|
|
if (CLG_(current_state).collect) {
|
|
ULong* cost_Ir;
|
|
|
|
if (CLG_(current_state).nonskipped)
|
|
cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
|
|
else
|
|
cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
|
|
|
|
inc_costs(IrRes, cost_Ir,
|
|
CLG_(current_state).cost + fullOffset(EG_IR) );
|
|
}
|
|
}
|
|
|
|
VG_REGPARM(2)
|
|
static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
|
|
{
|
|
CacheModelResult Ir1Res, Ir2Res;
|
|
ULong *global_cost_Ir;
|
|
|
|
current_ii = ii1;
|
|
Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
|
|
current_ii = ii2;
|
|
Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
|
|
|
|
CLG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
|
|
CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
|
|
CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
|
|
|
|
if (!CLG_(current_state).collect) return;
|
|
|
|
global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
|
|
if (CLG_(current_state).nonskipped) {
|
|
ULong* skipped_cost_Ir =
|
|
CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
|
|
|
|
inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
|
|
inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
|
|
return;
|
|
}
|
|
|
|
inc_costs(Ir1Res, global_cost_Ir,
|
|
CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
|
|
inc_costs(Ir2Res, global_cost_Ir,
|
|
CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
|
|
}
|
|
|
|
VG_REGPARM(3)
|
|
static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
|
|
{
|
|
CacheModelResult Ir1Res, Ir2Res, Ir3Res;
|
|
ULong *global_cost_Ir;
|
|
|
|
current_ii = ii1;
|
|
Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
|
|
current_ii = ii2;
|
|
Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
|
|
current_ii = ii3;
|
|
Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
|
|
|
|
CLG_DEBUG(6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
|
|
CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
|
|
CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
|
|
CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
|
|
|
|
if (!CLG_(current_state).collect) return;
|
|
|
|
global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
|
|
if (CLG_(current_state).nonskipped) {
|
|
ULong* skipped_cost_Ir =
|
|
CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
|
|
inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
|
|
inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
|
|
inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
|
|
return;
|
|
}
|
|
|
|
inc_costs(Ir1Res, global_cost_Ir,
|
|
CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
|
|
inc_costs(Ir2Res, global_cost_Ir,
|
|
CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
|
|
inc_costs(Ir3Res, global_cost_Ir,
|
|
CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
|
|
}
|
|
|
|
/* Instruction doing a read access */
|
|
|
|
VG_REGPARM(3)
|
|
static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
|
|
{
|
|
CacheModelResult IrRes, DrRes;
|
|
|
|
current_ii = ii;
|
|
IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
|
|
DrRes = (*simulator.D1_Read)(data_addr, data_size);
|
|
|
|
CLG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%ld => %s\n",
|
|
CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
|
|
data_addr, data_size, cacheRes(DrRes));
|
|
|
|
if (CLG_(current_state).collect) {
|
|
ULong *cost_Ir, *cost_Dr;
|
|
|
|
if (CLG_(current_state).nonskipped) {
|
|
cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
|
|
cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
|
|
}
|
|
else {
|
|
cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
|
|
cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
|
|
}
|
|
|
|
inc_costs(IrRes, cost_Ir,
|
|
CLG_(current_state).cost + fullOffset(EG_IR) );
|
|
inc_costs(DrRes, cost_Dr,
|
|
CLG_(current_state).cost + fullOffset(EG_DR) );
|
|
}
|
|
}
|
|
|
|
|
|
/* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
|
|
have exactly the same prototype. If you change them, you must
|
|
change addEvent_D_guarded too. */
|
|
VG_REGPARM(3)
|
|
static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
|
|
{
|
|
CacheModelResult DrRes;
|
|
|
|
current_ii = ii;
|
|
DrRes = (*simulator.D1_Read)(data_addr, data_size);
|
|
|
|
CLG_DEBUG(6, "log_0I1Dr: Dr %#lx/%ld => %s\n",
|
|
data_addr, data_size, cacheRes(DrRes));
|
|
|
|
if (CLG_(current_state).collect) {
|
|
ULong *cost_Dr;
|
|
|
|
if (CLG_(current_state).nonskipped)
|
|
cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
|
|
else
|
|
cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
|
|
|
|
inc_costs(DrRes, cost_Dr,
|
|
CLG_(current_state).cost + fullOffset(EG_DR) );
|
|
}
|
|
}
|
|
|
|
|
|
/* Instruction doing a write access */
|
|
|
|
VG_REGPARM(3)
|
|
static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
|
|
{
|
|
CacheModelResult IrRes, DwRes;
|
|
|
|
current_ii = ii;
|
|
IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
|
|
DwRes = (*simulator.D1_Write)(data_addr, data_size);
|
|
|
|
CLG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%ld => %s\n",
|
|
CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
|
|
data_addr, data_size, cacheRes(DwRes));
|
|
|
|
if (CLG_(current_state).collect) {
|
|
ULong *cost_Ir, *cost_Dw;
|
|
|
|
if (CLG_(current_state).nonskipped) {
|
|
cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
|
|
cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
|
|
}
|
|
else {
|
|
cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
|
|
cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
|
|
}
|
|
|
|
inc_costs(IrRes, cost_Ir,
|
|
CLG_(current_state).cost + fullOffset(EG_IR) );
|
|
inc_costs(DwRes, cost_Dw,
|
|
CLG_(current_state).cost + fullOffset(EG_DW) );
|
|
}
|
|
}
|
|
|
|
/* See comment on log_0I1Dr. */
|
|
VG_REGPARM(3)
|
|
static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
|
|
{
|
|
CacheModelResult DwRes;
|
|
|
|
current_ii = ii;
|
|
DwRes = (*simulator.D1_Write)(data_addr, data_size);
|
|
|
|
CLG_DEBUG(6, "log_0I1Dw: Dw %#lx/%ld => %s\n",
|
|
data_addr, data_size, cacheRes(DwRes));
|
|
|
|
if (CLG_(current_state).collect) {
|
|
ULong *cost_Dw;
|
|
|
|
if (CLG_(current_state).nonskipped)
|
|
cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
|
|
else
|
|
cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
|
|
|
|
inc_costs(DwRes, cost_Dw,
|
|
CLG_(current_state).cost + fullOffset(EG_DW) );
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Cache configuration ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
static cache_t clo_I1_cache = UNDEFINED_CACHE;
|
|
static cache_t clo_D1_cache = UNDEFINED_CACHE;
|
|
static cache_t clo_LL_cache = UNDEFINED_CACHE;
|
|
|
|
/* Initialize and clear simulator state */
|
|
static void cachesim_post_clo_init(void)
|
|
{
|
|
/* Cache configurations. */
|
|
cache_t I1c, D1c, LLc;
|
|
|
|
/* Initialize access handlers */
|
|
if (!CLG_(clo).simulate_cache) {
|
|
CLG_(cachesim).log_1I0D = 0;
|
|
CLG_(cachesim).log_1I0D_name = "(no function)";
|
|
CLG_(cachesim).log_2I0D = 0;
|
|
CLG_(cachesim).log_2I0D_name = "(no function)";
|
|
CLG_(cachesim).log_3I0D = 0;
|
|
CLG_(cachesim).log_3I0D_name = "(no function)";
|
|
|
|
CLG_(cachesim).log_1I1Dr = 0;
|
|
CLG_(cachesim).log_1I1Dr_name = "(no function)";
|
|
CLG_(cachesim).log_1I1Dw = 0;
|
|
CLG_(cachesim).log_1I1Dw_name = "(no function)";
|
|
|
|
CLG_(cachesim).log_0I1Dr = 0;
|
|
CLG_(cachesim).log_0I1Dr_name = "(no function)";
|
|
CLG_(cachesim).log_0I1Dw = 0;
|
|
CLG_(cachesim).log_0I1Dw_name = "(no function)";
|
|
return;
|
|
}
|
|
|
|
/* Configuration of caches only needed with real cache simulation */
|
|
VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
|
|
&clo_I1_cache,
|
|
&clo_D1_cache,
|
|
&clo_LL_cache);
|
|
|
|
I1.name = "I1";
|
|
D1.name = "D1";
|
|
LL.name = "LL";
|
|
|
|
// min_line_size is used to make sure that we never feed
|
|
// accesses to the simulator straddling more than two
|
|
// cache lines at any cache level
|
|
CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
|
|
? I1c.line_size : D1c.line_size;
|
|
CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
|
|
? LLc.line_size : CLG_(min_line_size);
|
|
|
|
Int largest_load_or_store_size
|
|
= VG_(machine_get_size_of_largest_guest_register)();
|
|
if (CLG_(min_line_size) < largest_load_or_store_size) {
|
|
/* We can't continue, because the cache simulation might
|
|
straddle more than 2 lines, and it will assert. So let's
|
|
just stop before we start. */
|
|
VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
|
|
(Int)CLG_(min_line_size));
|
|
VG_(umsg)(" must be equal to or larger than the maximum register size (%d)\n",
|
|
largest_load_or_store_size );
|
|
VG_(umsg)(" but it is not. Exiting now.\n");
|
|
VG_(exit)(1);
|
|
}
|
|
|
|
cachesim_initcache(I1c, &I1);
|
|
cachesim_initcache(D1c, &D1);
|
|
cachesim_initcache(LLc, &LL);
|
|
|
|
/* the other cache simulators use the standard helpers
|
|
* with dispatching via simulator struct */
|
|
|
|
CLG_(cachesim).log_1I0D = log_1I0D;
|
|
CLG_(cachesim).log_1I0D_name = "log_1I0D";
|
|
CLG_(cachesim).log_2I0D = log_2I0D;
|
|
CLG_(cachesim).log_2I0D_name = "log_2I0D";
|
|
CLG_(cachesim).log_3I0D = log_3I0D;
|
|
CLG_(cachesim).log_3I0D_name = "log_3I0D";
|
|
|
|
CLG_(cachesim).log_1I1Dr = log_1I1Dr;
|
|
CLG_(cachesim).log_1I1Dw = log_1I1Dw;
|
|
CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
|
|
CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
|
|
|
|
CLG_(cachesim).log_0I1Dr = log_0I1Dr;
|
|
CLG_(cachesim).log_0I1Dw = log_0I1Dw;
|
|
CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
|
|
CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
|
|
|
|
if (clo_collect_cacheuse) {
|
|
|
|
/* Output warning for not supported option combinations */
|
|
if (clo_simulate_hwpref) {
|
|
VG_(message)(Vg_DebugMsg,
|
|
"warning: prefetch simulation can not be "
|
|
"used with cache usage\n");
|
|
clo_simulate_hwpref = False;
|
|
}
|
|
|
|
if (clo_simulate_writeback) {
|
|
VG_(message)(Vg_DebugMsg,
|
|
"warning: write-back simulation can not be "
|
|
"used with cache usage\n");
|
|
clo_simulate_writeback = False;
|
|
}
|
|
|
|
simulator.I1_Read = cacheuse_I1_doRead;
|
|
simulator.D1_Read = cacheuse_D1_doRead;
|
|
simulator.D1_Write = cacheuse_D1_doRead;
|
|
return;
|
|
}
|
|
|
|
if (clo_simulate_hwpref) {
|
|
prefetch_clear();
|
|
|
|
if (clo_simulate_writeback) {
|
|
simulator.I1_Read = prefetch_I1_Read;
|
|
simulator.D1_Read = prefetch_D1_Read;
|
|
simulator.D1_Write = prefetch_D1_Write;
|
|
}
|
|
else {
|
|
simulator.I1_Read = prefetch_I1_ref;
|
|
simulator.D1_Read = prefetch_D1_ref;
|
|
simulator.D1_Write = prefetch_D1_ref;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (clo_simulate_writeback) {
|
|
simulator.I1_Read = cachesim_I1_Read;
|
|
simulator.D1_Read = cachesim_D1_Read;
|
|
simulator.D1_Write = cachesim_D1_Write;
|
|
}
|
|
else {
|
|
simulator.I1_Read = cachesim_I1_ref;
|
|
simulator.D1_Read = cachesim_D1_ref;
|
|
simulator.D1_Write = cachesim_D1_ref;
|
|
}
|
|
}
|
|
|
|
|
|
/* Clear simulator state. Has to be initialized before */
|
|
static
|
|
void cachesim_clear(void)
|
|
{
|
|
cachesim_clearcache(&I1);
|
|
cachesim_clearcache(&D1);
|
|
cachesim_clearcache(&LL);
|
|
|
|
prefetch_clear();
|
|
}
|
|
|
|
|
|
static void cachesim_dump_desc(VgFile *fp)
|
|
{
|
|
VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
|
|
VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
|
|
VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
|
|
}
|
|
|
|
static
|
|
void cachesim_print_opts(void)
|
|
{
|
|
VG_(printf)(
|
|
"\n cache simulator options (does cache simulation if used):\n"
|
|
" --simulate-wb=no|yes Count write-back events [no]\n"
|
|
" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
|
|
#if CLG_EXPERIMENTAL
|
|
" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
|
|
#endif
|
|
" --cacheuse=no|yes Collect cache block use [no]\n");
|
|
VG_(print_cache_clo_opts)();
|
|
}
|
|
|
|
/* Check for command line option for cache configuration.
|
|
* Return False if unknown and not handled.
|
|
*
|
|
* Called from CLG_(process_cmd_line_option)() in clo.c
|
|
*/
|
|
static Bool cachesim_parse_opt(const HChar* arg)
|
|
{
|
|
if VG_BOOL_CLO(arg, "--simulate-wb", clo_simulate_writeback) {}
|
|
else if VG_BOOL_CLO(arg, "--simulate-hwpref", clo_simulate_hwpref) {}
|
|
else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors) {}
|
|
|
|
else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
|
|
if (clo_collect_cacheuse) {
|
|
/* Use counters only make sense with fine dumping */
|
|
CLG_(clo).dump_instr = True;
|
|
}
|
|
}
|
|
|
|
else if (VG_(str_clo_cache_opt)(arg,
|
|
&clo_I1_cache,
|
|
&clo_D1_cache,
|
|
&clo_LL_cache)) {}
|
|
|
|
else
|
|
return False;
|
|
|
|
return True;
|
|
}
|
|
|
|
static
|
|
void cachesim_printstat(Int l1, Int l2, Int l3)
|
|
{
|
|
FullCost total = CLG_(total_cost), D_total = 0;
|
|
ULong LL_total_m, LL_total_mr, LL_total_mw,
|
|
LL_total, LL_total_r, LL_total_w;
|
|
|
|
if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
|
|
VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu\n",
|
|
prefetch_up);
|
|
VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu\n",
|
|
prefetch_down);
|
|
VG_(message)(Vg_DebugMsg, "\n");
|
|
}
|
|
|
|
VG_(message)(Vg_UserMsg, "I1 misses: %'*llu\n", l1,
|
|
total[fullOffset(EG_IR) +1]);
|
|
|
|
VG_(message)(Vg_UserMsg, "LLi misses: %'*llu\n", l1,
|
|
total[fullOffset(EG_IR) +2]);
|
|
|
|
if (0 == total[fullOffset(EG_IR)])
|
|
total[fullOffset(EG_IR)] = 1;
|
|
|
|
VG_(message)(Vg_UserMsg, "I1 miss rate: %*.2f%%\n", l1,
|
|
total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
|
|
|
|
VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
|
|
total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
|
|
|
|
VG_(message)(Vg_UserMsg, "\n");
|
|
|
|
/* D cache results.
|
|
Use the D_refs.rd and D_refs.wr values to determine the
|
|
* width of columns 2 & 3. */
|
|
|
|
D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
|
|
CLG_(init_cost)( CLG_(sets).full, D_total);
|
|
// we only use the first 3 values of D_total, adding up Dr and Dw costs
|
|
CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
|
|
CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
|
|
|
|
VG_(message)(Vg_UserMsg, "D refs: %'*llu (%'*llu rd + %'*llu wr)\n",
|
|
l1, D_total[0],
|
|
l2, total[fullOffset(EG_DR)],
|
|
l3, total[fullOffset(EG_DW)]);
|
|
|
|
VG_(message)(Vg_UserMsg, "D1 misses: %'*llu (%'*llu rd + %'*llu wr)\n",
|
|
l1, D_total[1],
|
|
l2, total[fullOffset(EG_DR)+1],
|
|
l3, total[fullOffset(EG_DW)+1]);
|
|
|
|
VG_(message)(Vg_UserMsg, "LLd misses: %'*llu (%'*llu rd + %'*llu wr)\n",
|
|
l1, D_total[2],
|
|
l2, total[fullOffset(EG_DR)+2],
|
|
l3, total[fullOffset(EG_DW)+2]);
|
|
|
|
if (0 == D_total[0]) D_total[0] = 1;
|
|
if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
|
|
if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
|
|
|
|
VG_(message)(Vg_UserMsg, "D1 miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
|
|
l1, D_total[1] * 100.0 / D_total[0],
|
|
l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
|
|
l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
|
|
|
|
VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
|
|
l1, D_total[2] * 100.0 / D_total[0],
|
|
l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
|
|
l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
|
|
VG_(message)(Vg_UserMsg, "\n");
|
|
|
|
|
|
|
|
/* LL overall results */
|
|
|
|
LL_total =
|
|
total[fullOffset(EG_DR) +1] +
|
|
total[fullOffset(EG_DW) +1] +
|
|
total[fullOffset(EG_IR) +1];
|
|
LL_total_r =
|
|
total[fullOffset(EG_DR) +1] +
|
|
total[fullOffset(EG_IR) +1];
|
|
LL_total_w = total[fullOffset(EG_DW) +1];
|
|
VG_(message)(Vg_UserMsg, "LL refs: %'*llu (%'*llu rd + %'*llu wr)\n",
|
|
l1, LL_total, l2, LL_total_r, l3, LL_total_w);
|
|
|
|
LL_total_m =
|
|
total[fullOffset(EG_DR) +2] +
|
|
total[fullOffset(EG_DW) +2] +
|
|
total[fullOffset(EG_IR) +2];
|
|
LL_total_mr =
|
|
total[fullOffset(EG_DR) +2] +
|
|
total[fullOffset(EG_IR) +2];
|
|
LL_total_mw = total[fullOffset(EG_DW) +2];
|
|
VG_(message)(Vg_UserMsg, "LL misses: %'*llu (%'*llu rd + %'*llu wr)\n",
|
|
l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
|
|
|
|
VG_(message)(Vg_UserMsg, "LL miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
|
|
l1, LL_total_m * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
|
|
l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
|
|
l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
|
|
}
|
|
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- Setup for Event set. ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
struct event_sets CLG_(sets);
|
|
|
|
void CLG_(init_eventsets)()
|
|
{
|
|
// Event groups from which the event sets are composed
|
|
// the "Use" group only is used with "cacheuse" simulation
|
|
if (clo_collect_cacheuse)
|
|
CLG_(register_event_group4)(EG_USE,
|
|
"AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
|
|
|
|
if (!CLG_(clo).simulate_cache)
|
|
CLG_(register_event_group)(EG_IR, "Ir");
|
|
else if (!clo_simulate_writeback) {
|
|
CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
|
|
CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
|
|
CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
|
|
}
|
|
else { // clo_simulate_writeback
|
|
CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
|
|
CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
|
|
CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
|
|
}
|
|
|
|
if (CLG_(clo).simulate_branch) {
|
|
CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
|
|
CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
|
|
}
|
|
|
|
if (CLG_(clo).collect_bus)
|
|
CLG_(register_event_group)(EG_BUS, "Ge");
|
|
|
|
if (CLG_(clo).collect_alloc)
|
|
CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
|
|
|
|
if (CLG_(clo).collect_systime != systime_no) {
|
|
if (CLG_(clo).collect_systime == systime_nsec)
|
|
CLG_(register_event_group3)(EG_SYS, "sysCount", "sysTime", "sysCpuTime");
|
|
else
|
|
CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
|
|
}
|
|
|
|
// event set used as base for instruction self cost
|
|
CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
|
|
|
|
// event set comprising all event groups, used for inclusive cost
|
|
CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
|
|
CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
|
|
CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
|
|
CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
|
|
|
|
CLG_DEBUGIF(1) {
|
|
CLG_DEBUG(1, "EventSets:\n");
|
|
CLG_(print_eventset)(-2, CLG_(sets).base);
|
|
CLG_(print_eventset)(-2, CLG_(sets).full);
|
|
}
|
|
|
|
/* Not-existing events are silently ignored */
|
|
CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
|
|
CLG_(append_event)(CLG_(dumpmap), "Ir");
|
|
CLG_(append_event)(CLG_(dumpmap), "Dr");
|
|
CLG_(append_event)(CLG_(dumpmap), "Dw");
|
|
CLG_(append_event)(CLG_(dumpmap), "I1mr");
|
|
CLG_(append_event)(CLG_(dumpmap), "D1mr");
|
|
CLG_(append_event)(CLG_(dumpmap), "D1mw");
|
|
CLG_(append_event)(CLG_(dumpmap), "ILmr");
|
|
CLG_(append_event)(CLG_(dumpmap), "DLmr");
|
|
CLG_(append_event)(CLG_(dumpmap), "DLmw");
|
|
CLG_(append_event)(CLG_(dumpmap), "ILdmr");
|
|
CLG_(append_event)(CLG_(dumpmap), "DLdmr");
|
|
CLG_(append_event)(CLG_(dumpmap), "DLdmw");
|
|
CLG_(append_event)(CLG_(dumpmap), "Bc");
|
|
CLG_(append_event)(CLG_(dumpmap), "Bcm");
|
|
CLG_(append_event)(CLG_(dumpmap), "Bi");
|
|
CLG_(append_event)(CLG_(dumpmap), "Bim");
|
|
CLG_(append_event)(CLG_(dumpmap), "AcCost1");
|
|
CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
|
|
CLG_(append_event)(CLG_(dumpmap), "AcCost2");
|
|
CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
|
|
CLG_(append_event)(CLG_(dumpmap), "Ge");
|
|
CLG_(append_event)(CLG_(dumpmap), "allocCount");
|
|
CLG_(append_event)(CLG_(dumpmap), "allocSize");
|
|
CLG_(append_event)(CLG_(dumpmap), "sysCount");
|
|
CLG_(append_event)(CLG_(dumpmap), "sysTime");
|
|
CLG_(append_event)(CLG_(dumpmap), "sysCpuTime");
|
|
}
|
|
|
|
|
|
/* this is called at dump time for every instruction executed */
|
|
static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
|
|
InstrInfo* ii, ULong exe_count)
|
|
{
|
|
if (!CLG_(clo).simulate_cache)
|
|
cost[ fullOffset(EG_IR) ] += exe_count;
|
|
|
|
if (ii->eventset)
|
|
CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
|
|
ii->eventset, bbcc->cost + ii->cost_offset);
|
|
}
|
|
|
|
static
|
|
void cachesim_finish(void)
|
|
{
|
|
if (clo_collect_cacheuse)
|
|
cacheuse_finish();
|
|
}
|
|
|
|
/*------------------------------------------------------------*/
|
|
/*--- The simulator defined in this file ---*/
|
|
/*------------------------------------------------------------*/
|
|
|
|
struct cachesim_if CLG_(cachesim) = {
|
|
.print_opts = cachesim_print_opts,
|
|
.parse_opt = cachesim_parse_opt,
|
|
.post_clo_init = cachesim_post_clo_init,
|
|
.clear = cachesim_clear,
|
|
.dump_desc = cachesim_dump_desc,
|
|
.printstat = cachesim_printstat,
|
|
.add_icost = cachesim_add_icost,
|
|
.finish = cachesim_finish,
|
|
|
|
/* these will be set by cachesim_post_clo_init */
|
|
.log_1I0D = 0,
|
|
.log_2I0D = 0,
|
|
.log_3I0D = 0,
|
|
|
|
.log_1I1Dr = 0,
|
|
.log_1I1Dw = 0,
|
|
|
|
.log_0I1Dr = 0,
|
|
.log_0I1Dw = 0,
|
|
|
|
.log_1I0D_name = "(no function)",
|
|
.log_2I0D_name = "(no function)",
|
|
.log_3I0D_name = "(no function)",
|
|
|
|
.log_1I1Dr_name = "(no function)",
|
|
.log_1I1Dw_name = "(no function)",
|
|
|
|
.log_0I1Dr_name = "(no function)",
|
|
.log_0I1Dw_name = "(no function)",
|
|
};
|
|
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- end ct_sim.c ---*/
|
|
/*--------------------------------------------------------------------*/
|