mirror of
https://github.com/Zenithsiz/ftmemsim-valgrind.git
synced 2026-02-03 18:13:01 +00:00
Change Cachegrind/Callgrind to talk about the LL (last-level) cache instead
of the L2 cache. This is to accommodate machines with three levels of cache. We still only simulate two levels, the first and the last. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11404
This commit is contained in:
parent
cb3fbb46d7
commit
60d9b410d4
19
NEWS
19
NEWS
@ -16,6 +16,20 @@ Improvements:
|
||||
--threshold option has changed; this is unlikely to affect many people, if
|
||||
you do use it please see the user manual for details.
|
||||
|
||||
- Callgrind now can do branch prediction simulation, similar to Cachegrind.
|
||||
In addition, it optionally can count the number of executed global bus events.
|
||||
Both can be used for a better approximation of a "Cycle Estimation" as
|
||||
derived event (you need to update the event formula in KCachegrind yourself).
|
||||
|
||||
- Cachegrind and Callgrind now refer to the LL (last-level) cache rather
|
||||
than the L2 cache. This is to accommodate machines with three levels of
|
||||
caches -- if Cachegrind/Callgrind auto-detects the cache configuration of
|
||||
such a machine it will run the simulation as if the L2 cache isn't
|
||||
present. This means the results are less likely to match the true result
|
||||
for the machine, but Cachegrind/Callgrind's results are already only
|
||||
approximate, and should not be considered authoritative. The results are
|
||||
still useful for giving a general idea about a program's locality.
|
||||
|
||||
- Massif has a new option, --pages-as-heap, which is disabled by default.
|
||||
When enabled, instead of tracking allocations at the level of heap blocks
|
||||
(as allocated with malloc/new/new[]), it instead tracks memory allocations
|
||||
@ -24,11 +38,6 @@ Improvements:
|
||||
harder than the heap-level output, but this option is useful if you want
|
||||
to account for every byte of memory used by a program.
|
||||
|
||||
- Callgrind now can do branch prediction simulation, similar to Cachegrind.
|
||||
In addition, it optionally can count the number of executed global bus events.
|
||||
Both can be used for a better approximation of a "Cycle Estimation" as
|
||||
derived event (you need to update the event formula in KCachegrind yourself).
|
||||
|
||||
- Added new memcheck command-line option --show-possibly-lost.
|
||||
|
||||
|
||||
|
||||
@ -37,13 +37,13 @@
|
||||
|
||||
#include "cg_arch.h"
|
||||
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
|
||||
Bool all_caches_clo_defined)
|
||||
{
|
||||
// Set caches to default (for Cortex-A8 ?)
|
||||
*I1c = (cache_t) { 16384, 4, 64 };
|
||||
*D1c = (cache_t) { 16384, 4, 64 };
|
||||
*L2c = (cache_t) { 262144, 8, 64 };
|
||||
*LLc = (cache_t) { 262144, 8, 64 };
|
||||
|
||||
if (!all_caches_clo_defined) {
|
||||
VG_(message)(Vg_DebugMsg,
|
||||
|
||||
@ -37,13 +37,13 @@
|
||||
|
||||
#include "cg_arch.h"
|
||||
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
|
||||
Bool all_caches_clo_defined)
|
||||
{
|
||||
// Set caches to default.
|
||||
*I1c = (cache_t) { 65536, 2, 64 };
|
||||
*D1c = (cache_t) { 65536, 2, 64 };
|
||||
*L2c = (cache_t) { 262144, 8, 64 };
|
||||
*LLc = (cache_t) { 262144, 8, 64 };
|
||||
|
||||
// Warn if config not completely specified from cmd line. Note that
|
||||
// this message is slightly different from the one we give on x86/AMD64
|
||||
|
||||
@ -37,13 +37,13 @@
|
||||
|
||||
#include "cg_arch.h"
|
||||
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
|
||||
Bool all_caches_clo_defined)
|
||||
{
|
||||
// Set caches to default.
|
||||
*I1c = (cache_t) { 65536, 2, 64 };
|
||||
*D1c = (cache_t) { 65536, 2, 64 };
|
||||
*L2c = (cache_t) { 262144, 8, 64 };
|
||||
*LLc = (cache_t) { 262144, 8, 64 };
|
||||
|
||||
// Warn if config not completely specified from cmd line. Note that
|
||||
// this message is slightly different from the one we give on x86/AMD64
|
||||
|
||||
@ -54,9 +54,12 @@ static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
|
||||
* array of pre-defined configurations for various parts of the memory
|
||||
* hierarchy.
|
||||
* According to Intel Processor Identification, App Note 485.
|
||||
*
|
||||
* If a L3 cache is found, then data for it rather than the L2
|
||||
* is returned via *LLc.
|
||||
*/
|
||||
static
|
||||
Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
|
||||
{
|
||||
Int cpuid1_eax;
|
||||
Int cpuid1_ignore;
|
||||
@ -65,6 +68,14 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
UChar info[16];
|
||||
Int i, trials;
|
||||
Bool L2_found = False;
|
||||
/* If we see L3 cache info, copy it into L3c. Then, at the end,
|
||||
copy it into *LLc. Hence if a L3 cache is specified, *LLc will
|
||||
eventually contain a description of it rather than the L2 cache.
|
||||
The use of the L3c intermediary makes this process independent
|
||||
of the order in which the cache specifications appear in
|
||||
info[]. */
|
||||
Bool L3_found = False;
|
||||
cache_t L3c = { 0, 0, 0 };
|
||||
|
||||
if (level < 2) {
|
||||
VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level);
|
||||
@ -121,18 +132,39 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
case 0x90: case 0x96: case 0x9b:
|
||||
VG_(tool_panic)("IA-64 cache detected?!");
|
||||
|
||||
case 0x22: case 0x23: case 0x25: case 0x29:
|
||||
case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
|
||||
case 0xe2: case 0xe3: case 0xe4: case 0xea: case 0xeb: case 0xec:
|
||||
VG_(dmsg)("warning: L3 cache detected but ignored\n");
|
||||
break;
|
||||
/* L3 cache info. */
|
||||
case 0x22: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break;
|
||||
case 0x23: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break;
|
||||
case 0x25: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break;
|
||||
case 0x29: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break;
|
||||
case 0x46: L3c = (cache_t) { 4096, 4, 64 }; L3_found = True; break;
|
||||
case 0x47: L3c = (cache_t) { 8192, 8, 64 }; L3_found = True; break;
|
||||
case 0x4a: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break;
|
||||
case 0x4b: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break;
|
||||
case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break;
|
||||
case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break;
|
||||
case 0xd0: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break;
|
||||
case 0xd1: L3c = (cache_t) { 1024, 4, 64 }; L3_found = True; break;
|
||||
case 0xd2: L3c = (cache_t) { 2048, 4, 64 }; L3_found = True; break;
|
||||
case 0xd6: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break;
|
||||
case 0xd7: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break;
|
||||
case 0xd8: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break;
|
||||
case 0xdc: L3c = (cache_t) { 1536, 12, 64 }; L3_found = True; break;
|
||||
case 0xdd: L3c = (cache_t) { 3072, 12, 64 }; L3_found = True; break;
|
||||
case 0xde: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break;
|
||||
case 0xe2: L3c = (cache_t) { 2048, 16, 64 }; L3_found = True; break;
|
||||
case 0xe3: L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; break;
|
||||
case 0xe4: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break;
|
||||
case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break;
|
||||
case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break;
|
||||
case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break;
|
||||
|
||||
/* Described as "MLC" in Intel documentation */
|
||||
case 0x21: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
|
||||
case 0x21: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break;
|
||||
|
||||
/* These are sectored, whatever that means */
|
||||
case 0x39: *L2c = (cache_t) { 128, 4, 64 }; L2_found = True; break;
|
||||
case 0x3c: *L2c = (cache_t) { 256, 4, 64 }; L2_found = True; break;
|
||||
case 0x39: *LLc = (cache_t) { 128, 4, 64 }; L2_found = True; break;
|
||||
case 0x3c: *LLc = (cache_t) { 256, 4, 64 }; L2_found = True; break;
|
||||
|
||||
/* If a P6 core, this means "no L2 cache".
|
||||
If a P4 core, this means "no L3 cache".
|
||||
@ -141,20 +173,21 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
case 0x40:
|
||||
break;
|
||||
|
||||
case 0x41: *L2c = (cache_t) { 128, 4, 32 }; L2_found = True; break;
|
||||
case 0x42: *L2c = (cache_t) { 256, 4, 32 }; L2_found = True; break;
|
||||
case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break;
|
||||
case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
|
||||
case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
|
||||
case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
|
||||
case 0x41: *LLc = (cache_t) { 128, 4, 32 }; L2_found = True; break;
|
||||
case 0x42: *LLc = (cache_t) { 256, 4, 32 }; L2_found = True; break;
|
||||
case 0x43: *LLc = (cache_t) { 512, 4, 32 }; L2_found = True; break;
|
||||
case 0x44: *LLc = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
|
||||
case 0x45: *LLc = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
|
||||
case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break;
|
||||
case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
|
||||
case 0x49:
|
||||
if ((family == 15) && (model == 6))
|
||||
/* On Xeon MP (family F, model 6), this is for L3 */
|
||||
VG_(dmsg)("warning: L3 cache detected but ignored\n");
|
||||
else
|
||||
*L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
|
||||
break;
|
||||
case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
|
||||
if (family == 15 && model == 6) {
|
||||
/* On Xeon MP (family F, model 6), this is for L3 */
|
||||
L3c = (cache_t) { 4096, 16, 64 }; L3_found = True;
|
||||
} else {
|
||||
*LLc = (cache_t) { 4096, 16, 64 }; L2_found = True;
|
||||
}
|
||||
break;
|
||||
|
||||
/* These are sectored, whatever that means */
|
||||
case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */
|
||||
@ -181,26 +214,24 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
break;
|
||||
|
||||
/* not sectored, whatever that might mean */
|
||||
case 0x78: *L2c = (cache_t) { 1024, 4, 64 }; L2_found = True; break;
|
||||
case 0x78: *LLc = (cache_t) { 1024, 4, 64 }; L2_found = True; break;
|
||||
|
||||
/* These are sectored, whatever that means */
|
||||
case 0x79: *L2c = (cache_t) { 128, 8, 64 }; L2_found = True; break;
|
||||
case 0x7a: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
|
||||
case 0x7b: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
|
||||
case 0x7c: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
|
||||
case 0x7d: *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
|
||||
case 0x7e: *L2c = (cache_t) { 256, 8, 128 }; L2_found = True; break;
|
||||
|
||||
case 0x7f: *L2c = (cache_t) { 512, 2, 64 }; L2_found = True; break;
|
||||
case 0x80: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
|
||||
|
||||
case 0x81: *L2c = (cache_t) { 128, 8, 32 }; L2_found = True; break;
|
||||
case 0x82: *L2c = (cache_t) { 256, 8, 32 }; L2_found = True; break;
|
||||
case 0x83: *L2c = (cache_t) { 512, 8, 32 }; L2_found = True; break;
|
||||
case 0x84: *L2c = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
|
||||
case 0x85: *L2c = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
|
||||
case 0x86: *L2c = (cache_t) { 512, 4, 64 }; L2_found = True; break;
|
||||
case 0x87: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
|
||||
case 0x79: *LLc = (cache_t) { 128, 8, 64 }; L2_found = True; break;
|
||||
case 0x7a: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break;
|
||||
case 0x7b: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break;
|
||||
case 0x7c: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
|
||||
case 0x7d: *LLc = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
|
||||
case 0x7e: *LLc = (cache_t) { 256, 8, 128 }; L2_found = True; break;
|
||||
case 0x7f: *LLc = (cache_t) { 512, 2, 64 }; L2_found = True; break;
|
||||
case 0x80: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break;
|
||||
case 0x81: *LLc = (cache_t) { 128, 8, 32 }; L2_found = True; break;
|
||||
case 0x82: *LLc = (cache_t) { 256, 8, 32 }; L2_found = True; break;
|
||||
case 0x83: *LLc = (cache_t) { 512, 8, 32 }; L2_found = True; break;
|
||||
case 0x84: *LLc = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
|
||||
case 0x85: *LLc = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
|
||||
case 0x86: *LLc = (cache_t) { 512, 4, 64 }; L2_found = True; break;
|
||||
case 0x87: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
|
||||
|
||||
/* Ignore prefetch information */
|
||||
case 0xf0: case 0xf1:
|
||||
@ -213,8 +244,15 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
}
|
||||
}
|
||||
|
||||
/* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
|
||||
if (L3_found) {
|
||||
VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
|
||||
*LLc = L3c;
|
||||
L2_found = True;
|
||||
}
|
||||
|
||||
if (!L2_found)
|
||||
VG_(dmsg)("warning: L2 cache not installed, ignore L2 results.\n");
|
||||
VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -241,14 +279,37 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
* 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
|
||||
* so we detect that.
|
||||
*
|
||||
* Returns 0 on success, non-zero on failure.
|
||||
* Returns 0 on success, non-zero on failure. As with the Intel code
|
||||
* above, if a L3 cache is found, then data for it rather than the L2
|
||||
* is returned via *LLc.
|
||||
*/
|
||||
|
||||
/* A small helper */
|
||||
static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
|
||||
{
|
||||
/* Decode a L2/L3 associativity indication. It is encoded
|
||||
differently from the I1/D1 associativity. Returns 1
|
||||
(direct-map) as a safe but suboptimal result for unknown
|
||||
encodings. */
|
||||
switch (bits_15_12 & 0xF) {
|
||||
case 1: return 1; case 2: return 2;
|
||||
case 4: return 4; case 6: return 8;
|
||||
case 8: return 16; case 0xA: return 32;
|
||||
case 0xB: return 48; case 0xC: return 64;
|
||||
case 0xD: return 96; case 0xE: return 128;
|
||||
case 0xF: /* fully associative */
|
||||
case 0: /* L2/L3 cache or TLB is disabled */
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
|
||||
{
|
||||
UInt ext_level;
|
||||
UInt dummy, model;
|
||||
UInt I1i, D1i, L2i;
|
||||
UInt I1i, D1i, L2i, L3i;
|
||||
|
||||
VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
|
||||
|
||||
@ -259,7 +320,7 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
}
|
||||
|
||||
VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
|
||||
VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
|
||||
VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i);
|
||||
|
||||
VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
|
||||
|
||||
@ -277,15 +338,26 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
I1c->assoc = (I1i >> 16) & 0xff;
|
||||
I1c->line_size = (I1i >> 0) & 0xff;
|
||||
|
||||
L2c->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
|
||||
L2c->assoc = (L2i >> 12) & 0xf;
|
||||
L2c->line_size = (L2i >> 0) & 0xff;
|
||||
LLc->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
|
||||
LLc->assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
|
||||
LLc->line_size = (L2i >> 0) & 0xff;
|
||||
|
||||
if (((L3i >> 18) & 0x3fff) > 0) {
|
||||
/* There's an L3 cache. Replace *LLc contents with this info. */
|
||||
/* NB: the test in the if is "if L3 size > 0 ". I don't know if
|
||||
this is the right way to test presence-vs-absence of L3. I
|
||||
can't see any guidance on this in the AMD documentation. */
|
||||
LLc->size = ((L3i >> 18) & 0x3fff) * 512;
|
||||
LLc->assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
|
||||
LLc->line_size = (L3i >> 0) & 0xff;
|
||||
VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
|
||||
{
|
||||
Int level, ret;
|
||||
Char vendor_id[13];
|
||||
@ -306,10 +378,10 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
|
||||
/* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
|
||||
if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
|
||||
ret = Intel_cache_info(level, I1c, D1c, L2c);
|
||||
ret = Intel_cache_info(level, I1c, D1c, LLc);
|
||||
|
||||
} else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
|
||||
ret = AMD_cache_info(I1c, D1c, L2c);
|
||||
ret = AMD_cache_info(I1c, D1c, LLc);
|
||||
|
||||
} else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
|
||||
/* Total kludge. Pretend to be a VIA Nehemiah. */
|
||||
@ -319,9 +391,9 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
I1c->size = 64;
|
||||
I1c->assoc = 4;
|
||||
I1c->line_size = 16;
|
||||
L2c->size = 64;
|
||||
L2c->assoc = 16;
|
||||
L2c->line_size = 16;
|
||||
LLc->size = 64;
|
||||
LLc->assoc = 16;
|
||||
LLc->line_size = 16;
|
||||
ret = 0;
|
||||
|
||||
} else {
|
||||
@ -332,13 +404,13 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
/* Successful! Convert sizes from KB to bytes */
|
||||
I1c->size *= 1024;
|
||||
D1c->size *= 1024;
|
||||
L2c->size *= 1024;
|
||||
LLc->size *= 1024;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
|
||||
Bool all_caches_clo_defined)
|
||||
{
|
||||
Int res;
|
||||
@ -346,10 +418,10 @@ void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
|
||||
// Set caches to default.
|
||||
*I1c = (cache_t) { 65536, 2, 64 };
|
||||
*D1c = (cache_t) { 65536, 2, 64 };
|
||||
*L2c = (cache_t) { 262144, 8, 64 };
|
||||
*LLc = (cache_t) { 262144, 8, 64 };
|
||||
|
||||
// Then replace with any info we can get from CPUID.
|
||||
res = get_caches_from_CPUID(I1c, D1c, L2c);
|
||||
res = get_caches_from_CPUID(I1c, D1c, LLc);
|
||||
|
||||
// Warn if CPUID failed and config not completely specified from cmd line.
|
||||
if (res != 0 && !all_caches_clo_defined) {
|
||||
|
||||
@ -33,14 +33,14 @@
|
||||
|
||||
// For cache simulation
|
||||
typedef struct {
|
||||
int size; // bytes
|
||||
int assoc;
|
||||
int line_size; // bytes
|
||||
Int size; // bytes
|
||||
Int assoc;
|
||||
Int line_size; // bytes
|
||||
} cache_t;
|
||||
|
||||
// Gives the configuration of I1, D1 and L2 caches. They get overridden
|
||||
// Gives the configuration of I1, D1 and LL caches. They get overridden
|
||||
// by any cache configurations specified on the command line.
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
|
||||
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
|
||||
Bool all_caches_clo_defined);
|
||||
|
||||
#endif // __CG_ARCH_H
|
||||
|
||||
@ -77,7 +77,7 @@ typedef
|
||||
struct {
|
||||
ULong a; /* total # memory accesses of this kind */
|
||||
ULong m1; /* misses in the first level cache */
|
||||
ULong m2; /* misses in the second level cache */
|
||||
ULong mL; /* misses in the second level cache */
|
||||
}
|
||||
CacheCC;
|
||||
|
||||
@ -268,13 +268,13 @@ static LineCC* get_lineCC(Addr origAddr)
|
||||
lineCC->loc.line = loc.line;
|
||||
lineCC->Ir.a = 0;
|
||||
lineCC->Ir.m1 = 0;
|
||||
lineCC->Ir.m2 = 0;
|
||||
lineCC->Ir.mL = 0;
|
||||
lineCC->Dr.a = 0;
|
||||
lineCC->Dr.m1 = 0;
|
||||
lineCC->Dr.m2 = 0;
|
||||
lineCC->Dr.mL = 0;
|
||||
lineCC->Dw.a = 0;
|
||||
lineCC->Dw.m1 = 0;
|
||||
lineCC->Dw.m2 = 0;
|
||||
lineCC->Dw.mL = 0;
|
||||
lineCC->Bc.b = 0;
|
||||
lineCC->Bc.mp = 0;
|
||||
lineCC->Bi.b = 0;
|
||||
@ -319,7 +319,7 @@ void log_1I_0D_cache_access(InstrInfo* n)
|
||||
//VG_(printf)("1I_0D : CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n",
|
||||
// n, n->instr_addr, n->instr_len);
|
||||
cachesim_I1_doref(n->instr_addr, n->instr_len,
|
||||
&n->parent->Ir.m1, &n->parent->Ir.m2);
|
||||
&n->parent->Ir.m1, &n->parent->Ir.mL);
|
||||
n->parent->Ir.a++;
|
||||
}
|
||||
|
||||
@ -331,10 +331,10 @@ void log_2I_0D_cache_access(InstrInfo* n, InstrInfo* n2)
|
||||
// n, n->instr_addr, n->instr_len,
|
||||
// n2, n2->instr_addr, n2->instr_len);
|
||||
cachesim_I1_doref(n->instr_addr, n->instr_len,
|
||||
&n->parent->Ir.m1, &n->parent->Ir.m2);
|
||||
&n->parent->Ir.m1, &n->parent->Ir.mL);
|
||||
n->parent->Ir.a++;
|
||||
cachesim_I1_doref(n2->instr_addr, n2->instr_len,
|
||||
&n2->parent->Ir.m1, &n2->parent->Ir.m2);
|
||||
&n2->parent->Ir.m1, &n2->parent->Ir.mL);
|
||||
n2->parent->Ir.a++;
|
||||
}
|
||||
|
||||
@ -348,13 +348,13 @@ void log_3I_0D_cache_access(InstrInfo* n, InstrInfo* n2, InstrInfo* n3)
|
||||
// n2, n2->instr_addr, n2->instr_len,
|
||||
// n3, n3->instr_addr, n3->instr_len);
|
||||
cachesim_I1_doref(n->instr_addr, n->instr_len,
|
||||
&n->parent->Ir.m1, &n->parent->Ir.m2);
|
||||
&n->parent->Ir.m1, &n->parent->Ir.mL);
|
||||
n->parent->Ir.a++;
|
||||
cachesim_I1_doref(n2->instr_addr, n2->instr_len,
|
||||
&n2->parent->Ir.m1, &n2->parent->Ir.m2);
|
||||
&n2->parent->Ir.m1, &n2->parent->Ir.mL);
|
||||
n2->parent->Ir.a++;
|
||||
cachesim_I1_doref(n3->instr_addr, n3->instr_len,
|
||||
&n3->parent->Ir.m1, &n3->parent->Ir.m2);
|
||||
&n3->parent->Ir.m1, &n3->parent->Ir.mL);
|
||||
n3->parent->Ir.a++;
|
||||
}
|
||||
|
||||
@ -365,11 +365,11 @@ void log_1I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
|
||||
// " daddr=0x%010lx, dsize=%lu\n",
|
||||
// n, n->instr_addr, n->instr_len, data_addr, data_size);
|
||||
cachesim_I1_doref(n->instr_addr, n->instr_len,
|
||||
&n->parent->Ir.m1, &n->parent->Ir.m2);
|
||||
&n->parent->Ir.m1, &n->parent->Ir.mL);
|
||||
n->parent->Ir.a++;
|
||||
|
||||
cachesim_D1_doref(data_addr, data_size,
|
||||
&n->parent->Dr.m1, &n->parent->Dr.m2);
|
||||
&n->parent->Dr.m1, &n->parent->Dr.mL);
|
||||
n->parent->Dr.a++;
|
||||
}
|
||||
|
||||
@ -380,11 +380,11 @@ void log_1I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
|
||||
// " daddr=0x%010lx, dsize=%lu\n",
|
||||
// n, n->instr_addr, n->instr_len, data_addr, data_size);
|
||||
cachesim_I1_doref(n->instr_addr, n->instr_len,
|
||||
&n->parent->Ir.m1, &n->parent->Ir.m2);
|
||||
&n->parent->Ir.m1, &n->parent->Ir.mL);
|
||||
n->parent->Ir.a++;
|
||||
|
||||
cachesim_D1_doref(data_addr, data_size,
|
||||
&n->parent->Dw.m1, &n->parent->Dw.m2);
|
||||
&n->parent->Dw.m1, &n->parent->Dw.mL);
|
||||
n->parent->Dw.a++;
|
||||
}
|
||||
|
||||
@ -394,7 +394,7 @@ void log_0I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
|
||||
//VG_(printf)("0I_1Dr: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
|
||||
// n, data_addr, data_size);
|
||||
cachesim_D1_doref(data_addr, data_size,
|
||||
&n->parent->Dr.m1, &n->parent->Dr.m2);
|
||||
&n->parent->Dr.m1, &n->parent->Dr.mL);
|
||||
n->parent->Dr.a++;
|
||||
}
|
||||
|
||||
@ -404,7 +404,7 @@ void log_0I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
|
||||
//VG_(printf)("0I_1Dw: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
|
||||
// n, data_addr, data_size);
|
||||
cachesim_D1_doref(data_addr, data_size,
|
||||
&n->parent->Dw.m1, &n->parent->Dw.m2);
|
||||
&n->parent->Dw.m1, &n->parent->Dw.mL);
|
||||
n->parent->Dw.a++;
|
||||
}
|
||||
|
||||
@ -1234,7 +1234,7 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,
|
||||
|
||||
static cache_t clo_I1_cache = UNDEFINED_CACHE;
|
||||
static cache_t clo_D1_cache = UNDEFINED_CACHE;
|
||||
static cache_t clo_L2_cache = UNDEFINED_CACHE;
|
||||
static cache_t clo_LL_cache = UNDEFINED_CACHE;
|
||||
|
||||
// Checks cache config is ok. Returns NULL if ok, or a pointer to an error
|
||||
// string otherwise.
|
||||
@ -1273,7 +1273,7 @@ static Char* check_cache(cache_t* cache)
|
||||
}
|
||||
|
||||
static
|
||||
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
|
||||
{
|
||||
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
|
||||
|
||||
@ -1283,22 +1283,22 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
Bool all_caches_clo_defined =
|
||||
(DEFINED(clo_I1_cache) &&
|
||||
DEFINED(clo_D1_cache) &&
|
||||
DEFINED(clo_L2_cache));
|
||||
DEFINED(clo_LL_cache));
|
||||
|
||||
// Set the cache config (using auto-detection, if supported by the
|
||||
// architecture).
|
||||
VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
|
||||
VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
|
||||
|
||||
// Check the default/auto-detected values.
|
||||
checkRes = check_cache(I1c); tl_assert(!checkRes);
|
||||
checkRes = check_cache(D1c); tl_assert(!checkRes);
|
||||
checkRes = check_cache(L2c); tl_assert(!checkRes);
|
||||
checkRes = check_cache(LLc); tl_assert(!checkRes);
|
||||
|
||||
// Then replace with any defined on the command line. (Already checked in
|
||||
// parse_cache_opt().)
|
||||
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
|
||||
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
|
||||
if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
|
||||
if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
|
||||
|
||||
if (VG_(clo_verbosity) >= 2) {
|
||||
VG_(umsg)("Cache configuration used:\n");
|
||||
@ -1306,8 +1306,8 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
I1c->size, I1c->assoc, I1c->line_size);
|
||||
VG_(umsg)(" D1: %dB, %d-way, %dB lines\n",
|
||||
D1c->size, D1c->assoc, D1c->line_size);
|
||||
VG_(umsg)(" L2: %dB, %d-way, %dB lines\n",
|
||||
L2c->size, L2c->assoc, L2c->line_size);
|
||||
VG_(umsg)(" LL: %dB, %d-way, %dB lines\n",
|
||||
LLc->size, LLc->assoc, LLc->line_size);
|
||||
}
|
||||
#undef CMD_LINE_DEFINED
|
||||
}
|
||||
@ -1354,12 +1354,12 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
VG_(free)(cachegrind_out_file);
|
||||
}
|
||||
|
||||
// "desc:" lines (giving I1/D1/L2 cache configuration). The spaces after
|
||||
// "desc:" lines (giving I1/D1/LL cache configuration). The spaces after
|
||||
// the 2nd colon makes cg_annotate's output look nicer.
|
||||
VG_(sprintf)(buf, "desc: I1 cache: %s\n"
|
||||
"desc: D1 cache: %s\n"
|
||||
"desc: L2 cache: %s\n",
|
||||
I1.desc_line, D1.desc_line, L2.desc_line);
|
||||
"desc: LL cache: %s\n",
|
||||
I1.desc_line, D1.desc_line, LL.desc_line);
|
||||
VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
|
||||
|
||||
// "cmd:" line
|
||||
@ -1379,11 +1379,11 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
}
|
||||
// "events:" line
|
||||
if (clo_cache_sim && clo_branch_sim) {
|
||||
VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
|
||||
VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
|
||||
"Bc Bcm Bi Bim\n");
|
||||
}
|
||||
else if (clo_cache_sim && !clo_branch_sim) {
|
||||
VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
|
||||
VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
|
||||
"\n");
|
||||
}
|
||||
else if (!clo_cache_sim && clo_branch_sim) {
|
||||
@ -1430,9 +1430,9 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
" %llu %llu %llu"
|
||||
" %llu %llu %llu %llu\n",
|
||||
lineCC->loc.line,
|
||||
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
|
||||
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
|
||||
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2,
|
||||
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
|
||||
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
|
||||
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL,
|
||||
lineCC->Bc.b, lineCC->Bc.mp,
|
||||
lineCC->Bi.b, lineCC->Bi.mp);
|
||||
}
|
||||
@ -1441,9 +1441,9 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
" %llu %llu %llu"
|
||||
" %llu %llu %llu\n",
|
||||
lineCC->loc.line,
|
||||
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
|
||||
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
|
||||
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
|
||||
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
|
||||
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
|
||||
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL);
|
||||
}
|
||||
else if (!clo_cache_sim && clo_branch_sim) {
|
||||
VG_(sprintf)(buf, "%u %llu"
|
||||
@ -1464,13 +1464,13 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
// Update summary stats
|
||||
Ir_total.a += lineCC->Ir.a;
|
||||
Ir_total.m1 += lineCC->Ir.m1;
|
||||
Ir_total.m2 += lineCC->Ir.m2;
|
||||
Ir_total.mL += lineCC->Ir.mL;
|
||||
Dr_total.a += lineCC->Dr.a;
|
||||
Dr_total.m1 += lineCC->Dr.m1;
|
||||
Dr_total.m2 += lineCC->Dr.m2;
|
||||
Dr_total.mL += lineCC->Dr.mL;
|
||||
Dw_total.a += lineCC->Dw.a;
|
||||
Dw_total.m1 += lineCC->Dw.m1;
|
||||
Dw_total.m2 += lineCC->Dw.m2;
|
||||
Dw_total.mL += lineCC->Dw.mL;
|
||||
Bc_total.b += lineCC->Bc.b;
|
||||
Bc_total.mp += lineCC->Bc.mp;
|
||||
Bi_total.b += lineCC->Bi.b;
|
||||
@ -1487,9 +1487,9 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
" %llu %llu %llu"
|
||||
" %llu %llu %llu"
|
||||
" %llu %llu %llu %llu\n",
|
||||
Ir_total.a, Ir_total.m1, Ir_total.m2,
|
||||
Dr_total.a, Dr_total.m1, Dr_total.m2,
|
||||
Dw_total.a, Dw_total.m1, Dw_total.m2,
|
||||
Ir_total.a, Ir_total.m1, Ir_total.mL,
|
||||
Dr_total.a, Dr_total.m1, Dr_total.mL,
|
||||
Dw_total.a, Dw_total.m1, Dw_total.mL,
|
||||
Bc_total.b, Bc_total.mp,
|
||||
Bi_total.b, Bi_total.mp);
|
||||
}
|
||||
@ -1498,9 +1498,9 @@ static void fprint_CC_table_and_calc_totals(void)
|
||||
" %llu %llu %llu"
|
||||
" %llu %llu %llu"
|
||||
" %llu %llu %llu\n",
|
||||
Ir_total.a, Ir_total.m1, Ir_total.m2,
|
||||
Dr_total.a, Dr_total.m1, Dr_total.m2,
|
||||
Dw_total.a, Dw_total.m1, Dw_total.m2);
|
||||
Ir_total.a, Ir_total.m1, Ir_total.mL,
|
||||
Dr_total.a, Dr_total.m1, Dr_total.mL,
|
||||
Dw_total.a, Dw_total.m1, Dw_total.mL);
|
||||
}
|
||||
else if (!clo_cache_sim && clo_branch_sim) {
|
||||
VG_(sprintf)(buf, "summary:"
|
||||
@ -1537,8 +1537,8 @@ static void cg_fini(Int exitcode)
|
||||
|
||||
CacheCC D_total;
|
||||
BranchCC B_total;
|
||||
ULong L2_total_m, L2_total_mr, L2_total_mw,
|
||||
L2_total, L2_total_r, L2_total_w;
|
||||
ULong LL_total_m, LL_total_mr, LL_total_mw,
|
||||
LL_total, LL_total_r, LL_total_w;
|
||||
Int l1, l2, l3;
|
||||
|
||||
fprint_CC_table_and_calc_totals();
|
||||
@ -1565,21 +1565,21 @@ static void cg_fini(Int exitcode)
|
||||
miss numbers */
|
||||
if (clo_cache_sim) {
|
||||
VG_(umsg)(fmt, "I1 misses: ", Ir_total.m1);
|
||||
VG_(umsg)(fmt, "L2i misses: ", Ir_total.m2);
|
||||
VG_(umsg)(fmt, "LLi misses: ", Ir_total.mL);
|
||||
|
||||
if (0 == Ir_total.a) Ir_total.a = 1;
|
||||
VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
|
||||
VG_(umsg)("I1 miss rate: %s\n", buf1);
|
||||
|
||||
VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
|
||||
VG_(umsg)("L2i miss rate: %s\n", buf1);
|
||||
VG_(percentify)(Ir_total.mL, Ir_total.a, 2, l1+1, buf1);
|
||||
VG_(umsg)("LLi miss rate: %s\n", buf1);
|
||||
VG_(umsg)("\n");
|
||||
|
||||
/* D cache results. Use the D_refs.rd and D_refs.wr values to
|
||||
* determine the width of columns 2 & 3. */
|
||||
D_total.a = Dr_total.a + Dw_total.a;
|
||||
D_total.m1 = Dr_total.m1 + Dw_total.m1;
|
||||
D_total.m2 = Dr_total.m2 + Dw_total.m2;
|
||||
D_total.mL = Dr_total.mL + Dw_total.mL;
|
||||
|
||||
/* Make format string, getting width right for numbers */
|
||||
VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu rd + %%,%dllu wr)\n",
|
||||
@ -1589,8 +1589,8 @@ static void cg_fini(Int exitcode)
|
||||
D_total.a, Dr_total.a, Dw_total.a);
|
||||
VG_(umsg)(fmt, "D1 misses: ",
|
||||
D_total.m1, Dr_total.m1, Dw_total.m1);
|
||||
VG_(umsg)(fmt, "L2d misses: ",
|
||||
D_total.m2, Dr_total.m2, Dw_total.m2);
|
||||
VG_(umsg)(fmt, "LLd misses: ",
|
||||
D_total.mL, Dr_total.mL, Dw_total.mL);
|
||||
|
||||
if (0 == D_total.a) D_total.a = 1;
|
||||
if (0 == Dr_total.a) Dr_total.a = 1;
|
||||
@ -1600,30 +1600,30 @@ static void cg_fini(Int exitcode)
|
||||
VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
|
||||
VG_(umsg)("D1 miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
|
||||
|
||||
VG_(percentify)( D_total.m2, D_total.a, 1, l1+1, buf1);
|
||||
VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
|
||||
VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
|
||||
VG_(umsg)("L2d miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
|
||||
VG_(percentify)( D_total.mL, D_total.a, 1, l1+1, buf1);
|
||||
VG_(percentify)(Dr_total.mL, Dr_total.a, 1, l2+1, buf2);
|
||||
VG_(percentify)(Dw_total.mL, Dw_total.a, 1, l3+1, buf3);
|
||||
VG_(umsg)("LLd miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
|
||||
VG_(umsg)("\n");
|
||||
|
||||
/* L2 overall results */
|
||||
/* LL overall results */
|
||||
|
||||
L2_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
|
||||
L2_total_r = Dr_total.m1 + Ir_total.m1;
|
||||
L2_total_w = Dw_total.m1;
|
||||
VG_(umsg)(fmt, "L2 refs: ",
|
||||
L2_total, L2_total_r, L2_total_w);
|
||||
LL_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
|
||||
LL_total_r = Dr_total.m1 + Ir_total.m1;
|
||||
LL_total_w = Dw_total.m1;
|
||||
VG_(umsg)(fmt, "LL refs: ",
|
||||
LL_total, LL_total_r, LL_total_w);
|
||||
|
||||
L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
|
||||
L2_total_mr = Dr_total.m2 + Ir_total.m2;
|
||||
L2_total_mw = Dw_total.m2;
|
||||
VG_(umsg)(fmt, "L2 misses: ",
|
||||
L2_total_m, L2_total_mr, L2_total_mw);
|
||||
LL_total_m = Dr_total.mL + Dw_total.mL + Ir_total.mL;
|
||||
LL_total_mr = Dr_total.mL + Ir_total.mL;
|
||||
LL_total_mw = Dw_total.mL;
|
||||
VG_(umsg)(fmt, "LL misses: ",
|
||||
LL_total_m, LL_total_mr, LL_total_mw);
|
||||
|
||||
VG_(percentify)(L2_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
|
||||
VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
|
||||
VG_(percentify)(L2_total_mw, Dw_total.a, 1, l3+1, buf3);
|
||||
VG_(umsg)("L2 miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
|
||||
VG_(percentify)(LL_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
|
||||
VG_(percentify)(LL_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
|
||||
VG_(percentify)(LL_total_mw, Dw_total.a, 1, l3+1, buf3);
|
||||
VG_(umsg)("LL miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
|
||||
}
|
||||
|
||||
/* If branch profiling is enabled, show branch overall results. */
|
||||
@ -1760,8 +1760,9 @@ static Bool cg_process_cmd_line_option(Char* arg)
|
||||
parse_cache_opt(&clo_I1_cache, arg, tmp_str);
|
||||
else if VG_STR_CLO(arg, "--D1", tmp_str)
|
||||
parse_cache_opt(&clo_D1_cache, arg, tmp_str);
|
||||
else if VG_STR_CLO(arg, "--L2", tmp_str)
|
||||
parse_cache_opt(&clo_L2_cache, arg, tmp_str);
|
||||
else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
|
||||
VG_STR_CLO(arg, "--LL", tmp_str))
|
||||
parse_cache_opt(&clo_LL_cache, arg, tmp_str);
|
||||
|
||||
else if VG_STR_CLO( arg, "--cachegrind-out-file", clo_cachegrind_out_file) {}
|
||||
else if VG_BOOL_CLO(arg, "--cache-sim", clo_cache_sim) {}
|
||||
@ -1777,7 +1778,7 @@ static void cg_print_usage(void)
|
||||
VG_(printf)(
|
||||
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
|
||||
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
|
||||
" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
|
||||
" --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
|
||||
" --cache-sim=yes|no [yes] collect cache stats?\n"
|
||||
" --branch-sim=yes|no [no] collect branch prediction stats?\n"
|
||||
" --cachegrind-out-file=<file> output file name [cachegrind.out.%%p]\n"
|
||||
@ -1819,7 +1820,7 @@ static void cg_pre_clo_init(void)
|
||||
|
||||
static void cg_post_clo_init(void)
|
||||
{
|
||||
cache_t I1c, D1c, L2c;
|
||||
cache_t I1c, D1c, LLc;
|
||||
|
||||
CC_table =
|
||||
VG_(OSetGen_Create)(offsetof(LineCC, loc),
|
||||
@ -1837,11 +1838,11 @@ static void cg_post_clo_init(void)
|
||||
VG_(malloc), "cg.main.cpci.3",
|
||||
VG_(free));
|
||||
|
||||
configure_caches(&I1c, &D1c, &L2c);
|
||||
configure_caches(&I1c, &D1c, &LLc);
|
||||
|
||||
cachesim_I1_initcache(I1c);
|
||||
cachesim_D1_initcache(D1c);
|
||||
cachesim_L2_initcache(L2c);
|
||||
cachesim_LL_initcache(LLc);
|
||||
}
|
||||
|
||||
VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init)
|
||||
|
||||
@ -96,7 +96,7 @@ static void cachesim_##L##_initcache(cache_t config) \
|
||||
/* bigger than its usual limit. Inlining gains around 5--10% speedup. */ \
|
||||
__attribute__((always_inline)) \
|
||||
static __inline__ \
|
||||
void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2) \
|
||||
void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *mL) \
|
||||
{ \
|
||||
UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
|
||||
UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
|
||||
@ -188,9 +188,9 @@ miss_treatment: \
|
||||
return; \
|
||||
}
|
||||
|
||||
CACHESIM(L2, (*m2)++ );
|
||||
CACHESIM(I1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
|
||||
CACHESIM(D1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
|
||||
CACHESIM(LL, (*mL)++ );
|
||||
CACHESIM(I1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
|
||||
CACHESIM(D1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
|
||||
|
||||
/*--------------------------------------------------------------------*/
|
||||
/*--- end cg_sim.c ---*/
|
||||
|
||||
@ -16,33 +16,45 @@ Valgrind command line.</para>
|
||||
|
||||
<para>Cachegrind simulates how your program interacts with a machine's cache
|
||||
hierarchy and (optionally) branch predictor. It simulates a machine with
|
||||
independent first level instruction and data caches (I1 and D1), backed by a
|
||||
unified second level cache (L2). This configuration is used by almost all
|
||||
modern machines.</para>
|
||||
independent first-level instruction and data caches (I1 and D1), backed by a
|
||||
unified second-level cache (L2). This exactly matches the configuration of
|
||||
many modern machines.</para>
|
||||
|
||||
<para>However, some modern machines have three levels of cache. For these
|
||||
machines (in the cases where Cachegrind can auto-detect the cache
|
||||
configuration) Cachegrind simulates the first-level and third-level caches.
|
||||
The reason for this choice is that the L3 cache has the most influence on
|
||||
runtime, as it masks accesses to main memory. Furthermore, the L1 caches
|
||||
often have low associativity, so simulating them can detect cases where the
|
||||
code interacts badly with this cache (eg. traversing a matrix column-wise
|
||||
with the row length being a power of 2).</para>
|
||||
|
||||
<para>Therefore, Cachegrind always refers to the I1, D1 and LL (last-level)
|
||||
caches.</para>
|
||||
|
||||
<para>
|
||||
It gathers the following statistics (abbreviations used for each statistic
|
||||
Cachegrind gathers the following statistics (abbreviations used for each statistic
|
||||
is given in parentheses):</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>I cache reads (<computeroutput>Ir</computeroutput>,
|
||||
which equals the number of instructions executed),
|
||||
I1 cache read misses (<computeroutput>I1mr</computeroutput>) and
|
||||
L2 cache instruction read misses (<computeroutput>I1mr</computeroutput>).
|
||||
LL cache instruction read misses (<computeroutput>ILmr</computeroutput>).
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>D cache reads (<computeroutput>Dr</computeroutput>, which
|
||||
equals the number of memory reads),
|
||||
D1 cache read misses (<computeroutput>D1mr</computeroutput>), and
|
||||
L2 cache data read misses (<computeroutput>D2mr</computeroutput>).
|
||||
LL cache data read misses (<computeroutput>DLmr</computeroutput>).
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>D cache writes (<computeroutput>Dw</computeroutput>, which equals
|
||||
the number of memory writes),
|
||||
D1 cache write misses (<computeroutput>D1mw</computeroutput>), and
|
||||
L2 cache data write misses (<computeroutput>D2mw</computeroutput>).
|
||||
LL cache data write misses (<computeroutput>DLmw</computeroutput>).
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
@ -59,10 +71,10 @@ is given in parentheses):</para>
|
||||
|
||||
<para>Note that D1 total accesses is given by
|
||||
<computeroutput>D1mr</computeroutput> +
|
||||
<computeroutput>D1mw</computeroutput>, and that L2 total
|
||||
accesses is given by <computeroutput>I2mr</computeroutput> +
|
||||
<computeroutput>D2mr</computeroutput> +
|
||||
<computeroutput>D2mw</computeroutput>.
|
||||
<computeroutput>D1mw</computeroutput>, and that LL total
|
||||
accesses is given by <computeroutput>ILmr</computeroutput> +
|
||||
<computeroutput>DLmr</computeroutput> +
|
||||
<computeroutput>DLmw</computeroutput>.
|
||||
</para>
|
||||
|
||||
<para>These statistics are presented for the entire program and for each
|
||||
@ -70,7 +82,7 @@ function in the program. You can also annotate each line of source code in
|
||||
the program with the counts that were caused directly by it.</para>
|
||||
|
||||
<para>On a modern machine, an L1 miss will typically cost
|
||||
around 10 cycles, an L2 miss can cost as much as 200
|
||||
around 10 cycles, an LL miss can cost as much as 200
|
||||
cycles, and a mispredicted branch costs in the region of 10
|
||||
to 30 cycles. Detailed cache and branch profiling can be very useful
|
||||
for understanding how your program interacts with the machine and thus how
|
||||
@ -118,24 +130,24 @@ summary statistics that look like this will be printed:</para>
|
||||
<programlisting><![CDATA[
|
||||
==31751== I refs: 27,742,716
|
||||
==31751== I1 misses: 276
|
||||
==31751== L2i misses: 275
|
||||
==31751== LLi misses: 275
|
||||
==31751== I1 miss rate: 0.0%
|
||||
==31751== L2i miss rate: 0.0%
|
||||
==31751== LLi miss rate: 0.0%
|
||||
==31751==
|
||||
==31751== D refs: 15,430,290 (10,955,517 rd + 4,474,773 wr)
|
||||
==31751== D1 misses: 41,185 ( 21,905 rd + 19,280 wr)
|
||||
==31751== L2d misses: 23,085 ( 3,987 rd + 19,098 wr)
|
||||
==31751== LLd misses: 23,085 ( 3,987 rd + 19,098 wr)
|
||||
==31751== D1 miss rate: 0.2% ( 0.1% + 0.4%)
|
||||
==31751== L2d miss rate: 0.1% ( 0.0% + 0.4%)
|
||||
==31751== LLd miss rate: 0.1% ( 0.0% + 0.4%)
|
||||
==31751==
|
||||
==31751== L2 misses: 23,360 ( 4,262 rd + 19,098 wr)
|
||||
==31751== L2 miss rate: 0.0% ( 0.0% + 0.4%)]]></programlisting>
|
||||
==31751== LL misses: 23,360 ( 4,262 rd + 19,098 wr)
|
||||
==31751== LL miss rate: 0.0% ( 0.0% + 0.4%)]]></programlisting>
|
||||
|
||||
<para>Cache accesses for instruction fetches are summarised
|
||||
first, giving the number of fetches made (this is the number of
|
||||
instructions executed, which can be useful to know in its own
|
||||
right), the number of I1 misses, and the number of L2 instruction
|
||||
(<computeroutput>L2i</computeroutput>) misses.</para>
|
||||
right), the number of I1 misses, and the number of LL instruction
|
||||
(<computeroutput>LLi</computeroutput>) misses.</para>
|
||||
|
||||
<para>Cache accesses for data follow. The information is similar
|
||||
to that of the instruction fetches, except that the values are
|
||||
@ -144,12 +156,12 @@ also shown split between reads and writes (note each row's
|
||||
<computeroutput>wr</computeroutput> values add up to the row's
|
||||
total).</para>
|
||||
|
||||
<para>Combined instruction and data figures for the L2 cache
|
||||
follow that. Note that the L2 miss rate is computed relative to the total
|
||||
<para>Combined instruction and data figures for the LL cache
|
||||
follow that. Note that the LL miss rate is computed relative to the total
|
||||
number of memory accesses, not the number of L1 misses. I.e. it is
|
||||
<computeroutput>(I2mr + D2mr + D2mw) / (Ir + Dr + Dw)</computeroutput>
|
||||
<computeroutput>(ILmr + DLmr + DLmw) / (Ir + Dr + Dw)</computeroutput>
|
||||
not
|
||||
<computeroutput>(I2mr + D2mr + D2mw) / (I1mr + D1mr + D1mw)</computeroutput>
|
||||
<computeroutput>(ILmr + DLmr + DLmw) / (I1mr + D1mr + D1mw)</computeroutput>
|
||||
</para>
|
||||
|
||||
<para>Branch prediction statistics are not collected by default.
|
||||
@ -208,11 +220,11 @@ wide if possible, as the output lines can be quite long.</para>
|
||||
--------------------------------------------------------------------------------
|
||||
I1 cache: 65536 B, 64 B, 2-way associative
|
||||
D1 cache: 65536 B, 64 B, 2-way associative
|
||||
L2 cache: 262144 B, 64 B, 8-way associative
|
||||
LL cache: 262144 B, 64 B, 8-way associative
|
||||
Command: concord vg_to_ucode.c
|
||||
Events recorded: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
|
||||
Events shown: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
|
||||
Event sort order: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
|
||||
Events recorded: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
|
||||
Events shown: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
|
||||
Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
|
||||
Threshold: 99%
|
||||
Chosen for annotation:
|
||||
Auto-annotation: off
|
||||
@ -224,7 +236,7 @@ Auto-annotation: off
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>I1 cache, D1 cache, L2 cache: cache configuration. So
|
||||
<para>I1 cache, D1 cache, LL cache: cache configuration. So
|
||||
you know the configuration with which these results were
|
||||
obtained.</para>
|
||||
</listitem>
|
||||
@ -300,7 +312,7 @@ program:</para>
|
||||
|
||||
<programlisting><![CDATA[
|
||||
--------------------------------------------------------------------------------
|
||||
Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
|
||||
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
|
||||
--------------------------------------------------------------------------------
|
||||
27,742,716 276 275 10,955,517 21,905 3,987 4,474,773 19,280 19,098 PROGRAM TOTALS]]></programlisting>
|
||||
|
||||
@ -312,7 +324,7 @@ These are similar to the summary provided when Cachegrind finishes running.
|
||||
|
||||
<programlisting><![CDATA[
|
||||
--------------------------------------------------------------------------------
|
||||
Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw file:function
|
||||
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw file:function
|
||||
--------------------------------------------------------------------------------
|
||||
8,821,482 5 5 2,242,702 1,621 73 1,794,230 0 0 getc.c:_IO_getc
|
||||
5,222,023 4 4 2,276,334 16 12 875,959 1 1 concord.c:get_word
|
||||
@ -367,7 +379,7 @@ produces the same output as above followed by an annotated version of
|
||||
--------------------------------------------------------------------------------
|
||||
-- User-annotated source: concord.c
|
||||
--------------------------------------------------------------------------------
|
||||
Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
|
||||
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
|
||||
|
||||
. . . . . . . . . void init_hash_table(char *file_name, Word_Node *table[])
|
||||
3 1 1 . . . 1 0 0 {
|
||||
@ -687,7 +699,7 @@ programs. It does however check that the
|
||||
<computeroutput>Events:</computeroutput> lines of all the inputs are
|
||||
identical, so as to ensure that the addition of costs makes sense.
|
||||
For example, it would be nonsensical for it to add a number indicating
|
||||
D1 read references to a number from a different file indicating L2
|
||||
D1 read references to a number from a different file indicating LL
|
||||
write misses.</para>
|
||||
|
||||
<para>
|
||||
@ -746,7 +758,7 @@ programs. It does however check that the
|
||||
<computeroutput>Events:</computeroutput> lines of all the inputs are
|
||||
identical, so as to ensure that the addition of costs makes sense.
|
||||
For example, it would be nonsensical for it to add a number indicating
|
||||
D1 read references to a number from a different file indicating L2
|
||||
D1 read references to a number from a different file indicating LL
|
||||
write misses.</para>
|
||||
|
||||
<para>
|
||||
@ -810,12 +822,12 @@ this case.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="opt.L2" xreflabel="--L2">
|
||||
<varlistentry id="opt.LL" xreflabel="--LL">
|
||||
<term>
|
||||
<option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
|
||||
<option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
|
||||
</term>
|
||||
<listitem>
|
||||
<para>Specify the size, associativity and line size of the level 2
|
||||
<para>Specify the size, associativity and line size of the last-level
|
||||
cache.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@ -903,9 +915,9 @@ this case.</para>
|
||||
order). Default is to use all present in the
|
||||
<filename>cachegrind.out.<pid></filename> file (and
|
||||
use the order in the file). Useful if you want to concentrate on, for
|
||||
example, I cache misses (<option>--show=I1mr,I2mr</option>), or data
|
||||
read misses (<option>--show=D1mr,D2mr</option>), or L2 data misses
|
||||
(<option>--show=D2mr,D2mw</option>). Best used in conjunction with
|
||||
example, I cache misses (<option>--show=I1mr,ILmr</option>), or data
|
||||
read misses (<option>--show=D1mr,DLmr</option>), or LL data misses
|
||||
(<option>--show=DLmr,DLmw</option>). Best used in conjunction with
|
||||
<option>--sort</option>.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@ -935,9 +947,9 @@ this case.</para>
|
||||
events by appending any events for the
|
||||
<option>--sort</option> option with a colon
|
||||
and a number (no spaces, though). E.g. if you want to see
|
||||
each function that covers more than 1% of L2 read misses or 1% of L2
|
||||
each function that covers more than 1% of LL read misses or 1% of LL
|
||||
write misses, use this option:</para>
|
||||
<para><option>--sort=D2mr:1,D2mw:1</option></para>
|
||||
<para><option>--sort=DLmr:1,DLmw:1</option></para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
@ -1059,13 +1071,13 @@ information, but they can still be very useful for identifying
|
||||
bottlenecks.</para>
|
||||
|
||||
<para>
|
||||
After that, we have found that L2 misses are typically a much bigger source
|
||||
After that, we have found that LL misses are typically a much bigger source
|
||||
of slow-downs than L1 misses. So it's worth looking for any snippets of
|
||||
code with high <computeroutput>D2mr</computeroutput> or
|
||||
<computeroutput>D2mw</computeroutput> counts. (You can use
|
||||
<option>--show=D2mr
|
||||
--sort=D2mr</option> with cg_annotate to focus just on
|
||||
<literal>D2mr</literal> counts, for example.) If you find any, it's still
|
||||
code with high <computeroutput>DLmr</computeroutput> or
|
||||
<computeroutput>DLmw</computeroutput> counts. (You can use
|
||||
<option>--show=DLmr
|
||||
--sort=DLmr</option> with cg_annotate to focus just on
|
||||
<literal>DLmr</literal> counts, for example.) If you find any, it's still
|
||||
not always easy to work out how to improve things. You need to have a
|
||||
reasonable understanding of how caches work, the principles of locality, and
|
||||
your program's data access patterns. Improving things may require
|
||||
@ -1153,12 +1165,12 @@ follows:</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>Inclusive L2 cache: the L2 cache typically replicates all
|
||||
<para>Inclusive LL cache: the LL cache typically replicates all
|
||||
the entries of the L1 caches, because fetching into L1 involves
|
||||
fetching into L2 first (this does not guarantee strict inclusiveness,
|
||||
as lines evicted from L2 still could reside in L1). This is
|
||||
fetching into LL first (this does not guarantee strict inclusiveness,
|
||||
as lines evicted from LL still could reside in L1). This is
|
||||
standard on Pentium chips, but AMD Opterons, Athlons and Durons
|
||||
use an exclusive L2 cache that only holds
|
||||
use an exclusive LL cache that only holds
|
||||
blocks evicted from L1. Ditto most modern VIA CPUs.</para>
|
||||
</listitem>
|
||||
|
||||
@ -1172,10 +1184,10 @@ early incarnation that doesn't give any cache information, then
|
||||
Cachegrind will fall back to using a default configuration (that
|
||||
of a model 3/4 Athlon). Cachegrind will tell you if this
|
||||
happens. You can manually specify one, two or all three levels
|
||||
(I1/D1/L2) of the cache from the command line using the
|
||||
(I1/D1/LL) of the cache from the command line using the
|
||||
<option>--I1</option>,
|
||||
<option>--D1</option> and
|
||||
<option>--L2</option> options.
|
||||
<option>--LL</option> options.
|
||||
For cache parameters to be valid for simulation, the number
|
||||
of sets (with associativity being the number of cache lines in
|
||||
each set) has to be a power of two.</para>
|
||||
@ -1186,7 +1198,7 @@ determine the cache configuration, so you will
|
||||
need to specify it with the
|
||||
<option>--I1</option>,
|
||||
<option>--D1</option> and
|
||||
<option>--L2</option> options.</para>
|
||||
<option>--LL</option> options.</para>
|
||||
|
||||
|
||||
<para>Other noteworthy behaviour:</para>
|
||||
|
||||
@ -2,16 +2,16 @@
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -2,16 +2,16 @@
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -7,11 +7,11 @@ $dir/../../tests/filter_stderr_basic |
|
||||
# Remove "Cachegrind, ..." line and the following copyright line.
|
||||
sed "/^Cachegrind, a cache and branch-prediction profiler/ , /./ d" |
|
||||
|
||||
# Remove numbers from I/D/L2 "refs:" lines
|
||||
perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' |
|
||||
# Remove numbers from I/D/LL "refs:" lines
|
||||
perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' |
|
||||
|
||||
# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
|
||||
perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
|
||||
# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
|
||||
perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
|
||||
|
||||
# Remove CPUID warnings lines for P4s and other machines
|
||||
sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
|
||||
|
||||
@ -2,16 +2,16 @@
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
prog: ../../tests/true
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
|
||||
cleanup: rm cachegrind.out.*
|
||||
|
||||
@ -2,16 +2,16 @@
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -2,16 +2,16 @@
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -414,7 +414,7 @@ for "Ir and "Dr".</para>
|
||||
<para>This specifies various information for this dump. For some
|
||||
types, the semantic is defined, but any description type is allowed.
|
||||
Unknown types should be ignored.</para>
|
||||
<para>There are the types "I1 cache", "D1 cache", "L2 cache", which
|
||||
<para>There are the types "I1 cache", "D1 cache", "LL cache", which
|
||||
specify parameters used for the cache simulator. These are the only
|
||||
types originally used by Cachegrind. Additionally, Callgrind uses
|
||||
the following types: "Timerange" gives a rough range of the basic
|
||||
@ -457,7 +457,7 @@ for "Ir and "Dr".</para>
|
||||
<para><command>I1mr</command>: Instruction Level 1 read cache miss</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para><command>I2mr</command>: Instruction Level 2 read cache miss</para>
|
||||
<para><command>ILmr</command>: Instruction last-level read cache miss</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>...</para>
|
||||
|
||||
@ -933,9 +933,9 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
|
||||
<para>Specify if you want to do full cache simulation. By default,
|
||||
only instruction read accesses will be counted ("Ir").
|
||||
With cache simulation, further event counters are enabled:
|
||||
Cache misses on instruction reads ("I1mr"/"I2mr"),
|
||||
data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"),
|
||||
data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw").
|
||||
Cache misses on instruction reads ("I1mr"/"ILmr"),
|
||||
data read accesses ("Dr") and related cache misses ("D1mr"/"DLmr"),
|
||||
data write accesses ("Dw") and related cache misses ("D1mw"/"DLmw").
|
||||
For more information, see <xref linkend="cg-manual"/>.
|
||||
</para>
|
||||
</listitem>
|
||||
@ -972,13 +972,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
|
||||
</term>
|
||||
<listitem>
|
||||
<para>Specify whether write-back behavior should be simulated, allowing
|
||||
to distinguish L2 caches misses with and without write backs.
|
||||
to distinguish LL caches misses with and without write backs.
|
||||
The cache model of Cachegrind/Callgrind does not specify write-through
|
||||
vs. write-back behavior, and this also is not relevant for the number
|
||||
of generated miss counts. However, with explicit write-back simulation
|
||||
it can be decided whether a miss triggers not only the loading of a new
|
||||
cache line, but also if a write back of a dirty cache line had to take
|
||||
place before. The new dirty miss events are I2dmr, D2dmr, and D2dmw,
|
||||
place before. The new dirty miss events are ILdmr, DLdmr, and DLdmw,
|
||||
for misses because of instruction read, data read, and data write,
|
||||
respectively. As they produce two memory transactions, they should
|
||||
account for a doubled time estimation in relation to a normal miss.
|
||||
@ -1016,13 +1016,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
|
||||
bad access behavior). The new counters are defined in a way such
|
||||
that worse behavior results in higher cost.
|
||||
AcCost1 and AcCost2 are counters showing bad temporal locality
|
||||
for L1 and L2 caches, respectively. This is done by summing up
|
||||
for L1 and LL caches, respectively. This is done by summing up
|
||||
reciprocal values of the numbers of accesses of each cache line,
|
||||
multiplied by 1000 (as only integer costs are allowed). E.g. for
|
||||
a given source line with 5 read accesses, a value of 5000 AcCost
|
||||
means that for every access, a new cache line was loaded and directly
|
||||
evicted afterwards without further accesses. Similarly, SpLoss1/2
|
||||
shows bad spatial locality for L1 and L2 caches, respectively. It
|
||||
shows bad spatial locality for L1 and LL caches, respectively. It
|
||||
gives the <emphasis>spatial loss</emphasis> count of bytes which
|
||||
were loaded into cache but never accessed. It pinpoints at code
|
||||
accessing data in a way such that cache space is wasted. This hints
|
||||
@ -1059,12 +1059,12 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="opt.L2" xreflabel="--L2">
|
||||
<varlistentry id="opt.LL" xreflabel="--LL">
|
||||
<term>
|
||||
<option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
|
||||
<option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
|
||||
</term>
|
||||
<listitem>
|
||||
<para>Specify the size, associativity and line size of the level 2
|
||||
<para>Specify the size, associativity and line size of the last-level
|
||||
cache.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
282
callgrind/sim.c
282
callgrind/sim.c
@ -91,7 +91,7 @@ typedef struct {
|
||||
* States of flat caches in our model.
|
||||
* We use a 2-level hierarchy,
|
||||
*/
|
||||
static cache_t2 I1, D1, L2;
|
||||
static cache_t2 I1, D1, LL;
|
||||
|
||||
/* Lower bits of cache tags are used as flags for a cache line */
|
||||
#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
|
||||
@ -123,8 +123,8 @@ static Int off_I1_AcCost = 0;
|
||||
static Int off_I1_SpLoss = 1;
|
||||
static Int off_D1_AcCost = 0;
|
||||
static Int off_D1_SpLoss = 1;
|
||||
static Int off_L2_AcCost = 2;
|
||||
static Int off_L2_SpLoss = 3;
|
||||
static Int off_LL_AcCost = 2;
|
||||
static Int off_LL_SpLoss = 3;
|
||||
|
||||
/* Cache access types */
|
||||
typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
|
||||
@ -135,7 +135,7 @@ typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
|
||||
/* Result of a reference into a hierarchical cache model */
|
||||
typedef enum {
|
||||
L1_Hit,
|
||||
L2_Hit,
|
||||
LL_Hit,
|
||||
MemAccess,
|
||||
WriteBackMemAccess } CacheModelResult;
|
||||
|
||||
@ -231,7 +231,7 @@ static void print_cache(cache_t2* c)
|
||||
/*------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Simple model: L1 & L2 Write Through
|
||||
* Simple model: L1 & LL Write Through
|
||||
* Does not distinguish among read and write references
|
||||
*
|
||||
* Simulator functions:
|
||||
@ -305,7 +305,7 @@ static
|
||||
CacheModelResult cachesim_I1_ref(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
||||
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
|
||||
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
||||
return MemAccess;
|
||||
}
|
||||
|
||||
@ -313,7 +313,7 @@ static
|
||||
CacheModelResult cachesim_D1_ref(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
||||
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
|
||||
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
||||
return MemAccess;
|
||||
}
|
||||
|
||||
@ -323,7 +323,7 @@ CacheModelResult cachesim_D1_ref(Addr a, UChar size)
|
||||
/*------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* More complex model: L1 Write-through, L2 Write-back
|
||||
* More complex model: L1 Write-through, LL Write-back
|
||||
* This needs to distinguish among read and write references.
|
||||
*
|
||||
* Simulator functions:
|
||||
@ -412,8 +412,8 @@ static
|
||||
CacheModelResult cachesim_I1_Read(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
||||
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
|
||||
case Hit: return L2_Hit;
|
||||
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
||||
case Hit: return LL_Hit;
|
||||
case Miss: return MemAccess;
|
||||
default: break;
|
||||
}
|
||||
@ -424,8 +424,8 @@ static
|
||||
CacheModelResult cachesim_D1_Read(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
||||
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
|
||||
case Hit: return L2_Hit;
|
||||
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
||||
case Hit: return LL_Hit;
|
||||
case Miss: return MemAccess;
|
||||
default: break;
|
||||
}
|
||||
@ -437,14 +437,14 @@ CacheModelResult cachesim_D1_Write(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &D1, a, size) == Hit ) {
|
||||
/* Even for a L1 hit, the write-trough L1 passes
|
||||
* the write to the L2 to make the L2 line dirty.
|
||||
* the write to the LL to make the LL line dirty.
|
||||
* But this causes no latency, so return the hit.
|
||||
*/
|
||||
cachesim_ref_wb( &L2, Write, a, size);
|
||||
cachesim_ref_wb( &LL, Write, a, size);
|
||||
return L1_Hit;
|
||||
}
|
||||
switch( cachesim_ref_wb( &L2, Write, a, size) ) {
|
||||
case Hit: return L2_Hit;
|
||||
switch( cachesim_ref_wb( &LL, Write, a, size) ) {
|
||||
case Hit: return LL_Hit;
|
||||
case Miss: return MemAccess;
|
||||
default: break;
|
||||
}
|
||||
@ -479,10 +479,10 @@ void prefetch_clear(void)
|
||||
* One stream can be detected per 4k page.
|
||||
*/
|
||||
static __inline__
|
||||
void prefetch_L2_doref(Addr a)
|
||||
void prefetch_LL_doref(Addr a)
|
||||
{
|
||||
UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
|
||||
UInt block = ( a >> L2.line_size_bits);
|
||||
UInt block = ( a >> LL.line_size_bits);
|
||||
|
||||
if (block != pf_lastblock[stream]) {
|
||||
if (pf_seqblocks[stream] == 0) {
|
||||
@ -494,7 +494,7 @@ void prefetch_L2_doref(Addr a)
|
||||
pf_seqblocks[stream]++;
|
||||
if (pf_seqblocks[stream] >= 2) {
|
||||
prefetch_up++;
|
||||
cachesim_ref(&L2, a + 5 * L2.line_size,1);
|
||||
cachesim_ref(&LL, a + 5 * LL.line_size,1);
|
||||
}
|
||||
}
|
||||
else pf_seqblocks[stream] = 0;
|
||||
@ -504,7 +504,7 @@ void prefetch_L2_doref(Addr a)
|
||||
pf_seqblocks[stream]--;
|
||||
if (pf_seqblocks[stream] <= -2) {
|
||||
prefetch_down++;
|
||||
cachesim_ref(&L2, a - 5 * L2.line_size,1);
|
||||
cachesim_ref(&LL, a - 5 * LL.line_size,1);
|
||||
}
|
||||
}
|
||||
else pf_seqblocks[stream] = 0;
|
||||
@ -519,8 +519,8 @@ static
|
||||
CacheModelResult prefetch_I1_ref(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
||||
prefetch_L2_doref(a);
|
||||
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
|
||||
prefetch_LL_doref(a);
|
||||
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
||||
return MemAccess;
|
||||
}
|
||||
|
||||
@ -528,8 +528,8 @@ static
|
||||
CacheModelResult prefetch_D1_ref(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
||||
prefetch_L2_doref(a);
|
||||
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
|
||||
prefetch_LL_doref(a);
|
||||
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
|
||||
return MemAccess;
|
||||
}
|
||||
|
||||
@ -540,9 +540,9 @@ static
|
||||
CacheModelResult prefetch_I1_Read(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
|
||||
prefetch_L2_doref(a);
|
||||
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
|
||||
case Hit: return L2_Hit;
|
||||
prefetch_LL_doref(a);
|
||||
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
||||
case Hit: return LL_Hit;
|
||||
case Miss: return MemAccess;
|
||||
default: break;
|
||||
}
|
||||
@ -553,9 +553,9 @@ static
|
||||
CacheModelResult prefetch_D1_Read(Addr a, UChar size)
|
||||
{
|
||||
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
|
||||
prefetch_L2_doref(a);
|
||||
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
|
||||
case Hit: return L2_Hit;
|
||||
prefetch_LL_doref(a);
|
||||
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
|
||||
case Hit: return LL_Hit;
|
||||
case Miss: return MemAccess;
|
||||
default: break;
|
||||
}
|
||||
@ -565,17 +565,17 @@ CacheModelResult prefetch_D1_Read(Addr a, UChar size)
|
||||
static
|
||||
CacheModelResult prefetch_D1_Write(Addr a, UChar size)
|
||||
{
|
||||
prefetch_L2_doref(a);
|
||||
prefetch_LL_doref(a);
|
||||
if ( cachesim_ref( &D1, a, size) == Hit ) {
|
||||
/* Even for a L1 hit, the write-trough L1 passes
|
||||
* the write to the L2 to make the L2 line dirty.
|
||||
* the write to the LL to make the LL line dirty.
|
||||
* But this causes no latency, so return the hit.
|
||||
*/
|
||||
cachesim_ref_wb( &L2, Write, a, size);
|
||||
cachesim_ref_wb( &LL, Write, a, size);
|
||||
return L1_Hit;
|
||||
}
|
||||
switch( cachesim_ref_wb( &L2, Write, a, size) ) {
|
||||
case Hit: return L2_Hit;
|
||||
switch( cachesim_ref_wb( &LL, Write, a, size) ) {
|
||||
case Hit: return LL_Hit;
|
||||
case Miss: return MemAccess;
|
||||
default: break;
|
||||
}
|
||||
@ -736,7 +736,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
|
||||
/* Second case: word straddles two lines. */ \
|
||||
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
|
||||
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
|
||||
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
|
||||
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
|
||||
set = &(L.tags[set1 * L.assoc]); \
|
||||
use_mask = L.line_start_mask[a & L.line_size_mask]; \
|
||||
if (tag == (set[0] & L.tag_mask)) { \
|
||||
@ -809,7 +809,7 @@ block2: \
|
||||
idx = (set2 * L.assoc) + tmp_tag; \
|
||||
miss2 = update_##L##_use(&L, idx, \
|
||||
use_mask, (a+size-1) &~ L.line_size_mask); \
|
||||
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
|
||||
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
|
||||
\
|
||||
} else { \
|
||||
VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
|
||||
@ -837,13 +837,13 @@ static __inline__ unsigned int countBits(unsigned int bits)
|
||||
return c;
|
||||
}
|
||||
|
||||
static void update_L2_use(int idx, Addr memline)
|
||||
static void update_LL_use(int idx, Addr memline)
|
||||
{
|
||||
line_loaded* loaded = &(L2.loaded[idx]);
|
||||
line_use* use = &(L2.use[idx]);
|
||||
int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
|
||||
line_loaded* loaded = &(LL.loaded[idx]);
|
||||
line_use* use = &(LL.use[idx]);
|
||||
int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
|
||||
|
||||
CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
|
||||
CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
|
||||
idx, CLG_(bb_base) + current_ii->instr_offset, memline);
|
||||
if (use->count>0) {
|
||||
CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
|
||||
@ -852,8 +852,8 @@ static void update_L2_use(int idx, Addr memline)
|
||||
CLG_(current_state).collect, loaded->use_base);
|
||||
|
||||
if (CLG_(current_state).collect && loaded->use_base) {
|
||||
(loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
|
||||
(loaded->use_base)[off_L2_SpLoss] += i;
|
||||
(loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
|
||||
(loaded->use_base)[off_LL_SpLoss] += i;
|
||||
}
|
||||
}
|
||||
|
||||
@ -868,53 +868,53 @@ static void update_L2_use(int idx, Addr memline)
|
||||
}
|
||||
|
||||
static
|
||||
CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
|
||||
CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
|
||||
{
|
||||
UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
|
||||
UWord* set = &(L2.tags[setNo * L2.assoc]);
|
||||
UWord tag = memline & L2.tag_mask;
|
||||
UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
|
||||
UWord* set = &(LL.tags[setNo * LL.assoc]);
|
||||
UWord tag = memline & LL.tag_mask;
|
||||
|
||||
int i, j, idx;
|
||||
UWord tmp_tag;
|
||||
|
||||
CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
|
||||
CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
|
||||
|
||||
if (tag == (set[0] & L2.tag_mask)) {
|
||||
idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
|
||||
l1_loaded->dep_use = &(L2.use[idx]);
|
||||
if (tag == (set[0] & LL.tag_mask)) {
|
||||
idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
|
||||
l1_loaded->dep_use = &(LL.use[idx]);
|
||||
|
||||
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
|
||||
idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
|
||||
L2.use[idx].mask, L2.use[idx].count);
|
||||
return L2_Hit;
|
||||
idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
|
||||
LL.use[idx].mask, LL.use[idx].count);
|
||||
return LL_Hit;
|
||||
}
|
||||
for (i = 1; i < L2.assoc; i++) {
|
||||
if (tag == (set[i] & L2.tag_mask)) {
|
||||
for (i = 1; i < LL.assoc; i++) {
|
||||
if (tag == (set[i] & LL.tag_mask)) {
|
||||
tmp_tag = set[i];
|
||||
for (j = i; j > 0; j--) {
|
||||
set[j] = set[j - 1];
|
||||
}
|
||||
set[0] = tmp_tag;
|
||||
idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
|
||||
l1_loaded->dep_use = &(L2.use[idx]);
|
||||
idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
|
||||
l1_loaded->dep_use = &(LL.use[idx]);
|
||||
|
||||
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
|
||||
i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
|
||||
L2.use[idx].mask, L2.use[idx].count);
|
||||
return L2_Hit;
|
||||
i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
|
||||
LL.use[idx].mask, LL.use[idx].count);
|
||||
return LL_Hit;
|
||||
}
|
||||
}
|
||||
|
||||
/* A miss; install this tag as MRU, shuffle rest down. */
|
||||
tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
|
||||
for (j = L2.assoc - 1; j > 0; j--) {
|
||||
tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
|
||||
for (j = LL.assoc - 1; j > 0; j--) {
|
||||
set[j] = set[j - 1];
|
||||
}
|
||||
set[0] = tag | tmp_tag;
|
||||
idx = (setNo * L2.assoc) + tmp_tag;
|
||||
l1_loaded->dep_use = &(L2.use[idx]);
|
||||
idx = (setNo * LL.assoc) + tmp_tag;
|
||||
l1_loaded->dep_use = &(LL.use[idx]);
|
||||
|
||||
update_L2_use(idx, memline);
|
||||
update_LL_use(idx, memline);
|
||||
|
||||
return MemAccess;
|
||||
}
|
||||
@ -943,7 +943,7 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
|
||||
(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
|
||||
(loaded->use_base)[off_##L##_SpLoss] += c; \
|
||||
\
|
||||
/* FIXME (?): L1/L2 line sizes must be equal ! */ \
|
||||
/* FIXME (?): L1/LL line sizes must be equal ! */ \
|
||||
loaded->dep_use->mask |= use->mask; \
|
||||
loaded->dep_use->count += use->count; \
|
||||
} \
|
||||
@ -957,8 +957,8 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
|
||||
CLG_(current_state).nonskipped->skipped : \
|
||||
CLG_(cost_base) + current_ii->cost_offset; \
|
||||
\
|
||||
if (memline == 0) return L2_Hit; \
|
||||
return cacheuse_L2_access(memline, loaded); \
|
||||
if (memline == 0) return LL_Hit; \
|
||||
return cacheuse_LL_access(memline, loaded); \
|
||||
}
|
||||
|
||||
UPDATE_USE(I1);
|
||||
@ -991,10 +991,10 @@ void cacheuse_finish(void)
|
||||
if (D1.loaded[i].use_base)
|
||||
update_D1_use( &D1, i, 0,0);
|
||||
|
||||
if (L2.use)
|
||||
for (i = 0; i < L2.sets * L2.assoc; i++)
|
||||
if (L2.loaded[i].use_base)
|
||||
update_L2_use(i, 0);
|
||||
if (LL.use)
|
||||
for (i = 0; i < LL.sets * LL.assoc; i++)
|
||||
if (LL.loaded[i].use_base)
|
||||
update_LL_use(i, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -1020,7 +1020,7 @@ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
|
||||
c2[2]++;
|
||||
// fall through
|
||||
|
||||
case L2_Hit:
|
||||
case LL_Hit:
|
||||
c1[1]++;
|
||||
c2[1]++;
|
||||
// fall through
|
||||
@ -1036,9 +1036,9 @@ Char* cacheRes(CacheModelResult r)
|
||||
{
|
||||
switch(r) {
|
||||
case L1_Hit: return "L1 Hit ";
|
||||
case L2_Hit: return "L2 Hit ";
|
||||
case MemAccess: return "L2 Miss";
|
||||
case WriteBackMemAccess: return "L2 Miss (dirty)";
|
||||
case LL_Hit: return "LL Hit ";
|
||||
case MemAccess: return "LL Miss";
|
||||
case WriteBackMemAccess: return "LL Miss (dirty)";
|
||||
default:
|
||||
tl_assert(0);
|
||||
}
|
||||
@ -1268,7 +1268,7 @@ static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
|
||||
|
||||
static cache_t clo_I1_cache = UNDEFINED_CACHE;
|
||||
static cache_t clo_D1_cache = UNDEFINED_CACHE;
|
||||
static cache_t clo_L2_cache = UNDEFINED_CACHE;
|
||||
static cache_t clo_LL_cache = UNDEFINED_CACHE;
|
||||
|
||||
|
||||
// Checks cache config is ok. Returns NULL if ok, or a pointer to an error
|
||||
@ -1308,7 +1308,7 @@ static Char* check_cache(cache_t* cache)
|
||||
}
|
||||
|
||||
static
|
||||
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
|
||||
{
|
||||
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
|
||||
|
||||
@ -1317,30 +1317,30 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
Bool all_caches_clo_defined =
|
||||
(DEFINED(clo_I1_cache) &&
|
||||
DEFINED(clo_D1_cache) &&
|
||||
DEFINED(clo_L2_cache));
|
||||
DEFINED(clo_LL_cache));
|
||||
|
||||
// Set the cache config (using auto-detection, if supported by the
|
||||
// architecture).
|
||||
VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
|
||||
VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
|
||||
|
||||
// Check the default/auto-detected values.
|
||||
checkRes = check_cache(I1c); tl_assert(!checkRes);
|
||||
checkRes = check_cache(D1c); tl_assert(!checkRes);
|
||||
checkRes = check_cache(L2c); tl_assert(!checkRes);
|
||||
checkRes = check_cache(LLc); tl_assert(!checkRes);
|
||||
|
||||
// Then replace with any defined on the command line.
|
||||
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
|
||||
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
|
||||
if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
|
||||
if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
|
||||
|
||||
if (VG_(clo_verbosity) > 1) {
|
||||
VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
|
||||
VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines\n",
|
||||
I1c->size, I1c->assoc, I1c->line_size);
|
||||
VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines\n",
|
||||
D1c->size, D1c->assoc, D1c->line_size);
|
||||
VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines\n",
|
||||
L2c->size, L2c->assoc, L2c->line_size);
|
||||
VG_(umsg)("Cache configuration used:\n");
|
||||
VG_(umsg)(" I1: %dB, %d-way, %dB lines\n",
|
||||
I1c->size, I1c->assoc, I1c->line_size);
|
||||
VG_(umsg)(" D1: %dB, %d-way, %dB lines\n",
|
||||
D1c->size, D1c->assoc, D1c->line_size);
|
||||
VG_(umsg)(" LL: %dB, %d-way, %dB lines\n",
|
||||
LLc->size, LLc->assoc, LLc->line_size);
|
||||
}
|
||||
#undef CMD_LINE_DEFINED
|
||||
}
|
||||
@ -1350,7 +1350,7 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
|
||||
static void cachesim_post_clo_init(void)
|
||||
{
|
||||
/* Cache configurations. */
|
||||
cache_t I1c, D1c, L2c;
|
||||
cache_t I1c, D1c, LLc;
|
||||
|
||||
/* Initialize access handlers */
|
||||
if (!CLG_(clo).simulate_cache) {
|
||||
@ -1374,15 +1374,15 @@ static void cachesim_post_clo_init(void)
|
||||
}
|
||||
|
||||
/* Configuration of caches only needed with real cache simulation */
|
||||
configure_caches(&I1c, &D1c, &L2c);
|
||||
configure_caches(&I1c, &D1c, &LLc);
|
||||
|
||||
I1.name = "I1";
|
||||
D1.name = "D1";
|
||||
L2.name = "L2";
|
||||
LL.name = "LL";
|
||||
|
||||
cachesim_initcache(I1c, &I1);
|
||||
cachesim_initcache(D1c, &D1);
|
||||
cachesim_initcache(L2c, &L2);
|
||||
cachesim_initcache(LLc, &LL);
|
||||
|
||||
/* the other cache simulators use the standard helpers
|
||||
* with dispatching via simulator struct */
|
||||
@ -1463,7 +1463,7 @@ void cachesim_clear(void)
|
||||
{
|
||||
cachesim_clearcache(&I1);
|
||||
cachesim_clearcache(&D1);
|
||||
cachesim_clearcache(&L2);
|
||||
cachesim_clearcache(&LL);
|
||||
|
||||
prefetch_clear();
|
||||
}
|
||||
@ -1474,7 +1474,7 @@ static void cachesim_getdesc(Char* buf)
|
||||
Int p;
|
||||
p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
|
||||
p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
|
||||
VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
|
||||
VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
|
||||
}
|
||||
|
||||
static
|
||||
@ -1490,11 +1490,12 @@ void cachesim_print_opts(void)
|
||||
" --cacheuse=no|yes Collect cache block use [no]\n"
|
||||
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
|
||||
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
|
||||
" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
|
||||
" --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
|
||||
);
|
||||
}
|
||||
|
||||
static void parse_opt ( cache_t* cache, char* opt, Char* optval )
|
||||
static void parse_opt ( cache_t* cache,
|
||||
char* opt, Char* optval, UChar kind )
|
||||
{
|
||||
Long i1, i2, i3;
|
||||
Char* endptr;
|
||||
@ -1550,11 +1551,12 @@ static Bool cachesim_parse_opt(Char* arg)
|
||||
}
|
||||
|
||||
else if VG_STR_CLO(arg, "--I1", tmp_str)
|
||||
parse_opt(&clo_I1_cache, arg, tmp_str);
|
||||
parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
|
||||
else if VG_STR_CLO(arg, "--D1", tmp_str)
|
||||
parse_opt(&clo_D1_cache, arg, tmp_str);
|
||||
else if VG_STR_CLO(arg, "--L2", tmp_str)
|
||||
parse_opt(&clo_L2_cache, arg, tmp_str);
|
||||
parse_opt(&clo_D1_cache, arg, tmp_str, '1');
|
||||
else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
|
||||
VG_STR_CLO(arg, "--LL", tmp_str))
|
||||
parse_opt(&clo_LL_cache, arg, tmp_str, '2');
|
||||
else
|
||||
return False;
|
||||
|
||||
@ -1613,8 +1615,8 @@ static
|
||||
void cachesim_printstat(Int l1, Int l2, Int l3)
|
||||
{
|
||||
FullCost total = CLG_(total_cost), D_total = 0;
|
||||
ULong L2_total_m, L2_total_mr, L2_total_mw,
|
||||
L2_total, L2_total_r, L2_total_w;
|
||||
ULong LL_total_m, LL_total_mr, LL_total_mw,
|
||||
LL_total, LL_total_r, LL_total_w;
|
||||
char buf1[RESULTS_BUF_LEN],
|
||||
buf2[RESULTS_BUF_LEN],
|
||||
buf3[RESULTS_BUF_LEN];
|
||||
@ -1632,7 +1634,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
|
||||
VG_(message)(Vg_UserMsg, "I1 misses: %s\n", buf1);
|
||||
|
||||
commify(total[fullOffset(EG_IR) +2], l1, buf1);
|
||||
VG_(message)(Vg_UserMsg, "L2i misses: %s\n", buf1);
|
||||
VG_(message)(Vg_UserMsg, "LLi misses: %s\n", buf1);
|
||||
|
||||
p = 100;
|
||||
|
||||
@ -1645,7 +1647,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
|
||||
|
||||
percentify(total[fullOffset(EG_IR)+2] * 100 * p /
|
||||
total[fullOffset(EG_IR)], p, l1+1, buf1);
|
||||
VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
|
||||
VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
|
||||
VG_(message)(Vg_UserMsg, "\n");
|
||||
|
||||
/* D cache results.
|
||||
@ -1673,7 +1675,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
|
||||
commify( D_total[2], l1, buf1);
|
||||
commify(total[fullOffset(EG_DR)+2], l2, buf2);
|
||||
commify(total[fullOffset(EG_DW)+2], l3, buf3);
|
||||
VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)\n",
|
||||
VG_(message)(Vg_UserMsg, "LLd misses: %s (%s rd + %s wr)\n",
|
||||
buf1, buf2, buf3);
|
||||
|
||||
p = 10;
|
||||
@ -1695,50 +1697,50 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
|
||||
total[fullOffset(EG_DR)], p, l2+1, buf2);
|
||||
percentify(total[fullOffset(EG_DW)+2] * 100 * p /
|
||||
total[fullOffset(EG_DW)], p, l3+1, buf3);
|
||||
VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )\n",
|
||||
VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s + %s )\n",
|
||||
buf1, buf2,buf3);
|
||||
VG_(message)(Vg_UserMsg, "\n");
|
||||
|
||||
|
||||
|
||||
/* L2 overall results */
|
||||
/* LL overall results */
|
||||
|
||||
L2_total =
|
||||
LL_total =
|
||||
total[fullOffset(EG_DR) +1] +
|
||||
total[fullOffset(EG_DW) +1] +
|
||||
total[fullOffset(EG_IR) +1];
|
||||
L2_total_r =
|
||||
LL_total_r =
|
||||
total[fullOffset(EG_DR) +1] +
|
||||
total[fullOffset(EG_IR) +1];
|
||||
L2_total_w = total[fullOffset(EG_DW) +1];
|
||||
commify(L2_total, l1, buf1);
|
||||
commify(L2_total_r, l2, buf2);
|
||||
commify(L2_total_w, l3, buf3);
|
||||
VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)\n",
|
||||
LL_total_w = total[fullOffset(EG_DW) +1];
|
||||
commify(LL_total, l1, buf1);
|
||||
commify(LL_total_r, l2, buf2);
|
||||
commify(LL_total_w, l3, buf3);
|
||||
VG_(message)(Vg_UserMsg, "LL refs: %s (%s rd + %s wr)\n",
|
||||
buf1, buf2, buf3);
|
||||
|
||||
L2_total_m =
|
||||
LL_total_m =
|
||||
total[fullOffset(EG_DR) +2] +
|
||||
total[fullOffset(EG_DW) +2] +
|
||||
total[fullOffset(EG_IR) +2];
|
||||
L2_total_mr =
|
||||
LL_total_mr =
|
||||
total[fullOffset(EG_DR) +2] +
|
||||
total[fullOffset(EG_IR) +2];
|
||||
L2_total_mw = total[fullOffset(EG_DW) +2];
|
||||
commify(L2_total_m, l1, buf1);
|
||||
commify(L2_total_mr, l2, buf2);
|
||||
commify(L2_total_mw, l3, buf3);
|
||||
VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)\n",
|
||||
LL_total_mw = total[fullOffset(EG_DW) +2];
|
||||
commify(LL_total_m, l1, buf1);
|
||||
commify(LL_total_mr, l2, buf2);
|
||||
commify(LL_total_mw, l3, buf3);
|
||||
VG_(message)(Vg_UserMsg, "LL misses: %s (%s rd + %s wr)\n",
|
||||
buf1, buf2, buf3);
|
||||
|
||||
percentify(L2_total_m * 100 * p /
|
||||
percentify(LL_total_m * 100 * p /
|
||||
(total[fullOffset(EG_IR)] + D_total[0]), p, l1+1, buf1);
|
||||
percentify(L2_total_mr * 100 * p /
|
||||
percentify(LL_total_mr * 100 * p /
|
||||
(total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
|
||||
p, l2+1, buf2);
|
||||
percentify(L2_total_mw * 100 * p /
|
||||
percentify(LL_total_mw * 100 * p /
|
||||
total[fullOffset(EG_DW)], p, l3+1, buf3);
|
||||
VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )\n",
|
||||
VG_(message)(Vg_UserMsg, "LL miss rate: %s (%s + %s )\n",
|
||||
buf1, buf2,buf3);
|
||||
}
|
||||
|
||||
@ -1760,14 +1762,14 @@ void CLG_(init_eventsets)()
|
||||
if (!CLG_(clo).simulate_cache)
|
||||
CLG_(register_event_group)(EG_IR, "Ir");
|
||||
else if (!clo_simulate_writeback) {
|
||||
CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "I2mr");
|
||||
CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "D2mr");
|
||||
CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "D2mw");
|
||||
CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
|
||||
CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
|
||||
CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
|
||||
}
|
||||
else { // clo_simulate_writeback
|
||||
CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "I2mr", "I2dmr");
|
||||
CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "D2mr", "D2dmr");
|
||||
CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
|
||||
CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
|
||||
CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
|
||||
CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
|
||||
}
|
||||
|
||||
if (CLG_(clo).simulate_branch) {
|
||||
@ -1807,12 +1809,12 @@ void CLG_(init_eventsets)()
|
||||
CLG_(append_event)(CLG_(dumpmap), "I1mr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "D1mr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "D1mw");
|
||||
CLG_(append_event)(CLG_(dumpmap), "I2mr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "D2mr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "D2mw");
|
||||
CLG_(append_event)(CLG_(dumpmap), "I2dmr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "D2dmr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "D2dmw");
|
||||
CLG_(append_event)(CLG_(dumpmap), "ILmr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "DLmr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "DLmw");
|
||||
CLG_(append_event)(CLG_(dumpmap), "ILdmr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "DLdmr");
|
||||
CLG_(append_event)(CLG_(dumpmap), "DLdmw");
|
||||
CLG_(append_event)(CLG_(dumpmap), "Bc");
|
||||
CLG_(append_event)(CLG_(dumpmap), "Bcm");
|
||||
CLG_(append_event)(CLG_(dumpmap), "Bi");
|
||||
|
||||
@ -13,11 +13,11 @@ sed "/^For interactive control,.*$/d" |
|
||||
# Remove numbers from "Collected" line
|
||||
sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |
|
||||
|
||||
# Remove numbers from I/D/L2 "refs:" lines
|
||||
perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' |
|
||||
# Remove numbers from I/D/LL "refs:" lines
|
||||
perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' |
|
||||
|
||||
# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
|
||||
perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
|
||||
# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
|
||||
perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
|
||||
|
||||
# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
|
||||
perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
prog: ../../tests/true
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-hwpref=yes
|
||||
cleanup: rm callgrind.out.*
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
prog: ../../tests/true
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --cacheuse=yes
|
||||
cleanup: rm callgrind.out.*
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
prog: ../../tests/true
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-wb=yes
|
||||
cleanup: rm callgrind.out.*
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
prog: ../../tests/true
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
|
||||
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
|
||||
cleanup: rm callgrind.out.*
|
||||
|
||||
@ -1,23 +1,23 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
Branches:
|
||||
Mispredicts:
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
|
||||
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
|
||||
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
|
||||
Collected :
|
||||
|
||||
I refs:
|
||||
I1 misses:
|
||||
L2i misses:
|
||||
LLi misses:
|
||||
I1 miss rate:
|
||||
L2i miss rate:
|
||||
LLi miss rate:
|
||||
|
||||
D refs:
|
||||
D1 misses:
|
||||
L2d misses:
|
||||
LLd misses:
|
||||
D1 miss rate:
|
||||
L2d miss rate:
|
||||
LLd miss rate:
|
||||
|
||||
L2 refs:
|
||||
L2 misses:
|
||||
L2 miss rate:
|
||||
LL refs:
|
||||
LL misses:
|
||||
LL miss rate:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user