Change Cachegrind/Callgrind to talk about the LL (last-level) cache instead

of the L2 cache.  This is to accommodate machines with three levels of
cache.  We still only simulate two levels, the first and the last.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11404
This commit is contained in:
Nicholas Nethercote 2010-10-06 22:46:31 +00:00
parent cb3fbb46d7
commit 60d9b410d4
34 changed files with 586 additions and 490 deletions

19
NEWS
View File

@ -16,6 +16,20 @@ Improvements:
--threshold option has changed; this is unlikely to affect many people, if
you do use it please see the user manual for details.
- Callgrind now can do branch prediction simulation, similar to Cachegrind.
In addition, it optionally can count the number of executed global bus events.
Both can be used for a better approximation of a "Cycle Estimation" as
derived event (you need to update the event formula in KCachegrind yourself).
- Cachegrind and Callgrind now refer to the LL (last-level) cache rather
than the L2 cache. This is to accommodate machines with three levels of
caches -- if Cachegrind/Callgrind auto-detects the cache configuration of
such a machine it will run the simulation as if the L2 cache isn't
present. This means the results are less likely to match the true result
for the machine, but Cachegrind/Callgrind's results are already only
approximate, and should not be considered authoritative. The results are
still useful for giving a general idea about a program's locality.
- Massif has a new option, --pages-as-heap, which is disabled by default.
When enabled, instead of tracking allocations at the level of heap blocks
(as allocated with malloc/new/new[]), it instead tracks memory allocations
@ -24,11 +38,6 @@ Improvements:
harder than the heap-level output, but this option is useful if you want
to account for every byte of memory used by a program.
- Callgrind now can do branch prediction simulation, similar to Cachegrind.
In addition, it optionally can count the number of executed global bus events.
Both can be used for a better approximation of a "Cycle Estimation" as
derived event (you need to update the event formula in KCachegrind yourself).
- Added new memcheck command-line option --show-possibly-lost.

View File

@ -37,13 +37,13 @@
#include "cg_arch.h"
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
// Set caches to default (for Cortex-A8 ?)
*I1c = (cache_t) { 16384, 4, 64 };
*D1c = (cache_t) { 16384, 4, 64 };
*L2c = (cache_t) { 262144, 8, 64 };
*LLc = (cache_t) { 262144, 8, 64 };
if (!all_caches_clo_defined) {
VG_(message)(Vg_DebugMsg,

View File

@ -37,13 +37,13 @@
#include "cg_arch.h"
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
// Set caches to default.
*I1c = (cache_t) { 65536, 2, 64 };
*D1c = (cache_t) { 65536, 2, 64 };
*L2c = (cache_t) { 262144, 8, 64 };
*LLc = (cache_t) { 262144, 8, 64 };
// Warn if config not completely specified from cmd line. Note that
// this message is slightly different from the one we give on x86/AMD64

View File

@ -37,13 +37,13 @@
#include "cg_arch.h"
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
// Set caches to default.
*I1c = (cache_t) { 65536, 2, 64 };
*D1c = (cache_t) { 65536, 2, 64 };
*L2c = (cache_t) { 262144, 8, 64 };
*LLc = (cache_t) { 262144, 8, 64 };
// Warn if config not completely specified from cmd line. Note that
// this message is slightly different from the one we give on x86/AMD64

View File

@ -54,9 +54,12 @@ static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
* array of pre-defined configurations for various parts of the memory
* hierarchy.
* According to Intel Processor Identification, App Note 485.
*
* If a L3 cache is found, then data for it rather than the L2
* is returned via *LLc.
*/
static
Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
Int cpuid1_eax;
Int cpuid1_ignore;
@ -65,6 +68,14 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
UChar info[16];
Int i, trials;
Bool L2_found = False;
/* If we see L3 cache info, copy it into L3c. Then, at the end,
copy it into *LLc. Hence if a L3 cache is specified, *LLc will
eventually contain a description of it rather than the L2 cache.
The use of the L3c intermediary makes this process independent
of the order in which the cache specifications appear in
info[]. */
Bool L3_found = False;
cache_t L3c = { 0, 0, 0 };
if (level < 2) {
VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level);
@ -121,18 +132,39 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
case 0x90: case 0x96: case 0x9b:
VG_(tool_panic)("IA-64 cache detected?!");
case 0x22: case 0x23: case 0x25: case 0x29:
case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
case 0xe2: case 0xe3: case 0xe4: case 0xea: case 0xeb: case 0xec:
VG_(dmsg)("warning: L3 cache detected but ignored\n");
break;
/* L3 cache info. */
case 0x22: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break;
case 0x23: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break;
case 0x25: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break;
case 0x29: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break;
case 0x46: L3c = (cache_t) { 4096, 4, 64 }; L3_found = True; break;
case 0x47: L3c = (cache_t) { 8192, 8, 64 }; L3_found = True; break;
case 0x4a: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break;
case 0x4b: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break;
case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break;
case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break;
case 0xd0: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break;
case 0xd1: L3c = (cache_t) { 1024, 4, 64 }; L3_found = True; break;
case 0xd2: L3c = (cache_t) { 2048, 4, 64 }; L3_found = True; break;
case 0xd6: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break;
case 0xd7: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break;
case 0xd8: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break;
case 0xdc: L3c = (cache_t) { 1536, 12, 64 }; L3_found = True; break;
case 0xdd: L3c = (cache_t) { 3072, 12, 64 }; L3_found = True; break;
case 0xde: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break;
case 0xe2: L3c = (cache_t) { 2048, 16, 64 }; L3_found = True; break;
case 0xe3: L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; break;
case 0xe4: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break;
case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break;
case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break;
case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break;
/* Described as "MLC" in Intel documentation */
case 0x21: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
case 0x21: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break;
/* These are sectored, whatever that means */
case 0x39: *L2c = (cache_t) { 128, 4, 64 }; L2_found = True; break;
case 0x3c: *L2c = (cache_t) { 256, 4, 64 }; L2_found = True; break;
case 0x39: *LLc = (cache_t) { 128, 4, 64 }; L2_found = True; break;
case 0x3c: *LLc = (cache_t) { 256, 4, 64 }; L2_found = True; break;
/* If a P6 core, this means "no L2 cache".
If a P4 core, this means "no L3 cache".
@ -141,20 +173,21 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
case 0x40:
break;
case 0x41: *L2c = (cache_t) { 128, 4, 32 }; L2_found = True; break;
case 0x42: *L2c = (cache_t) { 256, 4, 32 }; L2_found = True; break;
case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break;
case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
case 0x41: *LLc = (cache_t) { 128, 4, 32 }; L2_found = True; break;
case 0x42: *LLc = (cache_t) { 256, 4, 32 }; L2_found = True; break;
case 0x43: *LLc = (cache_t) { 512, 4, 32 }; L2_found = True; break;
case 0x44: *LLc = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
case 0x45: *LLc = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break;
case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
case 0x49:
if ((family == 15) && (model == 6))
/* On Xeon MP (family F, model 6), this is for L3 */
VG_(dmsg)("warning: L3 cache detected but ignored\n");
else
*L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
break;
case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
if (family == 15 && model == 6) {
/* On Xeon MP (family F, model 6), this is for L3 */
L3c = (cache_t) { 4096, 16, 64 }; L3_found = True;
} else {
*LLc = (cache_t) { 4096, 16, 64 }; L2_found = True;
}
break;
/* These are sectored, whatever that means */
case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */
@ -181,26 +214,24 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
break;
/* not sectored, whatever that might mean */
case 0x78: *L2c = (cache_t) { 1024, 4, 64 }; L2_found = True; break;
case 0x78: *LLc = (cache_t) { 1024, 4, 64 }; L2_found = True; break;
/* These are sectored, whatever that means */
case 0x79: *L2c = (cache_t) { 128, 8, 64 }; L2_found = True; break;
case 0x7a: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
case 0x7b: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
case 0x7c: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
case 0x7d: *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
case 0x7e: *L2c = (cache_t) { 256, 8, 128 }; L2_found = True; break;
case 0x7f: *L2c = (cache_t) { 512, 2, 64 }; L2_found = True; break;
case 0x80: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
case 0x81: *L2c = (cache_t) { 128, 8, 32 }; L2_found = True; break;
case 0x82: *L2c = (cache_t) { 256, 8, 32 }; L2_found = True; break;
case 0x83: *L2c = (cache_t) { 512, 8, 32 }; L2_found = True; break;
case 0x84: *L2c = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
case 0x85: *L2c = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
case 0x86: *L2c = (cache_t) { 512, 4, 64 }; L2_found = True; break;
case 0x87: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
case 0x79: *LLc = (cache_t) { 128, 8, 64 }; L2_found = True; break;
case 0x7a: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break;
case 0x7b: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break;
case 0x7c: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
case 0x7d: *LLc = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
case 0x7e: *LLc = (cache_t) { 256, 8, 128 }; L2_found = True; break;
case 0x7f: *LLc = (cache_t) { 512, 2, 64 }; L2_found = True; break;
case 0x80: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break;
case 0x81: *LLc = (cache_t) { 128, 8, 32 }; L2_found = True; break;
case 0x82: *LLc = (cache_t) { 256, 8, 32 }; L2_found = True; break;
case 0x83: *LLc = (cache_t) { 512, 8, 32 }; L2_found = True; break;
case 0x84: *LLc = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
case 0x85: *LLc = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
case 0x86: *LLc = (cache_t) { 512, 4, 64 }; L2_found = True; break;
case 0x87: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
/* Ignore prefetch information */
case 0xf0: case 0xf1:
@ -213,8 +244,15 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
}
}
/* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
if (L3_found) {
VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
*LLc = L3c;
L2_found = True;
}
if (!L2_found)
VG_(dmsg)("warning: L2 cache not installed, ignore L2 results.\n");
VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
return 0;
}
@ -241,14 +279,37 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
* 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
* so we detect that.
*
* Returns 0 on success, non-zero on failure.
* Returns 0 on success, non-zero on failure. As with the Intel code
* above, if a L3 cache is found, then data for it rather than the L2
* is returned via *LLc.
*/
/* A small helper */
static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
{
/* Decode a L2/L3 associativity indication. It is encoded
differently from the I1/D1 associativity. Returns 1
(direct-map) as a safe but suboptimal result for unknown
encodings. */
switch (bits_15_12 & 0xF) {
case 1: return 1; case 2: return 2;
case 4: return 4; case 6: return 8;
case 8: return 16; case 0xA: return 32;
case 0xB: return 48; case 0xC: return 64;
case 0xD: return 96; case 0xE: return 128;
case 0xF: /* fully associative */
case 0: /* L2/L3 cache or TLB is disabled */
default:
return 1;
}
}
static
Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
UInt ext_level;
UInt dummy, model;
UInt I1i, D1i, L2i;
UInt I1i, D1i, L2i, L3i;
VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
@ -259,7 +320,7 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
}
VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i);
VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
@ -277,15 +338,26 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
I1c->assoc = (I1i >> 16) & 0xff;
I1c->line_size = (I1i >> 0) & 0xff;
L2c->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
L2c->assoc = (L2i >> 12) & 0xf;
L2c->line_size = (L2i >> 0) & 0xff;
LLc->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
LLc->assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
LLc->line_size = (L2i >> 0) & 0xff;
if (((L3i >> 18) & 0x3fff) > 0) {
/* There's an L3 cache. Replace *LLc contents with this info. */
/* NB: the test in the if is "if L3 size > 0 ". I don't know if
this is the right way to test presence-vs-absence of L3. I
can't see any guidance on this in the AMD documentation. */
LLc->size = ((L3i >> 18) & 0x3fff) * 512;
LLc->assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
LLc->line_size = (L3i >> 0) & 0xff;
VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
}
return 0;
}
static
Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
Int level, ret;
Char vendor_id[13];
@ -306,10 +378,10 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
/* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
ret = Intel_cache_info(level, I1c, D1c, L2c);
ret = Intel_cache_info(level, I1c, D1c, LLc);
} else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
ret = AMD_cache_info(I1c, D1c, L2c);
ret = AMD_cache_info(I1c, D1c, LLc);
} else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
/* Total kludge. Pretend to be a VIA Nehemiah. */
@ -319,9 +391,9 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
I1c->size = 64;
I1c->assoc = 4;
I1c->line_size = 16;
L2c->size = 64;
L2c->assoc = 16;
L2c->line_size = 16;
LLc->size = 64;
LLc->assoc = 16;
LLc->line_size = 16;
ret = 0;
} else {
@ -332,13 +404,13 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
/* Successful! Convert sizes from KB to bytes */
I1c->size *= 1024;
D1c->size *= 1024;
L2c->size *= 1024;
LLc->size *= 1024;
return ret;
}
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
Int res;
@ -346,10 +418,10 @@ void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
// Set caches to default.
*I1c = (cache_t) { 65536, 2, 64 };
*D1c = (cache_t) { 65536, 2, 64 };
*L2c = (cache_t) { 262144, 8, 64 };
*LLc = (cache_t) { 262144, 8, 64 };
// Then replace with any info we can get from CPUID.
res = get_caches_from_CPUID(I1c, D1c, L2c);
res = get_caches_from_CPUID(I1c, D1c, LLc);
// Warn if CPUID failed and config not completely specified from cmd line.
if (res != 0 && !all_caches_clo_defined) {

View File

@ -33,14 +33,14 @@
// For cache simulation
typedef struct {
int size; // bytes
int assoc;
int line_size; // bytes
Int size; // bytes
Int assoc;
Int line_size; // bytes
} cache_t;
// Gives the configuration of I1, D1 and L2 caches. They get overridden
// Gives the configuration of I1, D1 and LL caches. They get overridden
// by any cache configurations specified on the command line.
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined);
#endif // __CG_ARCH_H

View File

@ -77,7 +77,7 @@ typedef
struct {
ULong a; /* total # memory accesses of this kind */
ULong m1; /* misses in the first level cache */
ULong m2; /* misses in the second level cache */
ULong mL; /* misses in the second level cache */
}
CacheCC;
@ -268,13 +268,13 @@ static LineCC* get_lineCC(Addr origAddr)
lineCC->loc.line = loc.line;
lineCC->Ir.a = 0;
lineCC->Ir.m1 = 0;
lineCC->Ir.m2 = 0;
lineCC->Ir.mL = 0;
lineCC->Dr.a = 0;
lineCC->Dr.m1 = 0;
lineCC->Dr.m2 = 0;
lineCC->Dr.mL = 0;
lineCC->Dw.a = 0;
lineCC->Dw.m1 = 0;
lineCC->Dw.m2 = 0;
lineCC->Dw.mL = 0;
lineCC->Bc.b = 0;
lineCC->Bc.mp = 0;
lineCC->Bi.b = 0;
@ -319,7 +319,7 @@ void log_1I_0D_cache_access(InstrInfo* n)
//VG_(printf)("1I_0D : CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n",
// n, n->instr_addr, n->instr_len);
cachesim_I1_doref(n->instr_addr, n->instr_len,
&n->parent->Ir.m1, &n->parent->Ir.m2);
&n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
}
@ -331,10 +331,10 @@ void log_2I_0D_cache_access(InstrInfo* n, InstrInfo* n2)
// n, n->instr_addr, n->instr_len,
// n2, n2->instr_addr, n2->instr_len);
cachesim_I1_doref(n->instr_addr, n->instr_len,
&n->parent->Ir.m1, &n->parent->Ir.m2);
&n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_I1_doref(n2->instr_addr, n2->instr_len,
&n2->parent->Ir.m1, &n2->parent->Ir.m2);
&n2->parent->Ir.m1, &n2->parent->Ir.mL);
n2->parent->Ir.a++;
}
@ -348,13 +348,13 @@ void log_3I_0D_cache_access(InstrInfo* n, InstrInfo* n2, InstrInfo* n3)
// n2, n2->instr_addr, n2->instr_len,
// n3, n3->instr_addr, n3->instr_len);
cachesim_I1_doref(n->instr_addr, n->instr_len,
&n->parent->Ir.m1, &n->parent->Ir.m2);
&n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_I1_doref(n2->instr_addr, n2->instr_len,
&n2->parent->Ir.m1, &n2->parent->Ir.m2);
&n2->parent->Ir.m1, &n2->parent->Ir.mL);
n2->parent->Ir.a++;
cachesim_I1_doref(n3->instr_addr, n3->instr_len,
&n3->parent->Ir.m1, &n3->parent->Ir.m2);
&n3->parent->Ir.m1, &n3->parent->Ir.mL);
n3->parent->Ir.a++;
}
@ -365,11 +365,11 @@ void log_1I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
// " daddr=0x%010lx, dsize=%lu\n",
// n, n->instr_addr, n->instr_len, data_addr, data_size);
cachesim_I1_doref(n->instr_addr, n->instr_len,
&n->parent->Ir.m1, &n->parent->Ir.m2);
&n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_D1_doref(data_addr, data_size,
&n->parent->Dr.m1, &n->parent->Dr.m2);
&n->parent->Dr.m1, &n->parent->Dr.mL);
n->parent->Dr.a++;
}
@ -380,11 +380,11 @@ void log_1I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
// " daddr=0x%010lx, dsize=%lu\n",
// n, n->instr_addr, n->instr_len, data_addr, data_size);
cachesim_I1_doref(n->instr_addr, n->instr_len,
&n->parent->Ir.m1, &n->parent->Ir.m2);
&n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_D1_doref(data_addr, data_size,
&n->parent->Dw.m1, &n->parent->Dw.m2);
&n->parent->Dw.m1, &n->parent->Dw.mL);
n->parent->Dw.a++;
}
@ -394,7 +394,7 @@ void log_0I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
//VG_(printf)("0I_1Dr: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
// n, data_addr, data_size);
cachesim_D1_doref(data_addr, data_size,
&n->parent->Dr.m1, &n->parent->Dr.m2);
&n->parent->Dr.m1, &n->parent->Dr.mL);
n->parent->Dr.a++;
}
@ -404,7 +404,7 @@ void log_0I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
//VG_(printf)("0I_1Dw: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
// n, data_addr, data_size);
cachesim_D1_doref(data_addr, data_size,
&n->parent->Dw.m1, &n->parent->Dw.m2);
&n->parent->Dw.m1, &n->parent->Dw.mL);
n->parent->Dw.a++;
}
@ -1234,7 +1234,7 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,
static cache_t clo_I1_cache = UNDEFINED_CACHE;
static cache_t clo_D1_cache = UNDEFINED_CACHE;
static cache_t clo_L2_cache = UNDEFINED_CACHE;
static cache_t clo_LL_cache = UNDEFINED_CACHE;
// Checks cache config is ok. Returns NULL if ok, or a pointer to an error
// string otherwise.
@ -1273,7 +1273,7 @@ static Char* check_cache(cache_t* cache)
}
static
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
@ -1283,22 +1283,22 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
Bool all_caches_clo_defined =
(DEFINED(clo_I1_cache) &&
DEFINED(clo_D1_cache) &&
DEFINED(clo_L2_cache));
DEFINED(clo_LL_cache));
// Set the cache config (using auto-detection, if supported by the
// architecture).
VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
// Check the default/auto-detected values.
checkRes = check_cache(I1c); tl_assert(!checkRes);
checkRes = check_cache(D1c); tl_assert(!checkRes);
checkRes = check_cache(L2c); tl_assert(!checkRes);
checkRes = check_cache(LLc); tl_assert(!checkRes);
// Then replace with any defined on the command line. (Already checked in
// parse_cache_opt().)
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
if (VG_(clo_verbosity) >= 2) {
VG_(umsg)("Cache configuration used:\n");
@ -1306,8 +1306,8 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
I1c->size, I1c->assoc, I1c->line_size);
VG_(umsg)(" D1: %dB, %d-way, %dB lines\n",
D1c->size, D1c->assoc, D1c->line_size);
VG_(umsg)(" L2: %dB, %d-way, %dB lines\n",
L2c->size, L2c->assoc, L2c->line_size);
VG_(umsg)(" LL: %dB, %d-way, %dB lines\n",
LLc->size, LLc->assoc, LLc->line_size);
}
#undef CMD_LINE_DEFINED
}
@ -1354,12 +1354,12 @@ static void fprint_CC_table_and_calc_totals(void)
VG_(free)(cachegrind_out_file);
}
// "desc:" lines (giving I1/D1/L2 cache configuration). The spaces after
// "desc:" lines (giving I1/D1/LL cache configuration). The spaces after
// the 2nd colon makes cg_annotate's output look nicer.
VG_(sprintf)(buf, "desc: I1 cache: %s\n"
"desc: D1 cache: %s\n"
"desc: L2 cache: %s\n",
I1.desc_line, D1.desc_line, L2.desc_line);
"desc: LL cache: %s\n",
I1.desc_line, D1.desc_line, LL.desc_line);
VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
// "cmd:" line
@ -1379,11 +1379,11 @@ static void fprint_CC_table_and_calc_totals(void)
}
// "events:" line
if (clo_cache_sim && clo_branch_sim) {
VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
"Bc Bcm Bi Bim\n");
}
else if (clo_cache_sim && !clo_branch_sim) {
VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
"\n");
}
else if (!clo_cache_sim && clo_branch_sim) {
@ -1430,9 +1430,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu %llu\n",
lineCC->loc.line,
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2,
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL,
lineCC->Bc.b, lineCC->Bc.mp,
lineCC->Bi.b, lineCC->Bi.mp);
}
@ -1441,9 +1441,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu\n",
lineCC->loc.line,
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL);
}
else if (!clo_cache_sim && clo_branch_sim) {
VG_(sprintf)(buf, "%u %llu"
@ -1464,13 +1464,13 @@ static void fprint_CC_table_and_calc_totals(void)
// Update summary stats
Ir_total.a += lineCC->Ir.a;
Ir_total.m1 += lineCC->Ir.m1;
Ir_total.m2 += lineCC->Ir.m2;
Ir_total.mL += lineCC->Ir.mL;
Dr_total.a += lineCC->Dr.a;
Dr_total.m1 += lineCC->Dr.m1;
Dr_total.m2 += lineCC->Dr.m2;
Dr_total.mL += lineCC->Dr.mL;
Dw_total.a += lineCC->Dw.a;
Dw_total.m1 += lineCC->Dw.m1;
Dw_total.m2 += lineCC->Dw.m2;
Dw_total.mL += lineCC->Dw.mL;
Bc_total.b += lineCC->Bc.b;
Bc_total.mp += lineCC->Bc.mp;
Bi_total.b += lineCC->Bi.b;
@ -1487,9 +1487,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu"
" %llu %llu %llu %llu\n",
Ir_total.a, Ir_total.m1, Ir_total.m2,
Dr_total.a, Dr_total.m1, Dr_total.m2,
Dw_total.a, Dw_total.m1, Dw_total.m2,
Ir_total.a, Ir_total.m1, Ir_total.mL,
Dr_total.a, Dr_total.m1, Dr_total.mL,
Dw_total.a, Dw_total.m1, Dw_total.mL,
Bc_total.b, Bc_total.mp,
Bi_total.b, Bi_total.mp);
}
@ -1498,9 +1498,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu"
" %llu %llu %llu\n",
Ir_total.a, Ir_total.m1, Ir_total.m2,
Dr_total.a, Dr_total.m1, Dr_total.m2,
Dw_total.a, Dw_total.m1, Dw_total.m2);
Ir_total.a, Ir_total.m1, Ir_total.mL,
Dr_total.a, Dr_total.m1, Dr_total.mL,
Dw_total.a, Dw_total.m1, Dw_total.mL);
}
else if (!clo_cache_sim && clo_branch_sim) {
VG_(sprintf)(buf, "summary:"
@ -1537,8 +1537,8 @@ static void cg_fini(Int exitcode)
CacheCC D_total;
BranchCC B_total;
ULong L2_total_m, L2_total_mr, L2_total_mw,
L2_total, L2_total_r, L2_total_w;
ULong LL_total_m, LL_total_mr, LL_total_mw,
LL_total, LL_total_r, LL_total_w;
Int l1, l2, l3;
fprint_CC_table_and_calc_totals();
@ -1565,21 +1565,21 @@ static void cg_fini(Int exitcode)
miss numbers */
if (clo_cache_sim) {
VG_(umsg)(fmt, "I1 misses: ", Ir_total.m1);
VG_(umsg)(fmt, "L2i misses: ", Ir_total.m2);
VG_(umsg)(fmt, "LLi misses: ", Ir_total.mL);
if (0 == Ir_total.a) Ir_total.a = 1;
VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
VG_(umsg)("I1 miss rate: %s\n", buf1);
VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
VG_(umsg)("L2i miss rate: %s\n", buf1);
VG_(percentify)(Ir_total.mL, Ir_total.a, 2, l1+1, buf1);
VG_(umsg)("LLi miss rate: %s\n", buf1);
VG_(umsg)("\n");
/* D cache results. Use the D_refs.rd and D_refs.wr values to
* determine the width of columns 2 & 3. */
D_total.a = Dr_total.a + Dw_total.a;
D_total.m1 = Dr_total.m1 + Dw_total.m1;
D_total.m2 = Dr_total.m2 + Dw_total.m2;
D_total.mL = Dr_total.mL + Dw_total.mL;
/* Make format string, getting width right for numbers */
VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu rd + %%,%dllu wr)\n",
@ -1589,8 +1589,8 @@ static void cg_fini(Int exitcode)
D_total.a, Dr_total.a, Dw_total.a);
VG_(umsg)(fmt, "D1 misses: ",
D_total.m1, Dr_total.m1, Dw_total.m1);
VG_(umsg)(fmt, "L2d misses: ",
D_total.m2, Dr_total.m2, Dw_total.m2);
VG_(umsg)(fmt, "LLd misses: ",
D_total.mL, Dr_total.mL, Dw_total.mL);
if (0 == D_total.a) D_total.a = 1;
if (0 == Dr_total.a) Dr_total.a = 1;
@ -1600,30 +1600,30 @@ static void cg_fini(Int exitcode)
VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
VG_(umsg)("D1 miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
VG_(percentify)( D_total.m2, D_total.a, 1, l1+1, buf1);
VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
VG_(umsg)("L2d miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
VG_(percentify)( D_total.mL, D_total.a, 1, l1+1, buf1);
VG_(percentify)(Dr_total.mL, Dr_total.a, 1, l2+1, buf2);
VG_(percentify)(Dw_total.mL, Dw_total.a, 1, l3+1, buf3);
VG_(umsg)("LLd miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
VG_(umsg)("\n");
/* L2 overall results */
/* LL overall results */
L2_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
L2_total_r = Dr_total.m1 + Ir_total.m1;
L2_total_w = Dw_total.m1;
VG_(umsg)(fmt, "L2 refs: ",
L2_total, L2_total_r, L2_total_w);
LL_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
LL_total_r = Dr_total.m1 + Ir_total.m1;
LL_total_w = Dw_total.m1;
VG_(umsg)(fmt, "LL refs: ",
LL_total, LL_total_r, LL_total_w);
L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
L2_total_mr = Dr_total.m2 + Ir_total.m2;
L2_total_mw = Dw_total.m2;
VG_(umsg)(fmt, "L2 misses: ",
L2_total_m, L2_total_mr, L2_total_mw);
LL_total_m = Dr_total.mL + Dw_total.mL + Ir_total.mL;
LL_total_mr = Dr_total.mL + Ir_total.mL;
LL_total_mw = Dw_total.mL;
VG_(umsg)(fmt, "LL misses: ",
LL_total_m, LL_total_mr, LL_total_mw);
VG_(percentify)(L2_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
VG_(percentify)(L2_total_mw, Dw_total.a, 1, l3+1, buf3);
VG_(umsg)("L2 miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
VG_(percentify)(LL_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
VG_(percentify)(LL_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
VG_(percentify)(LL_total_mw, Dw_total.a, 1, l3+1, buf3);
VG_(umsg)("LL miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
}
/* If branch profiling is enabled, show branch overall results. */
@ -1760,8 +1760,9 @@ static Bool cg_process_cmd_line_option(Char* arg)
parse_cache_opt(&clo_I1_cache, arg, tmp_str);
else if VG_STR_CLO(arg, "--D1", tmp_str)
parse_cache_opt(&clo_D1_cache, arg, tmp_str);
else if VG_STR_CLO(arg, "--L2", tmp_str)
parse_cache_opt(&clo_L2_cache, arg, tmp_str);
else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
VG_STR_CLO(arg, "--LL", tmp_str))
parse_cache_opt(&clo_LL_cache, arg, tmp_str);
else if VG_STR_CLO( arg, "--cachegrind-out-file", clo_cachegrind_out_file) {}
else if VG_BOOL_CLO(arg, "--cache-sim", clo_cache_sim) {}
@ -1777,7 +1778,7 @@ static void cg_print_usage(void)
VG_(printf)(
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
" --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
" --cache-sim=yes|no [yes] collect cache stats?\n"
" --branch-sim=yes|no [no] collect branch prediction stats?\n"
" --cachegrind-out-file=<file> output file name [cachegrind.out.%%p]\n"
@ -1819,7 +1820,7 @@ static void cg_pre_clo_init(void)
static void cg_post_clo_init(void)
{
cache_t I1c, D1c, L2c;
cache_t I1c, D1c, LLc;
CC_table =
VG_(OSetGen_Create)(offsetof(LineCC, loc),
@ -1837,11 +1838,11 @@ static void cg_post_clo_init(void)
VG_(malloc), "cg.main.cpci.3",
VG_(free));
configure_caches(&I1c, &D1c, &L2c);
configure_caches(&I1c, &D1c, &LLc);
cachesim_I1_initcache(I1c);
cachesim_D1_initcache(D1c);
cachesim_L2_initcache(L2c);
cachesim_LL_initcache(LLc);
}
VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init)

View File

@ -96,7 +96,7 @@ static void cachesim_##L##_initcache(cache_t config) \
/* bigger than its usual limit. Inlining gains around 5--10% speedup. */ \
__attribute__((always_inline)) \
static __inline__ \
void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2) \
void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *mL) \
{ \
UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
@ -188,9 +188,9 @@ miss_treatment: \
return; \
}
CACHESIM(L2, (*m2)++ );
CACHESIM(I1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
CACHESIM(D1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
CACHESIM(LL, (*mL)++ );
CACHESIM(I1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
CACHESIM(D1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
/*--------------------------------------------------------------------*/
/*--- end cg_sim.c ---*/

View File

@ -16,33 +16,45 @@ Valgrind command line.</para>
<para>Cachegrind simulates how your program interacts with a machine's cache
hierarchy and (optionally) branch predictor. It simulates a machine with
independent first level instruction and data caches (I1 and D1), backed by a
unified second level cache (L2). This configuration is used by almost all
modern machines.</para>
independent first-level instruction and data caches (I1 and D1), backed by a
unified second-level cache (L2). This exactly matches the configuration of
many modern machines.</para>
<para>However, some modern machines have three levels of cache. For these
machines (in the cases where Cachegrind can auto-detect the cache
configuration) Cachegrind simulates the first-level and third-level caches.
The reason for this choice is that the L3 cache has the most influence on
runtime, as it masks accesses to main memory. Furthermore, the L1 caches
often have low associativity, so simulating them can detect cases where the
code interacts badly with this cache (eg. traversing a matrix column-wise
with the row length being a power of 2).</para>
<para>Therefore, Cachegrind always refers to the I1, D1 and LL (last-level)
caches.</para>
<para>
It gathers the following statistics (abbreviations used for each statistic
Cachegrind gathers the following statistics (abbreviations used for each statistic
is given in parentheses):</para>
<itemizedlist>
<listitem>
<para>I cache reads (<computeroutput>Ir</computeroutput>,
which equals the number of instructions executed),
I1 cache read misses (<computeroutput>I1mr</computeroutput>) and
L2 cache instruction read misses (<computeroutput>I1mr</computeroutput>).
LL cache instruction read misses (<computeroutput>ILmr</computeroutput>).
</para>
</listitem>
<listitem>
<para>D cache reads (<computeroutput>Dr</computeroutput>, which
equals the number of memory reads),
D1 cache read misses (<computeroutput>D1mr</computeroutput>), and
L2 cache data read misses (<computeroutput>D2mr</computeroutput>).
LL cache data read misses (<computeroutput>DLmr</computeroutput>).
</para>
</listitem>
<listitem>
<para>D cache writes (<computeroutput>Dw</computeroutput>, which equals
the number of memory writes),
D1 cache write misses (<computeroutput>D1mw</computeroutput>), and
L2 cache data write misses (<computeroutput>D2mw</computeroutput>).
LL cache data write misses (<computeroutput>DLmw</computeroutput>).
</para>
</listitem>
<listitem>
@ -59,10 +71,10 @@ is given in parentheses):</para>
<para>Note that D1 total accesses is given by
<computeroutput>D1mr</computeroutput> +
<computeroutput>D1mw</computeroutput>, and that L2 total
accesses is given by <computeroutput>I2mr</computeroutput> +
<computeroutput>D2mr</computeroutput> +
<computeroutput>D2mw</computeroutput>.
<computeroutput>D1mw</computeroutput>, and that LL total
accesses is given by <computeroutput>ILmr</computeroutput> +
<computeroutput>DLmr</computeroutput> +
<computeroutput>DLmw</computeroutput>.
</para>
<para>These statistics are presented for the entire program and for each
@ -70,7 +82,7 @@ function in the program. You can also annotate each line of source code in
the program with the counts that were caused directly by it.</para>
<para>On a modern machine, an L1 miss will typically cost
around 10 cycles, an L2 miss can cost as much as 200
around 10 cycles, an LL miss can cost as much as 200
cycles, and a mispredicted branch costs in the region of 10
to 30 cycles. Detailed cache and branch profiling can be very useful
for understanding how your program interacts with the machine and thus how
@ -118,24 +130,24 @@ summary statistics that look like this will be printed:</para>
<programlisting><![CDATA[
==31751== I refs: 27,742,716
==31751== I1 misses: 276
==31751== L2i misses: 275
==31751== LLi misses: 275
==31751== I1 miss rate: 0.0%
==31751== L2i miss rate: 0.0%
==31751== LLi miss rate: 0.0%
==31751==
==31751== D refs: 15,430,290 (10,955,517 rd + 4,474,773 wr)
==31751== D1 misses: 41,185 ( 21,905 rd + 19,280 wr)
==31751== L2d misses: 23,085 ( 3,987 rd + 19,098 wr)
==31751== LLd misses: 23,085 ( 3,987 rd + 19,098 wr)
==31751== D1 miss rate: 0.2% ( 0.1% + 0.4%)
==31751== L2d miss rate: 0.1% ( 0.0% + 0.4%)
==31751== LLd miss rate: 0.1% ( 0.0% + 0.4%)
==31751==
==31751== L2 misses: 23,360 ( 4,262 rd + 19,098 wr)
==31751== L2 miss rate: 0.0% ( 0.0% + 0.4%)]]></programlisting>
==31751== LL misses: 23,360 ( 4,262 rd + 19,098 wr)
==31751== LL miss rate: 0.0% ( 0.0% + 0.4%)]]></programlisting>
<para>Cache accesses for instruction fetches are summarised
first, giving the number of fetches made (this is the number of
instructions executed, which can be useful to know in its own
right), the number of I1 misses, and the number of L2 instruction
(<computeroutput>L2i</computeroutput>) misses.</para>
right), the number of I1 misses, and the number of LL instruction
(<computeroutput>LLi</computeroutput>) misses.</para>
<para>Cache accesses for data follow. The information is similar
to that of the instruction fetches, except that the values are
@ -144,12 +156,12 @@ also shown split between reads and writes (note each row's
<computeroutput>wr</computeroutput> values add up to the row's
total).</para>
<para>Combined instruction and data figures for the L2 cache
follow that. Note that the L2 miss rate is computed relative to the total
<para>Combined instruction and data figures for the LL cache
follow that. Note that the LL miss rate is computed relative to the total
number of memory accesses, not the number of L1 misses. I.e. it is
<computeroutput>(I2mr + D2mr + D2mw) / (Ir + Dr + Dw)</computeroutput>
<computeroutput>(ILmr + DLmr + DLmw) / (Ir + Dr + Dw)</computeroutput>
not
<computeroutput>(I2mr + D2mr + D2mw) / (I1mr + D1mr + D1mw)</computeroutput>
<computeroutput>(ILmr + DLmr + DLmw) / (I1mr + D1mr + D1mw)</computeroutput>
</para>
<para>Branch prediction statistics are not collected by default.
@ -208,11 +220,11 @@ wide if possible, as the output lines can be quite long.</para>
--------------------------------------------------------------------------------
I1 cache: 65536 B, 64 B, 2-way associative
D1 cache: 65536 B, 64 B, 2-way associative
L2 cache: 262144 B, 64 B, 8-way associative
LL cache: 262144 B, 64 B, 8-way associative
Command: concord vg_to_ucode.c
Events recorded: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
Events shown: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
Event sort order: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
Events recorded: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
Events shown: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
Threshold: 99%
Chosen for annotation:
Auto-annotation: off
@ -224,7 +236,7 @@ Auto-annotation: off
<itemizedlist>
<listitem>
<para>I1 cache, D1 cache, L2 cache: cache configuration. So
<para>I1 cache, D1 cache, LL cache: cache configuration. So
you know the configuration with which these results were
obtained.</para>
</listitem>
@ -300,7 +312,7 @@ program:</para>
<programlisting><![CDATA[
--------------------------------------------------------------------------------
Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
--------------------------------------------------------------------------------
27,742,716 276 275 10,955,517 21,905 3,987 4,474,773 19,280 19,098 PROGRAM TOTALS]]></programlisting>
@ -312,7 +324,7 @@ These are similar to the summary provided when Cachegrind finishes running.
<programlisting><![CDATA[
--------------------------------------------------------------------------------
Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw file:function
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw file:function
--------------------------------------------------------------------------------
8,821,482 5 5 2,242,702 1,621 73 1,794,230 0 0 getc.c:_IO_getc
5,222,023 4 4 2,276,334 16 12 875,959 1 1 concord.c:get_word
@ -367,7 +379,7 @@ produces the same output as above followed by an annotated version of
--------------------------------------------------------------------------------
-- User-annotated source: concord.c
--------------------------------------------------------------------------------
Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
. . . . . . . . . void init_hash_table(char *file_name, Word_Node *table[])
3 1 1 . . . 1 0 0 {
@ -687,7 +699,7 @@ programs. It does however check that the
<computeroutput>Events:</computeroutput> lines of all the inputs are
identical, so as to ensure that the addition of costs makes sense.
For example, it would be nonsensical for it to add a number indicating
D1 read references to a number from a different file indicating L2
D1 read references to a number from a different file indicating LL
write misses.</para>
<para>
@ -746,7 +758,7 @@ programs. It does however check that the
<computeroutput>Events:</computeroutput> lines of all the inputs are
identical, so as to ensure that the addition of costs makes sense.
For example, it would be nonsensical for it to add a number indicating
D1 read references to a number from a different file indicating L2
D1 read references to a number from a different file indicating LL
write misses.</para>
<para>
@ -810,12 +822,12 @@ this case.</para>
</listitem>
</varlistentry>
<varlistentry id="opt.L2" xreflabel="--L2">
<varlistentry id="opt.LL" xreflabel="--LL">
<term>
<option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
<option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
</term>
<listitem>
<para>Specify the size, associativity and line size of the level 2
<para>Specify the size, associativity and line size of the last-level
cache.</para>
</listitem>
</varlistentry>
@ -903,9 +915,9 @@ this case.</para>
order). Default is to use all present in the
<filename>cachegrind.out.&lt;pid&gt;</filename> file (and
use the order in the file). Useful if you want to concentrate on, for
example, I cache misses (<option>--show=I1mr,I2mr</option>), or data
read misses (<option>--show=D1mr,D2mr</option>), or L2 data misses
(<option>--show=D2mr,D2mw</option>). Best used in conjunction with
example, I cache misses (<option>--show=I1mr,ILmr</option>), or data
read misses (<option>--show=D1mr,DLmr</option>), or LL data misses
(<option>--show=DLmr,DLmw</option>). Best used in conjunction with
<option>--sort</option>.</para>
</listitem>
</varlistentry>
@ -935,9 +947,9 @@ this case.</para>
events by appending any events for the
<option>--sort</option> option with a colon
and a number (no spaces, though). E.g. if you want to see
each function that covers more than 1% of L2 read misses or 1% of L2
each function that covers more than 1% of LL read misses or 1% of LL
write misses, use this option:</para>
<para><option>--sort=D2mr:1,D2mw:1</option></para>
<para><option>--sort=DLmr:1,DLmw:1</option></para>
</listitem>
</varlistentry>
@ -1059,13 +1071,13 @@ information, but they can still be very useful for identifying
bottlenecks.</para>
<para>
After that, we have found that L2 misses are typically a much bigger source
After that, we have found that LL misses are typically a much bigger source
of slow-downs than L1 misses. So it's worth looking for any snippets of
code with high <computeroutput>D2mr</computeroutput> or
<computeroutput>D2mw</computeroutput> counts. (You can use
<option>--show=D2mr
--sort=D2mr</option> with cg_annotate to focus just on
<literal>D2mr</literal> counts, for example.) If you find any, it's still
code with high <computeroutput>DLmr</computeroutput> or
<computeroutput>DLmw</computeroutput> counts. (You can use
<option>--show=DLmr
--sort=DLmr</option> with cg_annotate to focus just on
<literal>DLmr</literal> counts, for example.) If you find any, it's still
not always easy to work out how to improve things. You need to have a
reasonable understanding of how caches work, the principles of locality, and
your program's data access patterns. Improving things may require
@ -1153,12 +1165,12 @@ follows:</para>
</listitem>
<listitem>
<para>Inclusive L2 cache: the L2 cache typically replicates all
<para>Inclusive LL cache: the LL cache typically replicates all
the entries of the L1 caches, because fetching into L1 involves
fetching into L2 first (this does not guarantee strict inclusiveness,
as lines evicted from L2 still could reside in L1). This is
fetching into LL first (this does not guarantee strict inclusiveness,
as lines evicted from LL still could reside in L1). This is
standard on Pentium chips, but AMD Opterons, Athlons and Durons
use an exclusive L2 cache that only holds
use an exclusive LL cache that only holds
blocks evicted from L1. Ditto most modern VIA CPUs.</para>
</listitem>
@ -1172,10 +1184,10 @@ early incarnation that doesn't give any cache information, then
Cachegrind will fall back to using a default configuration (that
of a model 3/4 Athlon). Cachegrind will tell you if this
happens. You can manually specify one, two or all three levels
(I1/D1/L2) of the cache from the command line using the
(I1/D1/LL) of the cache from the command line using the
<option>--I1</option>,
<option>--D1</option> and
<option>--L2</option> options.
<option>--LL</option> options.
For cache parameters to be valid for simulation, the number
of sets (with associativity being the number of cache lines in
each set) has to be a power of two.</para>
@ -1186,7 +1198,7 @@ determine the cache configuration, so you will
need to specify it with the
<option>--I1</option>,
<option>--D1</option> and
<option>--L2</option> options.</para>
<option>--LL</option> options.</para>
<para>Other noteworthy behaviour:</para>

View File

@ -2,16 +2,16 @@
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -2,16 +2,16 @@
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -7,11 +7,11 @@ $dir/../../tests/filter_stderr_basic |
# Remove "Cachegrind, ..." line and the following copyright line.
sed "/^Cachegrind, a cache and branch-prediction profiler/ , /./ d" |
# Remove numbers from I/D/L2 "refs:" lines
perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' |
# Remove numbers from I/D/LL "refs:" lines
perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' |
# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
# Remove CPUID warnings lines for P4s and other machines
sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |

View File

@ -2,16 +2,16 @@
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,3 +1,3 @@
prog: ../../tests/true
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
cleanup: rm cachegrind.out.*

View File

@ -2,16 +2,16 @@
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -2,16 +2,16 @@
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -414,7 +414,7 @@ for "Ir and "Dr".</para>
<para>This specifies various information for this dump. For some
types, the semantic is defined, but any description type is allowed.
Unknown types should be ignored.</para>
<para>There are the types "I1 cache", "D1 cache", "L2 cache", which
<para>There are the types "I1 cache", "D1 cache", "LL cache", which
specify parameters used for the cache simulator. These are the only
types originally used by Cachegrind. Additionally, Callgrind uses
the following types: "Timerange" gives a rough range of the basic
@ -457,7 +457,7 @@ for "Ir and "Dr".</para>
<para><command>I1mr</command>: Instruction Level 1 read cache miss</para>
</listitem>
<listitem>
<para><command>I2mr</command>: Instruction Level 2 read cache miss</para>
<para><command>ILmr</command>: Instruction last-level read cache miss</para>
</listitem>
<listitem>
<para>...</para>

View File

@ -933,9 +933,9 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
<para>Specify if you want to do full cache simulation. By default,
only instruction read accesses will be counted ("Ir").
With cache simulation, further event counters are enabled:
Cache misses on instruction reads ("I1mr"/"I2mr"),
data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"),
data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw").
Cache misses on instruction reads ("I1mr"/"ILmr"),
data read accesses ("Dr") and related cache misses ("D1mr"/"DLmr"),
data write accesses ("Dw") and related cache misses ("D1mw"/"DLmw").
For more information, see <xref linkend="cg-manual"/>.
</para>
</listitem>
@ -972,13 +972,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
</term>
<listitem>
<para>Specify whether write-back behavior should be simulated, allowing
to distinguish L2 caches misses with and without write backs.
to distinguish LL caches misses with and without write backs.
The cache model of Cachegrind/Callgrind does not specify write-through
vs. write-back behavior, and this also is not relevant for the number
of generated miss counts. However, with explicit write-back simulation
it can be decided whether a miss triggers not only the loading of a new
cache line, but also if a write back of a dirty cache line had to take
place before. The new dirty miss events are I2dmr, D2dmr, and D2dmw,
place before. The new dirty miss events are ILdmr, DLdmr, and DLdmw,
for misses because of instruction read, data read, and data write,
respectively. As they produce two memory transactions, they should
account for a doubled time estimation in relation to a normal miss.
@ -1016,13 +1016,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
bad access behavior). The new counters are defined in a way such
that worse behavior results in higher cost.
AcCost1 and AcCost2 are counters showing bad temporal locality
for L1 and L2 caches, respectively. This is done by summing up
for L1 and LL caches, respectively. This is done by summing up
reciprocal values of the numbers of accesses of each cache line,
multiplied by 1000 (as only integer costs are allowed). E.g. for
a given source line with 5 read accesses, a value of 5000 AcCost
means that for every access, a new cache line was loaded and directly
evicted afterwards without further accesses. Similarly, SpLoss1/2
shows bad spatial locality for L1 and L2 caches, respectively. It
shows bad spatial locality for L1 and LL caches, respectively. It
gives the <emphasis>spatial loss</emphasis> count of bytes which
were loaded into cache but never accessed. It pinpoints at code
accessing data in a way such that cache space is wasted. This hints
@ -1059,12 +1059,12 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
</listitem>
</varlistentry>
<varlistentry id="opt.L2" xreflabel="--L2">
<varlistentry id="opt.LL" xreflabel="--LL">
<term>
<option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
<option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
</term>
<listitem>
<para>Specify the size, associativity and line size of the level 2
<para>Specify the size, associativity and line size of the last-level
cache.</para>
</listitem>
</varlistentry>

View File

@ -91,7 +91,7 @@ typedef struct {
* States of flat caches in our model.
* We use a 2-level hierarchy,
*/
static cache_t2 I1, D1, L2;
static cache_t2 I1, D1, LL;
/* Lower bits of cache tags are used as flags for a cache line */
#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
@ -123,8 +123,8 @@ static Int off_I1_AcCost = 0;
static Int off_I1_SpLoss = 1;
static Int off_D1_AcCost = 0;
static Int off_D1_SpLoss = 1;
static Int off_L2_AcCost = 2;
static Int off_L2_SpLoss = 3;
static Int off_LL_AcCost = 2;
static Int off_LL_SpLoss = 3;
/* Cache access types */
typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
@ -135,7 +135,7 @@ typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
/* Result of a reference into a hierarchical cache model */
typedef enum {
L1_Hit,
L2_Hit,
LL_Hit,
MemAccess,
WriteBackMemAccess } CacheModelResult;
@ -231,7 +231,7 @@ static void print_cache(cache_t2* c)
/*------------------------------------------------------------*/
/*
* Simple model: L1 & L2 Write Through
* Simple model: L1 & LL Write Through
* Does not distinguish among read and write references
*
* Simulator functions:
@ -305,7 +305,7 @@ static
CacheModelResult cachesim_I1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@ -313,7 +313,7 @@ static
CacheModelResult cachesim_D1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@ -323,7 +323,7 @@ CacheModelResult cachesim_D1_ref(Addr a, UChar size)
/*------------------------------------------------------------*/
/*
* More complex model: L1 Write-through, L2 Write-back
* More complex model: L1 Write-through, LL Write-back
* This needs to distinguish among read and write references.
*
* Simulator functions:
@ -412,8 +412,8 @@ static
CacheModelResult cachesim_I1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
case Hit: return L2_Hit;
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@ -424,8 +424,8 @@ static
CacheModelResult cachesim_D1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
case Hit: return L2_Hit;
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@ -437,14 +437,14 @@ CacheModelResult cachesim_D1_Write(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) {
/* Even for a L1 hit, the write-trough L1 passes
* the write to the L2 to make the L2 line dirty.
* the write to the LL to make the LL line dirty.
* But this causes no latency, so return the hit.
*/
cachesim_ref_wb( &L2, Write, a, size);
cachesim_ref_wb( &LL, Write, a, size);
return L1_Hit;
}
switch( cachesim_ref_wb( &L2, Write, a, size) ) {
case Hit: return L2_Hit;
switch( cachesim_ref_wb( &LL, Write, a, size) ) {
case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@ -479,10 +479,10 @@ void prefetch_clear(void)
* One stream can be detected per 4k page.
*/
static __inline__
void prefetch_L2_doref(Addr a)
void prefetch_LL_doref(Addr a)
{
UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
UInt block = ( a >> L2.line_size_bits);
UInt block = ( a >> LL.line_size_bits);
if (block != pf_lastblock[stream]) {
if (pf_seqblocks[stream] == 0) {
@ -494,7 +494,7 @@ void prefetch_L2_doref(Addr a)
pf_seqblocks[stream]++;
if (pf_seqblocks[stream] >= 2) {
prefetch_up++;
cachesim_ref(&L2, a + 5 * L2.line_size,1);
cachesim_ref(&LL, a + 5 * LL.line_size,1);
}
}
else pf_seqblocks[stream] = 0;
@ -504,7 +504,7 @@ void prefetch_L2_doref(Addr a)
pf_seqblocks[stream]--;
if (pf_seqblocks[stream] <= -2) {
prefetch_down++;
cachesim_ref(&L2, a - 5 * L2.line_size,1);
cachesim_ref(&LL, a - 5 * LL.line_size,1);
}
}
else pf_seqblocks[stream] = 0;
@ -519,8 +519,8 @@ static
CacheModelResult prefetch_I1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
prefetch_L2_doref(a);
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
prefetch_LL_doref(a);
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@ -528,8 +528,8 @@ static
CacheModelResult prefetch_D1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
prefetch_L2_doref(a);
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
prefetch_LL_doref(a);
if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@ -540,9 +540,9 @@ static
CacheModelResult prefetch_I1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
prefetch_L2_doref(a);
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
case Hit: return L2_Hit;
prefetch_LL_doref(a);
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@ -553,9 +553,9 @@ static
CacheModelResult prefetch_D1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
prefetch_L2_doref(a);
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
case Hit: return L2_Hit;
prefetch_LL_doref(a);
switch( cachesim_ref_wb( &LL, Read, a, size) ) {
case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@ -565,17 +565,17 @@ CacheModelResult prefetch_D1_Read(Addr a, UChar size)
static
CacheModelResult prefetch_D1_Write(Addr a, UChar size)
{
prefetch_L2_doref(a);
prefetch_LL_doref(a);
if ( cachesim_ref( &D1, a, size) == Hit ) {
/* Even for a L1 hit, the write-trough L1 passes
* the write to the L2 to make the L2 line dirty.
* the write to the LL to make the LL line dirty.
* But this causes no latency, so return the hit.
*/
cachesim_ref_wb( &L2, Write, a, size);
cachesim_ref_wb( &LL, Write, a, size);
return L1_Hit;
}
switch( cachesim_ref_wb( &L2, Write, a, size) ) {
case Hit: return L2_Hit;
switch( cachesim_ref_wb( &LL, Write, a, size) ) {
case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@ -736,7 +736,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
/* Second case: word straddles two lines. */ \
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
set = &(L.tags[set1 * L.assoc]); \
use_mask = L.line_start_mask[a & L.line_size_mask]; \
if (tag == (set[0] & L.tag_mask)) { \
@ -809,7 +809,7 @@ block2: \
idx = (set2 * L.assoc) + tmp_tag; \
miss2 = update_##L##_use(&L, idx, \
use_mask, (a+size-1) &~ L.line_size_mask); \
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
\
} else { \
VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
@ -837,13 +837,13 @@ static __inline__ unsigned int countBits(unsigned int bits)
return c;
}
static void update_L2_use(int idx, Addr memline)
static void update_LL_use(int idx, Addr memline)
{
line_loaded* loaded = &(L2.loaded[idx]);
line_use* use = &(L2.use[idx]);
int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
line_loaded* loaded = &(LL.loaded[idx]);
line_use* use = &(LL.use[idx]);
int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
idx, CLG_(bb_base) + current_ii->instr_offset, memline);
if (use->count>0) {
CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
@ -852,8 +852,8 @@ static void update_L2_use(int idx, Addr memline)
CLG_(current_state).collect, loaded->use_base);
if (CLG_(current_state).collect && loaded->use_base) {
(loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
(loaded->use_base)[off_L2_SpLoss] += i;
(loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
(loaded->use_base)[off_LL_SpLoss] += i;
}
}
@ -868,53 +868,53 @@ static void update_L2_use(int idx, Addr memline)
}
static
CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
{
UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
UWord* set = &(L2.tags[setNo * L2.assoc]);
UWord tag = memline & L2.tag_mask;
UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
UWord* set = &(LL.tags[setNo * LL.assoc]);
UWord tag = memline & LL.tag_mask;
int i, j, idx;
UWord tmp_tag;
CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
if (tag == (set[0] & L2.tag_mask)) {
idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
l1_loaded->dep_use = &(L2.use[idx]);
if (tag == (set[0] & LL.tag_mask)) {
idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
l1_loaded->dep_use = &(LL.use[idx]);
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
L2.use[idx].mask, L2.use[idx].count);
return L2_Hit;
idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
LL.use[idx].mask, LL.use[idx].count);
return LL_Hit;
}
for (i = 1; i < L2.assoc; i++) {
if (tag == (set[i] & L2.tag_mask)) {
for (i = 1; i < LL.assoc; i++) {
if (tag == (set[i] & LL.tag_mask)) {
tmp_tag = set[i];
for (j = i; j > 0; j--) {
set[j] = set[j - 1];
}
set[0] = tmp_tag;
idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
l1_loaded->dep_use = &(L2.use[idx]);
idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
l1_loaded->dep_use = &(LL.use[idx]);
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
L2.use[idx].mask, L2.use[idx].count);
return L2_Hit;
i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
LL.use[idx].mask, LL.use[idx].count);
return LL_Hit;
}
}
/* A miss; install this tag as MRU, shuffle rest down. */
tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
for (j = L2.assoc - 1; j > 0; j--) {
tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
for (j = LL.assoc - 1; j > 0; j--) {
set[j] = set[j - 1];
}
set[0] = tag | tmp_tag;
idx = (setNo * L2.assoc) + tmp_tag;
l1_loaded->dep_use = &(L2.use[idx]);
idx = (setNo * LL.assoc) + tmp_tag;
l1_loaded->dep_use = &(LL.use[idx]);
update_L2_use(idx, memline);
update_LL_use(idx, memline);
return MemAccess;
}
@ -943,7 +943,7 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
(loaded->use_base)[off_##L##_SpLoss] += c; \
\
/* FIXME (?): L1/L2 line sizes must be equal ! */ \
/* FIXME (?): L1/LL line sizes must be equal ! */ \
loaded->dep_use->mask |= use->mask; \
loaded->dep_use->count += use->count; \
} \
@ -957,8 +957,8 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
CLG_(current_state).nonskipped->skipped : \
CLG_(cost_base) + current_ii->cost_offset; \
\
if (memline == 0) return L2_Hit; \
return cacheuse_L2_access(memline, loaded); \
if (memline == 0) return LL_Hit; \
return cacheuse_LL_access(memline, loaded); \
}
UPDATE_USE(I1);
@ -991,10 +991,10 @@ void cacheuse_finish(void)
if (D1.loaded[i].use_base)
update_D1_use( &D1, i, 0,0);
if (L2.use)
for (i = 0; i < L2.sets * L2.assoc; i++)
if (L2.loaded[i].use_base)
update_L2_use(i, 0);
if (LL.use)
for (i = 0; i < LL.sets * LL.assoc; i++)
if (LL.loaded[i].use_base)
update_LL_use(i, 0);
}
@ -1020,7 +1020,7 @@ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
c2[2]++;
// fall through
case L2_Hit:
case LL_Hit:
c1[1]++;
c2[1]++;
// fall through
@ -1036,9 +1036,9 @@ Char* cacheRes(CacheModelResult r)
{
switch(r) {
case L1_Hit: return "L1 Hit ";
case L2_Hit: return "L2 Hit ";
case MemAccess: return "L2 Miss";
case WriteBackMemAccess: return "L2 Miss (dirty)";
case LL_Hit: return "LL Hit ";
case MemAccess: return "LL Miss";
case WriteBackMemAccess: return "LL Miss (dirty)";
default:
tl_assert(0);
}
@ -1268,7 +1268,7 @@ static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
static cache_t clo_I1_cache = UNDEFINED_CACHE;
static cache_t clo_D1_cache = UNDEFINED_CACHE;
static cache_t clo_L2_cache = UNDEFINED_CACHE;
static cache_t clo_LL_cache = UNDEFINED_CACHE;
// Checks cache config is ok. Returns NULL if ok, or a pointer to an error
@ -1308,7 +1308,7 @@ static Char* check_cache(cache_t* cache)
}
static
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
@ -1317,30 +1317,30 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
Bool all_caches_clo_defined =
(DEFINED(clo_I1_cache) &&
DEFINED(clo_D1_cache) &&
DEFINED(clo_L2_cache));
DEFINED(clo_LL_cache));
// Set the cache config (using auto-detection, if supported by the
// architecture).
VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
// Check the default/auto-detected values.
checkRes = check_cache(I1c); tl_assert(!checkRes);
checkRes = check_cache(D1c); tl_assert(!checkRes);
checkRes = check_cache(L2c); tl_assert(!checkRes);
checkRes = check_cache(LLc); tl_assert(!checkRes);
// Then replace with any defined on the command line.
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
if (VG_(clo_verbosity) > 1) {
VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines\n",
I1c->size, I1c->assoc, I1c->line_size);
VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines\n",
D1c->size, D1c->assoc, D1c->line_size);
VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines\n",
L2c->size, L2c->assoc, L2c->line_size);
VG_(umsg)("Cache configuration used:\n");
VG_(umsg)(" I1: %dB, %d-way, %dB lines\n",
I1c->size, I1c->assoc, I1c->line_size);
VG_(umsg)(" D1: %dB, %d-way, %dB lines\n",
D1c->size, D1c->assoc, D1c->line_size);
VG_(umsg)(" LL: %dB, %d-way, %dB lines\n",
LLc->size, LLc->assoc, LLc->line_size);
}
#undef CMD_LINE_DEFINED
}
@ -1350,7 +1350,7 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
static void cachesim_post_clo_init(void)
{
/* Cache configurations. */
cache_t I1c, D1c, L2c;
cache_t I1c, D1c, LLc;
/* Initialize access handlers */
if (!CLG_(clo).simulate_cache) {
@ -1374,15 +1374,15 @@ static void cachesim_post_clo_init(void)
}
/* Configuration of caches only needed with real cache simulation */
configure_caches(&I1c, &D1c, &L2c);
configure_caches(&I1c, &D1c, &LLc);
I1.name = "I1";
D1.name = "D1";
L2.name = "L2";
LL.name = "LL";
cachesim_initcache(I1c, &I1);
cachesim_initcache(D1c, &D1);
cachesim_initcache(L2c, &L2);
cachesim_initcache(LLc, &LL);
/* the other cache simulators use the standard helpers
* with dispatching via simulator struct */
@ -1463,7 +1463,7 @@ void cachesim_clear(void)
{
cachesim_clearcache(&I1);
cachesim_clearcache(&D1);
cachesim_clearcache(&L2);
cachesim_clearcache(&LL);
prefetch_clear();
}
@ -1474,7 +1474,7 @@ static void cachesim_getdesc(Char* buf)
Int p;
p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
}
static
@ -1490,11 +1490,12 @@ void cachesim_print_opts(void)
" --cacheuse=no|yes Collect cache block use [no]\n"
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
" --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
);
}
static void parse_opt ( cache_t* cache, char* opt, Char* optval )
static void parse_opt ( cache_t* cache,
char* opt, Char* optval, UChar kind )
{
Long i1, i2, i3;
Char* endptr;
@ -1550,11 +1551,12 @@ static Bool cachesim_parse_opt(Char* arg)
}
else if VG_STR_CLO(arg, "--I1", tmp_str)
parse_opt(&clo_I1_cache, arg, tmp_str);
parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
else if VG_STR_CLO(arg, "--D1", tmp_str)
parse_opt(&clo_D1_cache, arg, tmp_str);
else if VG_STR_CLO(arg, "--L2", tmp_str)
parse_opt(&clo_L2_cache, arg, tmp_str);
parse_opt(&clo_D1_cache, arg, tmp_str, '1');
else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
VG_STR_CLO(arg, "--LL", tmp_str))
parse_opt(&clo_LL_cache, arg, tmp_str, '2');
else
return False;
@ -1613,8 +1615,8 @@ static
void cachesim_printstat(Int l1, Int l2, Int l3)
{
FullCost total = CLG_(total_cost), D_total = 0;
ULong L2_total_m, L2_total_mr, L2_total_mw,
L2_total, L2_total_r, L2_total_w;
ULong LL_total_m, LL_total_mr, LL_total_mw,
LL_total, LL_total_r, LL_total_w;
char buf1[RESULTS_BUF_LEN],
buf2[RESULTS_BUF_LEN],
buf3[RESULTS_BUF_LEN];
@ -1632,7 +1634,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
VG_(message)(Vg_UserMsg, "I1 misses: %s\n", buf1);
commify(total[fullOffset(EG_IR) +2], l1, buf1);
VG_(message)(Vg_UserMsg, "L2i misses: %s\n", buf1);
VG_(message)(Vg_UserMsg, "LLi misses: %s\n", buf1);
p = 100;
@ -1645,7 +1647,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
percentify(total[fullOffset(EG_IR)+2] * 100 * p /
total[fullOffset(EG_IR)], p, l1+1, buf1);
VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
VG_(message)(Vg_UserMsg, "\n");
/* D cache results.
@ -1673,7 +1675,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
commify( D_total[2], l1, buf1);
commify(total[fullOffset(EG_DR)+2], l2, buf2);
commify(total[fullOffset(EG_DW)+2], l3, buf3);
VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)\n",
VG_(message)(Vg_UserMsg, "LLd misses: %s (%s rd + %s wr)\n",
buf1, buf2, buf3);
p = 10;
@ -1695,50 +1697,50 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
total[fullOffset(EG_DR)], p, l2+1, buf2);
percentify(total[fullOffset(EG_DW)+2] * 100 * p /
total[fullOffset(EG_DW)], p, l3+1, buf3);
VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )\n",
VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s + %s )\n",
buf1, buf2,buf3);
VG_(message)(Vg_UserMsg, "\n");
/* L2 overall results */
/* LL overall results */
L2_total =
LL_total =
total[fullOffset(EG_DR) +1] +
total[fullOffset(EG_DW) +1] +
total[fullOffset(EG_IR) +1];
L2_total_r =
LL_total_r =
total[fullOffset(EG_DR) +1] +
total[fullOffset(EG_IR) +1];
L2_total_w = total[fullOffset(EG_DW) +1];
commify(L2_total, l1, buf1);
commify(L2_total_r, l2, buf2);
commify(L2_total_w, l3, buf3);
VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)\n",
LL_total_w = total[fullOffset(EG_DW) +1];
commify(LL_total, l1, buf1);
commify(LL_total_r, l2, buf2);
commify(LL_total_w, l3, buf3);
VG_(message)(Vg_UserMsg, "LL refs: %s (%s rd + %s wr)\n",
buf1, buf2, buf3);
L2_total_m =
LL_total_m =
total[fullOffset(EG_DR) +2] +
total[fullOffset(EG_DW) +2] +
total[fullOffset(EG_IR) +2];
L2_total_mr =
LL_total_mr =
total[fullOffset(EG_DR) +2] +
total[fullOffset(EG_IR) +2];
L2_total_mw = total[fullOffset(EG_DW) +2];
commify(L2_total_m, l1, buf1);
commify(L2_total_mr, l2, buf2);
commify(L2_total_mw, l3, buf3);
VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)\n",
LL_total_mw = total[fullOffset(EG_DW) +2];
commify(LL_total_m, l1, buf1);
commify(LL_total_mr, l2, buf2);
commify(LL_total_mw, l3, buf3);
VG_(message)(Vg_UserMsg, "LL misses: %s (%s rd + %s wr)\n",
buf1, buf2, buf3);
percentify(L2_total_m * 100 * p /
percentify(LL_total_m * 100 * p /
(total[fullOffset(EG_IR)] + D_total[0]), p, l1+1, buf1);
percentify(L2_total_mr * 100 * p /
percentify(LL_total_mr * 100 * p /
(total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
p, l2+1, buf2);
percentify(L2_total_mw * 100 * p /
percentify(LL_total_mw * 100 * p /
total[fullOffset(EG_DW)], p, l3+1, buf3);
VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )\n",
VG_(message)(Vg_UserMsg, "LL miss rate: %s (%s + %s )\n",
buf1, buf2,buf3);
}
@ -1760,14 +1762,14 @@ void CLG_(init_eventsets)()
if (!CLG_(clo).simulate_cache)
CLG_(register_event_group)(EG_IR, "Ir");
else if (!clo_simulate_writeback) {
CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "I2mr");
CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "D2mr");
CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "D2mw");
CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
}
else { // clo_simulate_writeback
CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "I2mr", "I2dmr");
CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "D2mr", "D2dmr");
CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
}
if (CLG_(clo).simulate_branch) {
@ -1807,12 +1809,12 @@ void CLG_(init_eventsets)()
CLG_(append_event)(CLG_(dumpmap), "I1mr");
CLG_(append_event)(CLG_(dumpmap), "D1mr");
CLG_(append_event)(CLG_(dumpmap), "D1mw");
CLG_(append_event)(CLG_(dumpmap), "I2mr");
CLG_(append_event)(CLG_(dumpmap), "D2mr");
CLG_(append_event)(CLG_(dumpmap), "D2mw");
CLG_(append_event)(CLG_(dumpmap), "I2dmr");
CLG_(append_event)(CLG_(dumpmap), "D2dmr");
CLG_(append_event)(CLG_(dumpmap), "D2dmw");
CLG_(append_event)(CLG_(dumpmap), "ILmr");
CLG_(append_event)(CLG_(dumpmap), "DLmr");
CLG_(append_event)(CLG_(dumpmap), "DLmw");
CLG_(append_event)(CLG_(dumpmap), "ILdmr");
CLG_(append_event)(CLG_(dumpmap), "DLdmr");
CLG_(append_event)(CLG_(dumpmap), "DLdmw");
CLG_(append_event)(CLG_(dumpmap), "Bc");
CLG_(append_event)(CLG_(dumpmap), "Bcm");
CLG_(append_event)(CLG_(dumpmap), "Bi");

View File

@ -13,11 +13,11 @@ sed "/^For interactive control,.*$/d" |
# Remove numbers from "Collected" line
sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |
# Remove numbers from I/D/L2 "refs:" lines
perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' |
# Remove numbers from I/D/LL "refs:" lines
perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' |
# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,3 +1,3 @@
prog: ../../tests/true
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-hwpref=yes
cleanup: rm callgrind.out.*

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,3 +1,3 @@
prog: ../../tests/true
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --cacheuse=yes
cleanup: rm callgrind.out.*

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,3 +1,3 @@
prog: ../../tests/true
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-wb=yes
cleanup: rm callgrind.out.*

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,3 +1,3 @@
prog: ../../tests/true
vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
cleanup: rm callgrind.out.*

View File

@ -1,23 +1,23 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:
Branches:
Mispredicts:

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate:

View File

@ -1,20 +1,20 @@
Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
Collected :
I refs:
I1 misses:
L2i misses:
LLi misses:
I1 miss rate:
L2i miss rate:
LLi miss rate:
D refs:
D1 misses:
L2d misses:
LLd misses:
D1 miss rate:
L2d miss rate:
LLd miss rate:
L2 refs:
L2 misses:
L2 miss rate:
LL refs:
LL misses:
LL miss rate: