Change Cachegrind/Callgrind to talk about the LL (last-level) cache instead

of the L2 cache. This is to accommodate machines with three levels of cache. We still only simulate two levels, the first and the last. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11404
2026-02-03 18:13:01 +00:00 · 2010-10-06 22:46:31 +00:00 · 2010-10-06 22:46:31 +00:00 · 60d9b410d4
commit 60d9b410d4
parent cb3fbb46d7
34 changed files with 586 additions and 490 deletions
--- a/19
+++ b/19
@ -16,6 +16,20 @@ Improvements:
  --threshold option has changed; this is unlikely to affect many people, if
  you do use it please see the user manual for details.

+- Callgrind now can do branch prediction simulation, similar to Cachegrind.
+  In addition, it optionally can count the number of executed global bus events.
+  Both can be used for a better approximation of a "Cycle Estimation" as
+  derived event (you need to update the event formula in KCachegrind yourself).
+
+- Cachegrind and Callgrind now refer to the LL (last-level) cache rather
+  than the L2 cache.  This is to accommodate machines with three levels of
+  caches -- if Cachegrind/Callgrind auto-detects the cache configuration of
+  such a machine it will run the simulation as if the L2 cache isn't
+  present.  This means the results are less likely to match the true result
+  for the machine, but Cachegrind/Callgrind's results are already only
+  approximate, and should not be considered authoritative.  The results are
+  still useful for giving a general idea about a program's locality.
+
 - Massif has a new option, --pages-as-heap, which is disabled by default.
  When enabled, instead of tracking allocations at the level of heap blocks
  (as allocated with malloc/new/new[]), it instead tracks memory allocations
@ -24,11 +38,6 @@ Improvements:
  harder than the heap-level output, but this option is useful if you want
  to account for every byte of memory used by a program.

- Callgrind now can do branch prediction simulation, similar to Cachegrind.
-  In addition, it optionally can count the number of executed global bus events.
-  Both can be used for a better approximation of a "Cycle Estimation" as
-  derived event (you need to update the event formula in KCachegrind yourself).
-
 - Added new memcheck command-line option --show-possibly-lost.


--- a/cachegrind/cg-arm.c
+++ b/cachegrind/cg-arm.c
@ -37,13 +37,13 @@

 #include "cg_arch.h"

-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                           Bool all_caches_clo_defined)
 {
   // Set caches to default (for Cortex-A8 ?)
   *I1c = (cache_t) {  16384, 4, 64 };
   *D1c = (cache_t) {  16384, 4, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };

   if (!all_caches_clo_defined) {
      VG_(message)(Vg_DebugMsg, 
--- a/cachegrind/cg-ppc32.c
+++ b/cachegrind/cg-ppc32.c
@ -37,13 +37,13 @@

 #include "cg_arch.h"

-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                           Bool all_caches_clo_defined)
 {
   // Set caches to default.
   *I1c = (cache_t) {  65536, 2, 64 };
   *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };

   // Warn if config not completely specified from cmd line.  Note that
   // this message is slightly different from the one we give on x86/AMD64
--- a/cachegrind/cg-ppc64.c
+++ b/cachegrind/cg-ppc64.c
@ -37,13 +37,13 @@

 #include "cg_arch.h"

-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                           Bool all_caches_clo_defined)
 {
   // Set caches to default.
   *I1c = (cache_t) {  65536, 2, 64 };
   *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };

   // Warn if config not completely specified from cmd line.  Note that
   // this message is slightly different from the one we give on x86/AMD64
--- a/cachegrind/cg-x86-amd64.c
+++ b/cachegrind/cg-x86-amd64.c
@ -54,9 +54,12 @@ static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
 * array of pre-defined configurations for various parts of the memory
 * hierarchy.
 * According to Intel Processor Identification, App Note 485.
+ * 
+ * If a L3 cache is found, then data for it rather than the L2
+ * is returned via *LLc.
 */
 static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
   Int cpuid1_eax;
   Int cpuid1_ignore;
@ -65,6 +68,14 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
   UChar info[16];
   Int   i, trials;
   Bool  L2_found = False;
+   /* If we see L3 cache info, copy it into L3c.  Then, at the end,
+      copy it into *LLc.  Hence if a L3 cache is specified, *LLc will
+      eventually contain a description of it rather than the L2 cache.
+      The use of the L3c intermediary makes this process independent
+      of the order in which the cache specifications appear in
+      info[]. */
+   Bool  L3_found = False;
+   cache_t L3c = { 0, 0, 0 };

   if (level < 2) {
      VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level);
@ -121,18 +132,39 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
      case 0x90: case 0x96: case 0x9b:
         VG_(tool_panic)("IA-64 cache detected?!");

-      case 0x22: case 0x23: case 0x25: case 0x29:
-      case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
-      case 0xe2: case 0xe3: case 0xe4: case 0xea: case 0xeb: case 0xec:
-          VG_(dmsg)("warning: L3 cache detected but ignored\n");
-          break;
+      /* L3 cache info. */
+      case 0x22: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
+      case 0x23: L3c = (cache_t) { 1024,   8, 64 }; L3_found = True; break;
+      case 0x25: L3c = (cache_t) { 2048,   8, 64 }; L3_found = True; break;
+      case 0x29: L3c = (cache_t) { 4096,   8, 64 }; L3_found = True; break;
+      case 0x46: L3c = (cache_t) { 4096,   4, 64 }; L3_found = True; break;
+      case 0x47: L3c = (cache_t) { 8192,   8, 64 }; L3_found = True; break;
+      case 0x4a: L3c = (cache_t) { 6144,  12, 64 }; L3_found = True; break;
+      case 0x4b: L3c = (cache_t) { 8192,  16, 64 }; L3_found = True; break;
+      case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break;
+      case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break;
+      case 0xd0: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
+      case 0xd1: L3c = (cache_t) { 1024,   4, 64 }; L3_found = True; break;
+      case 0xd2: L3c = (cache_t) { 2048,   4, 64 }; L3_found = True; break;
+      case 0xd6: L3c = (cache_t) { 1024,   8, 64 }; L3_found = True; break;
+      case 0xd7: L3c = (cache_t) { 2048,   8, 64 }; L3_found = True; break;
+      case 0xd8: L3c = (cache_t) { 4096,   8, 64 }; L3_found = True; break;
+      case 0xdc: L3c = (cache_t) { 1536,  12, 64 }; L3_found = True; break;
+      case 0xdd: L3c = (cache_t) { 3072,  12, 64 }; L3_found = True; break;
+      case 0xde: L3c = (cache_t) { 6144,  12, 64 }; L3_found = True; break;
+      case 0xe2: L3c = (cache_t) { 2048,  16, 64 }; L3_found = True; break;
+      case 0xe3: L3c = (cache_t) { 4096,  16, 64 }; L3_found = True; break;
+      case 0xe4: L3c = (cache_t) { 8192,  16, 64 }; L3_found = True; break;
+      case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break;
+      case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break;
+      case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break;

      /* Described as "MLC" in Intel documentation */
-      case 0x21: *L2c = (cache_t) {  256, 8, 64 }; L2_found = True; break;
+      case 0x21: *LLc = (cache_t) {  256, 8, 64 }; L2_found = True; break;

      /* These are sectored, whatever that means */
-      case 0x39: *L2c = (cache_t) {  128, 4, 64 }; L2_found = True; break;
-      case 0x3c: *L2c = (cache_t) {  256, 4, 64 }; L2_found = True; break;
+      case 0x39: *LLc = (cache_t) {  128, 4, 64 }; L2_found = True; break;
+      case 0x3c: *LLc = (cache_t) {  256, 4, 64 }; L2_found = True; break;

      /* If a P6 core, this means "no L2 cache".  
         If a P4 core, this means "no L3 cache".
@ -141,20 +173,21 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
      case 0x40:
          break;

-      case 0x41: *L2c = (cache_t) {  128, 4, 32 }; L2_found = True; break;
-      case 0x42: *L2c = (cache_t) {  256, 4, 32 }; L2_found = True; break;
-      case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
-      case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
-      case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
-      case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
+      case 0x41: *LLc = (cache_t) {  128,  4, 32 }; L2_found = True; break;
+      case 0x42: *LLc = (cache_t) {  256,  4, 32 }; L2_found = True; break;
+      case 0x43: *LLc = (cache_t) {  512,  4, 32 }; L2_found = True; break;
+      case 0x44: *LLc = (cache_t) { 1024,  4, 32 }; L2_found = True; break;
+      case 0x45: *LLc = (cache_t) { 2048,  4, 32 }; L2_found = True; break;
+      case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break;
+      case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
      case 0x49:
-	  if ((family == 15) && (model == 6))
-	      /* On Xeon MP (family F, model 6), this is for L3 */
-	      VG_(dmsg)("warning: L3 cache detected but ignored\n");
-	  else
-	      *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-	  break;
-      case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
+         if (family == 15 && model == 6) {
+            /* On Xeon MP (family F, model 6), this is for L3 */
+            L3c = (cache_t) { 4096, 16, 64 }; L3_found = True;
+         } else {
+	    *LLc = (cache_t) { 4096, 16, 64 }; L2_found = True;
+         }
+         break;

      /* These are sectored, whatever that means */
      case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
@ -181,26 +214,24 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
         break;  

      /* not sectored, whatever that might mean */
-      case 0x78: *L2c = (cache_t) { 1024, 4,  64 }; L2_found = True;  break;
+      case 0x78: *LLc = (cache_t) { 1024, 4,  64 }; L2_found = True;  break;

      /* These are sectored, whatever that means */
-      case 0x79: *L2c = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
-      case 0x7a: *L2c = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
-      case 0x7b: *L2c = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
-      case 0x7c: *L2c = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
-      case 0x7d: *L2c = (cache_t) { 2048, 8,  64 }; L2_found = True;  break;
-      case 0x7e: *L2c = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
-
-      case 0x7f: *L2c = (cache_t) {  512, 2, 64 };  L2_found = True;  break;
-      case 0x80: *L2c = (cache_t) {  512, 8, 64 };  L2_found = True;  break;
-
-      case 0x81: *L2c = (cache_t) {  128, 8, 32 };  L2_found = True;  break;
-      case 0x82: *L2c = (cache_t) {  256, 8, 32 };  L2_found = True;  break;
-      case 0x83: *L2c = (cache_t) {  512, 8, 32 };  L2_found = True;  break;
-      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };  L2_found = True;  break;
-      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };  L2_found = True;  break;
-      case 0x86: *L2c = (cache_t) {  512, 4, 64 };  L2_found = True;  break;
-      case 0x87: *L2c = (cache_t) { 1024, 8, 64 };  L2_found = True;  break;
+      case 0x79: *LLc = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
+      case 0x7a: *LLc = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
+      case 0x7b: *LLc = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
+      case 0x7c: *LLc = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
+      case 0x7d: *LLc = (cache_t) { 2048, 8,  64 }; L2_found = True;  break;
+      case 0x7e: *LLc = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
+      case 0x7f: *LLc = (cache_t) {  512, 2,  64 }; L2_found = True;  break;
+      case 0x80: *LLc = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
+      case 0x81: *LLc = (cache_t) {  128, 8,  32 }; L2_found = True;  break;
+      case 0x82: *LLc = (cache_t) {  256, 8,  32 }; L2_found = True;  break;
+      case 0x83: *LLc = (cache_t) {  512, 8,  32 }; L2_found = True;  break;
+      case 0x84: *LLc = (cache_t) { 1024, 8,  32 }; L2_found = True;  break;
+      case 0x85: *LLc = (cache_t) { 2048, 8,  32 }; L2_found = True;  break;
+      case 0x86: *LLc = (cache_t) {  512, 4,  64 }; L2_found = True;  break;
+      case 0x87: *LLc = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;

      /* Ignore prefetch information */
      case 0xf0: case 0xf1:
@ -213,8 +244,15 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
      }
   }

+   /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
+   if (L3_found) {
+      VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
+      *LLc = L3c;
+      L2_found = True;
+   }
+
   if (!L2_found)
-      VG_(dmsg)("warning: L2 cache not installed, ignore L2 results.\n");
+      VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");

   return 0;
 }
@ -241,14 +279,37 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
 * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
 * so we detect that.
 * 
- * Returns 0 on success, non-zero on failure.
+ * Returns 0 on success, non-zero on failure.  As with the Intel code
+ * above, if a L3 cache is found, then data for it rather than the L2
+ * is returned via *LLc.
 */
+
+/* A small helper */
+static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
+{
+   /* Decode a L2/L3 associativity indication.  It is encoded
+      differently from the I1/D1 associativity.  Returns 1
+      (direct-map) as a safe but suboptimal result for unknown
+      encodings. */
+   switch (bits_15_12 & 0xF) {
+      case 1: return 1;    case 2: return 2;
+      case 4: return 4;    case 6: return 8;
+      case 8: return 16;   case 0xA: return 32;
+      case 0xB: return 48; case 0xC: return 64;
+      case 0xD: return 96; case 0xE: return 128;
+      case 0xF: /* fully associative */
+      case 0: /* L2/L3 cache or TLB is disabled */
+      default:
+        return 1;
+   }
+}
+
 static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
   UInt ext_level;
   UInt dummy, model;
-   UInt I1i, D1i, L2i;
+   UInt I1i, D1i, L2i, L3i;
   
   VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);

@ -259,7 +320,7 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
   }

   VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
-   VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
+   VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i);

   VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);

@ -277,15 +338,26 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
   I1c->assoc     = (I1i >> 16) & 0xff;
   I1c->line_size = (I1i >>  0) & 0xff;

-   L2c->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
-   L2c->assoc     = (L2i >> 12) & 0xf;
-   L2c->line_size = (L2i >>  0) & 0xff;
+   LLc->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
+   LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
+   LLc->line_size = (L2i >>  0) & 0xff;
+
+   if (((L3i >> 18) & 0x3fff) > 0) {
+      /* There's an L3 cache.  Replace *LLc contents with this info. */
+      /* NB: the test in the if is "if L3 size > 0 ".  I don't know if
+         this is the right way to test presence-vs-absence of L3.  I
+         can't see any guidance on this in the AMD documentation. */
+      LLc->size      = ((L3i >> 18) & 0x3fff) * 512;
+      LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
+      LLc->line_size = (L3i >>  0) & 0xff;
+      VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
+   }

   return 0;
 }

 static 
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
   Int  level, ret;
   Char vendor_id[13];
@ -306,10 +378,10 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)

   /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
   if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
-      ret = Intel_cache_info(level, I1c, D1c, L2c);
+      ret = Intel_cache_info(level, I1c, D1c, LLc);

   } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
-      ret = AMD_cache_info(I1c, D1c, L2c);
+      ret = AMD_cache_info(I1c, D1c, LLc);

   } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
      /* Total kludge.  Pretend to be a VIA Nehemiah. */
@ -319,9 +391,9 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
      I1c->size      = 64;
      I1c->assoc     = 4;
      I1c->line_size = 16;
-      L2c->size      = 64;
-      L2c->assoc     = 16;
-      L2c->line_size = 16;
+      LLc->size      = 64;
+      LLc->assoc     = 16;
+      LLc->line_size = 16;
      ret = 0;

   } else {
@ -332,13 +404,13 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
   /* Successful!  Convert sizes from KB to bytes */
   I1c->size *= 1024;
   D1c->size *= 1024;
-   L2c->size *= 1024;
+   LLc->size *= 1024;
      
   return ret;
 }


-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                           Bool all_caches_clo_defined)
 {
   Int res;
@ -346,10 +418,10 @@ void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
   // Set caches to default.
   *I1c = (cache_t) {  65536, 2, 64 };
   *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };

   // Then replace with any info we can get from CPUID.
-   res = get_caches_from_CPUID(I1c, D1c, L2c);
+   res = get_caches_from_CPUID(I1c, D1c, LLc);

   // Warn if CPUID failed and config not completely specified from cmd line.
   if (res != 0 && !all_caches_clo_defined) {
--- a/cachegrind/cg_arch.h
+++ b/cachegrind/cg_arch.h
@ -33,14 +33,14 @@

 // For cache simulation
 typedef struct {
-   int size;       // bytes
-   int assoc;
-   int line_size;  // bytes
+   Int size;       // bytes
+   Int assoc;
+   Int line_size;  // bytes
 } cache_t;

-// Gives the configuration of I1, D1 and L2 caches.  They get overridden
+// Gives the configuration of I1, D1 and LL caches.  They get overridden
 // by any cache configurations specified on the command line.
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                           Bool all_caches_clo_defined);

 #endif   // __CG_ARCH_H
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@ -77,7 +77,7 @@ typedef
   struct {
      ULong a;  /* total # memory accesses of this kind */
      ULong m1; /* misses in the first level cache */
-      ULong m2; /* misses in the second level cache */
+      ULong mL; /* misses in the second level cache */
   }
   CacheCC;

@ -268,13 +268,13 @@ static LineCC* get_lineCC(Addr origAddr)
      lineCC->loc.line = loc.line;
      lineCC->Ir.a     = 0;
      lineCC->Ir.m1    = 0;
-      lineCC->Ir.m2    = 0;
+      lineCC->Ir.mL    = 0;
      lineCC->Dr.a     = 0;
      lineCC->Dr.m1    = 0;
-      lineCC->Dr.m2    = 0;
+      lineCC->Dr.mL    = 0;
      lineCC->Dw.a     = 0;
      lineCC->Dw.m1    = 0;
-      lineCC->Dw.m2    = 0;
+      lineCC->Dw.mL    = 0;
      lineCC->Bc.b     = 0;
      lineCC->Bc.mp    = 0;
      lineCC->Bi.b     = 0;
@ -319,7 +319,7 @@ void log_1I_0D_cache_access(InstrInfo* n)
   //VG_(printf)("1I_0D :  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n",
   //             n, n->instr_addr, n->instr_len);
   cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
   n->parent->Ir.a++;
 }

@ -331,10 +331,10 @@ void log_2I_0D_cache_access(InstrInfo* n, InstrInfo* n2)
   //            n,  n->instr_addr,  n->instr_len,
   //            n2, n2->instr_addr, n2->instr_len);
   cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
   n->parent->Ir.a++;
   cachesim_I1_doref(n2->instr_addr, n2->instr_len, 
-                     &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+                     &n2->parent->Ir.m1, &n2->parent->Ir.mL);
   n2->parent->Ir.a++;
 }

@ -348,13 +348,13 @@ void log_3I_0D_cache_access(InstrInfo* n, InstrInfo* n2, InstrInfo* n3)
   //            n2, n2->instr_addr, n2->instr_len,
   //            n3, n3->instr_addr, n3->instr_len);
   cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
   n->parent->Ir.a++;
   cachesim_I1_doref(n2->instr_addr, n2->instr_len, 
-                     &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+                     &n2->parent->Ir.m1, &n2->parent->Ir.mL);
   n2->parent->Ir.a++;
   cachesim_I1_doref(n3->instr_addr, n3->instr_len, 
-                     &n3->parent->Ir.m1, &n3->parent->Ir.m2);
+                     &n3->parent->Ir.m1, &n3->parent->Ir.mL);
   n3->parent->Ir.a++;
 }

@ -365,11 +365,11 @@ void log_1I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
   //            "                               daddr=0x%010lx,  dsize=%lu\n",
   //            n, n->instr_addr, n->instr_len, data_addr, data_size);
   cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
   n->parent->Ir.a++;

   cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dr.m1, &n->parent->Dr.m2);
+                     &n->parent->Dr.m1, &n->parent->Dr.mL);
   n->parent->Dr.a++;
 }

@ -380,11 +380,11 @@ void log_1I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
   //            "                               daddr=0x%010lx,  dsize=%lu\n",
   //            n, n->instr_addr, n->instr_len, data_addr, data_size);
   cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
   n->parent->Ir.a++;

   cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dw.m1, &n->parent->Dw.m2);
+                     &n->parent->Dw.m1, &n->parent->Dw.mL);
   n->parent->Dw.a++;
 }

@ -394,7 +394,7 @@ void log_0I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
   //VG_(printf)("0I_1Dr:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
   //            n, data_addr, data_size);
   cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dr.m1, &n->parent->Dr.m2);
+                     &n->parent->Dr.m1, &n->parent->Dr.mL);
   n->parent->Dr.a++;
 }

@ -404,7 +404,7 @@ void log_0I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
   //VG_(printf)("0I_1Dw:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
   //            n, data_addr, data_size);
   cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dw.m1, &n->parent->Dw.m2);
+                     &n->parent->Dw.m1, &n->parent->Dw.mL);
   n->parent->Dw.a++;
 }

@ -1234,7 +1234,7 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,

 static cache_t clo_I1_cache = UNDEFINED_CACHE;
 static cache_t clo_D1_cache = UNDEFINED_CACHE;
-static cache_t clo_L2_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;

 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
 // string otherwise.
@ -1273,7 +1273,7 @@ static Char* check_cache(cache_t* cache)
 }

 static 
-void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)

@ -1283,22 +1283,22 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
   Bool all_caches_clo_defined =
      (DEFINED(clo_I1_cache) &&
       DEFINED(clo_D1_cache) &&
-       DEFINED(clo_L2_cache));
+       DEFINED(clo_LL_cache));

   // Set the cache config (using auto-detection, if supported by the
   // architecture).
-   VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
+   VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );

   // Check the default/auto-detected values.
   checkRes = check_cache(I1c);  tl_assert(!checkRes);
   checkRes = check_cache(D1c);  tl_assert(!checkRes);
-   checkRes = check_cache(L2c);  tl_assert(!checkRes);
+   checkRes = check_cache(LLc);  tl_assert(!checkRes);

   // Then replace with any defined on the command line.  (Already checked in
   // parse_cache_opt().)
   if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
   if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
-   if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+   if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }

   if (VG_(clo_verbosity) >= 2) {
      VG_(umsg)("Cache configuration used:\n");
@ -1306,8 +1306,8 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
                I1c->size, I1c->assoc, I1c->line_size);
      VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
                D1c->size, D1c->assoc, D1c->line_size);
-      VG_(umsg)("  L2: %dB, %d-way, %dB lines\n",
-                L2c->size, L2c->assoc, L2c->line_size);
+      VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
+                LLc->size, LLc->assoc, LLc->line_size);
   }
 #undef CMD_LINE_DEFINED
 }
@ -1354,12 +1354,12 @@ static void fprint_CC_table_and_calc_totals(void)
      VG_(free)(cachegrind_out_file);
   }

-   // "desc:" lines (giving I1/D1/L2 cache configuration).  The spaces after
+   // "desc:" lines (giving I1/D1/LL cache configuration).  The spaces after
   // the 2nd colon makes cg_annotate's output look nicer.
   VG_(sprintf)(buf, "desc: I1 cache:         %s\n"
                     "desc: D1 cache:         %s\n"
-                     "desc: L2 cache:         %s\n",
-                     I1.desc_line, D1.desc_line, L2.desc_line);
+                     "desc: LL cache:         %s\n",
+                     I1.desc_line, D1.desc_line, LL.desc_line);
   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

   // "cmd:" line
@ -1379,11 +1379,11 @@ static void fprint_CC_table_and_calc_totals(void)
   }
   // "events:" line
   if (clo_cache_sim && clo_branch_sim) {
-      VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+      VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
                                  "Bc Bcm Bi Bim\n");
   }
   else if (clo_cache_sim && !clo_branch_sim) {
-      VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+      VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
                                  "\n");
   }
   else if (!clo_cache_sim && clo_branch_sim) {
@ -1430,9 +1430,9 @@ static void fprint_CC_table_and_calc_totals(void)
                             " %llu %llu %llu"
                             " %llu %llu %llu %llu\n",
                            lineCC->loc.line,
-                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2, 
-                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
-                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2,
+                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL, 
+                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
+                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL,
                            lineCC->Bc.b, lineCC->Bc.mp, 
                            lineCC->Bi.b, lineCC->Bi.mp);
      }
@ -1441,9 +1441,9 @@ static void fprint_CC_table_and_calc_totals(void)
                             " %llu %llu %llu"
                             " %llu %llu %llu\n",
                            lineCC->loc.line,
-                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2, 
-                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
-                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
+                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL, 
+                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
+                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL);
      }
      else if (!clo_cache_sim && clo_branch_sim) {
         VG_(sprintf)(buf, "%u %llu"
@ -1464,13 +1464,13 @@ static void fprint_CC_table_and_calc_totals(void)
      // Update summary stats
      Ir_total.a  += lineCC->Ir.a;
      Ir_total.m1 += lineCC->Ir.m1;
-      Ir_total.m2 += lineCC->Ir.m2;
+      Ir_total.mL += lineCC->Ir.mL;
      Dr_total.a  += lineCC->Dr.a;
      Dr_total.m1 += lineCC->Dr.m1;
-      Dr_total.m2 += lineCC->Dr.m2;
+      Dr_total.mL += lineCC->Dr.mL;
      Dw_total.a  += lineCC->Dw.a;
      Dw_total.m1 += lineCC->Dw.m1;
-      Dw_total.m2 += lineCC->Dw.m2;
+      Dw_total.mL += lineCC->Dw.mL;
      Bc_total.b  += lineCC->Bc.b;
      Bc_total.mp += lineCC->Bc.mp;
      Bi_total.b  += lineCC->Bi.b;
@ -1487,9 +1487,9 @@ static void fprint_CC_table_and_calc_totals(void)
                        " %llu %llu %llu"
                        " %llu %llu %llu"
                        " %llu %llu %llu %llu\n", 
-                        Ir_total.a, Ir_total.m1, Ir_total.m2,
-                        Dr_total.a, Dr_total.m1, Dr_total.m2,
-                        Dw_total.a, Dw_total.m1, Dw_total.m2,
+                        Ir_total.a, Ir_total.m1, Ir_total.mL,
+                        Dr_total.a, Dr_total.m1, Dr_total.mL,
+                        Dw_total.a, Dw_total.m1, Dw_total.mL,
                        Bc_total.b, Bc_total.mp, 
                        Bi_total.b, Bi_total.mp);
   }
@ -1498,9 +1498,9 @@ static void fprint_CC_table_and_calc_totals(void)
                        " %llu %llu %llu"
                        " %llu %llu %llu"
                        " %llu %llu %llu\n",
-                        Ir_total.a, Ir_total.m1, Ir_total.m2,
-                        Dr_total.a, Dr_total.m1, Dr_total.m2,
-                        Dw_total.a, Dw_total.m1, Dw_total.m2);
+                        Ir_total.a, Ir_total.m1, Ir_total.mL,
+                        Dr_total.a, Dr_total.m1, Dr_total.mL,
+                        Dw_total.a, Dw_total.m1, Dw_total.mL);
   }
   else if (!clo_cache_sim && clo_branch_sim) {
      VG_(sprintf)(buf, "summary:"
@ -1537,8 +1537,8 @@ static void cg_fini(Int exitcode)

   CacheCC  D_total;
   BranchCC B_total;
-   ULong L2_total_m, L2_total_mr, L2_total_mw,
-         L2_total, L2_total_r, L2_total_w;
+   ULong LL_total_m, LL_total_mr, LL_total_mw,
+         LL_total, LL_total_r, LL_total_w;
   Int l1, l2, l3;

   fprint_CC_table_and_calc_totals();
@ -1565,21 +1565,21 @@ static void cg_fini(Int exitcode)
      miss numbers */
   if (clo_cache_sim) {
      VG_(umsg)(fmt, "I1  misses:   ", Ir_total.m1);
-      VG_(umsg)(fmt, "L2i misses:   ", Ir_total.m2);
+      VG_(umsg)(fmt, "LLi misses:   ", Ir_total.mL);

      if (0 == Ir_total.a) Ir_total.a = 1;
      VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
      VG_(umsg)("I1  miss rate: %s\n", buf1);

-      VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
-      VG_(umsg)("L2i miss rate: %s\n", buf1);
+      VG_(percentify)(Ir_total.mL, Ir_total.a, 2, l1+1, buf1);
+      VG_(umsg)("LLi miss rate: %s\n", buf1);
      VG_(umsg)("\n");

      /* D cache results.  Use the D_refs.rd and D_refs.wr values to
       * determine the width of columns 2 & 3. */
      D_total.a  = Dr_total.a  + Dw_total.a;
      D_total.m1 = Dr_total.m1 + Dw_total.m1;
-      D_total.m2 = Dr_total.m2 + Dw_total.m2;
+      D_total.mL = Dr_total.mL + Dw_total.mL;

      /* Make format string, getting width right for numbers */
      VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu rd   + %%,%dllu wr)\n",
@ -1589,8 +1589,8 @@ static void cg_fini(Int exitcode)
                     D_total.a, Dr_total.a, Dw_total.a);
      VG_(umsg)(fmt, "D1  misses:   ",
                     D_total.m1, Dr_total.m1, Dw_total.m1);
-      VG_(umsg)(fmt, "L2d misses:   ",
-                     D_total.m2, Dr_total.m2, Dw_total.m2);
+      VG_(umsg)(fmt, "LLd misses:   ",
+                     D_total.mL, Dr_total.mL, Dw_total.mL);

      if (0 == D_total.a)  D_total.a = 1;
      if (0 == Dr_total.a) Dr_total.a = 1;
@ -1600,30 +1600,30 @@ static void cg_fini(Int exitcode)
      VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
      VG_(umsg)("D1  miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);

-      VG_(percentify)( D_total.m2,  D_total.a, 1, l1+1, buf1);
-      VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
-      VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
-      VG_(umsg)("L2d miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
+      VG_(percentify)( D_total.mL,  D_total.a, 1, l1+1, buf1);
+      VG_(percentify)(Dr_total.mL, Dr_total.a, 1, l2+1, buf2);
+      VG_(percentify)(Dw_total.mL, Dw_total.a, 1, l3+1, buf3);
+      VG_(umsg)("LLd miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
      VG_(umsg)("\n");

-      /* L2 overall results */
+      /* LL overall results */

-      L2_total   = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
-      L2_total_r = Dr_total.m1 + Ir_total.m1;
-      L2_total_w = Dw_total.m1;
-      VG_(umsg)(fmt, "L2 refs:      ",
-                     L2_total, L2_total_r, L2_total_w);
+      LL_total   = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
+      LL_total_r = Dr_total.m1 + Ir_total.m1;
+      LL_total_w = Dw_total.m1;
+      VG_(umsg)(fmt, "LL refs:      ",
+                     LL_total, LL_total_r, LL_total_w);

-      L2_total_m  = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
-      L2_total_mr = Dr_total.m2 + Ir_total.m2;
-      L2_total_mw = Dw_total.m2;
-      VG_(umsg)(fmt, "L2 misses:    ",
-                     L2_total_m, L2_total_mr, L2_total_mw);
+      LL_total_m  = Dr_total.mL + Dw_total.mL + Ir_total.mL;
+      LL_total_mr = Dr_total.mL + Ir_total.mL;
+      LL_total_mw = Dw_total.mL;
+      VG_(umsg)(fmt, "LL misses:    ",
+                     LL_total_m, LL_total_mr, LL_total_mw);

-      VG_(percentify)(L2_total_m,  (Ir_total.a + D_total.a),  1, l1+1, buf1);
-      VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
-      VG_(percentify)(L2_total_mw, Dw_total.a,                1, l3+1, buf3);
-      VG_(umsg)("L2 miss rate:  %s (%s     + %s  )\n", buf1, buf2,buf3);
+      VG_(percentify)(LL_total_m,  (Ir_total.a + D_total.a),  1, l1+1, buf1);
+      VG_(percentify)(LL_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
+      VG_(percentify)(LL_total_mw, Dw_total.a,                1, l3+1, buf3);
+      VG_(umsg)("LL miss rate:  %s (%s     + %s  )\n", buf1, buf2,buf3);
   }

   /* If branch profiling is enabled, show branch overall results. */
@ -1760,8 +1760,9 @@ static Bool cg_process_cmd_line_option(Char* arg)
      parse_cache_opt(&clo_I1_cache, arg, tmp_str);
   else if VG_STR_CLO(arg, "--D1", tmp_str)
      parse_cache_opt(&clo_D1_cache, arg, tmp_str);
-   else if VG_STR_CLO(arg, "--L2", tmp_str)
-      parse_cache_opt(&clo_L2_cache, arg, tmp_str);
+   else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
+            VG_STR_CLO(arg, "--LL", tmp_str))
+      parse_cache_opt(&clo_LL_cache, arg, tmp_str);

   else if VG_STR_CLO( arg, "--cachegrind-out-file", clo_cachegrind_out_file) {}
   else if VG_BOOL_CLO(arg, "--cache-sim",  clo_cache_sim)  {}
@ -1777,7 +1778,7 @@ static void cg_print_usage(void)
   VG_(printf)(
 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
-"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
+"    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
 "    --cache-sim=yes|no  [yes]        collect cache stats?\n"
 "    --branch-sim=yes|no [no]         collect branch prediction stats?\n"
 "    --cachegrind-out-file=<file>     output file name [cachegrind.out.%%p]\n"
@ -1819,7 +1820,7 @@ static void cg_pre_clo_init(void)

 static void cg_post_clo_init(void)
 {
-   cache_t I1c, D1c, L2c; 
+   cache_t I1c, D1c, LLc; 

   CC_table =
      VG_(OSetGen_Create)(offsetof(LineCC, loc),
@ -1837,11 +1838,11 @@ static void cg_post_clo_init(void)
                          VG_(malloc), "cg.main.cpci.3",
                          VG_(free));

-   configure_caches(&I1c, &D1c, &L2c);
+   configure_caches(&I1c, &D1c, &LLc);

   cachesim_I1_initcache(I1c);
   cachesim_D1_initcache(D1c);
-   cachesim_L2_initcache(L2c);
+   cachesim_LL_initcache(LLc);
 }

 VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init)
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@ -96,7 +96,7 @@ static void cachesim_##L##_initcache(cache_t config)                        \
 /* bigger than its usual limit.  Inlining gains around 5--10% speedup. */   \
 __attribute__((always_inline))                                              \
 static __inline__                                                           \
-void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
+void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *mL)         \
 {                                                                           \
   UInt  set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);          \
   UInt  set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);          \
@ -188,9 +188,9 @@ miss_treatment:                                                             \
   return;                                                                  \
 }

-CACHESIM(L2, (*m2)++ );
-CACHESIM(I1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
-CACHESIM(D1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
+CACHESIM(LL, (*mL)++ );
+CACHESIM(I1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
+CACHESIM(D1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );

 /*--------------------------------------------------------------------*/
 /*--- end                                                 cg_sim.c ---*/
--- a/cachegrind/docs/cg-manual.xml
+++ b/cachegrind/docs/cg-manual.xml
@ -16,33 +16,45 @@ Valgrind command line.</para>

 <para>Cachegrind simulates how your program interacts with a machine's cache
 hierarchy and (optionally) branch predictor.  It simulates a machine with
-independent first level instruction and data caches (I1 and D1), backed by a
-unified second level cache (L2).  This configuration is used by almost all
-modern machines.</para>
+independent first-level instruction and data caches (I1 and D1), backed by a
+unified second-level cache (L2).  This exactly matches the configuration of
+many modern machines.</para>
+
+<para>However, some modern machines have three levels of cache.  For these
+machines (in the cases where Cachegrind can auto-detect the cache
+configuration) Cachegrind simulates the first-level and third-level caches.
+The reason for this choice is that the L3 cache has the most influence on
+runtime, as it masks accesses to main memory.  Furthermore, the L1 caches
+often have low associativity, so simulating them can detect cases where the
+code interacts badly with this cache (eg. traversing a matrix column-wise
+with the row length being a power of 2).</para>
+
+<para>Therefore, Cachegrind always refers to the I1, D1 and LL (last-level)
+caches.</para>

 <para>
-It gathers the following statistics (abbreviations used for each statistic
+Cachegrind gathers the following statistics (abbreviations used for each statistic
 is given in parentheses):</para>
 <itemizedlist>
  <listitem>
    <para>I cache reads (<computeroutput>Ir</computeroutput>,
    which equals the number of instructions executed),
    I1 cache read misses (<computeroutput>I1mr</computeroutput>) and
-    L2 cache instruction read misses (<computeroutput>I1mr</computeroutput>).
+    LL cache instruction read misses (<computeroutput>ILmr</computeroutput>).
    </para>
  </listitem>
  <listitem>
    <para>D cache reads (<computeroutput>Dr</computeroutput>, which
    equals the number of memory reads),
    D1 cache read misses (<computeroutput>D1mr</computeroutput>), and
-    L2 cache data read misses (<computeroutput>D2mr</computeroutput>).
+    LL cache data read misses (<computeroutput>DLmr</computeroutput>).
    </para>
  </listitem>
  <listitem>
    <para>D cache writes (<computeroutput>Dw</computeroutput>, which equals
    the number of memory writes),
    D1 cache write misses (<computeroutput>D1mw</computeroutput>), and
-    L2 cache data write misses (<computeroutput>D2mw</computeroutput>).
+    LL cache data write misses (<computeroutput>DLmw</computeroutput>).
    </para>
  </listitem>
  <listitem>
@ -59,10 +71,10 @@ is given in parentheses):</para>

 <para>Note that D1 total accesses is given by
 <computeroutput>D1mr</computeroutput> +
-<computeroutput>D1mw</computeroutput>, and that L2 total
-accesses is given by <computeroutput>I2mr</computeroutput> +
-<computeroutput>D2mr</computeroutput> +
-<computeroutput>D2mw</computeroutput>.
+<computeroutput>D1mw</computeroutput>, and that LL total
+accesses is given by <computeroutput>ILmr</computeroutput> +
+<computeroutput>DLmr</computeroutput> +
+<computeroutput>DLmw</computeroutput>.
 </para>

 <para>These statistics are presented for the entire program and for each
@ -70,7 +82,7 @@ function in the program.  You can also annotate each line of source code in
 the program with the counts that were caused directly by it.</para>

 <para>On a modern machine, an L1 miss will typically cost
-around 10 cycles, an L2 miss can cost as much as 200
+around 10 cycles, an LL miss can cost as much as 200
 cycles, and a mispredicted branch costs in the region of 10
 to 30 cycles.  Detailed cache and branch profiling can be very useful
 for understanding how your program interacts with the machine and thus how
@ -118,24 +130,24 @@ summary statistics that look like this will be printed:</para>
 <programlisting><![CDATA[
 ==31751== I   refs:      27,742,716
 ==31751== I1  misses:           276
-==31751== L2i misses:           275
+==31751== LLi misses:           275
 ==31751== I1  miss rate:        0.0%
-==31751== L2i miss rate:        0.0%
+==31751== LLi miss rate:        0.0%
 ==31751== 
 ==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
 ==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
-==31751== L2d misses:        23,085  (     3,987 rd +    19,098 wr)
+==31751== LLd misses:        23,085  (     3,987 rd +    19,098 wr)
 ==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
-==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
+==31751== LLd miss rate:        0.1% (       0.0%   +       0.4%)
 ==31751== 
-==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
-==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)]]></programlisting>
+==31751== LL misses:         23,360  (     4,262 rd +    19,098 wr)
+==31751== LL miss rate:         0.0% (       0.0%   +       0.4%)]]></programlisting>

 <para>Cache accesses for instruction fetches are summarised
 first, giving the number of fetches made (this is the number of
 instructions executed, which can be useful to know in its own
-right), the number of I1 misses, and the number of L2 instruction
-(<computeroutput>L2i</computeroutput>) misses.</para>
+right), the number of I1 misses, and the number of LL instruction
+(<computeroutput>LLi</computeroutput>) misses.</para>

 <para>Cache accesses for data follow. The information is similar
 to that of the instruction fetches, except that the values are
@ -144,12 +156,12 @@ also shown split between reads and writes (note each row's
 <computeroutput>wr</computeroutput> values add up to the row's
 total).</para>

-<para>Combined instruction and data figures for the L2 cache
-follow that.  Note that the L2 miss rate is computed relative to the total
+<para>Combined instruction and data figures for the LL cache
+follow that.  Note that the LL miss rate is computed relative to the total
 number of memory accesses, not the number of L1 misses.  I.e.  it is
-<computeroutput>(I2mr + D2mr + D2mw) / (Ir + Dr + Dw)</computeroutput>
+<computeroutput>(ILmr + DLmr + DLmw) / (Ir + Dr + Dw)</computeroutput>
 not
-<computeroutput>(I2mr + D2mr + D2mw) / (I1mr + D1mr + D1mw)</computeroutput>
+<computeroutput>(ILmr + DLmr + DLmw) / (I1mr + D1mr + D1mw)</computeroutput>
 </para>

 <para>Branch prediction statistics are not collected by default.
@ -208,11 +220,11 @@ wide if possible, as the output lines can be quite long.</para>
 --------------------------------------------------------------------------------
 I1 cache:              65536 B, 64 B, 2-way associative
 D1 cache:              65536 B, 64 B, 2-way associative
-L2 cache:              262144 B, 64 B, 8-way associative
+LL cache:              262144 B, 64 B, 8-way associative
 Command:               concord vg_to_ucode.c
-Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events recorded:       Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
+Events shown:          Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
+Event sort order:      Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
 Threshold:             99%
 Chosen for annotation:
 Auto-annotation:       off
@ -224,7 +236,7 @@ Auto-annotation:       off
 <itemizedlist>

  <listitem>
-    <para>I1 cache, D1 cache, L2 cache: cache configuration.  So
+    <para>I1 cache, D1 cache, LL cache: cache configuration.  So
    you know the configuration with which these results were
    obtained.</para>
  </listitem>
@ -300,7 +312,7 @@ program:</para>
  
 <programlisting><![CDATA[
 --------------------------------------------------------------------------------
-Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
+Ir         I1mr ILmr Dr         D1mr   DLmr  Dw        D1mw   DLmw
 --------------------------------------------------------------------------------
 27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS]]></programlisting>

@ -312,7 +324,7 @@ These are similar to the summary provided when Cachegrind finishes running.

 <programlisting><![CDATA[
 --------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
+Ir        I1mr ILmr Dr        D1mr  DLmr  Dw        D1mw   DLmw    file:function
 --------------------------------------------------------------------------------
 8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
 5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
@ -367,7 +379,7 @@ produces the same output as above followed by an annotated version of
 --------------------------------------------------------------------------------
 -- User-annotated source: concord.c
 --------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
+Ir        I1mr ILmr Dr      D1mr  DLmr  Dw      D1mw   DLmw

        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
        3    1    1       .     .     .       1      0      0  {
@ -687,7 +699,7 @@ programs.  It does however check that the
 <computeroutput>Events:</computeroutput> lines of all the inputs are
 identical, so as to ensure that the addition of costs makes sense.
 For example, it would be nonsensical for it to add a number indicating
-D1 read references to a number from a different file indicating L2
+D1 read references to a number from a different file indicating LL
 write misses.</para>

 <para>
@ -746,7 +758,7 @@ programs.  It does however check that the
 <computeroutput>Events:</computeroutput> lines of all the inputs are
 identical, so as to ensure that the addition of costs makes sense.
 For example, it would be nonsensical for it to add a number indicating
-D1 read references to a number from a different file indicating L2
+D1 read references to a number from a different file indicating LL
 write misses.</para>

 <para>
@ -810,12 +822,12 @@ this case.</para>
    </listitem>
  </varlistentry>

-  <varlistentry id="opt.L2" xreflabel="--L2">
+  <varlistentry id="opt.LL" xreflabel="--LL">
    <term>
-      <option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
+      <option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
    </term>
    <listitem>
-      <para>Specify the size, associativity and line size of the level 2
+      <para>Specify the size, associativity and line size of the last-level
      cache.</para>
    </listitem>
  </varlistentry>
@ -903,9 +915,9 @@ this case.</para>
      order). Default is to use all present in the
      <filename>cachegrind.out.&lt;pid&gt;</filename> file (and
      use the order in the file).  Useful if you want to concentrate on, for
-      example, I cache misses (<option>--show=I1mr,I2mr</option>), or data
-      read misses (<option>--show=D1mr,D2mr</option>), or L2 data misses
-      (<option>--show=D2mr,D2mw</option>).  Best used in conjunction with
+      example, I cache misses (<option>--show=I1mr,ILmr</option>), or data
+      read misses (<option>--show=D1mr,DLmr</option>), or LL data misses
+      (<option>--show=DLmr,DLmw</option>).  Best used in conjunction with
      <option>--sort</option>.</para>
    </listitem>
  </varlistentry>
@ -935,9 +947,9 @@ this case.</para>
      events by appending any events for the
      <option>--sort</option> option with a colon
      and a number (no spaces, though).  E.g. if you want to see
-      each function that covers more than 1% of L2 read misses or 1% of L2
+      each function that covers more than 1% of LL read misses or 1% of LL
      write misses, use this option:</para>
-      <para><option>--sort=D2mr:1,D2mw:1</option></para>
+      <para><option>--sort=DLmr:1,DLmw:1</option></para>
    </listitem>
  </varlistentry>

@ -1059,13 +1071,13 @@ information, but they can still be very useful for identifying
 bottlenecks.</para>

 <para>
-After that, we have found that L2 misses are typically a much bigger source
+After that, we have found that LL misses are typically a much bigger source
 of slow-downs than L1 misses.  So it's worth looking for any snippets of
-code with high <computeroutput>D2mr</computeroutput> or
-<computeroutput>D2mw</computeroutput> counts.  (You can use
-<option>--show=D2mr
--sort=D2mr</option> with cg_annotate to focus just on
-<literal>D2mr</literal> counts, for example.) If you find any, it's still
+code with high <computeroutput>DLmr</computeroutput> or
+<computeroutput>DLmw</computeroutput> counts.  (You can use
+<option>--show=DLmr
+--sort=DLmr</option> with cg_annotate to focus just on
+<literal>DLmr</literal> counts, for example.) If you find any, it's still
 not always easy to work out how to improve things.  You need to have a
 reasonable understanding of how caches work, the principles of locality, and
 your program's data access patterns.  Improving things may require
@ -1153,12 +1165,12 @@ follows:</para>
  </listitem>

  <listitem>
-    <para>Inclusive L2 cache: the L2 cache typically replicates all
+    <para>Inclusive LL cache: the LL cache typically replicates all
    the entries of the L1 caches, because fetching into L1 involves
-    fetching into L2 first (this does not guarantee strict inclusiveness,
-    as lines evicted from L2 still could reside in L1).  This is
+    fetching into LL first (this does not guarantee strict inclusiveness,
+    as lines evicted from LL still could reside in L1).  This is
    standard on Pentium chips, but AMD Opterons, Athlons and Durons
-    use an exclusive L2 cache that only holds
+    use an exclusive LL cache that only holds
    blocks evicted from L1.  Ditto most modern VIA CPUs.</para>
  </listitem>

@ -1172,10 +1184,10 @@ early incarnation that doesn't give any cache information, then
 Cachegrind will fall back to using a default configuration (that
 of a model 3/4 Athlon).  Cachegrind will tell you if this
 happens.  You can manually specify one, two or all three levels
-(I1/D1/L2) of the cache from the command line using the
+(I1/D1/LL) of the cache from the command line using the
 <option>--I1</option>,
 <option>--D1</option> and
-<option>--L2</option> options.
+<option>--LL</option> options.
 For cache parameters to be valid for simulation, the number
 of sets (with associativity being the number of cache lines in
 each set) has to be a power of two.</para>
@ -1186,7 +1198,7 @@ determine the cache configuration, so you will
 need to specify it with the
 <option>--I1</option>,
 <option>--D1</option> and
-<option>--L2</option> options.</para>
+<option>--LL</option> options.</para>


 <para>Other noteworthy behaviour:</para>
--- a/cachegrind/tests/chdir.stderr.exp
+++ b/cachegrind/tests/chdir.stderr.exp
@ -2,16 +2,16 @@

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/cachegrind/tests/dlclose.stderr.exp
+++ b/cachegrind/tests/dlclose.stderr.exp
@ -2,16 +2,16 @@

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/cachegrind/tests/filter_stderr
+++ b/cachegrind/tests/filter_stderr
@ -7,11 +7,11 @@ $dir/../../tests/filter_stderr_basic                |
 # Remove "Cachegrind, ..." line and the following copyright line.
 sed "/^Cachegrind, a cache and branch-prediction profiler/ , /./ d" |

-# Remove numbers from I/D/L2 "refs:" lines
-perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/'  |
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/'  |

-# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
-perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |

 # Remove CPUID warnings lines for P4s and other machines
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
--- a/cachegrind/tests/notpower2.stderr.exp
+++ b/cachegrind/tests/notpower2.stderr.exp
@ -2,16 +2,16 @@

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/cachegrind/tests/notpower2.vgtest
+++ b/cachegrind/tests/notpower2.vgtest
@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
 cleanup: rm cachegrind.out.*
--- a/cachegrind/tests/wrap5.stderr.exp
+++ b/cachegrind/tests/wrap5.stderr.exp
@ -2,16 +2,16 @@

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/cachegrind/tests/x86/fpu-28-108.stderr.exp
+++ b/cachegrind/tests/x86/fpu-28-108.stderr.exp
@ -2,16 +2,16 @@

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/docs/cl-format.xml
+++ b/callgrind/docs/cl-format.xml
@ -414,7 +414,7 @@ for "Ir and "Dr".</para>
    <para>This specifies various information for this dump.  For some 
    types, the semantic is defined, but any description type is allowed. 
    Unknown types should be ignored.</para>
-    <para>There are the types "I1 cache", "D1 cache", "L2 cache", which 
+    <para>There are the types "I1 cache", "D1 cache", "LL cache", which 
    specify parameters used for the cache simulator.  These are the only
    types originally used by Cachegrind.  Additionally, Callgrind uses 
    the following types:  "Timerange" gives a rough range of the basic
@ -457,7 +457,7 @@ for "Ir and "Dr".</para>
          <para><command>I1mr</command>: Instruction Level 1 read cache miss</para>
        </listitem>
        <listitem>
-          <para><command>I2mr</command>: Instruction Level 2 read cache miss</para>
+          <para><command>ILmr</command>: Instruction last-level read cache miss</para>
        </listitem>
        <listitem>
          <para>...</para>
--- a/callgrind/docs/cl-manual.xml
+++ b/callgrind/docs/cl-manual.xml
@ -933,9 +933,9 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
      <para>Specify if you want to do full cache simulation.  By default,
      only instruction read accesses will be counted ("Ir").
      With cache simulation, further event counters are enabled:
-      Cache misses on instruction reads ("I1mr"/"I2mr"),
-      data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"),
-      data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw").
+      Cache misses on instruction reads ("I1mr"/"ILmr"),
+      data read accesses ("Dr") and related cache misses ("D1mr"/"DLmr"),
+      data write accesses ("Dw") and related cache misses ("D1mw"/"DLmw").
      For more information, see <xref linkend="cg-manual"/>.
      </para>
    </listitem>
@ -972,13 +972,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
    </term>
    <listitem>
      <para>Specify whether write-back behavior should be simulated, allowing
-      to distinguish L2 caches misses with and without write backs.
+      to distinguish LL caches misses with and without write backs.
      The cache model of Cachegrind/Callgrind does not specify write-through
      vs. write-back behavior, and this also is not relevant for the number
      of generated miss counts. However, with explicit write-back simulation
      it can be decided whether a miss triggers not only the loading of a new
      cache line, but also if a write back of a dirty cache line had to take
-      place before. The new dirty miss events are I2dmr, D2dmr, and D2dmw,
+      place before. The new dirty miss events are ILdmr, DLdmr, and DLdmw,
      for misses because of instruction read, data read, and data write,
      respectively. As they produce two memory transactions, they should
      account for a doubled time estimation in relation to a normal miss.
@ -1016,13 +1016,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
      bad access behavior). The new counters are defined in a way such
      that worse behavior results in higher cost.
      AcCost1 and AcCost2 are counters showing bad temporal locality
-      for L1 and L2 caches, respectively. This is done by summing up
+      for L1 and LL caches, respectively. This is done by summing up
      reciprocal values of the numbers of accesses of each cache line,
      multiplied by 1000 (as only integer costs are allowed). E.g. for
      a given source line with 5 read accesses, a value of 5000 AcCost
      means that for every access, a new cache line was loaded and directly
      evicted afterwards without further accesses. Similarly, SpLoss1/2
-      shows bad spatial locality for L1 and L2 caches, respectively. It
+      shows bad spatial locality for L1 and LL caches, respectively. It
      gives the <emphasis>spatial loss</emphasis> count of bytes which
      were loaded into cache but never accessed. It pinpoints at code
      accessing data in a way such that cache space is wasted. This hints
@ -1059,12 +1059,12 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
    </listitem>
  </varlistentry>

-  <varlistentry id="opt.L2" xreflabel="--L2">
+  <varlistentry id="opt.LL" xreflabel="--LL">
    <term>
-      <option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
+      <option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
    </term>
    <listitem>
-      <para>Specify the size, associativity and line size of the level 2
+      <para>Specify the size, associativity and line size of the last-level
      cache.</para>
    </listitem>
  </varlistentry>
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@ -91,7 +91,7 @@ typedef struct {
 * States of flat caches in our model.
 * We use a 2-level hierarchy, 
 */
-static cache_t2 I1, D1, L2;
+static cache_t2 I1, D1, LL;

 /* Lower bits of cache tags are used as flags for a cache line */
 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
@ -123,8 +123,8 @@ static Int off_I1_AcCost  = 0;
 static Int off_I1_SpLoss  = 1;
 static Int off_D1_AcCost  = 0;
 static Int off_D1_SpLoss  = 1;
-static Int off_L2_AcCost  = 2;
-static Int off_L2_SpLoss  = 3;
+static Int off_LL_AcCost  = 2;
+static Int off_LL_SpLoss  = 3;

 /* Cache access types */
 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
@ -135,7 +135,7 @@ typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
 /* Result of a reference into a hierarchical cache model */
 typedef enum {
    L1_Hit, 
-    L2_Hit,
+    LL_Hit,
    MemAccess,
    WriteBackMemAccess } CacheModelResult;

@ -231,7 +231,7 @@ static void print_cache(cache_t2* c)
 /*------------------------------------------------------------*/

 /*
- * Simple model: L1 & L2 Write Through
+ * Simple model: L1 & LL Write Through
 * Does not distinguish among read and write references
 *
 * Simulator functions:
@ -305,7 +305,7 @@ static
 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 {
    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    return MemAccess;
 }

@ -313,7 +313,7 @@ static
 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 {
    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    return MemAccess;
 }

@ -323,7 +323,7 @@ CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 /*------------------------------------------------------------*/

 /*
- * More complex model: L1 Write-through, L2 Write-back
+ * More complex model: L1 Write-through, LL Write-back
 * This needs to distinguish among read and write references.
 *
 * Simulator functions:
@ -412,8 +412,8 @@ static
 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 {
    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
    }
@ -424,8 +424,8 @@ static
 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 {
    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
    }
@ -437,14 +437,14 @@ CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 {
    if ( cachesim_ref( &D1, a, size) == Hit ) {
 	/* Even for a L1 hit, the write-trough L1 passes
-	 * the write to the L2 to make the L2 line dirty.
+	 * the write to the LL to make the LL line dirty.
 	 * But this causes no latency, so return the hit.
 	 */
-	cachesim_ref_wb( &L2, Write, a, size);
+	cachesim_ref_wb( &LL, Write, a, size);
 	return L1_Hit;
    }
-    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
    }
@ -479,10 +479,10 @@ void prefetch_clear(void)
 * One stream can be detected per 4k page.
 */
 static __inline__
-void prefetch_L2_doref(Addr a)
+void prefetch_LL_doref(Addr a)
 {
  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
-  UInt block = ( a >> L2.line_size_bits);
+  UInt block = ( a >> LL.line_size_bits);

  if (block != pf_lastblock[stream]) {
    if (pf_seqblocks[stream] == 0) {
@ -494,7 +494,7 @@ void prefetch_L2_doref(Addr a)
 	pf_seqblocks[stream]++;
 	if (pf_seqblocks[stream] >= 2) {
 	  prefetch_up++;
-	  cachesim_ref(&L2, a + 5 * L2.line_size,1);
+	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
 	}
      }
      else pf_seqblocks[stream] = 0;
@ -504,7 +504,7 @@ void prefetch_L2_doref(Addr a)
 	pf_seqblocks[stream]--;
 	if (pf_seqblocks[stream] <= -2) {
 	  prefetch_down++;
-	  cachesim_ref(&L2, a - 5 * L2.line_size,1);
+	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
 	}
      }
      else pf_seqblocks[stream] = 0;
@ -519,8 +519,8 @@ static
 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 {
    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    return MemAccess;
 }

@ -528,8 +528,8 @@ static
 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 {
    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    return MemAccess;
 }

@ -540,9 +540,9 @@ static
 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 {
    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    prefetch_LL_doref(a);
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
    }
@ -553,9 +553,9 @@ static
 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 {
    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    prefetch_LL_doref(a);
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
    }
@ -565,17 +565,17 @@ CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 static
 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 {
-    prefetch_L2_doref(a);
+    prefetch_LL_doref(a);
    if ( cachesim_ref( &D1, a, size) == Hit ) {
 	/* Even for a L1 hit, the write-trough L1 passes
-	 * the write to the L2 to make the L2 line dirty.
+	 * the write to the LL to make the LL line dirty.
 	 * But this causes no latency, so return the hit.
 	 */
-	cachesim_ref_wb( &L2, Write, a, size);
+	cachesim_ref_wb( &LL, Write, a, size);
 	return L1_Hit;
    }
-    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
    }
@ -736,7 +736,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
   /* Second case: word straddles two lines. */                             \
   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
-      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
+      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
      set = &(L.tags[set1 * L.assoc]);                                      \
      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
      if (tag == (set[0] & L.tag_mask)) {                                   \
@ -809,7 +809,7 @@ block2:                                                                     \
      idx = (set2 * L.assoc) + tmp_tag;                                     \
      miss2 = update_##L##_use(&L, idx,			                    \
 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
-      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
+      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
                                                                            \
   } else {                                                                 \
       VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
@ -837,13 +837,13 @@ static __inline__ unsigned int countBits(unsigned int bits)
  return c;
 }

-static void update_L2_use(int idx, Addr memline)
+static void update_LL_use(int idx, Addr memline)
 {
-  line_loaded* loaded = &(L2.loaded[idx]);
-  line_use* use = &(L2.use[idx]);
-  int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
+  line_loaded* loaded = &(LL.loaded[idx]);
+  line_use* use = &(LL.use[idx]);
+  int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
  
-  CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
+  CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
           idx, CLG_(bb_base) + current_ii->instr_offset, memline);
  if (use->count>0) {
    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
@ -852,8 +852,8 @@ static void update_L2_use(int idx, Addr memline)
 	     CLG_(current_state).collect, loaded->use_base);
    
    if (CLG_(current_state).collect && loaded->use_base) {
-      (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
-      (loaded->use_base)[off_L2_SpLoss] += i;
+      (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
+      (loaded->use_base)[off_LL_SpLoss] += i;
    }
   }

@ -868,53 +868,53 @@ static void update_L2_use(int idx, Addr memline)
 }

 static
-CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
+CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
 {
-   UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
-   UWord* set = &(L2.tags[setNo * L2.assoc]);
-   UWord tag  = memline & L2.tag_mask;
+   UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
+   UWord* set = &(LL.tags[setNo * LL.assoc]);
+   UWord tag  = memline & LL.tag_mask;

   int i, j, idx;
   UWord tmp_tag;
   
-   CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
+   CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);

-   if (tag == (set[0] & L2.tag_mask)) {
-     idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
-     l1_loaded->dep_use = &(L2.use[idx]);
+   if (tag == (set[0] & LL.tag_mask)) {
+     idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
+     l1_loaded->dep_use = &(LL.use[idx]);

     CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
-		 idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
-		 L2.use[idx].mask, L2.use[idx].count);
-     return L2_Hit;
+		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
+		 LL.use[idx].mask, LL.use[idx].count);
+     return LL_Hit;
   }
-   for (i = 1; i < L2.assoc; i++) {
-     if (tag == (set[i] & L2.tag_mask)) {
+   for (i = 1; i < LL.assoc; i++) {
+     if (tag == (set[i] & LL.tag_mask)) {
       tmp_tag = set[i];
       for (j = i; j > 0; j--) {
 	 set[j] = set[j - 1];
       }
       set[0] = tmp_tag;
-       idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
-       l1_loaded->dep_use = &(L2.use[idx]);
+       idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
+       l1_loaded->dep_use = &(LL.use[idx]);

 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
-		 i, idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
-		 L2.use[idx].mask, L2.use[idx].count);
-	return L2_Hit;
+		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
+		 LL.use[idx].mask, LL.use[idx].count);
+	return LL_Hit;
     }
   }

   /* A miss;  install this tag as MRU, shuffle rest down. */
-   tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
-   for (j = L2.assoc - 1; j > 0; j--) {
+   tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
+   for (j = LL.assoc - 1; j > 0; j--) {
     set[j] = set[j - 1];
   }
   set[0] = tag | tmp_tag;
-   idx = (setNo * L2.assoc) + tmp_tag;
-   l1_loaded->dep_use = &(L2.use[idx]);
+   idx = (setNo * LL.assoc) + tmp_tag;
+   l1_loaded->dep_use = &(LL.use[idx]);

-   update_L2_use(idx, memline);
+   update_LL_use(idx, memline);

   return MemAccess;
 }
@ -943,7 +943,7 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
                                                                     \
-      /* FIXME (?): L1/L2 line sizes must be equal ! */              \
+      /* FIXME (?): L1/LL line sizes must be equal ! */              \
      loaded->dep_use->mask |= use->mask;                            \
      loaded->dep_use->count += use->count;                          \
    }                                                                \
@ -957,8 +957,8 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
    CLG_(current_state).nonskipped->skipped :                        \
    CLG_(cost_base) + current_ii->cost_offset;                       \
                                                                     \
-  if (memline == 0) return L2_Hit;                                   \
-  return cacheuse_L2_access(memline, loaded);                        \
+  if (memline == 0) return LL_Hit;                                   \
+  return cacheuse_LL_access(memline, loaded);                        \
 }

 UPDATE_USE(I1);
@ -991,10 +991,10 @@ void cacheuse_finish(void)
      if (D1.loaded[i].use_base)
 	update_D1_use( &D1, i, 0,0);

-  if (L2.use)
-    for (i = 0; i < L2.sets * L2.assoc; i++)
-      if (L2.loaded[i].use_base)
-	update_L2_use(i, 0);
+  if (LL.use)
+    for (i = 0; i < LL.sets * LL.assoc; i++)
+      if (LL.loaded[i].use_base)
+	update_LL_use(i, 0);
 }
  

@ -1020,7 +1020,7 @@ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
 	    c2[2]++;
 	    // fall through

-	case L2_Hit:
+	case LL_Hit:
 	    c1[1]++;
 	    c2[1]++;
 	    // fall through
@ -1036,9 +1036,9 @@ Char* cacheRes(CacheModelResult r)
 {
    switch(r) {
    case L1_Hit:    return "L1 Hit ";
-    case L2_Hit:    return "L2 Hit ";
-    case MemAccess: return "L2 Miss";
-    case WriteBackMemAccess: return "L2 Miss (dirty)";
+    case LL_Hit:    return "LL Hit ";
+    case MemAccess: return "LL Miss";
+    case WriteBackMemAccess: return "LL Miss (dirty)";
    default:
 	tl_assert(0);
    }
@ -1268,7 +1268,7 @@ static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)

 static cache_t clo_I1_cache = UNDEFINED_CACHE;
 static cache_t clo_D1_cache = UNDEFINED_CACHE;
-static cache_t clo_L2_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;


 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
@ -1308,7 +1308,7 @@ static Char* check_cache(cache_t* cache)
 }

 static
-void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)

@ -1317,30 +1317,30 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
   Bool all_caches_clo_defined =
      (DEFINED(clo_I1_cache) &&
       DEFINED(clo_D1_cache) &&
-       DEFINED(clo_L2_cache));
+       DEFINED(clo_LL_cache));

   // Set the cache config (using auto-detection, if supported by the
   // architecture).
-   VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
+   VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );

   // Check the default/auto-detected values.
   checkRes = check_cache(I1c);  tl_assert(!checkRes);
   checkRes = check_cache(D1c);  tl_assert(!checkRes);
-   checkRes = check_cache(L2c);  tl_assert(!checkRes);
+   checkRes = check_cache(LLc);  tl_assert(!checkRes);

   // Then replace with any defined on the command line.
   if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
   if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
-   if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+   if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }

   if (VG_(clo_verbosity) > 1) {
-      VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
-      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines\n",
-                               I1c->size, I1c->assoc, I1c->line_size);
-      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines\n",
-                               D1c->size, D1c->assoc, D1c->line_size);
-      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines\n",
-                               L2c->size, L2c->assoc, L2c->line_size);
+      VG_(umsg)("Cache configuration used:\n");
+      VG_(umsg)("  I1: %dB, %d-way, %dB lines\n",
+                I1c->size, I1c->assoc, I1c->line_size);
+      VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
+                D1c->size, D1c->assoc, D1c->line_size);
+      VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
+                LLc->size, LLc->assoc, LLc->line_size);
   }
 #undef CMD_LINE_DEFINED
 }
@ -1350,7 +1350,7 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
 static void cachesim_post_clo_init(void)
 {
  /* Cache configurations. */
-  cache_t  I1c, D1c, L2c;
+  cache_t  I1c, D1c, LLc;

  /* Initialize access handlers */
  if (!CLG_(clo).simulate_cache) {
@ -1374,15 +1374,15 @@ static void cachesim_post_clo_init(void)
  }

  /* Configuration of caches only needed with real cache simulation */
-  configure_caches(&I1c, &D1c, &L2c);
+  configure_caches(&I1c, &D1c, &LLc);
  
  I1.name = "I1";
  D1.name = "D1";
-  L2.name = "L2";
+  LL.name = "LL";

  cachesim_initcache(I1c, &I1);
  cachesim_initcache(D1c, &D1);
-  cachesim_initcache(L2c, &L2);
+  cachesim_initcache(LLc, &LL);

  /* the other cache simulators use the standard helpers
   * with dispatching via simulator struct */
@ -1463,7 +1463,7 @@ void cachesim_clear(void)
 {
  cachesim_clearcache(&I1);
  cachesim_clearcache(&D1);
-  cachesim_clearcache(&L2);
+  cachesim_clearcache(&LL);

  prefetch_clear();
 }
@ -1474,7 +1474,7 @@ static void cachesim_getdesc(Char* buf)
  Int p;
  p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
  p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
-  VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
+  VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
 }

 static
@ -1490,11 +1490,12 @@ void cachesim_print_opts(void)
 "    --cacheuse=no|yes         Collect cache block use [no]\n"
 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
-"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
+"    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
 	      );
 }

-static void parse_opt ( cache_t* cache, char* opt, Char* optval )
+static void parse_opt ( cache_t* cache,
+                        char* opt, Char* optval, UChar kind )
 {
   Long i1, i2, i3;
   Char* endptr;
@ -1550,11 +1551,12 @@ static Bool cachesim_parse_opt(Char* arg)
   }

   else if VG_STR_CLO(arg, "--I1", tmp_str)
-      parse_opt(&clo_I1_cache, arg, tmp_str);
+      parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
   else if VG_STR_CLO(arg, "--D1", tmp_str)
-      parse_opt(&clo_D1_cache, arg, tmp_str);
-   else if VG_STR_CLO(arg, "--L2", tmp_str)
-      parse_opt(&clo_L2_cache, arg, tmp_str);
+      parse_opt(&clo_D1_cache, arg, tmp_str, '1');
+   else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
+            VG_STR_CLO(arg, "--LL", tmp_str))
+      parse_opt(&clo_LL_cache, arg, tmp_str, '2');
  else
    return False;

@ -1613,8 +1615,8 @@ static
 void cachesim_printstat(Int l1, Int l2, Int l3)
 {
  FullCost total = CLG_(total_cost), D_total = 0;
-  ULong L2_total_m, L2_total_mr, L2_total_mw,
-    L2_total, L2_total_r, L2_total_w;
+  ULong LL_total_m, LL_total_mr, LL_total_mw,
+    LL_total, LL_total_r, LL_total_w;
  char buf1[RESULTS_BUF_LEN], 
    buf2[RESULTS_BUF_LEN], 
    buf3[RESULTS_BUF_LEN];
@ -1632,7 +1634,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
  VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);

  commify(total[fullOffset(EG_IR) +2], l1, buf1);
-  VG_(message)(Vg_UserMsg, "L2i misses:    %s\n", buf1);
+  VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);

  p = 100;

@ -1645,7 +1647,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
       
  percentify(total[fullOffset(EG_IR)+2] * 100 * p /
 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
-  VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
+  VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
  VG_(message)(Vg_UserMsg, "\n");
   
  /* D cache results.
@ -1673,7 +1675,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
  commify( D_total[2], l1, buf1);
  commify(total[fullOffset(EG_DR)+2], l2, buf2);
  commify(total[fullOffset(EG_DW)+2], l3, buf3);
-  VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)\n",
+  VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
 	       buf1, buf2, buf3);

  p = 10;
@ -1695,50 +1697,50 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
  percentify(total[fullOffset(EG_DW)+2] * 100 * p /
 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
-  VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )\n", 
+  VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n", 
               buf1, buf2,buf3);
  VG_(message)(Vg_UserMsg, "\n");


  
-  /* L2 overall results */
+  /* LL overall results */
  
-  L2_total   =
+  LL_total   =
    total[fullOffset(EG_DR) +1] +
    total[fullOffset(EG_DW) +1] +
    total[fullOffset(EG_IR) +1];
-  L2_total_r =
+  LL_total_r =
    total[fullOffset(EG_DR) +1] +
    total[fullOffset(EG_IR) +1];
-  L2_total_w = total[fullOffset(EG_DW) +1];
-  commify(L2_total,   l1, buf1);
-  commify(L2_total_r, l2, buf2);
-  commify(L2_total_w, l3, buf3);
-  VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)\n",
+  LL_total_w = total[fullOffset(EG_DW) +1];
+  commify(LL_total,   l1, buf1);
+  commify(LL_total_r, l2, buf2);
+  commify(LL_total_w, l3, buf3);
+  VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
 	       buf1, buf2, buf3);
  
-  L2_total_m  =
+  LL_total_m  =
    total[fullOffset(EG_DR) +2] +
    total[fullOffset(EG_DW) +2] +
    total[fullOffset(EG_IR) +2];
-  L2_total_mr =
+  LL_total_mr =
    total[fullOffset(EG_DR) +2] +
    total[fullOffset(EG_IR) +2];
-  L2_total_mw = total[fullOffset(EG_DW) +2];
-  commify(L2_total_m,  l1, buf1);
-  commify(L2_total_mr, l2, buf2);
-  commify(L2_total_mw, l3, buf3);
-  VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)\n",
+  LL_total_mw = total[fullOffset(EG_DW) +2];
+  commify(LL_total_m,  l1, buf1);
+  commify(LL_total_mr, l2, buf2);
+  commify(LL_total_mw, l3, buf3);
+  VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
 	       buf1, buf2, buf3);
  
-  percentify(L2_total_m  * 100 * p /
+  percentify(LL_total_m  * 100 * p /
 	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
-  percentify(L2_total_mr * 100 * p /
+  percentify(LL_total_mr * 100 * p /
 	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
 	     p, l2+1, buf2);
-  percentify(L2_total_mw * 100 * p /
+  percentify(LL_total_mw * 100 * p /
 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
-  VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )\n",
+  VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
 	       buf1, buf2,buf3);
 }

@ -1760,14 +1762,14 @@ void CLG_(init_eventsets)()
    if (!CLG_(clo).simulate_cache)
 	CLG_(register_event_group)(EG_IR, "Ir");
    else if (!clo_simulate_writeback) {
-	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "I2mr");
-	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "D2mr");
-	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "D2mw");
+	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
+	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
+	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
    }
    else { // clo_simulate_writeback
-	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "I2mr", "I2dmr");
-        CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "D2mr", "D2dmr");
-        CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
+	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
+        CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
+        CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
    }

    if (CLG_(clo).simulate_branch) {
@ -1807,12 +1809,12 @@ void CLG_(init_eventsets)()
    CLG_(append_event)(CLG_(dumpmap), "I1mr");
    CLG_(append_event)(CLG_(dumpmap), "D1mr");
    CLG_(append_event)(CLG_(dumpmap), "D1mw");
-    CLG_(append_event)(CLG_(dumpmap), "I2mr");
-    CLG_(append_event)(CLG_(dumpmap), "D2mr");
-    CLG_(append_event)(CLG_(dumpmap), "D2mw");
-    CLG_(append_event)(CLG_(dumpmap), "I2dmr");
-    CLG_(append_event)(CLG_(dumpmap), "D2dmr");
-    CLG_(append_event)(CLG_(dumpmap), "D2dmw");
+    CLG_(append_event)(CLG_(dumpmap), "ILmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLmw");
+    CLG_(append_event)(CLG_(dumpmap), "ILdmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLdmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLdmw");
    CLG_(append_event)(CLG_(dumpmap), "Bc");
    CLG_(append_event)(CLG_(dumpmap), "Bcm");
    CLG_(append_event)(CLG_(dumpmap), "Bi");
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@ -13,11 +13,11 @@ sed "/^For interactive control,.*$/d" |
 # Remove numbers from "Collected" line
 sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |

-# Remove numbers from I/D/L2 "refs:" lines
-perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/'  |
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/'  |

-# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
-perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |

 # Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
 perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
--- a/callgrind/tests/notpower2-hwpref.stderr.exp
+++ b/callgrind/tests/notpower2-hwpref.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/notpower2-hwpref.vgtest
+++ b/callgrind/tests/notpower2-hwpref.vgtest
@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-hwpref=yes
 cleanup: rm callgrind.out.*
--- a/callgrind/tests/notpower2-use.stderr.exp
+++ b/callgrind/tests/notpower2-use.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/notpower2-use.vgtest
+++ b/callgrind/tests/notpower2-use.vgtest
@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --cacheuse=yes
 cleanup: rm callgrind.out.*
--- a/callgrind/tests/notpower2-wb.stderr.exp
+++ b/callgrind/tests/notpower2-wb.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/notpower2-wb.vgtest
+++ b/callgrind/tests/notpower2-wb.vgtest
@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-wb=yes
 cleanup: rm callgrind.out.*
--- a/callgrind/tests/notpower2.stderr.exp
+++ b/callgrind/tests/notpower2.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/notpower2.vgtest
+++ b/callgrind/tests/notpower2.vgtest
@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
 cleanup: rm callgrind.out.*
--- a/callgrind/tests/simwork-both.stderr.exp
+++ b/callgrind/tests/simwork-both.stderr.exp
@ -1,23 +1,23 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:

 Branches:
 Mispredicts:
--- a/callgrind/tests/simwork-cache.stderr.exp
+++ b/callgrind/tests/simwork-cache.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/simwork1.stderr.exp
+++ b/callgrind/tests/simwork1.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/simwork2.stderr.exp
+++ b/callgrind/tests/simwork2.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/simwork3.stderr.exp
+++ b/callgrind/tests/simwork3.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
--- a/callgrind/tests/threads-use.stderr.exp
+++ b/callgrind/tests/threads-use.stderr.exp
@ -1,20 +1,20 @@


-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
 Collected :

 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:

 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:

-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate: