Add support for copy and ad hoc profiling to DHAT.

2026-02-03 10:05:29 +00:00 · 2019-09-09 14:13:35 +10:00 · 2019-09-09 14:13:35 +10:00 · 8c08253b89
commit 8c08253b89
parent d2d54dbcc7
23 changed files with 2076 additions and 844 deletions
--- a/.gitignore
+++ b/.gitignore
@ -273,8 +273,10 @@
 /dhat/tests/*.stdout.out
 /dhat/tests/.deps
 /dhat/tests/acc
+/dhat/tests/ad-hoc
 /dhat/tests/basic
 /dhat/tests/big
+/dhat/tests/copy
 /dhat/tests/empty
 /dhat/tests/sig
 /dhat/tests/single
--- a/12
+++ b/12
@ -16,6 +16,18 @@ support for X86/macOS 10.13, AMD64/macOS 10.13 and nanoMIPS/Linux.

 * DHAT:

+  - DHAT has been extended, with two new modes of operation. The new
+    --mode=copy flag triggers copy profiling, which records calls to memcpy,
+    strcpy, and similar functions. The new --mode=ad-hoc flag triggers ad hoc
+    profiling, which records calls to the DHAT_AD_HOC_EVENT client request in
+    the new dhat/dhat.h file. This is useful for learning more about hot code
+    paths. See the user manual for more information about the new modes.
+
+  - Because of these changes, DHAT's file format has changed. DHAT output
+    files produced with earlier versions of DHAT will not work with this
+    version of DHAT's viewer, and DHAT output files produced with this version
+    of DHAT will not work with earlier versions of DHAT's viewer.
+
 * Cachegrind:

 * Callgrind:
--- a/coregrind/m_replacemalloc/replacemalloc_core.c
+++ b/coregrind/m_replacemalloc/replacemalloc_core.c
@ -92,29 +92,35 @@ SizeT VG_(malloc_effective_client_redzone_size)(void)
 /*--- Useful functions                                     ---*/
 /*------------------------------------------------------------*/

-void* VG_(cli_malloc) ( SizeT align, SizeT nbytes )                 
-{                                                                             
+void* VG_(cli_malloc) ( SizeT align, SizeT nbytes )
+{
   // 'align' should be valid (ie. big enough and a power of two) by now.
   // VG_(arena_memalign)() will abort if it's not.
   if (VG_MIN_MALLOC_SZB == align)
-      return VG_(arena_malloc)   ( VG_AR_CLIENT, "replacemalloc.cm.1", 
-                                   nbytes ); 
-   else                                                                       
-      return VG_(arena_memalign) ( VG_AR_CLIENT, "replacemalloc.cm.2", 
+      return VG_(arena_malloc)   ( VG_AR_CLIENT, "replacemalloc.cm.1",
+                                   nbytes );
+   else
+      return VG_(arena_memalign) ( VG_AR_CLIENT, "replacemalloc.cm.2",
                                   align, nbytes );
-}                                                                             
-
-void VG_(cli_free) ( void* p )                                   
-{                                                                             
-   VG_(arena_free) ( VG_AR_CLIENT, p );                          
 }

-// Useful for querying user blocks.           
-SizeT VG_(cli_malloc_usable_size) ( void* p )                    
-{                                                            
+void* VG_(cli_realloc) ( void* ptr, SizeT nbytes )
+{
+   return VG_(arena_realloc) ( VG_AR_CLIENT, "replacemalloc.cr.1",
+                               ptr, nbytes );
+}
+
+void VG_(cli_free) ( void* p )
+{
+   VG_(arena_free) ( VG_AR_CLIENT, p );
+}
+
+// Useful for querying user blocks.
+SizeT VG_(cli_malloc_usable_size) ( void* p )
+{
   return VG_(arena_malloc_usable_size)(VG_AR_CLIENT, p);
-}                                                            
-  
+}
+
 Bool VG_(addr_is_in_block)( Addr a, Addr start, SizeT size, SizeT rz_szB )
 {
   return ( start - rz_szB <= a  &&  a < start + size + rz_szB );
--- a/dhat/Makefile.am
+++ b/dhat/Makefile.am
@ -1,13 +1,14 @@
 include $(top_srcdir)/Makefile.tool.am

-#SUBDIRS += perf
-
 EXTRA_DIST = docs/dh-manual.xml dh_view.html dh_view.css dh_view.js

 #----------------------------------------------------------------------------
 # Headers, etc
 #----------------------------------------------------------------------------

+pkginclude_HEADERS = \
+	dhat.h
+
 # Ensure the viewer components get copied into the install tree.
 dhatdir = $(pkglibexecdir)
 dhat_DATA = dh_view.html dh_view.css dh_view.js
@ -21,10 +22,10 @@ if VGCONF_HAVE_PLATFORM_SEC
 noinst_PROGRAMS += dhat-@VGCONF_ARCH_SEC@-@VGCONF_OS@
 endif

-EXP_DHAT_SOURCES_COMMON = dh_main.c
+DHAT_SOURCES_COMMON = dh_main.c

 dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES      = \
-	$(EXP_DHAT_SOURCES_COMMON)
+	$(DHAT_SOURCES_COMMON)
 dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS     = \
 	$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
 dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS       = $(LTO_CFLAGS) \
@ -45,7 +46,7 @@ dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LINK = \

 if VGCONF_HAVE_PLATFORM_SEC
 dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES      = \
-	$(EXP_DHAT_SOURCES_COMMON)
+	$(DHAT_SOURCES_COMMON)
 dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS     = \
 	$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
 dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS       = $(LTO_CFLAGS) \
@ -78,11 +79,16 @@ if VGCONF_OS_IS_DARWIN
 noinst_DSYMS = $(noinst_PROGRAMS)
 endif

-vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_SOURCES      = 
+# dh_replace_strmem.c runs on the simulated CPU, and is built with
+# AM_CFLAGS_PSO_* (see $(top_srcdir)/Makefile.all.am).
+VGPRELOAD_DHAT_SOURCES_COMMON = dh_replace_strmem.c
+
+vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_SOURCES      = \
+	$(VGPRELOAD_DHAT_SOURCES_COMMON)
 vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_CPPFLAGS     = \
 	$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
 vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_CFLAGS       = \
-	$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_PRI_CAPS@)
+	$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_PRI_CAPS@) -O2
 vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_DEPENDENCIES = \
 	$(LIBREPLACEMALLOC_@VGCONF_PLATFORM_PRI_CAPS@)
 vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_LDFLAGS      = \
@ -90,11 +96,12 @@ vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_LDFLAGS      = \
 	$(LIBREPLACEMALLOC_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)

 if VGCONF_HAVE_PLATFORM_SEC
-vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_SOURCES      = 
+vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_SOURCES      = \
+	$(VGPRELOAD_DHAT_SOURCES_COMMON)
 vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_CPPFLAGS     = \
 	$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
 vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_CFLAGS       = \
-	$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_SEC_CAPS@)
+	$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_SEC_CAPS@) -O2
 vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_DEPENDENCIES = \
 	$(LIBREPLACEMALLOC_@VGCONF_PLATFORM_SEC_CAPS@)
 vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_LDFLAGS      = \
--- a/dhat/dh_main.c
+++ b/dhat/dh_main.c
--- a/dhat/dh_replace_strmem.c
+++ b/dhat/dh_replace_strmem.c
@ -0,0 +1,41 @@
+/*--------------------------------------------------------------------*/
+/*--- Replacements for memcpy(), which run on the simulated CPU    ---*/
+/*--- simulated CPU.                                               ---*/
+/*---                                          dh_replace_strmem.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of DHAT, a Valgrind tool for profiling the
+   heap usage of programs.
+
+   Copyright (C) 2020-2020 Nicholas Nethercote
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "dhat.h"
+
+#define RECORD_COPY(_qzz_len) \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(_VG_USERREQ__DHAT_COPY, \
+                                  (_qzz_len), 0, 0, 0, 0)
+
+#include "../shared/vg_replace_strmem.c"
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
--- a/dhat/dh_test.js
+++ b/dhat/dh_test.js
--- a/dhat/dh_view.js
+++ b/dhat/dh_view.js
@ -51,7 +51,7 @@ let gHeaderDiv, gTestingDiv, gMainDiv, gLegendDiv, gTimingsDiv;
 let gFilename;

 // The object extracted from the JSON input.
-let gData;
+let gData = {};

 // The root of the radix tree build from gData. A radix tree is a
 // space-optimized prefix tree in which each node that is the only child is
@ -64,62 +64,68 @@ let gRoot;
 // - label: Used in the drop-down menu.
 // - bolds: Which fields to highlight in the output.
 // - cmpField: Field used to sort the radix tree.
+// - enable: Function saying whether this option is enabled.
 // - sig: Significance function used to determine aggregate nodes.
 // - sigLabel: Significance threshold description function.
 //
 const gSelectData = [
  {
-    label: "Total (bytes)",
+    label: () => `Total (${bytesUnit()})`,
    bolds: { "totalTitle": 1, "totalBytes": 1 },
    cmpField: "_totalBytes",
+    enable: (aBkLt, aBkAcc) => true,
    sig: (aT) => aT._totalBytes >= 0.01 * gRoot._totalBytes,
    sigLabel: () => `\
 total >= ${bytesAndPerc(0.01 * gRoot._totalBytes, gRoot._totalBytes)}`
  },
  {
    isDefault: true,
-    label: "Total (blocks)",
+    label: () => `Total (${blocksUnit()})`,
    bolds: { "totalTitle": 1, "totalBlocks": 1 },
    cmpField: "_totalBlocks",
+    enable: (aBkLt, aBkAcc) => true,
    sig: (aT) => aT._totalBlocks >= 0.01 * gRoot._totalBlocks,
    sigLabel: () => `\
 total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
  },
-  // No "Total (bytes), tiny" because it's extremely unlikely that an AP with a
+  // No "Total (bytes), tiny" because it's extremely unlikely that a PP with a
  // tiny average size will take up a significant number of bytes.
  {
-    label: "Total (blocks), tiny",
+    label: () => `Total (${blocksUnit()}), tiny`,
    bolds: { "totalTitle": 1, "totalBlocks": 1, "totalAvgSizeBytes": 1 },
    cmpField: "_totalBlocks",
+    enable: (aBkLt, aBkAcc) => true,
    sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
                 aT._totalAvgSizeBytes() <= 16,
    sigLabel: () => `\
 (total >= ${blocksAndPerc(0.005 * gRoot._totalBlocks, gRoot._totalBlocks)}) && \
-(total avg size <= ${bytes(16)})`
+(avg size <= ${bytes(16)})`
  },
-  // No "Total (bytes), short-lived", because an AP with few large, short-lived
+  // No "Total (bytes), short-lived", because a PP with few large, short-lived
  // blocks is unlikely. (In contrast, "Total (blocks), short-lived" is useful,
-  // because an AP with many small, short-lived blocks *is* likely.) And if
-  // such an AP existed, it'll probably show up in "Total (bytes), zero reads
+  // because a PP with many small, short-lived blocks *is* likely.) And if
+  // such a PP existed, it'll probably show up in "Total (bytes), zero reads
  // or zero writes" or "Total (bytes), low-access" anyway, because there's
-  // little time for accesses in 500 instructions.
+  // little time for accesses in a small number of instructions.
  {
-    label: "Total (blocks), short-lived",
-    bolds: { "totalTitle": 1, "totalBlocks": 1, "totalAvgLifetimeInstrs": 1 },
+    label: () => "Total (blocks), short-lived",
+    bolds: { "totalTitle": 1, "totalBlocks": 1, "totalAvgLifetime": 1 },
    cmpField: "_totalBlocks",
+    enable: (aBkLt, aBkAcc) => aBkLt,
    sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
-                 aT._totalAvgLifetimeInstrs() <= 500,
+                 aT._totalAvgLifetimes() <= gData.tuth,
    sigLabel: () => `\
 (total >= ${blocksAndPerc(0.005 * gRoot._totalBlocks, gRoot._totalBlocks)}) && \
-(total avg lifetime <= ${instrs(500)})`
+(avg lifetime <= ${time(gData.tuth)})`
  },
  {
-    label: "Total (bytes), zero reads or zero writes",
+    label: () => "Total (bytes), zero reads or zero writes",
    bolds: { "totalTitle": 1, "totalBytes": 1,
             "readsTitle": 1, "readsBytes": 1,
             "writesTitle": 1, "writesBytes": 1,
           },
    cmpField: "_totalBytes",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._totalBytes >= 0.005 * gRoot._totalBytes &&
                 (aT._readsBytes === 0 || aT._writesBytes === 0),
    sigLabel: () => `\
@ -127,12 +133,13 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
 ((reads == ${bytes(0)}) || (writes == ${bytes(0)}))`
  },
  {
-    label: "Total (blocks), zero reads or zero writes",
+    label: () => "Total (blocks), zero reads or zero writes",
    bolds: { "totalTitle": 1, "totalBlocks": 1,
             "readsTitle": 1, "readsBytes": 1,
             "writesTitle": 1, "writesBytes": 1,
           },
    cmpField: "_totalBlocks",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
                 (aT._readsBytes === 0 || aT._writesBytes === 0),
    sigLabel: () => `\
@ -140,12 +147,13 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
 ((reads == ${bytes(0)}) || (writes == ${bytes(0)}))`
  },
  {
-    label: "Total (bytes), low-access",
+    label: () => "Total (bytes), low-access",
    bolds: { "totalTitle": 1, "totalBytes": 1,
             "readsTitle": 1, "readsAvgPerByte": 1,
             "writesTitle": 1, "writesAvgPerByte": 1,
           },
    cmpField: "_totalBytes",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._totalBytes >= 0.005 * gRoot._totalBytes &&
                 aT._readsBytes !== 0 &&
                 aT._writesBytes !== 0 &&
@ -158,12 +166,13 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
 ((reads <= ${perByte(0.4)}) || (writes <= ${perByte(0.4)}))`
  },
  {
-    label: "Total (blocks), low-access",
+    label: () => "Total (blocks), low-access",
    bolds: { "totalTitle": 1, "totalBlocks": 1,
             "readsTitle": 1, "readsAvgPerByte": 1,
             "writesTitle": 1, "writesAvgPerByte": 1,
           },
    cmpField: "_totalBlocks",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
                 aT._readsBytes !== 0 &&
                 aT._writesBytes !== 0 &&
@ -176,14 +185,15 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
 ((reads <= ${perByte(0.4)}) || (writes <= ${perByte(0.4)}))`
  },
  // No "Total (avg size bytes)": not interesting.
-  // No "Total (avg lifetime instrs)": covered by "Total (blocks), short-lived".
+  // No "Total (avg lifetime)": covered by "Total (blocks), short-lived".
  // No "Max (bytes)": not interesting, and unclear how to sort.
  // No "Max (blocks)": not interesting, and unclear how to sort.
  // No "Max (avg size bytes)": not interesting, and unclear how to sort.
  {
-    label: "At t-gmax (bytes)",
+    label: () => "At t-gmax (bytes)",
    bolds: { "atTGmaxTitle": 1, "atTGmaxBytes": 1 },
    cmpField: "_atTGmaxBytes",
+    enable: (aBkLt, aBkAcc) => aBkLt,
    sig: (aT) => aT._atTGmaxBytes >= 0.01 * gRoot._atTGmaxBytes,
    sigLabel: () => `\
 at-t-gmax >= ${bytesAndPerc(0.01 * gRoot._atTGmaxBytes, gRoot._atTGmaxBytes)}`
@ -191,9 +201,10 @@ at-t-gmax >= ${bytesAndPerc(0.01 * gRoot._atTGmaxBytes, gRoot._atTGmaxBytes)}`
  // No "At t-gmax (blocks)": not interesting.
  // No "At t-gmax (avg size bytes)": not interesting.
  {
-    label: "At t-end (bytes)",
+    label: () => "At t-end (bytes)",
    bolds: { "atTEndTitle": 1, "atTEndBytes": 1 },
    cmpField: "_atTEndBytes",
+    enable: (aBkLt, aBkAcc) => aBkLt,
    sig: (aT) => aT._atTEndBytes >= 0.01 * gRoot._atTEndBytes,
    sigLabel: () => `\
 at-t-end >= ${bytesAndPerc(0.01 * gRoot._atTEndBytes, gRoot._atTEndBytes)}`
@ -201,17 +212,19 @@ at-t-end >= ${bytesAndPerc(0.01 * gRoot._atTEndBytes, gRoot._atTEndBytes)}`
  // No "At t-end (blocks)": not interesting.
  // No "At t-end (avg size bytes)": not interesting.
  {
-    label: "Reads (bytes)",
+    label: () => "Reads (bytes)",
    bolds: { "readsTitle": 1, "readsBytes": 1 },
    cmpField: "_readsBytes",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._readsBytes >= 0.01 * gRoot._readsBytes,
    sigLabel: () => `\
 reads >= ${bytesAndPerc(0.01 * gRoot._readsBytes, gRoot._readsBytes)}`
  },
  {
-    label: "Reads (bytes), high-access",
+    label: () => "Reads (bytes), high-access",
    bolds: { "readsTitle": 1, "readsBytes": 1, "readsAvgPerByte": 1 },
    cmpField: "_readsBytes",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._readsBytes >= 0.005 * gRoot._readsBytes &&
                 (aT._readsAvgPerByte() >= 1000 ||
                  aT._writesAvgPerByte() >= 1000),
@ -221,17 +234,19 @@ reads >= ${bytesAndPerc(0.01 * gRoot._readsBytes, gRoot._readsBytes)}`
  },
  // No "Reads (avg per byte)": covered by other access-related ones.
  {
-    label: "Writes (bytes)",
+    label: () => "Writes (bytes)",
    bolds: { "writesTitle": 1, "writesBytes": 1 },
    cmpField: "_writesBytes",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._writesBytes >= 0.01 * gRoot._writesBytes,
    sigLabel: () => `\
 writes >= ${bytesAndPerc(0.01 * gRoot._writesBytes, gRoot._writesBytes)}`
  },
  {
-    label: "Writes (bytes), high-access",
+    label: () => "Writes (bytes), high-access",
    bolds: { "writesTitle": 1, "writesBytes": 1, "writesAvgPerByte": 1 },
    cmpField: "_writesBytes",
+    enable: (aBkLt, aBkAcc) => aBkAcc,
    sig: (aT) => aT._writesBytes >= 0.005 * gRoot._writesBytes &&
                 (aT._readsAvgPerByte() >= 1000 ||
                  aT._writesAvgPerByte() >= 1000),
@ -304,10 +319,10 @@ function TreeNode(aKind, aFrames) {
  this._totalBytes = 0;
  this._totalBlocks = 0;

-  this._totalLifetimesInstrs = 0;
+  this._totalLifetimes = 0;

  // These numbers only make sense for leaf nodes. Unlike total stats, which
-  // can be summed, _maxBytes/_maxBlocks for two APs can't be easily combined
+  // can be summed, _maxBytes/_maxBlocks for two PPs can't be easily combined
  // because the maxes may have occurred at different times.
  if (this._kind === kLeaf) {
    this._maxBytes = 0;
@ -341,15 +356,20 @@ function TreeNode(aKind, aFrames) {
 }

 TreeNode.prototype = {
-  _add(aTotalBytes, aTotalBlocks, aTotalLifetimesInstrs, aMaxBytes,
+  _add(aTotalBytes, aTotalBlocks, aTotalLifetimes, aMaxBytes,
       aMaxBlocks, aAtTGmaxBytes, aAtTGmaxBlocks, aAtTEndBytes,
       aAtTEndBlocks, aReadsBytes, aWritesBytes, aAccesses) {

    // We ignore this._kind, this._frames, and this._kids.

+    // Note: if !gData.bklt and/or !gData.bkacc, some of these fields these
+    // values come from will be missing in the input file, so the values will
+    // be `undefined`, and the fields will end up as `NaN`. But this is ok
+    // because we don't show them.
+
    this._totalBytes += aTotalBytes;
    this._totalBlocks += aTotalBlocks;
-    this._totalLifetimesInstrs += aTotalLifetimesInstrs;
+    this._totalLifetimes += aTotalLifetimes;

    if (this._kind === kLeaf) {
      // Leaf nodes should only be added to once, because DHAT currently
@ -391,9 +411,9 @@ TreeNode.prototype = {
    }
  },

-  _addAP(aAP) {
-    this._add(aAP.tb, aAP.tbk, aAP.tli, aAP.mb, aAP.mbk, aAP.gb, aAP.gbk,
-              aAP.fb, aAP.fbk, aAP.rb, aAP.wb, aAP.acc);
+  _addPP(aPP) {
+    this._add(aPP.tb, aPP.tbk, aPP.tl, aPP.mb, aPP.mbk, aPP.gb, aPP.gbk,
+              aPP.eb, aPP.ebk, aPP.rb, aPP.wb, aPP.acc);
  },

  // This is called in two cases.
@ -401,7 +421,7 @@ TreeNode.prototype = {
  //   cloning a node).
  // - Aggregating multiple nodes.
  _addNode(aT) {
-    this._add(aT._totalBytes, aT._totalBlocks, aT._totalLifetimesInstrs,
+    this._add(aT._totalBytes, aT._totalBlocks, aT._totalLifetimes,
              aT._maxBytes, aT._maxBlocks, aT._atTGmaxBytes, aT._atTGmaxBlocks,
              aT._atTEndBytes, aT._atTEndBlocks,
              aT._readsBytes, aT._writesBytes, aT._accesses);
@ -409,7 +429,7 @@ TreeNode.prototype = {

  // Split the node after the aTi'th internal frame. The inheriting kid will
  // get the post-aTi frames; the new kid will get aNewFrames.
-  _split(aTi, aAP, aNewFrames) {
+  _split(aTi, aPP, aNewFrames) {
    // kid1 inherits t's kind and values.
    let inheritedFrames = this._frames.splice(aTi + 1);
    let kid1 = new TreeNode(this._kind, inheritedFrames);
@ -420,7 +440,7 @@ TreeNode.prototype = {

    // Put all remaining frames into kid2.
    let kid2 = new TreeNode(kLeaf, aNewFrames);
-    kid2._addAP(aAP);
+    kid2._addPP(aPP);

    // Update this.
    if (this._kind === kLeaf) {
@ -432,15 +452,15 @@ TreeNode.prototype = {
      delete this._maxBlocks;
    }
    this._kids = [kid1, kid2];
-    this._addAP(aAP);
+    this._addPP(aPP);
  },

  _totalAvgSizeBytes() {
    return div(this._totalBytes, this._totalBlocks);
  },

-  _totalAvgLifetimeInstrs() {
-    return div(this._totalLifetimesInstrs, this._totalBlocks);
+  _totalAvgLifetimes() {
+    return div(this._totalLifetimes, this._totalBlocks);
  },

  _maxAvgSizeBytes() {
@ -474,15 +494,15 @@ function checkFields(aObj, aFields) {
  }
 }

-// Do basic checking of an AP read from file.
-function checkAP(aAP) {
-  let fields = ["tb", "tbk", "tli",
-                "mb", "mbk",
-                "gb", "gbk",
-                "fb", "fbk",
-                "rb", "wb",
-                "fs"];
-  checkFields(aAP, fields);
+// Do basic checking of a PP read from file.
+function checkPP(aPP) {
+  checkFields(aPP, ["tb", "tbk", "fs"]);
+  if (gData.bklt) {
+    checkFields(aPP, ["mb", "mbk", "gb", "gbk", "eb", "ebk"]);
+  }
+  if (gData.bkacc) {
+    checkFields(aPP, ["rb", "wb"]);
+  }
 }

 // Access counts latch as 0xffff. Treating 0xffff as Infinity gives us exactly
@ -497,51 +517,78 @@ function normalizeAccess(aAcc) {
  assert(false, "too-large access value");
 }

-const kExpectedFileVersion = 1;
+const kExpectedFileVersion = 2;

 // Build gRoot from gData.
 function buildTree() {
  // Check global values.
-  let fields = ["dhatFileVersion",
+  let fields = ["dhatFileVersion", "mode", "verb",
+                "bklt", "bkacc",
+                "tu", "Mtu",
                "cmd", "pid",
-                "mi", "ei",
-                "aps", "ftbl"];
+                "te", "pps", "ftbl"];
  checkFields(gData, fields);
  if (gData.dhatFileVersion != kExpectedFileVersion) {
-      throw Error(`data file has version number ${gData.dhatFileVersion}, ` +
-                  `expected version number ${kExpectedFileVersion}`);
+      throw new Error(
+        `data file has version number ${gData.dhatFileVersion}, ` +
+        `expected version number ${kExpectedFileVersion}`);
+  }
+
+  if (gData.bklt) {
+    checkFields(gData, ["tg", "tuth"]);
+  }
+
+  // Update sort metric labels, and disable sort metrics that aren't allowed
+  // for this data.
+  for (let [i, option] of gSelect.childNodes.entries()) {
+    let data = gSelectData[i];
+    option.label = data.label();
+    option.disabled = !data.enable(gData.bklt, gData.bkacc);
+  }
+
+  // If the selected sort metric was just disabled, switch the sort metric
+  // back to the default (which is never disabled).
+  let option = gSelect.childNodes[gSelect.selectedIndex];
+  if (option.disabled) {
+    for (let [i, data] of gSelectData.entries()) {
+      let option = gSelect.childNodes[i];
+      if (data.isDefault) {
+        option.selected = true;
+        break;
+      }
+    }
  }

  // Build the radix tree. Nodes are in no particular order to start with. The
  // algorithm is tricky because we need to use internal frames when possible.
  gRoot = new TreeNode(kLeaf, [0]);   // Frame 0 is always "[root]".

-  for (let [i, ap] of gData.aps.entries()) {
-    checkAP(ap);
+  for (let [i, pp] of gData.pps.entries()) {
+    checkPP(pp);

    // Decompress the run-length encoding in `acc`, if present.
-    if (ap.acc) {
+    if (pp.acc) {
      let acc = [];
-      for (let i = 0; i < ap.acc.length; i++) {
-        if (ap.acc[i] < 0) {
+      for (let i = 0; i < pp.acc.length; i++) {
+        if (pp.acc[i] < 0) {
          // A negative number encodes a repeat count. The following entry has
          // the value to be repeated.
-          let reps = -ap.acc[i++];
-          let val = ap.acc[i];
+          let reps = -pp.acc[i++];
+          let val = pp.acc[i];
          for (let j = 0; j < reps; j++) {
            acc.push(normalizeAccess(val));
          }
        } else {
-          acc.push(normalizeAccess(ap.acc[i]));
+          acc.push(normalizeAccess(pp.acc[i]));
        }
      }
-      ap.acc = acc;
+      pp.acc = acc;
    }

-    // The first AP is a special case, because we have to build gRoot.
+    // The first PP is a special case, because we have to build gRoot.
    if (i === 0) {
-      gRoot._frames.push(...ap.fs);
-      gRoot._addAP(ap);
+      gRoot._frames.push(...pp.fs);
+      gRoot._addPP(pp);
      continue;
    }

@ -553,8 +600,7 @@ function buildTree() {
    // `abcd` is a frame sequence (and `-` is an empty sequence), `N` is a node
    // value, and `Xs` are the node's children.

-    for (let [j, kidFrame] of ap.fs.entries()) {
-
+    for (let [j, kidFrame] of pp.fs.entries()) {
      // Search for kidFrame among internal frames.
      if (ti + 1 < t._frames.length) {
        // t has an internal frame at the right index.
@ -566,7 +612,7 @@ function buildTree() {
          // The internal frame doesn't match. Split the node.
          //
          // E.g. abcd:20-[] + abef:10 => ab:30-[cd:20-[], ef:10-[]]
-          t._split(ti, ap, ap.fs.slice(j));
+          t._split(ti, pp, pp.fs.slice(j));
          done = true;
          break;
        }
@ -580,12 +626,12 @@ function buildTree() {
          // get the leftover frames.
          //
          // E.g. ab:20-[] + abcd:10 => ab:30-[-:20-[], cd:10-[]]
-          t._split(ti, ap, ap.fs.slice(j));
+          t._split(ti, pp, pp.fs.slice(j));
          done = true;
          break;
        }

-        t._addAP(ap);
+        t._addPP(pp);

        // Search for the frame among the kids.
        let kid;
@ -604,8 +650,8 @@ function buildTree() {
          //
          // E.g. ab:20-[c:10-Xs, d:10-Ys] + abef:10 =>
          //      ab:30-[c:10-Xs, d:10-Ys, ef:10-[]]
-          kid = new TreeNode(kLeaf, ap.fs.slice(j));
-          kid._addAP(ap);
+          kid = new TreeNode(kLeaf, pp.fs.slice(j));
+          kid._addPP(pp);
          t._kids.push(kid);
          done = true;
          break;
@ -615,9 +661,9 @@ function buildTree() {

    if (!done) {
      // If we reach here, either:
-      // - ap's frames match an existing frame sequence, in which case we
-      //   just need to _addAP(); or
-      // - ap's frames are a subsequence of an existing sequence, in which
+      // - pp's frames match an existing frame sequence, in which case we
+      //   just need to _addPP(); or
+      // - pp's frames are a subsequence of an existing sequence, in which
      //   case we must split.

      if (ti + 1 < t._frames.length) {
@ -625,20 +671,20 @@ function buildTree() {
        // frames. Split, creating an empty node.
        //
        // E.g. abcd:20-Xs + ab:10 => ab:30-[cd:20-Xs, -:10-[]]
-        t._split(ti, ap, []);
+        t._split(ti, pp, []);

      } else if (!t._kids) {
        // This is impossible because DHAT currently produces records with
        // unique locations. If we remove addresses from frames in the future
        // then duplicate locations will occur, and the following code is how
        // it must be handled.
-        throw Error(`data file contains a repeated location`);
+        throw new Error(`data file contains a repeated location (1)`);

        // Matches an existing sequence that doesn't end in node with empty
-        // frames. Add the AP.
+        // frames. Add the PP.
        //
        // E.g. ab:20-[] + ab:10 => ab:30-[]
-        t._addAP(ap);
+        t._addPP(pp);

      } else {
        // Look for a kid with empty frames.
@ -655,14 +701,14 @@ function buildTree() {
          // unique locations. If we remove addresses from frames in the future
          // then duplicate locations will occur, and the following code is how
          // it must be handled.
-          throw Error(`data file contains a repeated location`);
+          throw new Error(`data file contains a repeated location (2)`);

          // Matches an existing sequence that ends in a node with empty
-          // frames. Add the AP.
+          // frames. Add the PP.
          //
          // E.g. ab:20-[c:10-Xs, -:10-[]] + ab:10 => ab:30-[c:10-Xs, -:20-[]]
-          t._addAP(ap);
-          emptyKid._addAP(ap);
+          t._addPP(pp);
+          emptyKid._addPP(pp);

        } else {
          // A subsequence of an existing sequence that ends at the end of t's
@ -671,14 +717,13 @@ function buildTree() {
          // E.g. ab:20-[c:10-Xs, d:10-Ys] + ab:10 =>
          //      ab:30-[c:10-Xs, d:10-Ys, -:10-[]]
          let newKid = new TreeNode(kLeaf, []);
-          newKid._addAP(ap);
+          newKid._addPP(pp);

          t._kids.push(newKid);
-          t._addAP(ap);
+          t._addPP(pp);
        }
      }
    }
-
  }
 }

@ -697,11 +742,23 @@ function perc(aNum, aDenom) {
 }

 function perMinstr(aN) {
-  return `${kDFormat.format(div(1000000 * aN, gData.ei))}/Minstr`;
+  return `${kDFormat.format(div(1000000 * aN, gData.te))}/${gData.Mtu}`;
+}
+
+function byteUnit() {
+    return gData.hasOwnProperty("bu") ? gData.bsu : "byte";
+}
+
+function bytesUnit() {
+    return gData.hasOwnProperty("bsu") ? gData.bsu : "bytes";
+}
+
+function blocksUnit() {
+    return gData.hasOwnProperty("bksu") ? gData.bksu : "blocks";
 }

 function bytes(aN) {
-  return `${kDFormat.format(aN)} bytes`;
+  return `${kDFormat.format(aN)} ${bytesUnit()}`;
 }

 function bytesAndPerc(aN, aTotalN) {
@ -713,7 +770,7 @@ function bytesAndPercAndRate(aN, aTotalN) {
 }

 function blocks(aN) {
-  return `${kDFormat.format(aN)} blocks`;
+  return `${kDFormat.format(aN)} ${blocksUnit()}`;
 }

 function blocksAndPerc(aN, aTotalN) {
@ -729,15 +786,15 @@ function avgSizeBytes(aN) {
 }

 function perByte(aN) {
-  return `${kDFormat.format(aN)}/byte`;
+  return `${kDFormat.format(aN)}/${byteUnit()}`;
 }

-function instrs(aN) {
-  return `${kDFormat.format(aN)} instrs`;
+function time(aN) {
+  return `${kDFormat.format(aN)} ${gData.tu}`;
 }

-function avgLifetimeInstrs(aN) {
-  return `avg lifetime ${instrs(aN)}`;
+function avgLifetime(aN) {
+  return `avg lifetime ${time(aN)}`;
 }

 function accesses(aAccesses) {
@ -817,6 +874,7 @@ function appendInvocationAndTimes(aP) {
  let v, v1, v2;

  v = "Invocation {\n";
+  v += `  Mode:    ${gData.mode}\n`;
  v += `  Command: ${gData.cmd}\n`;
  v += `  PID:     ${gData.pid}\n`;
  v += "}\n\n";
@ -825,9 +883,11 @@ function appendInvocationAndTimes(aP) {

  v = "Times {\n";

-  v1 = perc(gData.mi, gData.ei);
-  v += `  t-gmax: ${instrs(gData.mi)} (${v1} of program duration)\n`;
-  v += `  t-end:  ${instrs(gData.ei)}\n`;
+  v1 = perc(gData.tg, gData.te);
+  if (gData.bklt) {
+    v += `  t-gmax: ${time(gData.tg)} (${v1} of program duration)\n`;
+  }
+  v += `  t-end:  ${time(gData.te)}\n`;

  v += "}\n\n";

@ -1017,103 +1077,109 @@ function appendTreeInner(aT, aP, aBolds, aCmp, aPc, aSig, aNodeIdNums,

  let v1, v2, v3, v4, v5;

-  // "AP" + node ID + kid count.
+  // "PP" + node ID + kid count.
  v1 = aNodeIdNums.join('.');
  v2 = aNumSibs + 1;
  v3 = kids ? `(${kids.length} children) ` : "";
-  fr(`AP ${v1}/${v2} ${v3}{`, true, false);
+  fr(`PP ${v1}/${v2} ${v3}{`, true, false);
  nl(true);

  // "Total".
  v1 = bytesAndPercAndRate(aT._totalBytes, gRoot._totalBytes);
  v2 = blocksAndPercAndRate(aT._totalBlocks, gRoot._totalBlocks);
  v3 = avgSizeBytes(aT._totalAvgSizeBytes());
-  v4 = avgLifetimeInstrs(aT._totalAvgLifetimeInstrs());
-  v5 = perc(aT._totalAvgLifetimeInstrs(), gData.ei);
+  v4 = avgLifetime(aT._totalAvgLifetimes());
+  v5 = perc(aT._totalAvgLifetimes(), gData.te);
  fr("  Total:     ", aBolds.totalTitle);
  fr(v1, aBolds.totalBytes);
  fr(" in ");
  fr(v2, aBolds.totalBlocks);
  fr(", ", aBolds.totalAvgSizeBytes, false);
  fr(v3, aBolds.totalAvgSizeBytes);
-  fr(", ", aBolds.totalAvgLifetimeInstrs, false);
-  fr(`${v4} (${v5} of program duration)`, aBolds.totalAvgLifetimeInstrs);
+  if (gData.bklt) {
+    fr(", ", aBolds.totalAvgLifetime, false);
+    fr(`${v4} (${v5} of program duration)`, aBolds.totalAvgLifetime);
+  }
  nl(aBolds.totalTitle);

-  // "Max".
-  if (aT !== gRoot && aT._kind === kLeaf) {
-    assert(!kids, "leaf node has children");
-    // These percentages are relative to the local totals, not the root
-    // totals.
-    v1 = bytes(aT._maxBytes);
-    v2 = blocks(aT._maxBlocks);
-    v3 = avgSizeBytes(aT._maxAvgSizeBytes());
-    fr(`  Max:       ${v1} in ${v2}, ${v3}`);
-    nl();
+  if (gData.bklt) {
+    // "Max".
+    if (aT !== gRoot && aT._kind === kLeaf) {
+      assert(!kids, "leaf node has children");
+      // These percentages are relative to the local totals, not the root
+      // totals.
+      v1 = bytes(aT._maxBytes);
+      v2 = blocks(aT._maxBlocks);
+      v3 = avgSizeBytes(aT._maxAvgSizeBytes());
+      fr(`  Max:       ${v1} in ${v2}, ${v3}`);
+      nl();
+    }
+
+    // "At t-gmax".
+    v1 = bytesAndPerc(aT._atTGmaxBytes, gRoot._atTGmaxBytes);
+    v2 = blocksAndPerc(aT._atTGmaxBlocks, gRoot._atTGmaxBlocks);
+    v3 = avgSizeBytes(aT._atTGmaxAvgSizeBytes());
+    fr("  At t-gmax: ", aBolds.atTGmaxTitle);
+    fr(v1, aBolds.atTGmaxBytes);
+    fr(` in ${v2}, ${v3}`);
+    nl(aBolds.atTGmaxTitle);
+
+    // "At t-end".
+    v1 = bytesAndPerc(aT._atTEndBytes, gRoot._atTEndBytes);
+    v2 = blocksAndPerc(aT._atTEndBlocks, gRoot._atTEndBlocks);
+    v3 = avgSizeBytes(aT._atTEndAvgSizeBytes());
+    fr("  At t-end:  ", aBolds.atTEndTitle);
+    fr(v1, aBolds.atTEndBytes);
+    fr(` in ${v2}, ${v3}`);
+    nl(aBolds.atTEndTitle);
  }

-  // "At t-gmax".
-  v1 = bytesAndPerc(aT._atTGmaxBytes, gRoot._atTGmaxBytes);
-  v2 = blocksAndPerc(aT._atTGmaxBlocks, gRoot._atTGmaxBlocks);
-  v3 = avgSizeBytes(aT._atTGmaxAvgSizeBytes());
-  fr("  At t-gmax: ", aBolds.atTGmaxTitle);
-  fr(v1, aBolds.atTGmaxBytes);
-  fr(` in ${v2}, ${v3}`);
-  nl(aBolds.atTGmaxTitle);
+  if (gData.bkacc) {
+    // "Reads".
+    v1 = bytesAndPercAndRate(aT._readsBytes, gRoot._readsBytes);
+    v2 = perByte(aT._readsAvgPerByte());
+    fr("  Reads:     ", aBolds.readsTitle);
+    fr(v1, aBolds.readsBytes);
+    fr(", ", aBolds.readsBytes && aBolds.readsAvgPerByte, false);
+    fr(v2, aBolds.readsAvgPerByte);
+    nl(aBolds.readsTitle);

-  // "At t-end".
-  v1 = bytesAndPerc(aT._atTEndBytes, gRoot._atTEndBytes);
-  v2 = blocksAndPerc(aT._atTEndBlocks, gRoot._atTEndBlocks);
-  v3 = avgSizeBytes(aT._atTEndAvgSizeBytes());
-  fr("  At t-end:  ", aBolds.atTEndTitle);
-  fr(v1, aBolds.atTEndBytes);
-  fr(` in ${v2}, ${v3}`);
-  nl(aBolds.atTEndTitle);
+    // "Writes".
+    v1 = bytesAndPercAndRate(aT._writesBytes, gRoot._writesBytes);
+    v2 = perByte(aT._writesAvgPerByte());
+    fr("  Writes:    ", aBolds.writesTitle);
+    fr(v1, aBolds.writesBytes);
+    fr(", ", aBolds.writesBytes && aBolds.writesAvgPerByte, false);
+    fr(v2, aBolds.writesAvgPerByte);
+    nl(aBolds.writesTitle);

-  // "Reads".
-  v1 = bytesAndPercAndRate(aT._readsBytes, gRoot._readsBytes);
-  v2 = perByte(aT._readsAvgPerByte());
-  fr("  Reads:     ", aBolds.readsTitle);
-  fr(v1, aBolds.readsBytes);
-  fr(", ", aBolds.readsBytes && aBolds.readsAvgPerByte, false);
-  fr(v2, aBolds.readsAvgPerByte);
-  nl(aBolds.readsTitle);
-
-  // "Writes".
-  v1 = bytesAndPercAndRate(aT._writesBytes, gRoot._writesBytes);
-  v2 = perByte(aT._writesAvgPerByte());
-  fr("  Writes:    ", aBolds.writesTitle);
-  fr(v1, aBolds.writesBytes);
-  fr(", ", aBolds.writesBytes && aBolds.writesAvgPerByte, false);
-  fr(v2, aBolds.writesAvgPerByte);
-  nl(aBolds.writesTitle);
-
-  // "Accesses". We show 32 per line (but not on aggregate nodes).
-  if (aT._accesses && aT._accesses.length > 0) {
-    let v = "  Accesses: {";
-    let prevN;
-    for (let [i, n] of aT._accesses.entries()) {
-      if ((i % 32) === 0) {
-        fr(v);
-        nl();
-        v1 = i.toString().padStart(3, ' ');
-        v = `    [${v1}]  `;
-        v += `${accesses(n)} `;
-      } else {
-        // Use a ditto mark for repeats.
-        v += (n === prevN && n !== 0) ? "〃 " : `${accesses(n)} `;
+    // "Accesses". We show 32 per line (but not on aggregate nodes).
+    if (aT._accesses && aT._accesses.length > 0) {
+      let v = "  Accesses: {";
+      let prevN;
+      for (let [i, n] of aT._accesses.entries()) {
+        if ((i % 32) === 0) {
+          fr(v);
+          nl();
+          v1 = i.toString().padStart(3, ' ');
+          v = `    [${v1}]  `;
+          v += `${accesses(n)} `;
+        } else {
+          // Use a ditto mark for repeats.
+          v += (n === prevN && n !== 0) ? "〃 " : `${accesses(n)} `;
+        }
+        prevN = n;
      }
-      prevN = n;
-    }
-    fr(v);
-    nl();
+      fr(v);
+      nl();

-    fr("  }");
-    nl();
+      fr("  }");
+      nl();
+    }
  }

  // "Allocated at".
-  fr("  Allocated at {", true, false);
+  fr(`  ${gData.verb} at {`, true, false);
  nl(true);
  if (aT._kind === kAgg) {
    // Don't print ancestor frames; just print the "insignificant" frame.
@ -1219,7 +1285,7 @@ function appendTree(aP, aBolds, aCmp, aPc, aSig) {
 }

 function appendSignificanceThreshold(aP, aSigLabel) {
-  let v = `\nAP significance threshold: ${aSigLabel()}\n`;
+  let v = `\nPP significance threshold: ${aSigLabel()}\n`;
  appendElementWithText(aP, "span", v, "threshold");
 }

@ -1287,7 +1353,7 @@ function displayTree(aTRead, aTParse, aTBuild) {
  // Get details relating to the chosen sort metrics.
  let data = gSelectData[gSelect.selectedIndex];
  let bolds = data.bolds;
-  let label = data.label;
+  let label = data.label();
  let cmpField = data.cmpField;
  let sig = data.sig;
  let sigLabel = data.sigLabel;
@ -1397,7 +1463,7 @@ function onLoad() {
  gSelect = appendElement(selectDiv, "select");
  gSelect.onchange = changeSortMetric;
  for (let [i, data] of gSelectData.entries()) {
-    let option = appendElementWithText(gSelect, "option", data.label);
+    let option = appendElementWithText(gSelect, "option", data.label());
    option.value = i;
    if (data.isDefault) {
      option.selected = true;
@ -1421,13 +1487,15 @@ function onLoad() {
  appendElementWithText(ul, "li", "'t-gmax': time of global heap maximum " +
                                  "(as measured in bytes)");
  appendElementWithText(ul, "li", "'t-end': time of program end");
+  // The file may use different units (via the `tu` and `Mtu` fields), but
+  // these are the standard units so mention them here.
  appendElementWithText(ul, "li", "'instrs': instructions");
  appendElementWithText(ul, "li", "'Minstr': mega-instruction, i.e. one " +
                                  "million instructions");
-  appendElementWithText(ul, "li", "'AP': allocation point");
+  appendElementWithText(ul, "li", "'PP': program point");
  appendElementWithText(ul, "li", "'avg': average");
  appendElementWithText(ul, "li", "'-' (in accesses): zero");
-  appendElementWithText(ul, "li", "'∞' (in accesses): leaf AP counts max out " +
+  appendElementWithText(ul, "li", "'∞' (in accesses): leaf PP counts max out " +
                                  "at 65534; larger counts are treated as " +
                                  "infinity");
  appendElementWithText(ul, "li", "'〃' (in accesses): same as previous entry");
--- a/dhat/dhat.h
+++ b/dhat/dhat.h
@ -0,0 +1,75 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (dhat.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of DHAT, a Valgrind tool for profiling the
+   heap usage of programs.
+
+   Copyright (C) 2020 Nicholas Nethercote.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must
+      not claim that you wrote the original software.  If you use this
+      software in a product, an acknowledgment in the product
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (memcheck.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+*/
+
+#include "valgrind.h"
+
+typedef
+   enum {
+      VG_USERREQ__DHAT_AD_HOC_EVENT = VG_USERREQ_TOOL_BASE('D', 'H'),
+
+      // This is just for DHAT's internal use. Don't use it.
+      _VG_USERREQ__DHAT_COPY = VG_USERREQ_TOOL_BASE('D','H') + 256
+   } Vg_DHATClientRequest;
+
+// Record an ad hoc event. The meaning of the weight argument will depend on
+// what the event represents, which is up to the user. If no meaningful weight
+// argument exists, just use 1.
+#define DHAT_AD_HOC_EVENT(_qzz_weight) \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DHAT_AD_HOC_EVENT, \
+                                    (_qzz_weight), 0, 0, 0, 0)
+
--- a/dhat/docs/dh-manual.xml
+++ b/dhat/docs/dh-manual.xml
@ -16,15 +16,15 @@
 <sect1 id="dh-manual.overview" xreflabel="Overview">
 <title>Overview</title>

-<para>DHAT is a tool for examining how programs use their heap
+<para>DHAT is primarily a tool for examining how programs use their heap
 allocations.</para>

 <para>It tracks the allocated blocks, and inspects every memory access
-to find which block, if any, it is to. It presents, on an allocation point
+to find which block, if any, it is to. It presents, on a program point
 basis, information about these blocks such as sizes, lifetimes, numbers of
 reads and writes, and read and write patterns.</para>

-<para>Using this information it is possible to identify allocation points with
+<para>Using this information it is possible to identify program points with
 the following characteristics:</para>

 <itemizedlist>
@ -54,6 +54,9 @@ as instruction counts. This sounds a little odd at first, but it
 makes runs repeatable in a way which is not possible if CPU time is
 used.</para>

+<para>DHAT also has support for copy profiling and ad hoc profiling. These are
+described below.</para>
+
 </sect1>


@ -155,11 +158,12 @@ because this can significantly reduce the size of DHAT's output files.</para>

 <sect2 id="dh-output-header"><title>The Output Header</title>

-<para>The first part of the output shows the program command and process ID.
-For example:</para>
+<para>The first part of the output shows the mode, program command and process
+ID. For example:</para>

 <programlisting><![CDATA[
 Invocation {
+  Mode:    heap
  Command: /home/njn/moz/rust0/build/x86_64-unknown-linux-gnu/stage2/bin/rustc --crate-name tuple_stress src/main.rs
  PID:     18816
 }
@ -179,18 +183,19 @@ Times {
 </sect2>


-<sect2 id="dh-ap-tree"><title>The AP Tree</title>
+<sect2 id="dh-ap-tree"><title>The PP Tree</title>

 <para>The third part of the output is the largest and most interesting part,
-showing the allocation point (AP) tree.</para>
+showing the program point (PP) tree.</para>


 <sect3 id="dh-structure"><title>Structure</title>

-<para>The following image shows a screenshot of part of an AP
+<para>The following image shows a screenshot of part of a PP
 tree. The font is very small because this screenshot is intended to
 demonstrate the high-level structure of the tree rather than the
-details within the text.</para>
+details within the text. (It is also slightly out-of-date, and doesn't quite
+match the current output produced by DHAT's viewer.)</para>

 <graphic fileref="images/dh-tree.png" scalefit="1"/>

@ -228,7 +233,7 @@ email, bug report, etc.</para>
 <para>The root node looks like this:</para>

 <programlisting><![CDATA[
-AP 1/1 (25 children) {
+PP 1/1 (25 children) {
  Total:     1,355,253,987 bytes (100%, 67,454.81/Minstr) in 5,943,417 blocks (100%, 295.82/Minstr), avg size 228.03 bytes, avg lifetime 3,134,692,250.67 instrs (15.6% of program duration)
  At t-gmax: 423,930,307 bytes (100%) in 1,575,682 blocks (100%), avg size 269.05 bytes
  At t-end:  258,002 bytes (100%) in 2,129 blocks (100%), avg size 121.18 bytes
@ -250,11 +255,11 @@ next example will explain these in more detail.</para>

 <sect3 id="dh-interior-nodes"><title>Interior Nodes</title>

-<para>AP nodes further down the tree show information about a subset of
+<para>PP nodes further down the tree show information about a subset of
 allocations. For example:</para>

 <programlisting><![CDATA[
-AP 1.1/25 (2 children) {
+PP 1.1/25 (2 children) {
  Total:     54,533,440 bytes (4.02%, 2,714.28/Minstr) in 458,839 blocks (7.72%, 22.84/Minstr), avg size 118.85 bytes, avg lifetime 1,127,259,403.64 instrs (5.61% of program duration)
  At t-gmax: 0 bytes (0%) in 0 blocks (0%), avg size 0 bytes
  At t-end:  0 bytes (0%) in 0 blocks (0%), avg size 0 bytes
@ -288,7 +293,7 @@ stack trace that is shared by all the blocks covered by this node.</para>
 <para>The <computeroutput>Total</computeroutput> line shows that this node
 accounts for 4.02% of all bytes allocated during execution, and 7.72% of all
 blocks. These percentages are useful for comparing the significance of
-different nodes within a single profile; an AP that accounts for 10% of bytes
+different nodes within a single profile; a PP that accounts for 10% of bytes
 allocated is likely to be more interesting than one that accounts for
 2%.</para>

@ -301,16 +306,16 @@ different workloads.</para>
 average size and lifetimes of these blocks.</para>

 <para>The <computeroutput>At t-gmax</computeroutput> line says shows that no
-blocks from this AP were alive when the global heap peak occurred. In other
+blocks from this PP were alive when the global heap peak occurred. In other
 words, these blocks do not contribute at all to the global heap peak.</para>

 <para>The <computeroutput>At t-end</computeroutput> line shows that no blocks
-were from this AP were alive at shutdown. In other words, all those blocks were
+were from this PP were alive at shutdown. In other words, all those blocks were
 explicitly freed before termination.</para>

 <para>The <computeroutput>Reads</computeroutput> and
 <computeroutput>Writes</computeroutput> lines show how many bytes were read 
-within this AP's blocks, the fraction this represents of all heap reads, and
+within this PP's blocks, the fraction this represents of all heap reads, and
 the read rate. Finally, it shows the read ratio, which is the number of reads
 per byte. In this case the number is 0.29, which is quite low -- if no byte was
 read twice, then only 29% of the allocated bytes, which means that at least 71%
@ -336,7 +341,7 @@ vectors and hash tables, and isn't always fixable. </para>
 <para>This is a leaf node:</para>

 <programlisting><![CDATA[
-AP 1.1.1.1/2 {
+PP 1.1.1.1/2 {
  Total:     31,460,928 bytes (2.32%, 1,565.9/Minstr) in 262,171 blocks (4.41%, 13.05/Minstr), avg size 120 bytes, avg lifetime 986,406,885.05 instrs (4.91% of program duration)
  Max:       16,779,136 bytes in 65,543 blocks, avg size 256 bytes
  At t-gmax: 0 bytes (0%) in 0 blocks (0%), avg size 0 bytes
@ -365,10 +370,10 @@ is a great-grandchild of the root; is the first grandchild of the node in the
 previous example; and has no children.</para>

 <para>Leaf nodes contain an additional <computeroutput>Max</computeroutput>
-line, indicating the peak memory use for the blocks covered by this AP. (This
+line, indicating the peak memory use for the blocks covered by this PP. (This
 peak may have occurred at a time other than
 <computeroutput>t-gmax</computeroutput>.) In this case, 31,460,298 bytes were
-allocated from this AP, but the maximum size alive at once was 16,779,136
+allocated from this PP, but the maximum size alive at once was 16,779,136
 bytes.</para>

 <para>Stack frames that begin with a <computeroutput>^</computeroutput> rather
@ -383,7 +388,7 @@ This also means that each node makes complete sense on its own.</para>

 <sect3 id="dh-access-counts"><title>Access Counts</title>

-<para>If all blocks covered by an AP node have the same size, an additional
+<para>If all blocks covered by a PP node have the same size, an additional
 <computeroutput>Accesses</computeroutput> field will be present. It indicates
 how the reads and writes within these blocks were distributed. For
 example:</para>
@ -399,7 +404,7 @@ Accesses: {
 }
 ]]></programlisting>

-<para>Every block covered by this AP was 32 bytes. Within all of those blocks,
+<para>Every block covered by this PP was 32 bytes. Within all of those blocks,
 byte 0 was accessed (read or written) 65,547 times, byte 1 was accessed 7
 times, byte 2 was accessed 8 times, and so on.</para>

@ -425,12 +430,12 @@ layout inefficiencies.</para>

 <sect3 id="aggregate-nodes"><title>Aggregate Nodes</title>

-<para>The AP tree is very large and many nodes represent tiny numbers of blocks
+<para>The PP tree is very large and many nodes represent tiny numbers of blocks
 and bytes. Therefore, DHAT's viewer aggregates insignificant nodes like
 this:</para>

 <programlisting><![CDATA[
-AP 1.14.2/2 {
+PP 1.14.2/2 {
  Total:     5,175 blocks (0.09%, 0.26/Minstr)
  Allocated at {
    [5 insignificant]
@ -449,15 +454,15 @@ case).</para>

 <sect2 id="dh-output-footer"><title>The Output Footer</title>

-<para>Below the AP tree is a line like this:</para>
+<para>Below the PP tree is a line like this:</para>

 <programlisting><![CDATA[
-AP significance threshold: total >= 59,434.17 blocks (1%)
+PP significance threshold: total >= 59,434.17 blocks (1%)
 ]]></programlisting>

-<para>It shows the function used to determine if an AP node is significant. All
+<para>It shows the function used to determine if a PP node is significant. All
 nodes that don't satisfy this function are aggregated. It is occasionally
-useful if you don't understand why an AP node has been aggregated. The exact
+useful if you don't understand why a PP node has been aggregated. The exact
 threshold depends on the sort metric (see below).</para>

 <para>Finally, the bottom of the page shows a legend that explains some of the
@ -587,21 +592,21 @@ filtering, so that only nodes meeting a particular criteria are shown.</para>
 <para>The values within a node that represent the chosen sort metric are shown
 in bold, so they stand out.</para>

-<para>Here is part of an AP node found with "Total (blocks), tiny", showing
+<para>Here is part of a PP node found with "Total (blocks), tiny", showing
 blocks with an average size of only 8.67 bytes:</para>

 <programlisting><![CDATA[
 Total:     3,407,848 bytes (0.25%, 169.62/Minstr) in 393,214 blocks (6.62%, 19.57/Minstr), avg size 8.67 bytes, avg lifetime 1,167,795,629.1 instrs (5.81% of program duration)
 ]]></programlisting>

-<para>Here is part of an AP node found with "Total (blocks), short-lived",
+<para>Here is part of a PP node found with "Total (blocks), short-lived",
 showing blocks with an average lifetime of only 181.75 instructions:</para>

 <programlisting><![CDATA[
 Total:     23,068,584 bytes (1.7%, 1,148.19/Minstr) in 262,143 blocks (4.41%, 13.05/Minstr), avg size 88 bytes, avg lifetime 181.75 instrs (0% of program duration)
 ]]></programlisting>

-<para>Here is an example of an AP identified with "Total (blocks), zero reads
+<para>Here is an example of a PP identified with "Total (blocks), zero reads
 or zero writes", showing blocks that are allocated but never touched:</para>

 <programlisting><![CDATA[
@ -613,7 +618,7 @@ Reads:     0 bytes (0%, 0/Minstr), 0/byte
 Writes:    0 bytes (0%, 0/Minstr), 0/byte
 ]]></programlisting>

-<para>All the blocks identified by these APs are good candidates for
+<para>All the blocks identified by these PPs are good candidates for
 optimization.</para>

 </sect2>
@ -648,10 +653,10 @@ increasing the current heap size by 200 bytes and then decreasing it by 100
 bytes.) As a result, it can only increase the global heap peak (if indeed,
 this results in a new peak) by 100 bytes.</para>

-<para>Finally, the allocation point assigned to the block allocated by the
+<para>Finally, the program point assigned to the block allocated by the
 <computeroutput>malloc(100)</computeroutput> call is retained once the block
 is reallocated. Which means that all 300 bytes are attributed to that
-allocation point, and no separate allocation point is created for the
+program point, and no separate program point is created for the
 <computeroutput>realloc(200)</computeroutput> call. This may be surprising,
 but it has one large benefit.</para>

@ -659,12 +664,84 @@ but it has one large benefit.</para>
 adds data to that buffer from numerous different points in the code,
 reallocating the buffer each time it gets full. (E.g. code generation in a
 compiler might work this way.) With the described approach, the first heap
-block and all subsequent heap blocks are attributed to the same allocation
-point. While this is something of a lie -- the first allocation point isn't
-actually responsible for the other allocations -- it is arguably better than
-having the allocation points spread around, in a distribution
-that unpredictably depends on whenever the reallocation points were
-triggered.</para>
+block and all subsequent heap blocks are attributed to the same program point.
+While this is something of a lie -- the first program point isn't actually
+responsible for the other allocations -- it is arguably better than having the
+program points spread around in a distribution that unpredictably depends on
+whenever the reallocations were triggered.</para>
+
+</sect1>
+
+
+<sect1 id="dh-manual.copy-profiling" xreflabel="Copy profiling">
+<title>Copy profiling</title>
+
+<para>If DHAT is invoked with <option>--mode=copy</option>, instead of
+profiling heap operations (allocations and deallocations), it profiles copy
+operations, such as <computeroutput>memcpy</computeroutput>,
+<computeroutput>memmove</computeroutput>,
+<computeroutput>strcpy</computeroutput>, and
+<computeroutput>bcopy</computeroutput>. This is sometimes useful.</para>
+
+<para>Here is an example PP node from this mode:</para>
+
+<programlisting><![CDATA[
+PP 1.1.2/5 (4 children) {
+  Total:     1,210,925 bytes (10.03%, 4,358.66/Minstr) in 112,717 blocks (35.2%, 405.72/Minstr), avg size 10.74 bytes
+  Copied at {
+    ^1: 0x4842524: memmove (vg_replace_strmem.c:1289)
+    #2: 0x1F0A0D: copy_nonoverlapping<u8> (intrinsics.rs:1858)
+    #3: 0x1F0A0D: copy_from_slice<u8> (mod.rs:2524)
+    #4: 0x1F0A0D: spec_extend<u8> (vec.rs:2227)
+    #5: 0x1F0A0D: extend_from_slice<u8> (vec.rs:1619)
+    #6: 0x1F0A0D: push_str (string.rs:821)
+    #7: 0x1F0A0D: write_str (string.rs:2418)
+    #8: 0x1F0A0D: <&mut W as core::fmt::Write>::write_str (mod.rs:195)
+  }
+}
+]]></programlisting>
+
+<para>It is very similar to the PP nodes for heap profiling, but with less
+information, because copy profiling doesn't involve any tracking of memory
+regions with lifetimes.</para>
+
+</sect1>
+
+
+<sect1 id="dh-manual.ad-hoc-profiling" xreflabel="Ad hoc profiling">
+<title>Ad hoc profiling</title>
+
+<para>If DHAT is invoked with <option>--mode=ad-hoc</option>, instead of
+profiling heap operations (allocations and deallocations), it profiles calls to
+the <computeroutput>DHAT_AD_HOC_EVENT</computeroutput> client request, which is
+declared in <filename>dhat/dhat.h</filename>.</para>
+
+<para>Here is an example PP node from this mode:</para>
+
+<programlisting><![CDATA[
+PP 1.1.1.1/2 {
+  Total:     30 units (17.65%, 115.97/Minstr) in 1 events (14.29%, 3.87/Minstr), avg size 30 units
+  Occurred at {
+    ^1: 0x109407: g (ad-hoc.c:4)
+    ^2: 0x109425: f (ad-hoc.c:8)
+    #3: 0x109497: main (ad-hoc.c:14)
+  }
+}
+]]></programlisting>
+
+<para>This kind of profiling is useful when you know a code path is hot but you
+want to know more about it.</para>
+
+<para>For example, you might want to know which callsites of a hot function
+account for most of the calls. You could put a
+<computeroutput>DHAT_AD_HOC_EVENT(1);</computeroutput> call at the start of
+that function.</para>
+
+<para>Alternatively, you might want to know the typical length of a vector in a
+hot location. You could put a
+<computeroutput>DHAT_AD_HOC_EVENT(len);</computeroutput> call at the
+appropriate location, when <computeroutput>len</computeroutput> is the length
+of the vector.</para>

 </sect1>

@ -694,6 +771,17 @@ triggered.</para>
    </listitem>
  </varlistentry>

+  <varlistentry id="opt.mode" xreflabel="--mode">
+    <term>
+      <option><![CDATA[--mode=<heap|copy|ad-hoc> [default: heap] ]]></option>
+    </term>
+    <listitem>
+      <para>The profiling mode: heap profiling, copy profiling, or ad hoc
+            profiling.
+      </para>
+    </listitem>
+  </varlistentry>
+
 </variablelist>

 <para>Note that stacks by default have 12 frames. This may be more than
--- a/dhat/tests/Makefile.am
+++ b/dhat/tests/Makefile.am
@ -5,16 +5,20 @@ dist_noinst_SCRIPTS = filter_stderr

 EXTRA_DIST = \
 	acc.stderr.exp acc.vgtest \
+	ad-hoc.stderr.exp ad-hoc.vgtest \
 	basic.stderr.exp basic.vgtest \
 	big.stderr.exp big.vgtest \
+	copy.stderr.exp copy.vgtest \
 	empty.stderr.exp empty.vgtest \
 	sig.stderr.exp sig.vgtest \
 	single.stderr.exp single.vgtest

 check_PROGRAMS = \
 	acc \
+	ad-hoc \
 	basic \
 	big \
+	copy \
 	empty \
 	sig \
 	single
--- a/dhat/tests/ad-hoc.c
+++ b/dhat/tests/ad-hoc.c
@ -0,0 +1,27 @@
+#include "dhat/dhat.h"
+#include <stdlib.h>
+void g(void) {
+   DHAT_AD_HOC_EVENT(30);
+}
+
+void f(void) {
+   g();
+   DHAT_AD_HOC_EVENT(20);
+   g();
+}
+
+int main(void) {
+   f();
+   DHAT_AD_HOC_EVENT(10);
+   f();
+
+   // At one point malloc was broken with --mode=ad-hoc(!), and Valgrind was
+   // printing messages like "VG_USERREQ__CLIENT_CALL1: func=0x0" when malloc
+   // was called. So check that it's basically working...
+   char* p = malloc(100);
+   p = realloc(p, 200);
+   free(p);
+
+   return 0;
+}
+
--- a/dhat/tests/ad-hoc.stderr.exp
+++ b/dhat/tests/ad-hoc.stderr.exp
@ -0,0 +1 @@
+Total:     170 units in 7 events
--- a/dhat/tests/ad-hoc.vgtest
+++ b/dhat/tests/ad-hoc.vgtest
@ -0,0 +1,3 @@
+prog: ad-hoc
+vgopts: --mode=ad-hoc --dhat-out-file=dhat.out
+cleanup: rm dhat.out
--- a/dhat/tests/basic.c
+++ b/dhat/tests/basic.c
@ -3,6 +3,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include "dhat/dhat.h"

 int main(void)
 {
@ -24,5 +25,9 @@ int main(void)

   free(c);
                                 // totals: 3008 read, 3516 write
+
+   // Should be ignored because we're not in ad hoc mode.
+   DHAT_AD_HOC_EVENT(100);
+
   return 0;
 }
--- a/dhat/tests/copy.c
+++ b/dhat/tests/copy.c
@ -0,0 +1,60 @@
+// This tests --mode=copy with various copying functions.
+
+#define _GNU_SOURCE // For mempcpy.
+#include <stdlib.h>
+#include <string.h>
+#include <wchar.h>
+
+void f(char* a, char* b, wchar_t* wa, wchar_t* wb);
+void test_malloc();
+
+int main(void) {
+   char a[1000];
+   char b[1000];
+   for (int i = 0; i < 1000; i++) {
+      a[i] = 'a';
+      b[i] = 'b';
+   }
+   a[999] = '\0';
+   b[999] = '\0';
+
+   wchar_t wa[250];
+   wchar_t wb[250];
+   for (int i = 0; i < 250; i++) {
+      wa[i] = 'A';
+      wb[i] = 'B';
+   }
+   wa[249] = '\0';
+   wb[249] = '\0';
+
+   for (int i = 0; i < 100; i++) {
+      f(a, b, wa, wb);
+   }
+
+   test_malloc();
+   return 0;
+}
+
+void f(char* a, char* b, wchar_t* wa, wchar_t* wb) {
+   // The memcpy is duplicated so we have 10 calls, which makes for nice round
+   // numbers in the totals.
+   memcpy (a, b, 1000); // Redirects to memmove
+   memcpy (a, b, 1000); // Redirects to memmove
+   memmove(a, b, 1000);
+   mempcpy(a, b, 1000);
+   bcopy  (a, b, 1000); // Redirects to memmove
+   strcpy (a, b);
+   strncpy(a, b, 1000);
+   stpcpy (a, b);       // Redirects to strcpy
+   stpncpy(a, b, 1000);
+   wcscpy (wa, wb);
+}
+
+void test_malloc() {
+   // At one point malloc was broken with --mode=copy(!), and Valgrind was
+   // printing messages like "VG_USERREQ__CLIENT_CALL1: func=0x0" when malloc
+   // was called. So check that it's basically working...
+   char* p = malloc(100);
+   p = realloc(p, 200);
+   free(p);
+}
--- a/dhat/tests/copy.stderr.exp
+++ b/dhat/tests/copy.stderr.exp
@ -0,0 +1 @@
+Total:     1,000,... bytes in 1,0.. blocks
--- a/dhat/tests/copy.vgtest
+++ b/dhat/tests/copy.vgtest
@ -0,0 +1,4 @@
+prog: copy
+vgopts: --mode=copy --dhat-out-file=dhat.out
+stderr_filter: filter_copy
+cleanup: rm dhat.out
--- a/dhat/tests/filter_copy
+++ b/dhat/tests/filter_copy
@ -0,0 +1,9 @@
+#! /bin/sh
+
+# It's impossible to get exact matches for copy counts because even trivial C
+# programs do a few memcpy/strcpy calls. So we allow some fuzzy matching.
+# So we allow 1,000,000..1,009,999 bytes and 1,000..1,099 blocks.
+
+./filter_stderr "$@" |
+sed -e "s/1,00.,... bytes in 1,0.. blocks/1,000,... bytes in 1,0.. blocks/"
+
--- a/dhat/tests/filter_stderr
+++ b/dhat/tests/filter_stderr
@ -21,8 +21,7 @@ sed "/^  file:\/\/\// d" |
 sed "/^in a web browser/ d" |
 sed "/^  \// d" |                # This is pretty feeble, but I don't see
                                 # how to do better
-sed "/^Scroll to the end/ d" |
-sed "/^explanation of some/ d" |
+sed "/^The text at the bottom/ d" |

 # and remove any blank lines in the output
 sed "/^[[:space:]]*$/d"
--- a/include/pub_tool_replacemalloc.h
+++ b/include/pub_tool_replacemalloc.h
@ -39,6 +39,7 @@
 /* Can be called from VG_(tdict).malloc_malloc et al to do the actual
 * alloc/freeing. */
 extern void* VG_(cli_malloc) ( SizeT align, SizeT nbytes );
+extern void* VG_(cli_realloc)( void* ptr, SizeT nbytes );
 extern void  VG_(cli_free)   ( void* p );
 // Returns the usable size of a heap-block.  It's the asked-for size plus
 // possibly some more due to rounding up.
--- a/none/nl_main.c
+++ b/none/nl_main.c
@ -41,7 +41,7 @@ IRSB* nl_instrument ( VgCallbackClosure* closure,
                      const VexArchInfo* archinfo_host,
                      IRType gWordTy, IRType hWordTy )
 {
-    return bb;
+   return bb;
 }

 static void nl_fini(Int exitcode)
--- a/shared/vg_replace_strmem.c
+++ b/shared/vg_replace_strmem.c
@ -35,12 +35,13 @@
 #include "pub_tool_clreq.h"

 /* ---------------------------------------------------------------------
-   We have our own versions of these functions for two reasons:
+   We have our own versions of these functions for multiple reasons:
   (a) it allows us to do overlap checking
-   (b) some of the normal versions are hyper-optimised, which fools
+   (b) it allows us to do copy tracking
+   (c) some of the normal versions are hyper-optimised, which fools
       Memcheck and cause spurious value warnings.  Our versions are
       simpler.
-   (c) the glibc SSE-variants can read past the end of the input data
+   (d) the glibc SSE-variants can read past the end of the input data
       ranges. This can cause false-positive Memcheck / Helgrind / DRD
       reports.

@ -173,6 +174,15 @@ static inline void my_exit ( int x )
 #ifndef RECORD_OVERLAP_ERROR
 #define RECORD_OVERLAP_ERROR(s, src, dst, len) do { } while (0)
 #endif
+
+// Used for tools that record bulk copies: memcpy, strcpy, etc.
+#ifndef RECORD_COPY
+#define RECORD_COPY(len) do { } while (0)
+#define FOR_COPY(x)
+#else
+#define FOR_COPY(x) x
+#endif
+
 #ifndef VALGRIND_CHECK_VALUE_IS_DEFINED
 #define VALGRIND_CHECK_VALUE_IS_DEFINED(__lvalue) 1
 #endif
@ -496,12 +506,14 @@ static inline void my_exit ( int x )
      while (*src) *dst++ = *src++; \
      *dst = 0; \
      \
-      /* This checks for overlap after copying, unavoidable without */ \
+      /* This happens after copying, unavoidable without */ \
      /* pre-counting length... should be ok */ \
+      SizeT srclen = (Addr)src-(Addr)src_orig+1; \
+      RECORD_COPY(srclen); \
      if (is_overlap(dst_orig,  \
                     src_orig,  \
                     (Addr)dst-(Addr)dst_orig+1, \
-                     (Addr)src-(Addr)src_orig+1)) \
+                     srclen)) \
         RECORD_OVERLAP_ERROR("strcpy", dst_orig, src_orig, 0); \
      \
      return dst_orig; \
@ -539,7 +551,9 @@ static inline void my_exit ( int x )
      while (m   < n && *src) { m++; *dst++ = *src++; } \
      /* Check for overlap after copying; all n bytes of dst are relevant, */ \
      /* but only m+1 bytes of src if terminator was found */ \
-      if (is_overlap(dst_orig, src_orig, n, (m < n) ? m+1 : n)) \
+      SizeT srclen = (m < n) ? m+1 : n; \
+      RECORD_COPY(srclen); \
+      if (is_overlap(dst_orig, src_orig, n, srclen)) \
         RECORD_OVERLAP_ERROR("strncpy", dst, src, n); \
      while (m++ < n) *dst++ = 0;         /* must pad remainder with nulls */ \
      \
@ -585,7 +599,9 @@ static inline void my_exit ( int x )
      /* m non-nul bytes have now been copied, and m <= n-1. */ \
      /* Check for overlap after copying; all n bytes of dst are relevant, */ \
      /* but only m+1 bytes of src if terminator was found */ \
-      if (is_overlap(dst_orig, src_orig, n, (m < n) ? m+1 : n)) \
+      SizeT srclen = (m < n) ? m+1 : n; \
+      RECORD_COPY(srclen); \
+      if (is_overlap(dst_orig, src_orig, n, srclen)) \
          RECORD_OVERLAP_ERROR("strlcpy", dst, src, n); \
      /* Nul-terminate dst. */ \
      if (n > 0) *dst = 0; \
@ -943,6 +959,7 @@ static inline void my_exit ( int x )
   void* VG_REPLACE_FUNCTION_EZZ(becTag,soname,fnname) \
            ( void *dst, const void *src, SizeT len ) \
   { \
+      RECORD_COPY(len); \
      if (do_ol_check && is_overlap(dst, src, len, len)) \
         RECORD_OVERLAP_ERROR("memcpy", dst, src, len); \
      \
@ -1034,6 +1051,7 @@ static inline void my_exit ( int x )
 MEMCPY(VG_Z_LIBC_SONAME,  memcpy) /* fallback case */
 MEMCPY(VG_Z_LIBC_SONAME,    __GI_memcpy)
 MEMCPY(VG_Z_LIBC_SONAME,    __memcpy_sse2)
+ MEMCPY(VG_Z_LIBC_SONAME, __memcpy_avx_unaligned_erms)
 MEMCPY(VG_Z_LD_SO_1,      memcpy) /* ld.so.1 */
 MEMCPY(VG_Z_LD64_SO_1,    memcpy) /* ld64.so.1 */
 /* icc9 blats these around all over the place.  Not only in the main
@ -1142,10 +1160,12 @@ static inline void my_exit ( int x )
      \
      /* This checks for overlap after copying, unavoidable without */ \
      /* pre-counting length... should be ok */ \
+      SizeT srclen = (Addr)src-(Addr)src_orig+1; \
+      RECORD_COPY(srclen); \
      if (is_overlap(dst_orig,  \
                     src_orig,  \
                     (Addr)dst-(Addr)dst_orig+1,  \
-                     (Addr)src-(Addr)src_orig+1)) \
+                     srclen)) \
         RECORD_OVERLAP_ERROR("stpcpy", dst_orig, src_orig, 0); \
      \
      return dst; \
@ -1185,7 +1205,9 @@ static inline void my_exit ( int x )
      while (m   < n && *src) { m++; *dst++ = *src++; } \
      /* Check for overlap after copying; all n bytes of dst are relevant, */ \
      /* but only m+1 bytes of src if terminator was found */ \
-      if (is_overlap(dst_str, src_orig, n, (m < n) ? m+1 : n)) \
+      SizeT srclen = (m < n) ? m+1 : n; \
+      RECORD_COPY(srclen); \
+      if (is_overlap(dst_str, src_orig, n, srclen)) \
         RECORD_OVERLAP_ERROR("stpncpy", dst, src, n); \
      dst_str = dst; \
      while (m++ < n) *dst++ = 0;         /* must pad remainder with nulls */ \
@ -1200,9 +1222,6 @@ static inline void my_exit ( int x )

 /*---------------------- memset ----------------------*/

-/* Why are we bothering to intercept this?  It seems entirely
-   pointless. */
-
 #define MEMSET(soname, fnname) \
   void* VG_REPLACE_FUNCTION_EZZ(20210,soname,fnname) \
            (void *s, Int c, SizeT n); \
@ -1301,6 +1320,7 @@ static inline void my_exit ( int x )
   void VG_REPLACE_FUNCTION_EZU(20230,soname,fnname) \
            (const void *srcV, void *dstV, SizeT n) \
   { \
+      RECORD_COPY(n); \
      SizeT i; \
      HChar* dst = dstV; \
      const HChar* src = srcV; \
@ -1338,6 +1358,7 @@ static inline void my_exit ( int x )
   void* VG_REPLACE_FUNCTION_EZU(20240,soname,fnname) \
            (void *dstV, const void *srcV, SizeT n, SizeT destlen) \
   { \
+      RECORD_COPY(n); \
      SizeT i; \
      HChar* dst = dstV;        \
      const HChar* src = srcV; \
@ -1438,12 +1459,14 @@ static inline void my_exit ( int x )
   char* VG_REPLACE_FUNCTION_EZU(20270,soname,fnname) \
            (char* dst, const char* src, SizeT len) \
   { \
+      FOR_COPY(const HChar* src_orig = src); \
      HChar* ret = dst; \
      if (! len) \
         goto badness; \
      while ((*dst++ = *src++) != '\0') \
         if (--len == 0) \
            goto badness; \
+      RECORD_COPY((Addr)src-(Addr)src_orig); \
      return ret; \
     badness: \
      VALGRIND_PRINTF_BACKTRACE( \
@ -1474,11 +1497,13 @@ static inline void my_exit ( int x )
   char* VG_REPLACE_FUNCTION_EZU(20280,soname,fnname) \
            (char* dst, const char* src, SizeT len) \
   { \
+      FOR_COPY(const HChar* src_orig = src); \
      if (! len) \
         goto badness; \
      while ((*dst++ = *src++) != '\0') \
         if (--len == 0) \
            goto badness; \
+      RECORD_COPY((Addr)src-(Addr)src_orig); \
      return dst - 1; \
     badness: \
      VALGRIND_PRINTF_BACKTRACE( \
@ -1508,6 +1533,7 @@ static inline void my_exit ( int x )
   void* VG_REPLACE_FUNCTION_EZU(20290,soname,fnname) \
            ( void *dst, const void *src, SizeT len ) \
   { \
+      RECORD_COPY(len); \
      SizeT len_saved = len; \
      \
      if (len == 0) \
@ -1557,15 +1583,13 @@ static inline void my_exit ( int x )
   { \
      register HChar *d; \
      register const HChar *s; \
-      \
-      if (dstlen < len) goto badness; \
-      \
+      if (dstlen < len) \
+         goto badness; \
+      RECORD_COPY(len); \
      if (len == 0) \
         return dst; \
-      \
      if (is_overlap(dst, src, len, len)) \
         RECORD_OVERLAP_ERROR("memcpy_chk", dst, src, len); \
-      \
      if ( dst > src ) { \
         d = (HChar *)dst + len - 1; \
         s = (const HChar *)src + len - 1; \
@ -1977,11 +2001,14 @@ static inline void my_exit ( int x )
      \
      /* This checks for overlap after copying, unavoidable without */ \
      /* pre-counting length... should be ok */ \
+      /* +4 because sizeof(wchar_t) == 4 */ \
+      SizeT srclen = (Addr)src-(Addr)src_orig+4; \
+      RECORD_COPY(srclen); \
      if (is_overlap(dst_orig,  \
                     src_orig,  \
                     /* +4 because sizeof(wchar_t) == 4 */ \
                     (Addr)dst-(Addr)dst_orig+4, \
-                     (Addr)src-(Addr)src_orig+4)) \
+                     srclen)) \
         RECORD_OVERLAP_ERROR("wcscpy", dst_orig, src_orig, 0); \
      \
      return dst_orig; \