Add support for copy and ad hoc profiling to DHAT.

This commit is contained in:
Nicholas Nethercote 2019-09-09 14:13:35 +10:00
parent d2d54dbcc7
commit 8c08253b89
23 changed files with 2076 additions and 844 deletions

2
.gitignore vendored
View File

@ -273,8 +273,10 @@
/dhat/tests/*.stdout.out
/dhat/tests/.deps
/dhat/tests/acc
/dhat/tests/ad-hoc
/dhat/tests/basic
/dhat/tests/big
/dhat/tests/copy
/dhat/tests/empty
/dhat/tests/sig
/dhat/tests/single

12
NEWS
View File

@ -16,6 +16,18 @@ support for X86/macOS 10.13, AMD64/macOS 10.13 and nanoMIPS/Linux.
* DHAT:
- DHAT has been extended, with two new modes of operation. The new
--mode=copy flag triggers copy profiling, which records calls to memcpy,
strcpy, and similar functions. The new --mode=ad-hoc flag triggers ad hoc
profiling, which records calls to the DHAT_AD_HOC_EVENT client request in
the new dhat/dhat.h file. This is useful for learning more about hot code
paths. See the user manual for more information about the new modes.
- Because of these changes, DHAT's file format has changed. DHAT output
files produced with earlier versions of DHAT will not work with this
version of DHAT's viewer, and DHAT output files produced with this version
of DHAT will not work with earlier versions of DHAT's viewer.
* Cachegrind:
* Callgrind:

View File

@ -92,29 +92,35 @@ SizeT VG_(malloc_effective_client_redzone_size)(void)
/*--- Useful functions ---*/
/*------------------------------------------------------------*/
void* VG_(cli_malloc) ( SizeT align, SizeT nbytes )
{
void* VG_(cli_malloc) ( SizeT align, SizeT nbytes )
{
// 'align' should be valid (ie. big enough and a power of two) by now.
// VG_(arena_memalign)() will abort if it's not.
if (VG_MIN_MALLOC_SZB == align)
return VG_(arena_malloc) ( VG_AR_CLIENT, "replacemalloc.cm.1",
nbytes );
else
return VG_(arena_memalign) ( VG_AR_CLIENT, "replacemalloc.cm.2",
return VG_(arena_malloc) ( VG_AR_CLIENT, "replacemalloc.cm.1",
nbytes );
else
return VG_(arena_memalign) ( VG_AR_CLIENT, "replacemalloc.cm.2",
align, nbytes );
}
void VG_(cli_free) ( void* p )
{
VG_(arena_free) ( VG_AR_CLIENT, p );
}
// Useful for querying user blocks.
SizeT VG_(cli_malloc_usable_size) ( void* p )
{
void* VG_(cli_realloc) ( void* ptr, SizeT nbytes )
{
return VG_(arena_realloc) ( VG_AR_CLIENT, "replacemalloc.cr.1",
ptr, nbytes );
}
void VG_(cli_free) ( void* p )
{
VG_(arena_free) ( VG_AR_CLIENT, p );
}
// Useful for querying user blocks.
SizeT VG_(cli_malloc_usable_size) ( void* p )
{
return VG_(arena_malloc_usable_size)(VG_AR_CLIENT, p);
}
}
Bool VG_(addr_is_in_block)( Addr a, Addr start, SizeT size, SizeT rz_szB )
{
return ( start - rz_szB <= a && a < start + size + rz_szB );

View File

@ -1,13 +1,14 @@
include $(top_srcdir)/Makefile.tool.am
#SUBDIRS += perf
EXTRA_DIST = docs/dh-manual.xml dh_view.html dh_view.css dh_view.js
#----------------------------------------------------------------------------
# Headers, etc
#----------------------------------------------------------------------------
pkginclude_HEADERS = \
dhat.h
# Ensure the viewer components get copied into the install tree.
dhatdir = $(pkglibexecdir)
dhat_DATA = dh_view.html dh_view.css dh_view.js
@ -21,10 +22,10 @@ if VGCONF_HAVE_PLATFORM_SEC
noinst_PROGRAMS += dhat-@VGCONF_ARCH_SEC@-@VGCONF_OS@
endif
EXP_DHAT_SOURCES_COMMON = dh_main.c
DHAT_SOURCES_COMMON = dh_main.c
dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES = \
$(EXP_DHAT_SOURCES_COMMON)
$(DHAT_SOURCES_COMMON)
dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS = \
$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS = $(LTO_CFLAGS) \
@ -45,7 +46,7 @@ dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LINK = \
if VGCONF_HAVE_PLATFORM_SEC
dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES = \
$(EXP_DHAT_SOURCES_COMMON)
$(DHAT_SOURCES_COMMON)
dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS = \
$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS = $(LTO_CFLAGS) \
@ -78,11 +79,16 @@ if VGCONF_OS_IS_DARWIN
noinst_DSYMS = $(noinst_PROGRAMS)
endif
vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_SOURCES =
# dh_replace_strmem.c runs on the simulated CPU, and is built with
# AM_CFLAGS_PSO_* (see $(top_srcdir)/Makefile.all.am).
VGPRELOAD_DHAT_SOURCES_COMMON = dh_replace_strmem.c
vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_SOURCES = \
$(VGPRELOAD_DHAT_SOURCES_COMMON)
vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_CPPFLAGS = \
$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_CFLAGS = \
$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_PRI_CAPS@)
$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_PRI_CAPS@) -O2
vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_DEPENDENCIES = \
$(LIBREPLACEMALLOC_@VGCONF_PLATFORM_PRI_CAPS@)
vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_LDFLAGS = \
@ -90,11 +96,12 @@ vgpreload_dhat_@VGCONF_ARCH_PRI@_@VGCONF_OS@_so_LDFLAGS = \
$(LIBREPLACEMALLOC_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
if VGCONF_HAVE_PLATFORM_SEC
vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_SOURCES =
vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_SOURCES = \
$(VGPRELOAD_DHAT_SOURCES_COMMON)
vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_CPPFLAGS = \
$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_CFLAGS = \
$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_SEC_CAPS@)
$(AM_CFLAGS_PSO_@VGCONF_PLATFORM_SEC_CAPS@) -O2
vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_DEPENDENCIES = \
$(LIBREPLACEMALLOC_@VGCONF_PLATFORM_SEC_CAPS@)
vgpreload_dhat_@VGCONF_ARCH_SEC@_@VGCONF_OS@_so_LDFLAGS = \

File diff suppressed because it is too large Load Diff

41
dhat/dh_replace_strmem.c Normal file
View File

@ -0,0 +1,41 @@
/*--------------------------------------------------------------------*/
/*--- Replacements for memcpy(), which run on the simulated CPU ---*/
/*--- simulated CPU. ---*/
/*--- dh_replace_strmem.c ---*/
/*--------------------------------------------------------------------*/
/*
This file is part of DHAT, a Valgrind tool for profiling the
heap usage of programs.
Copyright (C) 2020-2020 Nicholas Nethercote
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307, USA.
The GNU General Public License is contained in the file COPYING.
*/
#include "dhat.h"
#define RECORD_COPY(_qzz_len) \
VALGRIND_DO_CLIENT_REQUEST_STMT(_VG_USERREQ__DHAT_COPY, \
(_qzz_len), 0, 0, 0, 0)
#include "../shared/vg_replace_strmem.c"
/*--------------------------------------------------------------------*/
/*--- end ---*/
/*--------------------------------------------------------------------*/

File diff suppressed because it is too large Load Diff

View File

@ -51,7 +51,7 @@ let gHeaderDiv, gTestingDiv, gMainDiv, gLegendDiv, gTimingsDiv;
let gFilename;
// The object extracted from the JSON input.
let gData;
let gData = {};
// The root of the radix tree build from gData. A radix tree is a
// space-optimized prefix tree in which each node that is the only child is
@ -64,62 +64,68 @@ let gRoot;
// - label: Used in the drop-down menu.
// - bolds: Which fields to highlight in the output.
// - cmpField: Field used to sort the radix tree.
// - enable: Function saying whether this option is enabled.
// - sig: Significance function used to determine aggregate nodes.
// - sigLabel: Significance threshold description function.
//
const gSelectData = [
{
label: "Total (bytes)",
label: () => `Total (${bytesUnit()})`,
bolds: { "totalTitle": 1, "totalBytes": 1 },
cmpField: "_totalBytes",
enable: (aBkLt, aBkAcc) => true,
sig: (aT) => aT._totalBytes >= 0.01 * gRoot._totalBytes,
sigLabel: () => `\
total >= ${bytesAndPerc(0.01 * gRoot._totalBytes, gRoot._totalBytes)}`
},
{
isDefault: true,
label: "Total (blocks)",
label: () => `Total (${blocksUnit()})`,
bolds: { "totalTitle": 1, "totalBlocks": 1 },
cmpField: "_totalBlocks",
enable: (aBkLt, aBkAcc) => true,
sig: (aT) => aT._totalBlocks >= 0.01 * gRoot._totalBlocks,
sigLabel: () => `\
total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
},
// No "Total (bytes), tiny" because it's extremely unlikely that an AP with a
// No "Total (bytes), tiny" because it's extremely unlikely that a PP with a
// tiny average size will take up a significant number of bytes.
{
label: "Total (blocks), tiny",
label: () => `Total (${blocksUnit()}), tiny`,
bolds: { "totalTitle": 1, "totalBlocks": 1, "totalAvgSizeBytes": 1 },
cmpField: "_totalBlocks",
enable: (aBkLt, aBkAcc) => true,
sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
aT._totalAvgSizeBytes() <= 16,
sigLabel: () => `\
(total >= ${blocksAndPerc(0.005 * gRoot._totalBlocks, gRoot._totalBlocks)}) && \
(total avg size <= ${bytes(16)})`
(avg size <= ${bytes(16)})`
},
// No "Total (bytes), short-lived", because an AP with few large, short-lived
// No "Total (bytes), short-lived", because a PP with few large, short-lived
// blocks is unlikely. (In contrast, "Total (blocks), short-lived" is useful,
// because an AP with many small, short-lived blocks *is* likely.) And if
// such an AP existed, it'll probably show up in "Total (bytes), zero reads
// because a PP with many small, short-lived blocks *is* likely.) And if
// such a PP existed, it'll probably show up in "Total (bytes), zero reads
// or zero writes" or "Total (bytes), low-access" anyway, because there's
// little time for accesses in 500 instructions.
// little time for accesses in a small number of instructions.
{
label: "Total (blocks), short-lived",
bolds: { "totalTitle": 1, "totalBlocks": 1, "totalAvgLifetimeInstrs": 1 },
label: () => "Total (blocks), short-lived",
bolds: { "totalTitle": 1, "totalBlocks": 1, "totalAvgLifetime": 1 },
cmpField: "_totalBlocks",
enable: (aBkLt, aBkAcc) => aBkLt,
sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
aT._totalAvgLifetimeInstrs() <= 500,
aT._totalAvgLifetimes() <= gData.tuth,
sigLabel: () => `\
(total >= ${blocksAndPerc(0.005 * gRoot._totalBlocks, gRoot._totalBlocks)}) && \
(total avg lifetime <= ${instrs(500)})`
(avg lifetime <= ${time(gData.tuth)})`
},
{
label: "Total (bytes), zero reads or zero writes",
label: () => "Total (bytes), zero reads or zero writes",
bolds: { "totalTitle": 1, "totalBytes": 1,
"readsTitle": 1, "readsBytes": 1,
"writesTitle": 1, "writesBytes": 1,
},
cmpField: "_totalBytes",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._totalBytes >= 0.005 * gRoot._totalBytes &&
(aT._readsBytes === 0 || aT._writesBytes === 0),
sigLabel: () => `\
@ -127,12 +133,13 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
((reads == ${bytes(0)}) || (writes == ${bytes(0)}))`
},
{
label: "Total (blocks), zero reads or zero writes",
label: () => "Total (blocks), zero reads or zero writes",
bolds: { "totalTitle": 1, "totalBlocks": 1,
"readsTitle": 1, "readsBytes": 1,
"writesTitle": 1, "writesBytes": 1,
},
cmpField: "_totalBlocks",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
(aT._readsBytes === 0 || aT._writesBytes === 0),
sigLabel: () => `\
@ -140,12 +147,13 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
((reads == ${bytes(0)}) || (writes == ${bytes(0)}))`
},
{
label: "Total (bytes), low-access",
label: () => "Total (bytes), low-access",
bolds: { "totalTitle": 1, "totalBytes": 1,
"readsTitle": 1, "readsAvgPerByte": 1,
"writesTitle": 1, "writesAvgPerByte": 1,
},
cmpField: "_totalBytes",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._totalBytes >= 0.005 * gRoot._totalBytes &&
aT._readsBytes !== 0 &&
aT._writesBytes !== 0 &&
@ -158,12 +166,13 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
((reads <= ${perByte(0.4)}) || (writes <= ${perByte(0.4)}))`
},
{
label: "Total (blocks), low-access",
label: () => "Total (blocks), low-access",
bolds: { "totalTitle": 1, "totalBlocks": 1,
"readsTitle": 1, "readsAvgPerByte": 1,
"writesTitle": 1, "writesAvgPerByte": 1,
},
cmpField: "_totalBlocks",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._totalBlocks >= 0.005 * gRoot._totalBlocks &&
aT._readsBytes !== 0 &&
aT._writesBytes !== 0 &&
@ -176,14 +185,15 @@ total >= ${blocksAndPerc(0.01 * gRoot._totalBlocks, gRoot._totalBlocks)}`
((reads <= ${perByte(0.4)}) || (writes <= ${perByte(0.4)}))`
},
// No "Total (avg size bytes)": not interesting.
// No "Total (avg lifetime instrs)": covered by "Total (blocks), short-lived".
// No "Total (avg lifetime)": covered by "Total (blocks), short-lived".
// No "Max (bytes)": not interesting, and unclear how to sort.
// No "Max (blocks)": not interesting, and unclear how to sort.
// No "Max (avg size bytes)": not interesting, and unclear how to sort.
{
label: "At t-gmax (bytes)",
label: () => "At t-gmax (bytes)",
bolds: { "atTGmaxTitle": 1, "atTGmaxBytes": 1 },
cmpField: "_atTGmaxBytes",
enable: (aBkLt, aBkAcc) => aBkLt,
sig: (aT) => aT._atTGmaxBytes >= 0.01 * gRoot._atTGmaxBytes,
sigLabel: () => `\
at-t-gmax >= ${bytesAndPerc(0.01 * gRoot._atTGmaxBytes, gRoot._atTGmaxBytes)}`
@ -191,9 +201,10 @@ at-t-gmax >= ${bytesAndPerc(0.01 * gRoot._atTGmaxBytes, gRoot._atTGmaxBytes)}`
// No "At t-gmax (blocks)": not interesting.
// No "At t-gmax (avg size bytes)": not interesting.
{
label: "At t-end (bytes)",
label: () => "At t-end (bytes)",
bolds: { "atTEndTitle": 1, "atTEndBytes": 1 },
cmpField: "_atTEndBytes",
enable: (aBkLt, aBkAcc) => aBkLt,
sig: (aT) => aT._atTEndBytes >= 0.01 * gRoot._atTEndBytes,
sigLabel: () => `\
at-t-end >= ${bytesAndPerc(0.01 * gRoot._atTEndBytes, gRoot._atTEndBytes)}`
@ -201,17 +212,19 @@ at-t-end >= ${bytesAndPerc(0.01 * gRoot._atTEndBytes, gRoot._atTEndBytes)}`
// No "At t-end (blocks)": not interesting.
// No "At t-end (avg size bytes)": not interesting.
{
label: "Reads (bytes)",
label: () => "Reads (bytes)",
bolds: { "readsTitle": 1, "readsBytes": 1 },
cmpField: "_readsBytes",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._readsBytes >= 0.01 * gRoot._readsBytes,
sigLabel: () => `\
reads >= ${bytesAndPerc(0.01 * gRoot._readsBytes, gRoot._readsBytes)}`
},
{
label: "Reads (bytes), high-access",
label: () => "Reads (bytes), high-access",
bolds: { "readsTitle": 1, "readsBytes": 1, "readsAvgPerByte": 1 },
cmpField: "_readsBytes",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._readsBytes >= 0.005 * gRoot._readsBytes &&
(aT._readsAvgPerByte() >= 1000 ||
aT._writesAvgPerByte() >= 1000),
@ -221,17 +234,19 @@ reads >= ${bytesAndPerc(0.01 * gRoot._readsBytes, gRoot._readsBytes)}`
},
// No "Reads (avg per byte)": covered by other access-related ones.
{
label: "Writes (bytes)",
label: () => "Writes (bytes)",
bolds: { "writesTitle": 1, "writesBytes": 1 },
cmpField: "_writesBytes",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._writesBytes >= 0.01 * gRoot._writesBytes,
sigLabel: () => `\
writes >= ${bytesAndPerc(0.01 * gRoot._writesBytes, gRoot._writesBytes)}`
},
{
label: "Writes (bytes), high-access",
label: () => "Writes (bytes), high-access",
bolds: { "writesTitle": 1, "writesBytes": 1, "writesAvgPerByte": 1 },
cmpField: "_writesBytes",
enable: (aBkLt, aBkAcc) => aBkAcc,
sig: (aT) => aT._writesBytes >= 0.005 * gRoot._writesBytes &&
(aT._readsAvgPerByte() >= 1000 ||
aT._writesAvgPerByte() >= 1000),
@ -304,10 +319,10 @@ function TreeNode(aKind, aFrames) {
this._totalBytes = 0;
this._totalBlocks = 0;
this._totalLifetimesInstrs = 0;
this._totalLifetimes = 0;
// These numbers only make sense for leaf nodes. Unlike total stats, which
// can be summed, _maxBytes/_maxBlocks for two APs can't be easily combined
// can be summed, _maxBytes/_maxBlocks for two PPs can't be easily combined
// because the maxes may have occurred at different times.
if (this._kind === kLeaf) {
this._maxBytes = 0;
@ -341,15 +356,20 @@ function TreeNode(aKind, aFrames) {
}
TreeNode.prototype = {
_add(aTotalBytes, aTotalBlocks, aTotalLifetimesInstrs, aMaxBytes,
_add(aTotalBytes, aTotalBlocks, aTotalLifetimes, aMaxBytes,
aMaxBlocks, aAtTGmaxBytes, aAtTGmaxBlocks, aAtTEndBytes,
aAtTEndBlocks, aReadsBytes, aWritesBytes, aAccesses) {
// We ignore this._kind, this._frames, and this._kids.
// Note: if !gData.bklt and/or !gData.bkacc, some of these fields these
// values come from will be missing in the input file, so the values will
// be `undefined`, and the fields will end up as `NaN`. But this is ok
// because we don't show them.
this._totalBytes += aTotalBytes;
this._totalBlocks += aTotalBlocks;
this._totalLifetimesInstrs += aTotalLifetimesInstrs;
this._totalLifetimes += aTotalLifetimes;
if (this._kind === kLeaf) {
// Leaf nodes should only be added to once, because DHAT currently
@ -391,9 +411,9 @@ TreeNode.prototype = {
}
},
_addAP(aAP) {
this._add(aAP.tb, aAP.tbk, aAP.tli, aAP.mb, aAP.mbk, aAP.gb, aAP.gbk,
aAP.fb, aAP.fbk, aAP.rb, aAP.wb, aAP.acc);
_addPP(aPP) {
this._add(aPP.tb, aPP.tbk, aPP.tl, aPP.mb, aPP.mbk, aPP.gb, aPP.gbk,
aPP.eb, aPP.ebk, aPP.rb, aPP.wb, aPP.acc);
},
// This is called in two cases.
@ -401,7 +421,7 @@ TreeNode.prototype = {
// cloning a node).
// - Aggregating multiple nodes.
_addNode(aT) {
this._add(aT._totalBytes, aT._totalBlocks, aT._totalLifetimesInstrs,
this._add(aT._totalBytes, aT._totalBlocks, aT._totalLifetimes,
aT._maxBytes, aT._maxBlocks, aT._atTGmaxBytes, aT._atTGmaxBlocks,
aT._atTEndBytes, aT._atTEndBlocks,
aT._readsBytes, aT._writesBytes, aT._accesses);
@ -409,7 +429,7 @@ TreeNode.prototype = {
// Split the node after the aTi'th internal frame. The inheriting kid will
// get the post-aTi frames; the new kid will get aNewFrames.
_split(aTi, aAP, aNewFrames) {
_split(aTi, aPP, aNewFrames) {
// kid1 inherits t's kind and values.
let inheritedFrames = this._frames.splice(aTi + 1);
let kid1 = new TreeNode(this._kind, inheritedFrames);
@ -420,7 +440,7 @@ TreeNode.prototype = {
// Put all remaining frames into kid2.
let kid2 = new TreeNode(kLeaf, aNewFrames);
kid2._addAP(aAP);
kid2._addPP(aPP);
// Update this.
if (this._kind === kLeaf) {
@ -432,15 +452,15 @@ TreeNode.prototype = {
delete this._maxBlocks;
}
this._kids = [kid1, kid2];
this._addAP(aAP);
this._addPP(aPP);
},
_totalAvgSizeBytes() {
return div(this._totalBytes, this._totalBlocks);
},
_totalAvgLifetimeInstrs() {
return div(this._totalLifetimesInstrs, this._totalBlocks);
_totalAvgLifetimes() {
return div(this._totalLifetimes, this._totalBlocks);
},
_maxAvgSizeBytes() {
@ -474,15 +494,15 @@ function checkFields(aObj, aFields) {
}
}
// Do basic checking of an AP read from file.
function checkAP(aAP) {
let fields = ["tb", "tbk", "tli",
"mb", "mbk",
"gb", "gbk",
"fb", "fbk",
"rb", "wb",
"fs"];
checkFields(aAP, fields);
// Do basic checking of a PP read from file.
function checkPP(aPP) {
checkFields(aPP, ["tb", "tbk", "fs"]);
if (gData.bklt) {
checkFields(aPP, ["mb", "mbk", "gb", "gbk", "eb", "ebk"]);
}
if (gData.bkacc) {
checkFields(aPP, ["rb", "wb"]);
}
}
// Access counts latch as 0xffff. Treating 0xffff as Infinity gives us exactly
@ -497,51 +517,78 @@ function normalizeAccess(aAcc) {
assert(false, "too-large access value");
}
const kExpectedFileVersion = 1;
const kExpectedFileVersion = 2;
// Build gRoot from gData.
function buildTree() {
// Check global values.
let fields = ["dhatFileVersion",
let fields = ["dhatFileVersion", "mode", "verb",
"bklt", "bkacc",
"tu", "Mtu",
"cmd", "pid",
"mi", "ei",
"aps", "ftbl"];
"te", "pps", "ftbl"];
checkFields(gData, fields);
if (gData.dhatFileVersion != kExpectedFileVersion) {
throw Error(`data file has version number ${gData.dhatFileVersion}, ` +
`expected version number ${kExpectedFileVersion}`);
throw new Error(
`data file has version number ${gData.dhatFileVersion}, ` +
`expected version number ${kExpectedFileVersion}`);
}
if (gData.bklt) {
checkFields(gData, ["tg", "tuth"]);
}
// Update sort metric labels, and disable sort metrics that aren't allowed
// for this data.
for (let [i, option] of gSelect.childNodes.entries()) {
let data = gSelectData[i];
option.label = data.label();
option.disabled = !data.enable(gData.bklt, gData.bkacc);
}
// If the selected sort metric was just disabled, switch the sort metric
// back to the default (which is never disabled).
let option = gSelect.childNodes[gSelect.selectedIndex];
if (option.disabled) {
for (let [i, data] of gSelectData.entries()) {
let option = gSelect.childNodes[i];
if (data.isDefault) {
option.selected = true;
break;
}
}
}
// Build the radix tree. Nodes are in no particular order to start with. The
// algorithm is tricky because we need to use internal frames when possible.
gRoot = new TreeNode(kLeaf, [0]); // Frame 0 is always "[root]".
for (let [i, ap] of gData.aps.entries()) {
checkAP(ap);
for (let [i, pp] of gData.pps.entries()) {
checkPP(pp);
// Decompress the run-length encoding in `acc`, if present.
if (ap.acc) {
if (pp.acc) {
let acc = [];
for (let i = 0; i < ap.acc.length; i++) {
if (ap.acc[i] < 0) {
for (let i = 0; i < pp.acc.length; i++) {
if (pp.acc[i] < 0) {
// A negative number encodes a repeat count. The following entry has
// the value to be repeated.
let reps = -ap.acc[i++];
let val = ap.acc[i];
let reps = -pp.acc[i++];
let val = pp.acc[i];
for (let j = 0; j < reps; j++) {
acc.push(normalizeAccess(val));
}
} else {
acc.push(normalizeAccess(ap.acc[i]));
acc.push(normalizeAccess(pp.acc[i]));
}
}
ap.acc = acc;
pp.acc = acc;
}
// The first AP is a special case, because we have to build gRoot.
// The first PP is a special case, because we have to build gRoot.
if (i === 0) {
gRoot._frames.push(...ap.fs);
gRoot._addAP(ap);
gRoot._frames.push(...pp.fs);
gRoot._addPP(pp);
continue;
}
@ -553,8 +600,7 @@ function buildTree() {
// `abcd` is a frame sequence (and `-` is an empty sequence), `N` is a node
// value, and `Xs` are the node's children.
for (let [j, kidFrame] of ap.fs.entries()) {
for (let [j, kidFrame] of pp.fs.entries()) {
// Search for kidFrame among internal frames.
if (ti + 1 < t._frames.length) {
// t has an internal frame at the right index.
@ -566,7 +612,7 @@ function buildTree() {
// The internal frame doesn't match. Split the node.
//
// E.g. abcd:20-[] + abef:10 => ab:30-[cd:20-[], ef:10-[]]
t._split(ti, ap, ap.fs.slice(j));
t._split(ti, pp, pp.fs.slice(j));
done = true;
break;
}
@ -580,12 +626,12 @@ function buildTree() {
// get the leftover frames.
//
// E.g. ab:20-[] + abcd:10 => ab:30-[-:20-[], cd:10-[]]
t._split(ti, ap, ap.fs.slice(j));
t._split(ti, pp, pp.fs.slice(j));
done = true;
break;
}
t._addAP(ap);
t._addPP(pp);
// Search for the frame among the kids.
let kid;
@ -604,8 +650,8 @@ function buildTree() {
//
// E.g. ab:20-[c:10-Xs, d:10-Ys] + abef:10 =>
// ab:30-[c:10-Xs, d:10-Ys, ef:10-[]]
kid = new TreeNode(kLeaf, ap.fs.slice(j));
kid._addAP(ap);
kid = new TreeNode(kLeaf, pp.fs.slice(j));
kid._addPP(pp);
t._kids.push(kid);
done = true;
break;
@ -615,9 +661,9 @@ function buildTree() {
if (!done) {
// If we reach here, either:
// - ap's frames match an existing frame sequence, in which case we
// just need to _addAP(); or
// - ap's frames are a subsequence of an existing sequence, in which
// - pp's frames match an existing frame sequence, in which case we
// just need to _addPP(); or
// - pp's frames are a subsequence of an existing sequence, in which
// case we must split.
if (ti + 1 < t._frames.length) {
@ -625,20 +671,20 @@ function buildTree() {
// frames. Split, creating an empty node.
//
// E.g. abcd:20-Xs + ab:10 => ab:30-[cd:20-Xs, -:10-[]]
t._split(ti, ap, []);
t._split(ti, pp, []);
} else if (!t._kids) {
// This is impossible because DHAT currently produces records with
// unique locations. If we remove addresses from frames in the future
// then duplicate locations will occur, and the following code is how
// it must be handled.
throw Error(`data file contains a repeated location`);
throw new Error(`data file contains a repeated location (1)`);
// Matches an existing sequence that doesn't end in node with empty
// frames. Add the AP.
// frames. Add the PP.
//
// E.g. ab:20-[] + ab:10 => ab:30-[]
t._addAP(ap);
t._addPP(pp);
} else {
// Look for a kid with empty frames.
@ -655,14 +701,14 @@ function buildTree() {
// unique locations. If we remove addresses from frames in the future
// then duplicate locations will occur, and the following code is how
// it must be handled.
throw Error(`data file contains a repeated location`);
throw new Error(`data file contains a repeated location (2)`);
// Matches an existing sequence that ends in a node with empty
// frames. Add the AP.
// frames. Add the PP.
//
// E.g. ab:20-[c:10-Xs, -:10-[]] + ab:10 => ab:30-[c:10-Xs, -:20-[]]
t._addAP(ap);
emptyKid._addAP(ap);
t._addPP(pp);
emptyKid._addPP(pp);
} else {
// A subsequence of an existing sequence that ends at the end of t's
@ -671,14 +717,13 @@ function buildTree() {
// E.g. ab:20-[c:10-Xs, d:10-Ys] + ab:10 =>
// ab:30-[c:10-Xs, d:10-Ys, -:10-[]]
let newKid = new TreeNode(kLeaf, []);
newKid._addAP(ap);
newKid._addPP(pp);
t._kids.push(newKid);
t._addAP(ap);
t._addPP(pp);
}
}
}
}
}
@ -697,11 +742,23 @@ function perc(aNum, aDenom) {
}
function perMinstr(aN) {
return `${kDFormat.format(div(1000000 * aN, gData.ei))}/Minstr`;
return `${kDFormat.format(div(1000000 * aN, gData.te))}/${gData.Mtu}`;
}
function byteUnit() {
return gData.hasOwnProperty("bu") ? gData.bsu : "byte";
}
function bytesUnit() {
return gData.hasOwnProperty("bsu") ? gData.bsu : "bytes";
}
function blocksUnit() {
return gData.hasOwnProperty("bksu") ? gData.bksu : "blocks";
}
function bytes(aN) {
return `${kDFormat.format(aN)} bytes`;
return `${kDFormat.format(aN)} ${bytesUnit()}`;
}
function bytesAndPerc(aN, aTotalN) {
@ -713,7 +770,7 @@ function bytesAndPercAndRate(aN, aTotalN) {
}
function blocks(aN) {
return `${kDFormat.format(aN)} blocks`;
return `${kDFormat.format(aN)} ${blocksUnit()}`;
}
function blocksAndPerc(aN, aTotalN) {
@ -729,15 +786,15 @@ function avgSizeBytes(aN) {
}
function perByte(aN) {
return `${kDFormat.format(aN)}/byte`;
return `${kDFormat.format(aN)}/${byteUnit()}`;
}
function instrs(aN) {
return `${kDFormat.format(aN)} instrs`;
function time(aN) {
return `${kDFormat.format(aN)} ${gData.tu}`;
}
function avgLifetimeInstrs(aN) {
return `avg lifetime ${instrs(aN)}`;
function avgLifetime(aN) {
return `avg lifetime ${time(aN)}`;
}
function accesses(aAccesses) {
@ -817,6 +874,7 @@ function appendInvocationAndTimes(aP) {
let v, v1, v2;
v = "Invocation {\n";
v += ` Mode: ${gData.mode}\n`;
v += ` Command: ${gData.cmd}\n`;
v += ` PID: ${gData.pid}\n`;
v += "}\n\n";
@ -825,9 +883,11 @@ function appendInvocationAndTimes(aP) {
v = "Times {\n";
v1 = perc(gData.mi, gData.ei);
v += ` t-gmax: ${instrs(gData.mi)} (${v1} of program duration)\n`;
v += ` t-end: ${instrs(gData.ei)}\n`;
v1 = perc(gData.tg, gData.te);
if (gData.bklt) {
v += ` t-gmax: ${time(gData.tg)} (${v1} of program duration)\n`;
}
v += ` t-end: ${time(gData.te)}\n`;
v += "}\n\n";
@ -1017,103 +1077,109 @@ function appendTreeInner(aT, aP, aBolds, aCmp, aPc, aSig, aNodeIdNums,
let v1, v2, v3, v4, v5;
// "AP" + node ID + kid count.
// "PP" + node ID + kid count.
v1 = aNodeIdNums.join('.');
v2 = aNumSibs + 1;
v3 = kids ? `(${kids.length} children) ` : "";
fr(`AP ${v1}/${v2} ${v3}{`, true, false);
fr(`PP ${v1}/${v2} ${v3}{`, true, false);
nl(true);
// "Total".
v1 = bytesAndPercAndRate(aT._totalBytes, gRoot._totalBytes);
v2 = blocksAndPercAndRate(aT._totalBlocks, gRoot._totalBlocks);
v3 = avgSizeBytes(aT._totalAvgSizeBytes());
v4 = avgLifetimeInstrs(aT._totalAvgLifetimeInstrs());
v5 = perc(aT._totalAvgLifetimeInstrs(), gData.ei);
v4 = avgLifetime(aT._totalAvgLifetimes());
v5 = perc(aT._totalAvgLifetimes(), gData.te);
fr(" Total: ", aBolds.totalTitle);
fr(v1, aBolds.totalBytes);
fr(" in ");
fr(v2, aBolds.totalBlocks);
fr(", ", aBolds.totalAvgSizeBytes, false);
fr(v3, aBolds.totalAvgSizeBytes);
fr(", ", aBolds.totalAvgLifetimeInstrs, false);
fr(`${v4} (${v5} of program duration)`, aBolds.totalAvgLifetimeInstrs);
if (gData.bklt) {
fr(", ", aBolds.totalAvgLifetime, false);
fr(`${v4} (${v5} of program duration)`, aBolds.totalAvgLifetime);
}
nl(aBolds.totalTitle);
// "Max".
if (aT !== gRoot && aT._kind === kLeaf) {
assert(!kids, "leaf node has children");
// These percentages are relative to the local totals, not the root
// totals.
v1 = bytes(aT._maxBytes);
v2 = blocks(aT._maxBlocks);
v3 = avgSizeBytes(aT._maxAvgSizeBytes());
fr(` Max: ${v1} in ${v2}, ${v3}`);
nl();
if (gData.bklt) {
// "Max".
if (aT !== gRoot && aT._kind === kLeaf) {
assert(!kids, "leaf node has children");
// These percentages are relative to the local totals, not the root
// totals.
v1 = bytes(aT._maxBytes);
v2 = blocks(aT._maxBlocks);
v3 = avgSizeBytes(aT._maxAvgSizeBytes());
fr(` Max: ${v1} in ${v2}, ${v3}`);
nl();
}
// "At t-gmax".
v1 = bytesAndPerc(aT._atTGmaxBytes, gRoot._atTGmaxBytes);
v2 = blocksAndPerc(aT._atTGmaxBlocks, gRoot._atTGmaxBlocks);
v3 = avgSizeBytes(aT._atTGmaxAvgSizeBytes());
fr(" At t-gmax: ", aBolds.atTGmaxTitle);
fr(v1, aBolds.atTGmaxBytes);
fr(` in ${v2}, ${v3}`);
nl(aBolds.atTGmaxTitle);
// "At t-end".
v1 = bytesAndPerc(aT._atTEndBytes, gRoot._atTEndBytes);
v2 = blocksAndPerc(aT._atTEndBlocks, gRoot._atTEndBlocks);
v3 = avgSizeBytes(aT._atTEndAvgSizeBytes());
fr(" At t-end: ", aBolds.atTEndTitle);
fr(v1, aBolds.atTEndBytes);
fr(` in ${v2}, ${v3}`);
nl(aBolds.atTEndTitle);
}
// "At t-gmax".
v1 = bytesAndPerc(aT._atTGmaxBytes, gRoot._atTGmaxBytes);
v2 = blocksAndPerc(aT._atTGmaxBlocks, gRoot._atTGmaxBlocks);
v3 = avgSizeBytes(aT._atTGmaxAvgSizeBytes());
fr(" At t-gmax: ", aBolds.atTGmaxTitle);
fr(v1, aBolds.atTGmaxBytes);
fr(` in ${v2}, ${v3}`);
nl(aBolds.atTGmaxTitle);
if (gData.bkacc) {
// "Reads".
v1 = bytesAndPercAndRate(aT._readsBytes, gRoot._readsBytes);
v2 = perByte(aT._readsAvgPerByte());
fr(" Reads: ", aBolds.readsTitle);
fr(v1, aBolds.readsBytes);
fr(", ", aBolds.readsBytes && aBolds.readsAvgPerByte, false);
fr(v2, aBolds.readsAvgPerByte);
nl(aBolds.readsTitle);
// "At t-end".
v1 = bytesAndPerc(aT._atTEndBytes, gRoot._atTEndBytes);
v2 = blocksAndPerc(aT._atTEndBlocks, gRoot._atTEndBlocks);
v3 = avgSizeBytes(aT._atTEndAvgSizeBytes());
fr(" At t-end: ", aBolds.atTEndTitle);
fr(v1, aBolds.atTEndBytes);
fr(` in ${v2}, ${v3}`);
nl(aBolds.atTEndTitle);
// "Writes".
v1 = bytesAndPercAndRate(aT._writesBytes, gRoot._writesBytes);
v2 = perByte(aT._writesAvgPerByte());
fr(" Writes: ", aBolds.writesTitle);
fr(v1, aBolds.writesBytes);
fr(", ", aBolds.writesBytes && aBolds.writesAvgPerByte, false);
fr(v2, aBolds.writesAvgPerByte);
nl(aBolds.writesTitle);
// "Reads".
v1 = bytesAndPercAndRate(aT._readsBytes, gRoot._readsBytes);
v2 = perByte(aT._readsAvgPerByte());
fr(" Reads: ", aBolds.readsTitle);
fr(v1, aBolds.readsBytes);
fr(", ", aBolds.readsBytes && aBolds.readsAvgPerByte, false);
fr(v2, aBolds.readsAvgPerByte);
nl(aBolds.readsTitle);
// "Writes".
v1 = bytesAndPercAndRate(aT._writesBytes, gRoot._writesBytes);
v2 = perByte(aT._writesAvgPerByte());
fr(" Writes: ", aBolds.writesTitle);
fr(v1, aBolds.writesBytes);
fr(", ", aBolds.writesBytes && aBolds.writesAvgPerByte, false);
fr(v2, aBolds.writesAvgPerByte);
nl(aBolds.writesTitle);
// "Accesses". We show 32 per line (but not on aggregate nodes).
if (aT._accesses && aT._accesses.length > 0) {
let v = " Accesses: {";
let prevN;
for (let [i, n] of aT._accesses.entries()) {
if ((i % 32) === 0) {
fr(v);
nl();
v1 = i.toString().padStart(3, ' ');
v = ` [${v1}] `;
v += `${accesses(n)} `;
} else {
// Use a ditto mark for repeats.
v += (n === prevN && n !== 0) ? "〃 " : `${accesses(n)} `;
// "Accesses". We show 32 per line (but not on aggregate nodes).
if (aT._accesses && aT._accesses.length > 0) {
let v = " Accesses: {";
let prevN;
for (let [i, n] of aT._accesses.entries()) {
if ((i % 32) === 0) {
fr(v);
nl();
v1 = i.toString().padStart(3, ' ');
v = ` [${v1}] `;
v += `${accesses(n)} `;
} else {
// Use a ditto mark for repeats.
v += (n === prevN && n !== 0) ? "〃 " : `${accesses(n)} `;
}
prevN = n;
}
prevN = n;
}
fr(v);
nl();
fr(v);
nl();
fr(" }");
nl();
fr(" }");
nl();
}
}
// "Allocated at".
fr(" Allocated at {", true, false);
fr(` ${gData.verb} at {`, true, false);
nl(true);
if (aT._kind === kAgg) {
// Don't print ancestor frames; just print the "insignificant" frame.
@ -1219,7 +1285,7 @@ function appendTree(aP, aBolds, aCmp, aPc, aSig) {
}
function appendSignificanceThreshold(aP, aSigLabel) {
let v = `\nAP significance threshold: ${aSigLabel()}\n`;
let v = `\nPP significance threshold: ${aSigLabel()}\n`;
appendElementWithText(aP, "span", v, "threshold");
}
@ -1287,7 +1353,7 @@ function displayTree(aTRead, aTParse, aTBuild) {
// Get details relating to the chosen sort metrics.
let data = gSelectData[gSelect.selectedIndex];
let bolds = data.bolds;
let label = data.label;
let label = data.label();
let cmpField = data.cmpField;
let sig = data.sig;
let sigLabel = data.sigLabel;
@ -1397,7 +1463,7 @@ function onLoad() {
gSelect = appendElement(selectDiv, "select");
gSelect.onchange = changeSortMetric;
for (let [i, data] of gSelectData.entries()) {
let option = appendElementWithText(gSelect, "option", data.label);
let option = appendElementWithText(gSelect, "option", data.label());
option.value = i;
if (data.isDefault) {
option.selected = true;
@ -1421,13 +1487,15 @@ function onLoad() {
appendElementWithText(ul, "li", "'t-gmax': time of global heap maximum " +
"(as measured in bytes)");
appendElementWithText(ul, "li", "'t-end': time of program end");
// The file may use different units (via the `tu` and `Mtu` fields), but
// these are the standard units so mention them here.
appendElementWithText(ul, "li", "'instrs': instructions");
appendElementWithText(ul, "li", "'Minstr': mega-instruction, i.e. one " +
"million instructions");
appendElementWithText(ul, "li", "'AP': allocation point");
appendElementWithText(ul, "li", "'PP': program point");
appendElementWithText(ul, "li", "'avg': average");
appendElementWithText(ul, "li", "'-' (in accesses): zero");
appendElementWithText(ul, "li", "'∞' (in accesses): leaf AP counts max out " +
appendElementWithText(ul, "li", "'∞' (in accesses): leaf PP counts max out " +
"at 65534; larger counts are treated as " +
"infinity");
appendElementWithText(ul, "li", "'〃' (in accesses): same as previous entry");

75
dhat/dhat.h Normal file
View File

@ -0,0 +1,75 @@
/*
----------------------------------------------------------------
Notice that the following BSD-style license applies to this one
file (dhat.h) only. The rest of Valgrind is licensed under the
terms of the GNU General Public License, version 2, unless
otherwise indicated. See the COPYING file in the source
distribution for details.
----------------------------------------------------------------
This file is part of DHAT, a Valgrind tool for profiling the
heap usage of programs.
Copyright (C) 2020 Nicholas Nethercote. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product
documentation would be appreciated but is not required.
3. Altered source versions must be plainly marked as such, and must
not be misrepresented as being the original software.
4. The name of the author may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------
Notice that the above BSD-style license applies to this one file
(memcheck.h) only. The entire rest of Valgrind is licensed under
the terms of the GNU General Public License, version 2. See the
COPYING file in the source distribution for details.
----------------------------------------------------------------
*/
#include "valgrind.h"
typedef
enum {
VG_USERREQ__DHAT_AD_HOC_EVENT = VG_USERREQ_TOOL_BASE('D', 'H'),
// This is just for DHAT's internal use. Don't use it.
_VG_USERREQ__DHAT_COPY = VG_USERREQ_TOOL_BASE('D','H') + 256
} Vg_DHATClientRequest;
// Record an ad hoc event. The meaning of the weight argument will depend on
// what the event represents, which is up to the user. If no meaningful weight
// argument exists, just use 1.
#define DHAT_AD_HOC_EVENT(_qzz_weight) \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DHAT_AD_HOC_EVENT, \
(_qzz_weight), 0, 0, 0, 0)

View File

@ -16,15 +16,15 @@
<sect1 id="dh-manual.overview" xreflabel="Overview">
<title>Overview</title>
<para>DHAT is a tool for examining how programs use their heap
<para>DHAT is primarily a tool for examining how programs use their heap
allocations.</para>
<para>It tracks the allocated blocks, and inspects every memory access
to find which block, if any, it is to. It presents, on an allocation point
to find which block, if any, it is to. It presents, on a program point
basis, information about these blocks such as sizes, lifetimes, numbers of
reads and writes, and read and write patterns.</para>
<para>Using this information it is possible to identify allocation points with
<para>Using this information it is possible to identify program points with
the following characteristics:</para>
<itemizedlist>
@ -54,6 +54,9 @@ as instruction counts. This sounds a little odd at first, but it
makes runs repeatable in a way which is not possible if CPU time is
used.</para>
<para>DHAT also has support for copy profiling and ad hoc profiling. These are
described below.</para>
</sect1>
@ -155,11 +158,12 @@ because this can significantly reduce the size of DHAT's output files.</para>
<sect2 id="dh-output-header"><title>The Output Header</title>
<para>The first part of the output shows the program command and process ID.
For example:</para>
<para>The first part of the output shows the mode, program command and process
ID. For example:</para>
<programlisting><![CDATA[
Invocation {
Mode: heap
Command: /home/njn/moz/rust0/build/x86_64-unknown-linux-gnu/stage2/bin/rustc --crate-name tuple_stress src/main.rs
PID: 18816
}
@ -179,18 +183,19 @@ Times {
</sect2>
<sect2 id="dh-ap-tree"><title>The AP Tree</title>
<sect2 id="dh-ap-tree"><title>The PP Tree</title>
<para>The third part of the output is the largest and most interesting part,
showing the allocation point (AP) tree.</para>
showing the program point (PP) tree.</para>
<sect3 id="dh-structure"><title>Structure</title>
<para>The following image shows a screenshot of part of an AP
<para>The following image shows a screenshot of part of a PP
tree. The font is very small because this screenshot is intended to
demonstrate the high-level structure of the tree rather than the
details within the text.</para>
details within the text. (It is also slightly out-of-date, and doesn't quite
match the current output produced by DHAT's viewer.)</para>
<graphic fileref="images/dh-tree.png" scalefit="1"/>
@ -228,7 +233,7 @@ email, bug report, etc.</para>
<para>The root node looks like this:</para>
<programlisting><![CDATA[
AP 1/1 (25 children) {
PP 1/1 (25 children) {
Total: 1,355,253,987 bytes (100%, 67,454.81/Minstr) in 5,943,417 blocks (100%, 295.82/Minstr), avg size 228.03 bytes, avg lifetime 3,134,692,250.67 instrs (15.6% of program duration)
At t-gmax: 423,930,307 bytes (100%) in 1,575,682 blocks (100%), avg size 269.05 bytes
At t-end: 258,002 bytes (100%) in 2,129 blocks (100%), avg size 121.18 bytes
@ -250,11 +255,11 @@ next example will explain these in more detail.</para>
<sect3 id="dh-interior-nodes"><title>Interior Nodes</title>
<para>AP nodes further down the tree show information about a subset of
<para>PP nodes further down the tree show information about a subset of
allocations. For example:</para>
<programlisting><![CDATA[
AP 1.1/25 (2 children) {
PP 1.1/25 (2 children) {
Total: 54,533,440 bytes (4.02%, 2,714.28/Minstr) in 458,839 blocks (7.72%, 22.84/Minstr), avg size 118.85 bytes, avg lifetime 1,127,259,403.64 instrs (5.61% of program duration)
At t-gmax: 0 bytes (0%) in 0 blocks (0%), avg size 0 bytes
At t-end: 0 bytes (0%) in 0 blocks (0%), avg size 0 bytes
@ -288,7 +293,7 @@ stack trace that is shared by all the blocks covered by this node.</para>
<para>The <computeroutput>Total</computeroutput> line shows that this node
accounts for 4.02% of all bytes allocated during execution, and 7.72% of all
blocks. These percentages are useful for comparing the significance of
different nodes within a single profile; an AP that accounts for 10% of bytes
different nodes within a single profile; a PP that accounts for 10% of bytes
allocated is likely to be more interesting than one that accounts for
2%.</para>
@ -301,16 +306,16 @@ different workloads.</para>
average size and lifetimes of these blocks.</para>
<para>The <computeroutput>At t-gmax</computeroutput> line says shows that no
blocks from this AP were alive when the global heap peak occurred. In other
blocks from this PP were alive when the global heap peak occurred. In other
words, these blocks do not contribute at all to the global heap peak.</para>
<para>The <computeroutput>At t-end</computeroutput> line shows that no blocks
were from this AP were alive at shutdown. In other words, all those blocks were
were from this PP were alive at shutdown. In other words, all those blocks were
explicitly freed before termination.</para>
<para>The <computeroutput>Reads</computeroutput> and
<computeroutput>Writes</computeroutput> lines show how many bytes were read
within this AP's blocks, the fraction this represents of all heap reads, and
within this PP's blocks, the fraction this represents of all heap reads, and
the read rate. Finally, it shows the read ratio, which is the number of reads
per byte. In this case the number is 0.29, which is quite low -- if no byte was
read twice, then only 29% of the allocated bytes, which means that at least 71%
@ -336,7 +341,7 @@ vectors and hash tables, and isn't always fixable. </para>
<para>This is a leaf node:</para>
<programlisting><![CDATA[
AP 1.1.1.1/2 {
PP 1.1.1.1/2 {
Total: 31,460,928 bytes (2.32%, 1,565.9/Minstr) in 262,171 blocks (4.41%, 13.05/Minstr), avg size 120 bytes, avg lifetime 986,406,885.05 instrs (4.91% of program duration)
Max: 16,779,136 bytes in 65,543 blocks, avg size 256 bytes
At t-gmax: 0 bytes (0%) in 0 blocks (0%), avg size 0 bytes
@ -365,10 +370,10 @@ is a great-grandchild of the root; is the first grandchild of the node in the
previous example; and has no children.</para>
<para>Leaf nodes contain an additional <computeroutput>Max</computeroutput>
line, indicating the peak memory use for the blocks covered by this AP. (This
line, indicating the peak memory use for the blocks covered by this PP. (This
peak may have occurred at a time other than
<computeroutput>t-gmax</computeroutput>.) In this case, 31,460,298 bytes were
allocated from this AP, but the maximum size alive at once was 16,779,136
allocated from this PP, but the maximum size alive at once was 16,779,136
bytes.</para>
<para>Stack frames that begin with a <computeroutput>^</computeroutput> rather
@ -383,7 +388,7 @@ This also means that each node makes complete sense on its own.</para>
<sect3 id="dh-access-counts"><title>Access Counts</title>
<para>If all blocks covered by an AP node have the same size, an additional
<para>If all blocks covered by a PP node have the same size, an additional
<computeroutput>Accesses</computeroutput> field will be present. It indicates
how the reads and writes within these blocks were distributed. For
example:</para>
@ -399,7 +404,7 @@ Accesses: {
}
]]></programlisting>
<para>Every block covered by this AP was 32 bytes. Within all of those blocks,
<para>Every block covered by this PP was 32 bytes. Within all of those blocks,
byte 0 was accessed (read or written) 65,547 times, byte 1 was accessed 7
times, byte 2 was accessed 8 times, and so on.</para>
@ -425,12 +430,12 @@ layout inefficiencies.</para>
<sect3 id="aggregate-nodes"><title>Aggregate Nodes</title>
<para>The AP tree is very large and many nodes represent tiny numbers of blocks
<para>The PP tree is very large and many nodes represent tiny numbers of blocks
and bytes. Therefore, DHAT's viewer aggregates insignificant nodes like
this:</para>
<programlisting><![CDATA[
AP 1.14.2/2 {
PP 1.14.2/2 {
Total: 5,175 blocks (0.09%, 0.26/Minstr)
Allocated at {
[5 insignificant]
@ -449,15 +454,15 @@ case).</para>
<sect2 id="dh-output-footer"><title>The Output Footer</title>
<para>Below the AP tree is a line like this:</para>
<para>Below the PP tree is a line like this:</para>
<programlisting><![CDATA[
AP significance threshold: total >= 59,434.17 blocks (1%)
PP significance threshold: total >= 59,434.17 blocks (1%)
]]></programlisting>
<para>It shows the function used to determine if an AP node is significant. All
<para>It shows the function used to determine if a PP node is significant. All
nodes that don't satisfy this function are aggregated. It is occasionally
useful if you don't understand why an AP node has been aggregated. The exact
useful if you don't understand why a PP node has been aggregated. The exact
threshold depends on the sort metric (see below).</para>
<para>Finally, the bottom of the page shows a legend that explains some of the
@ -587,21 +592,21 @@ filtering, so that only nodes meeting a particular criteria are shown.</para>
<para>The values within a node that represent the chosen sort metric are shown
in bold, so they stand out.</para>
<para>Here is part of an AP node found with "Total (blocks), tiny", showing
<para>Here is part of a PP node found with "Total (blocks), tiny", showing
blocks with an average size of only 8.67 bytes:</para>
<programlisting><![CDATA[
Total: 3,407,848 bytes (0.25%, 169.62/Minstr) in 393,214 blocks (6.62%, 19.57/Minstr), avg size 8.67 bytes, avg lifetime 1,167,795,629.1 instrs (5.81% of program duration)
]]></programlisting>
<para>Here is part of an AP node found with "Total (blocks), short-lived",
<para>Here is part of a PP node found with "Total (blocks), short-lived",
showing blocks with an average lifetime of only 181.75 instructions:</para>
<programlisting><![CDATA[
Total: 23,068,584 bytes (1.7%, 1,148.19/Minstr) in 262,143 blocks (4.41%, 13.05/Minstr), avg size 88 bytes, avg lifetime 181.75 instrs (0% of program duration)
]]></programlisting>
<para>Here is an example of an AP identified with "Total (blocks), zero reads
<para>Here is an example of a PP identified with "Total (blocks), zero reads
or zero writes", showing blocks that are allocated but never touched:</para>
<programlisting><![CDATA[
@ -613,7 +618,7 @@ Reads: 0 bytes (0%, 0/Minstr), 0/byte
Writes: 0 bytes (0%, 0/Minstr), 0/byte
]]></programlisting>
<para>All the blocks identified by these APs are good candidates for
<para>All the blocks identified by these PPs are good candidates for
optimization.</para>
</sect2>
@ -648,10 +653,10 @@ increasing the current heap size by 200 bytes and then decreasing it by 100
bytes.) As a result, it can only increase the global heap peak (if indeed,
this results in a new peak) by 100 bytes.</para>
<para>Finally, the allocation point assigned to the block allocated by the
<para>Finally, the program point assigned to the block allocated by the
<computeroutput>malloc(100)</computeroutput> call is retained once the block
is reallocated. Which means that all 300 bytes are attributed to that
allocation point, and no separate allocation point is created for the
program point, and no separate program point is created for the
<computeroutput>realloc(200)</computeroutput> call. This may be surprising,
but it has one large benefit.</para>
@ -659,12 +664,84 @@ but it has one large benefit.</para>
adds data to that buffer from numerous different points in the code,
reallocating the buffer each time it gets full. (E.g. code generation in a
compiler might work this way.) With the described approach, the first heap
block and all subsequent heap blocks are attributed to the same allocation
point. While this is something of a lie -- the first allocation point isn't
actually responsible for the other allocations -- it is arguably better than
having the allocation points spread around, in a distribution
that unpredictably depends on whenever the reallocation points were
triggered.</para>
block and all subsequent heap blocks are attributed to the same program point.
While this is something of a lie -- the first program point isn't actually
responsible for the other allocations -- it is arguably better than having the
program points spread around in a distribution that unpredictably depends on
whenever the reallocations were triggered.</para>
</sect1>
<sect1 id="dh-manual.copy-profiling" xreflabel="Copy profiling">
<title>Copy profiling</title>
<para>If DHAT is invoked with <option>--mode=copy</option>, instead of
profiling heap operations (allocations and deallocations), it profiles copy
operations, such as <computeroutput>memcpy</computeroutput>,
<computeroutput>memmove</computeroutput>,
<computeroutput>strcpy</computeroutput>, and
<computeroutput>bcopy</computeroutput>. This is sometimes useful.</para>
<para>Here is an example PP node from this mode:</para>
<programlisting><![CDATA[
PP 1.1.2/5 (4 children) {
Total: 1,210,925 bytes (10.03%, 4,358.66/Minstr) in 112,717 blocks (35.2%, 405.72/Minstr), avg size 10.74 bytes
Copied at {
^1: 0x4842524: memmove (vg_replace_strmem.c:1289)
#2: 0x1F0A0D: copy_nonoverlapping<u8> (intrinsics.rs:1858)
#3: 0x1F0A0D: copy_from_slice<u8> (mod.rs:2524)
#4: 0x1F0A0D: spec_extend<u8> (vec.rs:2227)
#5: 0x1F0A0D: extend_from_slice<u8> (vec.rs:1619)
#6: 0x1F0A0D: push_str (string.rs:821)
#7: 0x1F0A0D: write_str (string.rs:2418)
#8: 0x1F0A0D: <&mut W as core::fmt::Write>::write_str (mod.rs:195)
}
}
]]></programlisting>
<para>It is very similar to the PP nodes for heap profiling, but with less
information, because copy profiling doesn't involve any tracking of memory
regions with lifetimes.</para>
</sect1>
<sect1 id="dh-manual.ad-hoc-profiling" xreflabel="Ad hoc profiling">
<title>Ad hoc profiling</title>
<para>If DHAT is invoked with <option>--mode=ad-hoc</option>, instead of
profiling heap operations (allocations and deallocations), it profiles calls to
the <computeroutput>DHAT_AD_HOC_EVENT</computeroutput> client request, which is
declared in <filename>dhat/dhat.h</filename>.</para>
<para>Here is an example PP node from this mode:</para>
<programlisting><![CDATA[
PP 1.1.1.1/2 {
Total: 30 units (17.65%, 115.97/Minstr) in 1 events (14.29%, 3.87/Minstr), avg size 30 units
Occurred at {
^1: 0x109407: g (ad-hoc.c:4)
^2: 0x109425: f (ad-hoc.c:8)
#3: 0x109497: main (ad-hoc.c:14)
}
}
]]></programlisting>
<para>This kind of profiling is useful when you know a code path is hot but you
want to know more about it.</para>
<para>For example, you might want to know which callsites of a hot function
account for most of the calls. You could put a
<computeroutput>DHAT_AD_HOC_EVENT(1);</computeroutput> call at the start of
that function.</para>
<para>Alternatively, you might want to know the typical length of a vector in a
hot location. You could put a
<computeroutput>DHAT_AD_HOC_EVENT(len);</computeroutput> call at the
appropriate location, when <computeroutput>len</computeroutput> is the length
of the vector.</para>
</sect1>
@ -694,6 +771,17 @@ triggered.</para>
</listitem>
</varlistentry>
<varlistentry id="opt.mode" xreflabel="--mode">
<term>
<option><![CDATA[--mode=<heap|copy|ad-hoc> [default: heap] ]]></option>
</term>
<listitem>
<para>The profiling mode: heap profiling, copy profiling, or ad hoc
profiling.
</para>
</listitem>
</varlistentry>
</variablelist>
<para>Note that stacks by default have 12 frames. This may be more than

View File

@ -5,16 +5,20 @@ dist_noinst_SCRIPTS = filter_stderr
EXTRA_DIST = \
acc.stderr.exp acc.vgtest \
ad-hoc.stderr.exp ad-hoc.vgtest \
basic.stderr.exp basic.vgtest \
big.stderr.exp big.vgtest \
copy.stderr.exp copy.vgtest \
empty.stderr.exp empty.vgtest \
sig.stderr.exp sig.vgtest \
single.stderr.exp single.vgtest
check_PROGRAMS = \
acc \
ad-hoc \
basic \
big \
copy \
empty \
sig \
single

27
dhat/tests/ad-hoc.c Normal file
View File

@ -0,0 +1,27 @@
#include "dhat/dhat.h"
#include <stdlib.h>
void g(void) {
DHAT_AD_HOC_EVENT(30);
}
void f(void) {
g();
DHAT_AD_HOC_EVENT(20);
g();
}
int main(void) {
f();
DHAT_AD_HOC_EVENT(10);
f();
// At one point malloc was broken with --mode=ad-hoc(!), and Valgrind was
// printing messages like "VG_USERREQ__CLIENT_CALL1: func=0x0" when malloc
// was called. So check that it's basically working...
char* p = malloc(100);
p = realloc(p, 200);
free(p);
return 0;
}

View File

@ -0,0 +1 @@
Total: 170 units in 7 events

3
dhat/tests/ad-hoc.vgtest Normal file
View File

@ -0,0 +1,3 @@
prog: ad-hoc
vgopts: --mode=ad-hoc --dhat-out-file=dhat.out
cleanup: rm dhat.out

View File

@ -3,6 +3,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "dhat/dhat.h"
int main(void)
{
@ -24,5 +25,9 @@ int main(void)
free(c);
// totals: 3008 read, 3516 write
// Should be ignored because we're not in ad hoc mode.
DHAT_AD_HOC_EVENT(100);
return 0;
}

60
dhat/tests/copy.c Normal file
View File

@ -0,0 +1,60 @@
// This tests --mode=copy with various copying functions.
#define _GNU_SOURCE // For mempcpy.
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
void f(char* a, char* b, wchar_t* wa, wchar_t* wb);
void test_malloc();
int main(void) {
char a[1000];
char b[1000];
for (int i = 0; i < 1000; i++) {
a[i] = 'a';
b[i] = 'b';
}
a[999] = '\0';
b[999] = '\0';
wchar_t wa[250];
wchar_t wb[250];
for (int i = 0; i < 250; i++) {
wa[i] = 'A';
wb[i] = 'B';
}
wa[249] = '\0';
wb[249] = '\0';
for (int i = 0; i < 100; i++) {
f(a, b, wa, wb);
}
test_malloc();
return 0;
}
void f(char* a, char* b, wchar_t* wa, wchar_t* wb) {
// The memcpy is duplicated so we have 10 calls, which makes for nice round
// numbers in the totals.
memcpy (a, b, 1000); // Redirects to memmove
memcpy (a, b, 1000); // Redirects to memmove
memmove(a, b, 1000);
mempcpy(a, b, 1000);
bcopy (a, b, 1000); // Redirects to memmove
strcpy (a, b);
strncpy(a, b, 1000);
stpcpy (a, b); // Redirects to strcpy
stpncpy(a, b, 1000);
wcscpy (wa, wb);
}
void test_malloc() {
// At one point malloc was broken with --mode=copy(!), and Valgrind was
// printing messages like "VG_USERREQ__CLIENT_CALL1: func=0x0" when malloc
// was called. So check that it's basically working...
char* p = malloc(100);
p = realloc(p, 200);
free(p);
}

View File

@ -0,0 +1 @@
Total: 1,000,... bytes in 1,0.. blocks

4
dhat/tests/copy.vgtest Normal file
View File

@ -0,0 +1,4 @@
prog: copy
vgopts: --mode=copy --dhat-out-file=dhat.out
stderr_filter: filter_copy
cleanup: rm dhat.out

9
dhat/tests/filter_copy Executable file
View File

@ -0,0 +1,9 @@
#! /bin/sh
# It's impossible to get exact matches for copy counts because even trivial C
# programs do a few memcpy/strcpy calls. So we allow some fuzzy matching.
# So we allow 1,000,000..1,009,999 bytes and 1,000..1,099 blocks.
./filter_stderr "$@" |
sed -e "s/1,00.,... bytes in 1,0.. blocks/1,000,... bytes in 1,0.. blocks/"

View File

@ -21,8 +21,7 @@ sed "/^ file:\/\/\// d" |
sed "/^in a web browser/ d" |
sed "/^ \// d" | # This is pretty feeble, but I don't see
# how to do better
sed "/^Scroll to the end/ d" |
sed "/^explanation of some/ d" |
sed "/^The text at the bottom/ d" |
# and remove any blank lines in the output
sed "/^[[:space:]]*$/d"

View File

@ -39,6 +39,7 @@
/* Can be called from VG_(tdict).malloc_malloc et al to do the actual
* alloc/freeing. */
extern void* VG_(cli_malloc) ( SizeT align, SizeT nbytes );
extern void* VG_(cli_realloc)( void* ptr, SizeT nbytes );
extern void VG_(cli_free) ( void* p );
// Returns the usable size of a heap-block. It's the asked-for size plus
// possibly some more due to rounding up.

View File

@ -41,7 +41,7 @@ IRSB* nl_instrument ( VgCallbackClosure* closure,
const VexArchInfo* archinfo_host,
IRType gWordTy, IRType hWordTy )
{
return bb;
return bb;
}
static void nl_fini(Int exitcode)

View File

@ -35,12 +35,13 @@
#include "pub_tool_clreq.h"
/* ---------------------------------------------------------------------
We have our own versions of these functions for two reasons:
We have our own versions of these functions for multiple reasons:
(a) it allows us to do overlap checking
(b) some of the normal versions are hyper-optimised, which fools
(b) it allows us to do copy tracking
(c) some of the normal versions are hyper-optimised, which fools
Memcheck and cause spurious value warnings. Our versions are
simpler.
(c) the glibc SSE-variants can read past the end of the input data
(d) the glibc SSE-variants can read past the end of the input data
ranges. This can cause false-positive Memcheck / Helgrind / DRD
reports.
@ -173,6 +174,15 @@ static inline void my_exit ( int x )
#ifndef RECORD_OVERLAP_ERROR
#define RECORD_OVERLAP_ERROR(s, src, dst, len) do { } while (0)
#endif
// Used for tools that record bulk copies: memcpy, strcpy, etc.
#ifndef RECORD_COPY
#define RECORD_COPY(len) do { } while (0)
#define FOR_COPY(x)
#else
#define FOR_COPY(x) x
#endif
#ifndef VALGRIND_CHECK_VALUE_IS_DEFINED
#define VALGRIND_CHECK_VALUE_IS_DEFINED(__lvalue) 1
#endif
@ -496,12 +506,14 @@ static inline void my_exit ( int x )
while (*src) *dst++ = *src++; \
*dst = 0; \
\
/* This checks for overlap after copying, unavoidable without */ \
/* This happens after copying, unavoidable without */ \
/* pre-counting length... should be ok */ \
SizeT srclen = (Addr)src-(Addr)src_orig+1; \
RECORD_COPY(srclen); \
if (is_overlap(dst_orig, \
src_orig, \
(Addr)dst-(Addr)dst_orig+1, \
(Addr)src-(Addr)src_orig+1)) \
srclen)) \
RECORD_OVERLAP_ERROR("strcpy", dst_orig, src_orig, 0); \
\
return dst_orig; \
@ -539,7 +551,9 @@ static inline void my_exit ( int x )
while (m < n && *src) { m++; *dst++ = *src++; } \
/* Check for overlap after copying; all n bytes of dst are relevant, */ \
/* but only m+1 bytes of src if terminator was found */ \
if (is_overlap(dst_orig, src_orig, n, (m < n) ? m+1 : n)) \
SizeT srclen = (m < n) ? m+1 : n; \
RECORD_COPY(srclen); \
if (is_overlap(dst_orig, src_orig, n, srclen)) \
RECORD_OVERLAP_ERROR("strncpy", dst, src, n); \
while (m++ < n) *dst++ = 0; /* must pad remainder with nulls */ \
\
@ -585,7 +599,9 @@ static inline void my_exit ( int x )
/* m non-nul bytes have now been copied, and m <= n-1. */ \
/* Check for overlap after copying; all n bytes of dst are relevant, */ \
/* but only m+1 bytes of src if terminator was found */ \
if (is_overlap(dst_orig, src_orig, n, (m < n) ? m+1 : n)) \
SizeT srclen = (m < n) ? m+1 : n; \
RECORD_COPY(srclen); \
if (is_overlap(dst_orig, src_orig, n, srclen)) \
RECORD_OVERLAP_ERROR("strlcpy", dst, src, n); \
/* Nul-terminate dst. */ \
if (n > 0) *dst = 0; \
@ -943,6 +959,7 @@ static inline void my_exit ( int x )
void* VG_REPLACE_FUNCTION_EZZ(becTag,soname,fnname) \
( void *dst, const void *src, SizeT len ) \
{ \
RECORD_COPY(len); \
if (do_ol_check && is_overlap(dst, src, len, len)) \
RECORD_OVERLAP_ERROR("memcpy", dst, src, len); \
\
@ -1034,6 +1051,7 @@ static inline void my_exit ( int x )
MEMCPY(VG_Z_LIBC_SONAME, memcpy) /* fallback case */
MEMCPY(VG_Z_LIBC_SONAME, __GI_memcpy)
MEMCPY(VG_Z_LIBC_SONAME, __memcpy_sse2)
MEMCPY(VG_Z_LIBC_SONAME, __memcpy_avx_unaligned_erms)
MEMCPY(VG_Z_LD_SO_1, memcpy) /* ld.so.1 */
MEMCPY(VG_Z_LD64_SO_1, memcpy) /* ld64.so.1 */
/* icc9 blats these around all over the place. Not only in the main
@ -1142,10 +1160,12 @@ static inline void my_exit ( int x )
\
/* This checks for overlap after copying, unavoidable without */ \
/* pre-counting length... should be ok */ \
SizeT srclen = (Addr)src-(Addr)src_orig+1; \
RECORD_COPY(srclen); \
if (is_overlap(dst_orig, \
src_orig, \
(Addr)dst-(Addr)dst_orig+1, \
(Addr)src-(Addr)src_orig+1)) \
srclen)) \
RECORD_OVERLAP_ERROR("stpcpy", dst_orig, src_orig, 0); \
\
return dst; \
@ -1185,7 +1205,9 @@ static inline void my_exit ( int x )
while (m < n && *src) { m++; *dst++ = *src++; } \
/* Check for overlap after copying; all n bytes of dst are relevant, */ \
/* but only m+1 bytes of src if terminator was found */ \
if (is_overlap(dst_str, src_orig, n, (m < n) ? m+1 : n)) \
SizeT srclen = (m < n) ? m+1 : n; \
RECORD_COPY(srclen); \
if (is_overlap(dst_str, src_orig, n, srclen)) \
RECORD_OVERLAP_ERROR("stpncpy", dst, src, n); \
dst_str = dst; \
while (m++ < n) *dst++ = 0; /* must pad remainder with nulls */ \
@ -1200,9 +1222,6 @@ static inline void my_exit ( int x )
/*---------------------- memset ----------------------*/
/* Why are we bothering to intercept this? It seems entirely
pointless. */
#define MEMSET(soname, fnname) \
void* VG_REPLACE_FUNCTION_EZZ(20210,soname,fnname) \
(void *s, Int c, SizeT n); \
@ -1301,6 +1320,7 @@ static inline void my_exit ( int x )
void VG_REPLACE_FUNCTION_EZU(20230,soname,fnname) \
(const void *srcV, void *dstV, SizeT n) \
{ \
RECORD_COPY(n); \
SizeT i; \
HChar* dst = dstV; \
const HChar* src = srcV; \
@ -1338,6 +1358,7 @@ static inline void my_exit ( int x )
void* VG_REPLACE_FUNCTION_EZU(20240,soname,fnname) \
(void *dstV, const void *srcV, SizeT n, SizeT destlen) \
{ \
RECORD_COPY(n); \
SizeT i; \
HChar* dst = dstV; \
const HChar* src = srcV; \
@ -1438,12 +1459,14 @@ static inline void my_exit ( int x )
char* VG_REPLACE_FUNCTION_EZU(20270,soname,fnname) \
(char* dst, const char* src, SizeT len) \
{ \
FOR_COPY(const HChar* src_orig = src); \
HChar* ret = dst; \
if (! len) \
goto badness; \
while ((*dst++ = *src++) != '\0') \
if (--len == 0) \
goto badness; \
RECORD_COPY((Addr)src-(Addr)src_orig); \
return ret; \
badness: \
VALGRIND_PRINTF_BACKTRACE( \
@ -1474,11 +1497,13 @@ static inline void my_exit ( int x )
char* VG_REPLACE_FUNCTION_EZU(20280,soname,fnname) \
(char* dst, const char* src, SizeT len) \
{ \
FOR_COPY(const HChar* src_orig = src); \
if (! len) \
goto badness; \
while ((*dst++ = *src++) != '\0') \
if (--len == 0) \
goto badness; \
RECORD_COPY((Addr)src-(Addr)src_orig); \
return dst - 1; \
badness: \
VALGRIND_PRINTF_BACKTRACE( \
@ -1508,6 +1533,7 @@ static inline void my_exit ( int x )
void* VG_REPLACE_FUNCTION_EZU(20290,soname,fnname) \
( void *dst, const void *src, SizeT len ) \
{ \
RECORD_COPY(len); \
SizeT len_saved = len; \
\
if (len == 0) \
@ -1557,15 +1583,13 @@ static inline void my_exit ( int x )
{ \
register HChar *d; \
register const HChar *s; \
\
if (dstlen < len) goto badness; \
\
if (dstlen < len) \
goto badness; \
RECORD_COPY(len); \
if (len == 0) \
return dst; \
\
if (is_overlap(dst, src, len, len)) \
RECORD_OVERLAP_ERROR("memcpy_chk", dst, src, len); \
\
if ( dst > src ) { \
d = (HChar *)dst + len - 1; \
s = (const HChar *)src + len - 1; \
@ -1977,11 +2001,14 @@ static inline void my_exit ( int x )
\
/* This checks for overlap after copying, unavoidable without */ \
/* pre-counting length... should be ok */ \
/* +4 because sizeof(wchar_t) == 4 */ \
SizeT srclen = (Addr)src-(Addr)src_orig+4; \
RECORD_COPY(srclen); \
if (is_overlap(dst_orig, \
src_orig, \
/* +4 because sizeof(wchar_t) == 4 */ \
(Addr)dst-(Addr)dst_orig+4, \
(Addr)src-(Addr)src_orig+4)) \
srclen)) \
RECORD_OVERLAP_ERROR("wcscpy", dst_orig, src_orig, 0); \
\
return dst_orig; \