This patch (re-)gains performance in helgrind, following revision 15207, that

reduced memory use doing SecMap GC, but was slowing down some workloads
(typically, workloads doing a lot of malloc/free).

A significant part of the slowdown came from the clear of the filter,
that was not optimised for big ranges : the filter was working byte
per byte till an 8 alignment. Then working per 8 bytes at a time.

With the patch, the filter clear is done the following way:
   * all the bytes till 8 alignement are done together
   * then 8 bytes at a time till filter_line alignment (32 bytes)
   * then 32 bytes at a time.

Moreover, as the filter cache is small (1024 lines of 32 bytes),
clearing filter for ranges bigger than 32Kb was uselessly checking
several times the same entry. This is now avoided by using a range
check rather than a tag equality check.

As the new filter clear is significanly more complex than the previous simple
algorithm, the old algorithm is kept and used to check the new algorithm
when CHECK_ZSM is defined as 1.

The patch also contains a few micro optimisations and
disables 
   // VG_(track_die_mem_stack)       ( evh__die_mem );
as this had no effect and was somewhat costly.

With this patch, we have almost reached for all perf tests the same
performance as we had before revision 15207. Some tests are still
slightly slower than before the SecMap GC (max 2% difference).
Some tests are now significantly faster (e.g. sarp).
For almost all tests, we are now faster than valgrind 3.10.1.
Details below.

Regtested on x86/amd64/ppc64 (and regtested with all compile time
checks set).
I have also regtested with libreoffice and firefox.
(with firefox, also with CHECK_ZSM set to 1).


Details about performance:
hgtrace = this patch
trunk_untouched = trunk
base_secmap = trunk before secmap GC
valgrind 3.10.1 included for comparison
Measured on core i5 2.53GHz


-- Running  tests in perf ----------------------------------------------
-- bigcode1 --
bigcode1 hgtrace   :0.14s  he: 2.6s (18.4x, -----)
bigcode1 trunk_untouched:0.14s  he: 2.6s (18.4x, -0.4%)
bigcode1 base_secmap:0.14s  he: 2.6s (18.6x, -1.2%)
bigcode1 valgrind-3.10.1:0.14s  he: 2.8s (19.8x, -7.8%)
-- bigcode2 --
bigcode2 hgtrace   :0.14s  he: 6.3s (44.7x, -----)
bigcode2 trunk_untouched:0.14s  he: 6.2s (44.6x,  0.2%)
bigcode2 base_secmap:0.14s  he: 6.3s (45.0x, -0.6%)
bigcode2 valgrind-3.10.1:0.14s  he: 6.6s (47.1x, -5.4%)
-- bz2 --
bz2      hgtrace   :0.64s  he:11.3s (17.7x, -----)
bz2      trunk_untouched:0.64s  he:11.7s (18.2x, -3.2%)
bz2      base_secmap:0.64s  he:11.1s (17.3x,  1.9%)
bz2      valgrind-3.10.1:0.64s  he:12.6s (19.7x,-11.3%)
-- fbench --
fbench   hgtrace   :0.29s  he: 3.4s (11.8x, -----)
fbench   trunk_untouched:0.29s  he: 3.4s (11.7x,  0.6%)
fbench   base_secmap:0.29s  he: 3.6s (12.4x, -5.0%)
fbench   valgrind-3.10.1:0.29s  he: 3.5s (12.2x, -3.5%)
-- ffbench --
ffbench  hgtrace   :0.26s  he: 9.8s (37.7x, -----)
ffbench  trunk_untouched:0.26s  he:10.0s (38.4x, -1.9%)
ffbench  base_secmap:0.26s  he: 9.8s (37.8x, -0.2%)
ffbench  valgrind-3.10.1:0.26s  he:10.0s (38.4x, -1.9%)
-- heap --
heap     hgtrace   :0.11s  he: 9.2s (84.0x, -----)
heap     trunk_untouched:0.11s  he: 9.6s (87.1x, -3.7%)
heap     base_secmap:0.11s  he: 9.0s (81.9x,  2.5%)
heap     valgrind-3.10.1:0.11s  he: 9.1s (82.9x,  1.3%)
-- heap_pdb4 --
heap_pdb4 hgtrace   :0.13s  he:10.7s (82.3x, -----)
heap_pdb4 trunk_untouched:0.13s  he:11.0s (84.8x, -3.0%)
heap_pdb4 base_secmap:0.13s  he:10.5s (80.8x,  1.8%)
heap_pdb4 valgrind-3.10.1:0.13s  he:10.6s (81.8x,  0.7%)
-- many-loss-records --
many-loss-records hgtrace   :0.01s  he: 1.5s (152.0x, -----)
many-loss-records trunk_untouched:0.01s  he: 1.6s (157.0x, -3.3%)
many-loss-records base_secmap:0.01s  he: 1.6s (158.0x, -3.9%)
many-loss-records valgrind-3.10.1:0.01s  he: 1.7s (167.0x, -9.9%)
-- many-xpts --
many-xpts hgtrace   :0.03s  he: 2.8s (91.7x, -----)
many-xpts trunk_untouched:0.03s  he: 2.8s (94.7x, -3.3%)
many-xpts base_secmap:0.03s  he: 2.8s (94.0x, -2.5%)
many-xpts valgrind-3.10.1:0.03s  he: 2.9s (97.7x, -6.5%)
-- memrw --
memrw    hgtrace   :0.06s  he: 7.3s (121.2x, -----)
memrw    trunk_untouched:0.06s  he: 7.2s (120.3x,  0.7%)
memrw    base_secmap:0.06s  he: 7.1s (117.7x,  2.9%)
memrw    valgrind-3.10.1:0.06s  he: 8.1s (135.2x,-11.6%)
-- sarp --
sarp     hgtrace   :0.02s  he: 7.6s (378.5x, -----)
sarp     trunk_untouched:0.02s  he: 8.4s (422.0x,-11.5%)
sarp     base_secmap:0.02s  he: 8.6s (431.0x,-13.9%)
sarp     valgrind-3.10.1:0.02s  he: 8.8s (442.0x,-16.8%)
-- tinycc --
tinycc   hgtrace   :0.20s  he:12.4s (62.0x, -----)
tinycc   trunk_untouched:0.20s  he:12.6s (63.2x, -1.9%)
tinycc   base_secmap:0.20s  he:12.6s (63.0x, -1.6%)
tinycc   valgrind-3.10.1:0.20s  he:12.7s (63.5x, -2.3%)
-- Finished tests in perf ----------------------------------------------

== 12 programs, 48 timings =================




git-svn-id: svn://svn.valgrind.org/valgrind/trunk@15236
This commit is contained in:
Philippe Waroquiers 2015-05-15 11:41:54 +00:00
parent ba3235fb91
commit 6d0f2f35c0
2 changed files with 156 additions and 4 deletions

View File

@ -5568,7 +5568,11 @@ static void hg_pre_clo_init ( void )
VG_(track_die_mem_stack_signal)( evh__die_mem );
VG_(track_die_mem_brk) ( evh__die_mem_munmap );
VG_(track_die_mem_munmap) ( evh__die_mem_munmap );
VG_(track_die_mem_stack) ( evh__die_mem );
/* evh__die_mem calls at the end libhb_srange_noaccess_NoFX
which has no effect. We do not use VG_(track_die_mem_stack),
as this would be an expensive way to do nothing. */
// VG_(track_die_mem_stack) ( evh__die_mem );
// FIXME: what is this for?
VG_(track_ban_mem_stack) (NULL);

View File

@ -3568,9 +3568,12 @@ static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
}
}
static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
/* Only used to verify the fast Filter__clear_range */
__attribute__((unused))
static void Filter__clear_range_SLOW ( Filter* fi, Addr a, UWord len )
{
//VG_(printf)("%lu ", len);
tl_assert (CHECK_ZSM);
/* slowly do part preceding 8-alignment */
while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
Filter__clear_1byte( fi, a );
@ -3591,6 +3594,151 @@ static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
}
}
static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
{
# if CHECK_ZSM > 0
/* We check the below more complex algorithm with the simple one.
This check is very expensive : we do first the slow way on a
copy of the data, then do it the fast way. On RETURN, we check
the two values are equal. */
Filter fi_check = *fi;
Filter__clear_range_SLOW(&fi_check, a, len);
# define RETURN goto check_and_return
# else
# define RETURN return
# endif
Addr begtag = FI_GET_TAG(a); /* tag of range begin */
Addr end = a + len - 1;
Addr endtag = FI_GET_TAG(end); /* tag of range end. */
UWord rlen = len; /* remaining length to clear */
Addr c = a; /* Current position we are clearing. */
UWord clineno = FI_GET_LINENO(c); /* Current lineno we are clearing */
FiLine* cline; /* Current line we are clearing */
UWord cloff; /* Current offset in line we are clearing, when clearing
partial lines. */
UShort u16;
STATIC_ASSERT (FI_LINE_SZB == 32);
// Below assumes filter lines are 32 bytes
if (LIKELY(fi->tags[clineno] == begtag)) {
/* LIKELY for the heavy caller VG_(unknown_SP_update). */
/* First filter line matches begtag.
If c is not at the filter line begin, the below will clear
the filter line bytes starting from c. */
cline = &fi->lines[clineno];
cloff = (c - begtag) / 8;
/* First the byte(s) needed to reach 8-alignment */
if (UNLIKELY(!VG_IS_8_ALIGNED(c))) {
/* hiB is the nr of bytes (higher addresses) from c to reach
8-aligment. */
UWord hiB = 8 - (c & 7);
/* Compute 2-bit/byte mask representing hiB bytes [c..c+hiB[
mask is C000 , F000, FC00, FF00, FFC0, FFF0 or FFFC for the byte
range 7..7 6..7 5..7 4..7 3..7 2..7 1..7 */
UShort mask = 0xFFFF << (16 - 2*hiB);
u16 = cline->u16s[cloff];
if (LIKELY(rlen >= hiB)) {
cline->u16s[cloff] = u16 & ~mask; /* clear all hiB from c */
rlen -= hiB;
c += hiB;
cloff += 1;
} else {
/* Only have the bits for rlen bytes bytes. */
mask = mask & ~(0xFFFF << (16 - 2*(hiB-rlen)));
cline->u16s[cloff] = u16 & ~mask; /* clear rlen bytes from c. */
RETURN; // We have cleared all what we can.
}
}
/* c is now 8 aligned. Clear by 8 aligned bytes,
till c is filter-line aligned */
while (!VG_IS_32_ALIGNED(c) && rlen >= 8) {
cline->u16s[cloff] = 0;
c += 8;
rlen -= 8;
cloff += 1;
}
} else {
c = begtag + FI_LINE_SZB;
if (c > end)
RETURN; // We have cleared all what we can.
rlen -= c - a;
}
// We have changed c, so re-establish clineno.
clineno = FI_GET_LINENO(c);
if (rlen >= FI_LINE_SZB) {
/* Here, c is filter line-aligned. Clear all full lines that
overlap with the range starting at c, made of a full lines */
UWord nfull = rlen / FI_LINE_SZB;
UWord full_len = nfull * FI_LINE_SZB;
rlen -= full_len;
if (nfull > FI_NUM_LINES)
nfull = FI_NUM_LINES; // no need to check several times the same entry.
for (UWord n = 0; n < nfull; n++) {
if (UNLIKELY(address_in_range(fi->tags[clineno], c, full_len))) {
cline = &fi->lines[clineno];
cline->u16s[0] = 0;
cline->u16s[1] = 0;
cline->u16s[2] = 0;
cline->u16s[3] = 0;
STATIC_ASSERT (4 == sizeof(cline->u16s)/sizeof(cline->u16s[0]));
}
clineno++;
if (UNLIKELY(clineno == FI_NUM_LINES))
clineno = 0;
}
c += full_len;
clineno = FI_GET_LINENO(c);
}
if (CHECK_ZSM) {
tl_assert(VG_IS_8_ALIGNED(c));
tl_assert(clineno == FI_GET_LINENO(c));
}
/* Do the last filter line, if it was not cleared as a full filter line */
if (UNLIKELY(rlen > 0) && fi->tags[clineno] == endtag) {
cline = &fi->lines[clineno];
cloff = (c - endtag) / 8;
if (CHECK_ZSM) tl_assert(FI_GET_TAG(c) == endtag);
/* c is 8 aligned. Clear by 8 aligned bytes, till we have less than
8 bytes. */
while (rlen >= 8) {
cline->u16s[cloff] = 0;
c += 8;
rlen -= 8;
cloff += 1;
}
/* Then the remaining byte(s) */
if (rlen > 0) {
/* nr of bytes from c to reach end. */
UWord loB = rlen;
/* Compute mask representing loB bytes [c..c+loB[ :
mask is 0003, 000F, 003F, 00FF, 03FF, 0FFF or 3FFF */
UShort mask = 0xFFFF >> (16 - 2*loB);
u16 = cline->u16s[cloff];
cline->u16s[cloff] = u16 & ~mask; /* clear all loB from c */
}
}
# if CHECK_ZSM > 0
check_and_return:
tl_assert (VG_(memcmp)(&fi_check, fi, sizeof(fi_check)) == 0);
# endif
# undef RETURN
}
/* ------ Read handlers for the filter. ------ */
@ -6786,7 +6934,7 @@ static void zsm_sset_range_noaccess (Addr addr, SizeT len)
if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
for (UInt lz = 0; lz < N_SECMAP_ZLINES; lz++) {
if (sm->linesZ[lz].dict[0] != SVal_INVALID)
if (LIKELY(sm->linesZ[lz].dict[0] != SVal_INVALID))
rcdec_LineZ(&sm->linesZ[lz]);
}
for (UInt lf = 0; lf < sm->linesF_size; lf++) {