Bug 446103 - Memcheck: --track-origins=yes causes extreme slowdowns for large mmap/munmap.

This patch rewrites the Level 2 origin-tracking cache (ocacheL2) so that
set-address-range-permissions (SARP) operations on it, for large ranges, are
at least a factor of 2.5 x faster.  This is primarily targeted at SARPs in the
range of hundreds to thousands of megabytes.  The Level 1 origin-tracking
cache covers 64MB address space, so SARPs that fit within it are mostly
unaffected.  There are extensive comments in-line.  Changes are:

* Change the Level 2 cache from a single AVL tree (OSet) into 4096 such trees,
  selected by middle bits of the tag, hence "taking out" 12 significant bits
  of search in any given tree.

* For the OCacheLine type, use a union so as to overlay the w32 and descr
  arrays with an array of 64-bit values.  This is used to speed up cases where
  those fields are to be set to zero, or checked against zero.

* Due to the various fast-paths added by this patch, OC_BITS_PER_LINE has
  pretty much been frozen at the current value, 5.

* ocache_sarp_Set_Origins, ocache_sarp_Clear_Origins: deal with large ranges
  in 32-byte steps instead of 4-byte steps.

* MC_(helperc_b_store32), MC_(helperc_b_store16): rewrite these to be (much)
  more efficient.

* fast-return cases for VG_(OSetGen_Lookup) and VG_(OSetGen_Remove) when the
  tree is empty

* a few extra inline hints
This commit is contained in:
Julian Seward 2021-12-08 07:52:09 +01:00
parent 23de71004b
commit 8ee1656165
3 changed files with 354 additions and 91 deletions

3
NEWS
View File

@ -57,10 +57,11 @@ are not entered into bugzilla tend to get forgotten about or ignored.
446139 DRD/Helgrind with std::shared_timed_mutex::try_lock_until and try_lock_shared_until false positives
446138 DRD/Helgrind with std::timed_mutex::try_lock_until false positives
446281 Add a DRD suppression for fwrite
446103 Memcheck: `--track-origins=yes` causes extreme slowdowns for large mmap/munmap
To see details of a given bug, visit
https://bugs.kde.org/show_bug.cgi?id=XXXXXX
where XXXXXX is the bug number as listed below.
where XXXXXX is the bug number as listed above.

View File

@ -570,7 +570,7 @@ void VG_(OSetWord_Insert)(AvlTree* t, UWord val)
/*--------------------------------------------------------------------*/
// Find the *node* in t matching k, or NULL if not found.
static AvlNode* avl_lookup(const AvlTree* t, const void* k)
static inline AvlNode* avl_lookup(const AvlTree* t, const void* k)
{
Word cmpres;
AvlNode* curr = t->root;
@ -604,9 +604,10 @@ static AvlNode* avl_lookup(const AvlTree* t, const void* k)
// Find the *element* in t matching k, or NULL if not found.
void* VG_(OSetGen_Lookup)(const AvlTree* t, const void* k)
{
AvlNode* n;
vg_assert(t);
n = avl_lookup(t, k);
if (LIKELY(t->root == NULL))
return NULL;
AvlNode* n = avl_lookup(t, k);
return ( n ? elem_of_node(n) : NULL );
}
@ -767,6 +768,8 @@ static Bool avl_removeroot(AvlTree* t)
// if not present.
void* VG_(OSetGen_Remove)(AvlTree* t, const void* k)
{
if (LIKELY(t->root == NULL))
return NULL;
// Have to find the node first, then remove it.
AvlNode* n = avl_lookup(t, k);
if (n) {

View File

@ -2402,7 +2402,9 @@ static UWord stats_ocacheL1_misses = 0;
static UWord stats_ocacheL1_lossage = 0;
static UWord stats_ocacheL1_movefwds = 0;
static UWord stats__ocacheL2_refs = 0;
static UWord stats__ocacheL2_finds = 0;
static UWord stats__ocacheL2_adds = 0;
static UWord stats__ocacheL2_dels = 0;
static UWord stats__ocacheL2_misses = 0;
static UWord stats__ocacheL2_n_nodes_max = 0;
@ -2431,26 +2433,86 @@ static INLINE Bool is_valid_oc_tag ( Addr tag ) {
#define OC_MOVE_FORWARDS_EVERY_BITS 7
/* Originally (pre Dec 2021) it was the case that this code had a
parameterizable cache line size, set by changing OC_BITS_PER_LINE.
However, as a result of the speedup fixes necessitated by bug 446103, that
is no longer really the case, and much of the L1 and L2 cache code has been
tuned specifically for the case OC_BITS_PER_LINE == 5 (that is, the line
size is 32 bytes). Changing that would require a bunch of re-tuning
effort. So let's set it in stone for now. */
STATIC_ASSERT(OC_BITS_PER_LINE == 5);
STATIC_ASSERT(OC_LINES_PER_SET == 2);
/* Fundamentally we want an OCacheLine structure (see below) as follows:
struct {
Addr tag;
UInt w32 [OC_W32S_PER_LINE];
UChar descr[OC_W32S_PER_LINE];
}
However, in various places, we want to set the w32[] and descr[] arrays to
zero, or check if they are zero. This can be a very hot path (per bug
446103). So, instead, we have a union which is either those two arrays
(OCacheLine_Main) or simply an array of ULongs (OCacheLine_W64s). For the
set-zero/test-zero operations, the OCacheLine_W64s are used.
*/
// To ensure that OCacheLine.descr[] will fit in an integral number of ULongs.
STATIC_ASSERT(0 == (OC_W32S_PER_LINE % 8));
#define OC_W64S_PER_MAIN /* "MAIN" meaning "struct OCacheLine_Main" */ \
(OC_W32S_PER_LINE / 2 /* covers OCacheLine_Main.w32[] */ \
+ OC_W32S_PER_LINE / 8) /* covers OCacheLine_Main.descr[] */
STATIC_ASSERT(OC_W64S_PER_MAIN == 5);
typedef
ULong OCacheLine_W64s[OC_W64S_PER_MAIN];
typedef
struct {
UInt w32 [OC_W32S_PER_LINE];
UChar descr[OC_W32S_PER_LINE];
}
OCacheLine_Main;
STATIC_ASSERT(sizeof(OCacheLine_W64s) == sizeof(OCacheLine_Main));
typedef
struct {
Addr tag;
UInt w32[OC_W32S_PER_LINE];
UChar descr[OC_W32S_PER_LINE];
union {
OCacheLine_W64s w64s;
OCacheLine_Main main;
} u;
}
OCacheLine;
/* Classify and also sanity-check 'line'. Return 'e' (empty) if not
in use, 'n' (nonzero) if it contains at least one valid origin tag,
and 'z' if all the represented tags are zero. */
static UChar classify_OCacheLine ( OCacheLine* line )
static inline UChar classify_OCacheLine ( OCacheLine* line )
{
UWord i;
if (line->tag == 1/*invalid*/)
return 'e'; /* EMPTY */
tl_assert(is_valid_oc_tag(line->tag));
// BEGIN fast special-case of the test loop below. This will detect
// zero-ness (case 'z') for a subset of cases that the loop below will,
// hence is safe.
if (OC_W64S_PER_MAIN == 5) {
if (line->u.w64s[0] == 0
&& line->u.w64s[1] == 0 && line->u.w64s[2] == 0
&& line->u.w64s[3] == 0 && line->u.w64s[4] == 0) {
return 'z';
}
} else {
tl_assert2(0, "unsupported line size (classify_OCacheLine)");
}
// END fast special-case of the test loop below.
for (i = 0; i < OC_W32S_PER_LINE; i++) {
tl_assert(0 == ((~0xF) & line->descr[i]));
if (line->w32[i] > 0 && line->descr[i] > 0)
tl_assert(0 == ((~0xF) & line->u.main.descr[i]));
if (line->u.main.w32[i] > 0 && line->u.main.descr[i] > 0)
return 'n'; /* NONZERO - contains useful info */
}
return 'z'; /* ZERO - no useful info */
@ -2491,7 +2553,7 @@ static void init_OCache ( void )
init_ocacheL2();
}
static void moveLineForwards ( OCacheSet* set, UWord lineno )
static inline void moveLineForwards ( OCacheSet* set, UWord lineno )
{
OCacheLine tmp;
stats_ocacheL1_movefwds++;
@ -2501,11 +2563,23 @@ static void moveLineForwards ( OCacheSet* set, UWord lineno )
set->line[lineno] = tmp;
}
static void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) {
static inline void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) {
UWord i;
for (i = 0; i < OC_W32S_PER_LINE; i++) {
line->w32[i] = 0; /* NO ORIGIN */
line->descr[i] = 0; /* REALLY REALLY NO ORIGIN! */
if (OC_W32S_PER_LINE == 8) {
// BEGIN fast special-case of the loop below
tl_assert(OC_W64S_PER_MAIN == 5);
line->u.w64s[0] = 0;
line->u.w64s[1] = 0;
line->u.w64s[2] = 0;
line->u.w64s[3] = 0;
line->u.w64s[4] = 0;
// END fast special-case of the loop below
} else {
tl_assert2(0, "unsupported line size (zeroise_OCacheLine)");
for (i = 0; i < OC_W32S_PER_LINE; i++) {
line->u.main.w32[i] = 0; /* NO ORIGIN */
line->u.main.descr[i] = 0; /* REALLY REALLY NO ORIGIN! */
}
}
line->tag = tag;
}
@ -2513,7 +2587,37 @@ static void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) {
//////////////////////////////////////////////////////////////
//// OCache backing store
static OSet* ocacheL2 = NULL;
// The backing store for ocacheL1 is, conceptually, an AVL tree of lines that
// got ejected from the L1 (a "victim cache"), and which actually contain
// useful info -- that is, for which classify_OCacheLine would return 'n' and
// no other value. However, the tree can grow large, and searching/updating
// it can be hot paths. Hence we "take out" 12 significant bits of the key by
// having 4096 trees, and select one using HASH_OCACHE_TAG.
//
// What that hash function returns isn't important so long as it is a pure
// function of the tag values, and is < 4096. However, it is critical for
// performance of long SARPs. Hence the extra shift of 11 bits. This means
// each tree conceptually is assigned to contiguous sequences of 2048 lines in
// the "line address space", giving some locality of reference when scanning
// linearly through address space, as is done by a SARP. Changing that 11 to
// 0 gives terrible performance on long SARPs, presumably because each new
// line is in a different tree, hence we wind up thrashing the (CPU's) caches.
//
// On 32-bit targets, we have to be a bit careful not to shift out so many
// bits that not all 2^12 trees get used. That leads to the constraint
// (OC_BITS_PER_LINE + 11 + 12) < 32. Note that the 11 is the only thing we
// can change here. In this case we have OC_BITS_PER_LINE == 5, hence the
// inequality is (28 < 32) and so we're good.
//
// The value 11 was determined empirically from various Firefox runs. 10 or
// 12 also work pretty well.
static OSet* ocachesL2[4096];
STATIC_ASSERT((OC_BITS_PER_LINE + 11 + 12) < 32);
static inline UInt HASH_OCACHE_TAG ( Addr tag ) {
return (UInt)((tag >> (OC_BITS_PER_LINE + 11)) & 0xFFF);
}
static void* ocacheL2_malloc ( const HChar* cc, SizeT szB ) {
return VG_(malloc)(cc, szB);
@ -2527,23 +2631,26 @@ static UWord stats__ocacheL2_n_nodes = 0;
static void init_ocacheL2 ( void )
{
tl_assert(!ocacheL2);
tl_assert(sizeof(Word) == sizeof(Addr)); /* since OCacheLine.tag :: Addr */
tl_assert(0 == offsetof(OCacheLine,tag));
ocacheL2
= VG_(OSetGen_Create)( offsetof(OCacheLine,tag),
NULL, /* fast cmp */
ocacheL2_malloc, "mc.ioL2", ocacheL2_free);
for (UInt i = 0; i < 4096; i++) {
tl_assert(!ocachesL2[i]);
ocachesL2[i]
= VG_(OSetGen_Create)( offsetof(OCacheLine,tag),
NULL, /* fast cmp */
ocacheL2_malloc, "mc.ioL2", ocacheL2_free);
}
stats__ocacheL2_n_nodes = 0;
}
/* Find line with the given tag in the tree, or NULL if not found. */
static OCacheLine* ocacheL2_find_tag ( Addr tag )
static inline OCacheLine* ocacheL2_find_tag ( Addr tag )
{
OCacheLine* line;
tl_assert(is_valid_oc_tag(tag));
stats__ocacheL2_refs++;
line = VG_(OSetGen_Lookup)( ocacheL2, &tag );
stats__ocacheL2_finds++;
OSet* oset = ocachesL2[HASH_OCACHE_TAG(tag)];
line = VG_(OSetGen_Lookup)( oset, &tag );
return line;
}
@ -2553,10 +2660,11 @@ static void ocacheL2_del_tag ( Addr tag )
{
OCacheLine* line;
tl_assert(is_valid_oc_tag(tag));
stats__ocacheL2_refs++;
line = VG_(OSetGen_Remove)( ocacheL2, &tag );
stats__ocacheL2_dels++;
OSet* oset = ocachesL2[HASH_OCACHE_TAG(tag)];
line = VG_(OSetGen_Remove)( oset, &tag );
if (line) {
VG_(OSetGen_FreeNode)(ocacheL2, line);
VG_(OSetGen_FreeNode)(oset, line);
tl_assert(stats__ocacheL2_n_nodes > 0);
stats__ocacheL2_n_nodes--;
}
@ -2568,10 +2676,11 @@ static void ocacheL2_add_line ( OCacheLine* line )
{
OCacheLine* copy;
tl_assert(is_valid_oc_tag(line->tag));
copy = VG_(OSetGen_AllocNode)( ocacheL2, sizeof(OCacheLine) );
OSet* oset = ocachesL2[HASH_OCACHE_TAG(line->tag)];
copy = VG_(OSetGen_AllocNode)( oset, sizeof(OCacheLine) );
*copy = *line;
stats__ocacheL2_refs++;
VG_(OSetGen_Insert)( ocacheL2, copy );
stats__ocacheL2_adds++;
VG_(OSetGen_Insert)( oset, copy );
stats__ocacheL2_n_nodes++;
if (stats__ocacheL2_n_nodes > stats__ocacheL2_n_nodes_max)
stats__ocacheL2_n_nodes_max = stats__ocacheL2_n_nodes;
@ -2626,7 +2735,7 @@ static OCacheLine* find_OCacheLine_SLOW ( Addr a )
/* line contains zeroes. We must ensure the backing store is
updated accordingly, either by copying the line there
verbatim, or by ensuring it isn't present there. We
chosse the latter on the basis that it reduces the size of
choose the latter on the basis that it reduces the size of
the backing store. */
ocacheL2_del_tag( victim->tag );
break;
@ -2697,10 +2806,10 @@ static INLINE void set_aligned_word64_Origin_to_undef ( Addr a, UInt otag )
&& lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/);
}
line = find_OCacheLine( a );
line->descr[lineoff+0] = 0xF;
line->descr[lineoff+1] = 0xF;
line->w32[lineoff+0] = otag;
line->w32[lineoff+1] = otag;
line->u.main.descr[lineoff+0] = 0xF;
line->u.main.descr[lineoff+1] = 0xF;
line->u.main.w32[lineoff+0] = otag;
line->u.main.w32[lineoff+1] = otag;
}
//// END inlined, specialised version of MC_(helperc_b_store8)
}
@ -2751,8 +2860,8 @@ void make_aligned_word32_undefined_w_otag ( Addr a, UInt otag )
tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
}
line = find_OCacheLine( a );
line->descr[lineoff] = 0xF;
line->w32[lineoff] = otag;
line->u.main.descr[lineoff] = 0xF;
line->u.main.w32[lineoff] = otag;
}
//// END inlined, specialised version of MC_(helperc_b_store4)
}
@ -2788,7 +2897,7 @@ void make_aligned_word32_noaccess ( Addr a )
tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
}
line = find_OCacheLine( a );
line->descr[lineoff] = 0;
line->u.main.descr[lineoff] = 0;
}
//// END inlined, specialised version of MC_(helperc_b_store4)
}
@ -2834,10 +2943,10 @@ void make_aligned_word64_undefined_w_otag ( Addr a, UInt otag )
tl_assert(lineoff >= 0
&& lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/);
line = find_OCacheLine( a );
line->descr[lineoff+0] = 0xF;
line->descr[lineoff+1] = 0xF;
line->w32[lineoff+0] = otag;
line->w32[lineoff+1] = otag;
line->u.main.descr[lineoff+0] = 0xF;
line->u.main.descr[lineoff+1] = 0xF;
line->u.main.w32[lineoff+0] = otag;
line->u.main.w32[lineoff+1] = otag;
}
//// END inlined, specialised version of MC_(helperc_b_store8)
}
@ -2872,8 +2981,8 @@ void make_aligned_word64_noaccess ( Addr a )
tl_assert(lineoff >= 0
&& lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/);
line = find_OCacheLine( a );
line->descr[lineoff+0] = 0;
line->descr[lineoff+1] = 0;
line->u.main.descr[lineoff+0] = 0;
line->u.main.descr[lineoff+1] = 0;
}
//// END inlined, specialised version of MC_(helperc_b_store8)
}
@ -7399,7 +7508,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load1)( Addr a ) {
line = find_OCacheLine( a );
descr = line->descr[lineoff];
descr = line->u.main.descr[lineoff];
if (OC_ENABLE_ASSERTIONS) {
tl_assert(descr < 0x10);
}
@ -7407,7 +7516,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load1)( Addr a ) {
if (LIKELY(0 == (descr & (1 << byteoff)))) {
return 0;
} else {
return line->w32[lineoff];
return line->u.main.w32[lineoff];
}
}
@ -7431,7 +7540,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load2)( Addr a ) {
}
line = find_OCacheLine( a );
descr = line->descr[lineoff];
descr = line->u.main.descr[lineoff];
if (OC_ENABLE_ASSERTIONS) {
tl_assert(descr < 0x10);
}
@ -7439,7 +7548,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load2)( Addr a ) {
if (LIKELY(0 == (descr & (3 << byteoff)))) {
return 0;
} else {
return line->w32[lineoff];
return line->u.main.w32[lineoff];
}
}
@ -7462,7 +7571,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load4)( Addr a ) {
line = find_OCacheLine( a );
descr = line->descr[lineoff];
descr = line->u.main.descr[lineoff];
if (OC_ENABLE_ASSERTIONS) {
tl_assert(descr < 0x10);
}
@ -7470,7 +7579,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load4)( Addr a ) {
if (LIKELY(0 == descr)) {
return 0;
} else {
return line->w32[lineoff];
return line->u.main.w32[lineoff];
}
}
@ -7493,8 +7602,8 @@ UWord VG_REGPARM(1) MC_(helperc_b_load8)( Addr a ) {
line = find_OCacheLine( a );
descrLo = line->descr[lineoff + 0];
descrHi = line->descr[lineoff + 1];
descrLo = line->u.main.descr[lineoff + 0];
descrHi = line->u.main.descr[lineoff + 1];
descr = descrLo | descrHi;
if (OC_ENABLE_ASSERTIONS) {
tl_assert(descr < 0x10);
@ -7503,8 +7612,8 @@ UWord VG_REGPARM(1) MC_(helperc_b_load8)( Addr a ) {
if (LIKELY(0 == descr)) {
return 0; /* both 32-bit chunks are defined */
} else {
UInt oLo = descrLo == 0 ? 0 : line->w32[lineoff + 0];
UInt oHi = descrHi == 0 ? 0 : line->w32[lineoff + 1];
UInt oLo = descrLo == 0 ? 0 : line->u.main.w32[lineoff + 0];
UInt oHi = descrHi == 0 ? 0 : line->u.main.w32[lineoff + 1];
return merge_origins(oLo, oHi);
}
}
@ -7543,10 +7652,10 @@ void VG_REGPARM(2) MC_(helperc_b_store1)( Addr a, UWord d32 ) {
line = find_OCacheLine( a );
if (d32 == 0) {
line->descr[lineoff] &= ~(1 << byteoff);
line->u.main.descr[lineoff] &= ~(1 << byteoff);
} else {
line->descr[lineoff] |= (1 << byteoff);
line->w32[lineoff] = d32;
line->u.main.descr[lineoff] |= (1 << byteoff);
line->u.main.w32[lineoff] = d32;
}
}
@ -7571,10 +7680,10 @@ void VG_REGPARM(2) MC_(helperc_b_store2)( Addr a, UWord d32 ) {
line = find_OCacheLine( a );
if (d32 == 0) {
line->descr[lineoff] &= ~(3 << byteoff);
line->u.main.descr[lineoff] &= ~(3 << byteoff);
} else {
line->descr[lineoff] |= (3 << byteoff);
line->w32[lineoff] = d32;
line->u.main.descr[lineoff] |= (3 << byteoff);
line->u.main.w32[lineoff] = d32;
}
}
@ -7597,14 +7706,15 @@ void VG_REGPARM(2) MC_(helperc_b_store4)( Addr a, UWord d32 ) {
line = find_OCacheLine( a );
if (d32 == 0) {
line->descr[lineoff] = 0;
line->u.main.descr[lineoff] = 0;
} else {
line->descr[lineoff] = 0xF;
line->w32[lineoff] = d32;
line->u.main.descr[lineoff] = 0xF;
line->u.main.w32[lineoff] = d32;
}
}
void VG_REGPARM(2) MC_(helperc_b_store8)( Addr a, UWord d32 ) {
STATIC_ASSERT(OC_W32S_PER_LINE == 8);
OCacheLine* line;
UWord lineoff;
@ -7623,26 +7733,98 @@ void VG_REGPARM(2) MC_(helperc_b_store8)( Addr a, UWord d32 ) {
line = find_OCacheLine( a );
if (d32 == 0) {
line->descr[lineoff + 0] = 0;
line->descr[lineoff + 1] = 0;
line->u.main.descr[lineoff + 0] = 0;
line->u.main.descr[lineoff + 1] = 0;
} else {
line->descr[lineoff + 0] = 0xF;
line->descr[lineoff + 1] = 0xF;
line->w32[lineoff + 0] = d32;
line->w32[lineoff + 1] = d32;
line->u.main.descr[lineoff + 0] = 0xF;
line->u.main.descr[lineoff + 1] = 0xF;
line->u.main.w32[lineoff + 0] = d32;
line->u.main.w32[lineoff + 1] = d32;
}
}
void VG_REGPARM(2) MC_(helperc_b_store16)( Addr a, UWord d32 ) {
MC_(helperc_b_store8)( a + 0, d32 );
MC_(helperc_b_store8)( a + 8, d32 );
STATIC_ASSERT(OC_W32S_PER_LINE == 8);
OCacheLine* line;
UWord lineoff;
if (UNLIKELY(a & 15)) {
/* Handle misaligned case, slowly. */
MC_(helperc_b_store8)( a + 0, d32 );
MC_(helperc_b_store8)( a + 8, d32 );
return;
}
lineoff = oc_line_offset(a);
if (OC_ENABLE_ASSERTIONS) {
tl_assert(lineoff == (lineoff & 4)); /*0,4*//*since 16-aligned*/
}
line = find_OCacheLine( a );
if (d32 == 0) {
line->u.main.descr[lineoff + 0] = 0;
line->u.main.descr[lineoff + 1] = 0;
line->u.main.descr[lineoff + 2] = 0;
line->u.main.descr[lineoff + 3] = 0;
} else {
line->u.main.descr[lineoff + 0] = 0xF;
line->u.main.descr[lineoff + 1] = 0xF;
line->u.main.descr[lineoff + 2] = 0xF;
line->u.main.descr[lineoff + 3] = 0xF;
line->u.main.w32[lineoff + 0] = d32;
line->u.main.w32[lineoff + 1] = d32;
line->u.main.w32[lineoff + 2] = d32;
line->u.main.w32[lineoff + 3] = d32;
}
}
void VG_REGPARM(2) MC_(helperc_b_store32)( Addr a, UWord d32 ) {
MC_(helperc_b_store8)( a + 0, d32 );
MC_(helperc_b_store8)( a + 8, d32 );
MC_(helperc_b_store8)( a + 16, d32 );
MC_(helperc_b_store8)( a + 24, d32 );
STATIC_ASSERT(OC_W32S_PER_LINE == 8);
OCacheLine* line;
UWord lineoff;
if (UNLIKELY(a & 31)) {
/* Handle misaligned case, slowly. */
MC_(helperc_b_store16)( a + 0, d32 );
MC_(helperc_b_store16)( a + 16, d32 );
return;
}
lineoff = oc_line_offset(a);
if (OC_ENABLE_ASSERTIONS) {
tl_assert(lineoff == 0);
}
line = find_OCacheLine( a );
if (d32 == 0) {
line->u.main.descr[0] = 0;
line->u.main.descr[1] = 0;
line->u.main.descr[2] = 0;
line->u.main.descr[3] = 0;
line->u.main.descr[4] = 0;
line->u.main.descr[5] = 0;
line->u.main.descr[6] = 0;
line->u.main.descr[7] = 0;
} else {
line->u.main.descr[0] = 0xF;
line->u.main.descr[1] = 0xF;
line->u.main.descr[2] = 0xF;
line->u.main.descr[3] = 0xF;
line->u.main.descr[4] = 0xF;
line->u.main.descr[5] = 0xF;
line->u.main.descr[6] = 0xF;
line->u.main.descr[7] = 0xF;
line->u.main.w32[0] = d32;
line->u.main.w32[1] = d32;
line->u.main.w32[2] = d32;
line->u.main.w32[3] = d32;
line->u.main.w32[4] = d32;
line->u.main.w32[5] = d32;
line->u.main.w32[6] = d32;
line->u.main.w32[7] = d32;
}
}
@ -7650,6 +7832,9 @@ void VG_REGPARM(2) MC_(helperc_b_store32)( Addr a, UWord d32 ) {
/*--- Origin tracking: sarp handlers ---*/
/*--------------------------------------------*/
// We may get asked to do very large SARPs (bug 446103), hence it is important
// to process 32-byte chunks at a time when possible.
__attribute__((noinline))
static void ocache_sarp_Set_Origins ( Addr a, UWord len, UInt otag ) {
if ((a & 1) && len >= 1) {
@ -7662,9 +7847,40 @@ static void ocache_sarp_Set_Origins ( Addr a, UWord len, UInt otag ) {
a += 2;
len -= 2;
}
if (len >= 4)
tl_assert(0 == (a & 3));
while (len >= 4) {
if ((a & 4) && len >= 4) {
MC_(helperc_b_store4)( a, otag );
a += 4;
len -= 4;
}
if ((a & 8) && len >= 8) {
MC_(helperc_b_store8)( a, otag );
a += 8;
len -= 8;
}
if ((a & 16) && len >= 16) {
MC_(helperc_b_store16)( a, otag );
a += 16;
len -= 16;
}
if (len >= 32) {
tl_assert(0 == (a & 31));
while (len >= 32) {
MC_(helperc_b_store32)( a, otag );
a += 32;
len -= 32;
}
}
if (len >= 16) {
MC_(helperc_b_store16)( a, otag );
a += 16;
len -= 16;
}
if (len >= 8) {
MC_(helperc_b_store8)( a, otag );
a += 8;
len -= 8;
}
if (len >= 4) {
MC_(helperc_b_store4)( a, otag );
a += 4;
len -= 4;
@ -7694,9 +7910,40 @@ static void ocache_sarp_Clear_Origins ( Addr a, UWord len ) {
a += 2;
len -= 2;
}
if (len >= 4)
tl_assert(0 == (a & 3));
while (len >= 4) {
if ((a & 4) && len >= 4) {
MC_(helperc_b_store4)( a, 0 );
a += 4;
len -= 4;
}
if ((a & 8) && len >= 8) {
MC_(helperc_b_store8)( a, 0 );
a += 8;
len -= 8;
}
if ((a & 16) && len >= 16) {
MC_(helperc_b_store16)( a, 0 );
a += 16;
len -= 16;
}
if (len >= 32) {
tl_assert(0 == (a & 31));
while (len >= 32) {
MC_(helperc_b_store32)( a, 0 );
a += 32;
len -= 32;
}
}
if (len >= 16) {
MC_(helperc_b_store16)( a, 0 );
a += 16;
len -= 16;
}
if (len >= 8) {
MC_(helperc_b_store8)( a, 0 );
a += 8;
len -= 8;
}
if (len >= 4) {
MC_(helperc_b_store4)( a, 0 );
a += 4;
len -= 4;
@ -7846,10 +8093,14 @@ static void mc_post_clo_init ( void )
if (MC_(clo_mc_level) >= 3) {
init_OCache();
tl_assert(ocacheL1 != NULL);
tl_assert(ocacheL2 != NULL);
for (UInt i = 0; i < 4096; i++ ) {
tl_assert(ocachesL2[i] != NULL);
}
} else {
tl_assert(ocacheL1 == NULL);
tl_assert(ocacheL2 == NULL);
for (UInt i = 0; i < 4096; i++ ) {
tl_assert(ocachesL2[i] == NULL);
}
}
MC_(chunk_poolalloc) = VG_(newPA)
@ -7943,28 +8194,32 @@ static void mc_print_stats (void)
if (MC_(clo_mc_level) >= 3) {
VG_(message)(Vg_DebugMsg,
" ocacheL1: %'12lu refs %'12lu misses (%'lu lossage)\n",
" ocacheL1: %'14lu refs %'14lu misses (%'lu lossage)\n",
stats_ocacheL1_find,
stats_ocacheL1_misses,
stats_ocacheL1_lossage );
VG_(message)(Vg_DebugMsg,
" ocacheL1: %'12lu at 0 %'12lu at 1\n",
" ocacheL1: %'14lu at 0 %'14lu at 1\n",
stats_ocacheL1_find - stats_ocacheL1_misses
- stats_ocacheL1_found_at_1
- stats_ocacheL1_found_at_N,
stats_ocacheL1_found_at_1 );
VG_(message)(Vg_DebugMsg,
" ocacheL1: %'12lu at 2+ %'12lu move-fwds\n",
" ocacheL1: %'14lu at 2+ %'14lu move-fwds\n",
stats_ocacheL1_found_at_N,
stats_ocacheL1_movefwds );
VG_(message)(Vg_DebugMsg,
" ocacheL1: %'12lu sizeB %'12d useful\n",
" ocacheL1: %'14lu sizeB %'14d useful\n",
(SizeT)sizeof(OCache),
4 * OC_W32S_PER_LINE * OC_LINES_PER_SET * OC_N_SETS );
VG_(message)(Vg_DebugMsg,
" ocacheL2: %'12lu refs %'12lu misses\n",
stats__ocacheL2_refs,
" ocacheL2: %'14lu finds %'14lu misses\n",
stats__ocacheL2_finds,
stats__ocacheL2_misses );
VG_(message)(Vg_DebugMsg,
" ocacheL2: %'14lu adds %'14lu dels\n",
stats__ocacheL2_adds,
stats__ocacheL2_dels );
VG_(message)(Vg_DebugMsg,
" ocacheL2: %'9lu max nodes %'9lu curr nodes\n",
stats__ocacheL2_n_nodes_max,
@ -7974,7 +8229,9 @@ static void mc_print_stats (void)
stats__nia_cache_queries, stats__nia_cache_misses);
} else {
tl_assert(ocacheL1 == NULL);
tl_assert(ocacheL2 == NULL);
for (UInt i = 0; i < 4096; i++ ) {
tl_assert(ocachesL2[1] == NULL);
}
}
}
@ -8216,7 +8473,9 @@ static void mc_pre_clo_init(void)
if we need to, since the command line args haven't been
processed yet. Hence defer it to mc_post_clo_init. */
tl_assert(ocacheL1 == NULL);
tl_assert(ocacheL2 == NULL);
for (UInt i = 0; i < 4096; i++ ) {
tl_assert(ocachesL2[i] == NULL);
}
/* Check some important stuff. See extensive comments above
re UNALIGNED_OR_HIGH for background. */