Vectorise copy_address_range_perms for common cases. This gives about

40% speedup on artificial programs which just do realloc() and nothing
else, and about a 3-4% speedup on starting kpresenter-1.5.0 and
loading a 16-slide presentation.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5880
This commit is contained in:
Julian Seward 2006-05-03 22:13:57 +00:00
parent e29f2850f6
commit 23ad3fce2f
2 changed files with 87 additions and 18 deletions

View File

@ -29,6 +29,10 @@ Post 3.1.0:
- Nick changed ExeContext gathering to not record/save extra zeroes at the
end. Saved 7% on perf/heap with --num-callers=50, and about 1% on
perf/tinycc.
- Julian vectorised copy_address_range_perms for common cases, which
gives about 40% speedup on artificial programs which just do
realloc() and nothing else, and about a 3-4% speedup on starting
kpresenter-1.5.0 and loading a 16-slide presentation.
COMPVBITS branch:
- Nick converted to compress V bits, initial version saved 0--5% on most

View File

@ -589,6 +589,28 @@ UChar get_vabits2 ( Addr a )
return extract_vabits2_from_vabits8(a, vabits8);
}
// *** WARNING! ***
// Any time this function is called, if it is possible that any of the
// 4 2-bit fields in vabits8 are equal to VA_BITS2_PARTDEFINED, then the
// corresponding entry(s) in the sec-V-bits table must also be set!
static INLINE
UChar get_vabits8_for_aligned_word32 ( Addr a )
{
SecMap* sm = get_secmap_for_reading(a);
UWord sm_off = SM_OFF(a);
UChar vabits8 = sm->vabits8[sm_off];
return vabits8;
}
static INLINE
void set_vabits8_for_aligned_word32 ( Addr a, UChar vabits8 )
{
SecMap* sm = get_secmap_for_writing(a);
UWord sm_off = SM_OFF(a);
sm->vabits8[sm_off] = vabits8;
}
// Forward declarations
static UWord get_sec_vbits8(Addr a);
static void set_sec_vbits8(Addr a, UWord vbits8);
@ -1227,35 +1249,81 @@ static void make_mem_defined_if_addressable ( Addr a, SizeT len )
void MC_(copy_address_range_state) ( Addr src, Addr dst, SizeT len )
{
SizeT i, j;
UChar vabits2;
UChar vabits2, vabits8;
Bool aligned, nooverlap;
DEBUG("MC_(copy_address_range_state)\n");
PROF_EVENT(50, "MC_(copy_address_range_state)");
if (len == 0)
if (len == 0 || src == dst)
return;
if (src < dst) {
for (i = 0, j = len-1; i < len; i++, j--) {
PROF_EVENT(51, "MC_(copy_address_range_state)(loop)");
vabits2 = get_vabits2( src+j );
set_vabits2( dst+j, vabits2 );
if (VA_BITS2_PARTDEFINED == vabits2) {
set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) );
}
}
}
aligned = VG_IS_4_ALIGNED(src) && VG_IS_4_ALIGNED(dst);
nooverlap = src+len <= dst || dst+len <= src;
if (src > dst) {
for (i = 0; i < len; i++) {
PROF_EVENT(52, "MC_(copy_address_range_state)(loop)");
if (nooverlap && aligned) {
/* Vectorised fast case, when no overlap and suitably aligned */
/* vector loop */
i = 0;
while (len >= 4) {
vabits8 = get_vabits8_for_aligned_word32( src+i );
set_vabits8_for_aligned_word32( dst+i, vabits8 );
if (EXPECTED_TAKEN(VA_BITS8_DEFINED == vabits8
|| VA_BITS8_UNDEFINED == vabits8
|| VA_BITS8_NOACCESS == vabits8)) {
/* do nothing */
} else {
/* have to copy secondary map info */
if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+0 ))
set_sec_vbits8( dst+i+0, get_sec_vbits8( src+i+0 ) );
if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+1 ))
set_sec_vbits8( dst+i+1, get_sec_vbits8( src+i+1 ) );
if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+2 ))
set_sec_vbits8( dst+i+2, get_sec_vbits8( src+i+2 ) );
if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+3 ))
set_sec_vbits8( dst+i+3, get_sec_vbits8( src+i+3 ) );
}
i += 4;
len -= 4;
}
/* fixup loop */
while (len >= 1) {
vabits2 = get_vabits2( src+i );
set_vabits2( dst+i, vabits2 );
if (VA_BITS2_PARTDEFINED == vabits2) {
set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) );
}
i++;
len--;
}
} else {
/* We have to do things the slow way */
if (src < dst) {
for (i = 0, j = len-1; i < len; i++, j--) {
PROF_EVENT(51, "MC_(copy_address_range_state)(loop)");
vabits2 = get_vabits2( src+j );
set_vabits2( dst+j, vabits2 );
if (VA_BITS2_PARTDEFINED == vabits2) {
set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) );
}
}
}
if (src > dst) {
for (i = 0; i < len; i++) {
PROF_EVENT(52, "MC_(copy_address_range_state)(loop)");
vabits2 = get_vabits2( src+i );
set_vabits2( dst+i, vabits2 );
if (VA_BITS2_PARTDEFINED == vabits2) {
set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) );
}
}
}
}
}
@ -4422,6 +4490,3 @@ VG_DETERMINE_INTERFACE_VERSION(mc_pre_clo_init)
/*--------------------------------------------------------------------*/
/*--- end ---*/
/*--------------------------------------------------------------------*/