mirror of
https://github.com/Zenithsiz/ftmemsim-valgrind.git
synced 2026-02-03 18:13:01 +00:00
include/pub_tool_deduppoolalloc.h coregrind/pub_core_deduppoolalloc.h coregrind/m_deduppoolalloc.c and uses it (currently only) for the strings in m_debuginfo/storage.c The idea is that such ddup pool allocator will also be used for other highly duplicated information (e.g. the DiCFSI information), where significant gains can also be achieved. The dedup pool for strings also decreases significantly the memory needed by the read inline information (patch still to be committed, see bug 278972). When testing with a big executable (tacot_process), this reduces the size of the dinfo arena from trunk: 158941184/109760512 max/curr mmap'd, 156775944/107882728 max/curr, to ddup: 157892608/106614784 max/curr mmap'd, 156362160/101414712 max/curr (so 3Mb less mmap-ed once debug info is read, 1Mb less mmap-ed in peak, 6Mb less allocated once debug info is read). This is all gained due to the string which changes from: trunk: 17,434,704 in 266: di.storage.addStr.1 to ddup: 10,966,608 in 750: di.storage.addStr.1 (6.5Mb less memory used by strings) The gain in mmap-ed memory is smaller due to fragmentation. Probably one could decrease the fragmentation by using bigger size for the dedup pool, but then we would lose memory on the last allocated pool (and for small libraries, we often do not use much of a big pool block). Solution might be to increase the pool size but have a "shrink_block" operation. To be looked at in the future. In terms of performance, startup of a big executable (on an old pentium) is not influenced significantly (something like 0.1 seconds on 15 seconds startup for a big executable, on a slow pentium). The dedup pool uses a hash table. The hash function used currently is the VG_(adler32) check sum. It is reported (and visible also here) that this checksum is not a very good hash function (many collisions). To have statistics about collisions, use --stats -v -v -v As an example of the collisions, on the strings in debug info of memcheck tool on x86, one obtain: --4789-- dedupPA:di.storage.addStr.1 9983 allocs (8174 uniq) 11 pools (4820 bytes free in last pool) --4789-- nr occurences of chains of len N, N-plicated keys, N-plicated elts --4789-- N: 0 : nr chain 6975, nr keys 0, nr elts 0 --4789-- N: 1 : nr chain 3670, nr keys 6410, nr elts 8174 --4789-- N: 2 : nr chain 1070, nr keys 226, nr elts 0 --4789-- N: 3 : nr chain 304, nr keys 100, nr elts 0 --4789-- N: 4 : nr chain 104, nr keys 84, nr elts 0 --4789-- N: 5 : nr chain 72, nr keys 42, nr elts 0 --4789-- N: 6 : nr chain 44, nr keys 34, nr elts 0 --4789-- N: 7 : nr chain 18, nr keys 13, nr elts 0 --4789-- N: 8 : nr chain 17, nr keys 8, nr elts 0 --4789-- N: 9 : nr chain 4, nr keys 6, nr elts 0 --4789-- N:10 : nr chain 9, nr keys 4, nr elts 0 --4789-- N:11 : nr chain 1, nr keys 0, nr elts 0 --4789-- N:13 : nr chain 1, nr keys 1, nr elts 0 --4789-- total nr of unique chains: 12289, keys 6928, elts 8174 which shows that on 8174 different strings, we have only 6410 strings which have a unique hash value. As other examples, N:13 line shows we have 13 strings mapping to the same key. N:14 line shows we have 4 groups of 10 strings mapping to the same key, etc. So, adler32 is definitely a bad hash function. Trials have been done with another hash function, giving a much lower collision rate. So, a better (but still fast) hash function would probably be beneficial. To be looked at ... git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14029
401 lines
12 KiB
C
401 lines
12 KiB
C
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- A separately-chained hash table. m_hashtable.c ---*/
|
|
/*--------------------------------------------------------------------*/
|
|
|
|
/*
|
|
This file is part of Valgrind, a dynamic binary instrumentation
|
|
framework.
|
|
|
|
Copyright (C) 2005-2013 Nicholas Nethercote
|
|
njn@valgrind.org
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License as
|
|
published by the Free Software Foundation; either version 2 of the
|
|
License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
02111-1307, USA.
|
|
|
|
The GNU General Public License is contained in the file COPYING.
|
|
*/
|
|
|
|
#include "pub_core_basics.h"
|
|
#include "pub_core_debuglog.h"
|
|
#include "pub_core_hashtable.h"
|
|
#include "pub_core_libcassert.h"
|
|
#include "pub_core_libcprint.h"
|
|
#include "pub_core_mallocfree.h"
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- Declarations ---*/
|
|
/*--------------------------------------------------------------------*/
|
|
|
|
#define CHAIN_NO(key,tbl) (((UWord)(key)) % tbl->n_chains)
|
|
|
|
struct _VgHashTable {
|
|
UInt n_chains; // should be prime
|
|
UInt n_elements;
|
|
VgHashNode* iterNode; // current iterator node
|
|
UInt iterChain; // next chain to be traversed by the iterator
|
|
VgHashNode** chains; // expanding array of hash chains
|
|
Bool iterOK; // table safe to iterate over?
|
|
const HChar* name; // name of table (for debugging only)
|
|
};
|
|
|
|
#define N_HASH_PRIMES 20
|
|
|
|
static SizeT primes[N_HASH_PRIMES] = {
|
|
769UL, 1543UL, 3079UL, 6151UL,
|
|
12289UL, 24593UL, 49157UL, 98317UL,
|
|
196613UL, 393241UL, 786433UL, 1572869UL,
|
|
3145739UL, 6291469UL, 12582917UL, 25165843UL,
|
|
50331653UL, 100663319UL, 201326611UL, 402653189UL
|
|
};
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- Functions ---*/
|
|
/*--------------------------------------------------------------------*/
|
|
|
|
VgHashTable VG_(HT_construct) ( const HChar* name )
|
|
{
|
|
/* Initialises to zero, ie. all entries NULL */
|
|
SizeT n_chains = primes[0];
|
|
SizeT sz = n_chains * sizeof(VgHashNode*);
|
|
VgHashTable table = VG_(calloc)("hashtable.Hc.1",
|
|
1, sizeof(struct _VgHashTable));
|
|
table->chains = VG_(calloc)("hashtable.Hc.2", 1, sz);
|
|
table->n_chains = n_chains;
|
|
table->n_elements = 0;
|
|
table->iterOK = True;
|
|
table->name = name;
|
|
vg_assert(name);
|
|
return table;
|
|
}
|
|
|
|
Int VG_(HT_count_nodes) ( VgHashTable table )
|
|
{
|
|
return table->n_elements;
|
|
}
|
|
|
|
static void resize ( VgHashTable table )
|
|
{
|
|
Int i;
|
|
SizeT sz;
|
|
SizeT old_chains = table->n_chains;
|
|
SizeT new_chains = old_chains + 1;
|
|
VgHashNode** chains;
|
|
VgHashNode * node;
|
|
|
|
/* If we've run out of primes, do nothing. */
|
|
if (old_chains == primes[N_HASH_PRIMES-1])
|
|
return;
|
|
|
|
vg_assert(old_chains >= primes[0]
|
|
&& old_chains < primes[N_HASH_PRIMES-1]);
|
|
|
|
for (i = 0; i < N_HASH_PRIMES; i++) {
|
|
if (primes[i] > new_chains) {
|
|
new_chains = primes[i];
|
|
break;
|
|
}
|
|
}
|
|
|
|
vg_assert(new_chains > old_chains);
|
|
vg_assert(new_chains > primes[0]
|
|
&& new_chains <= primes[N_HASH_PRIMES-1]);
|
|
|
|
VG_(debugLog)(
|
|
1, "hashtable",
|
|
"resizing table `%s' from %lu to %lu (total elems %lu)\n",
|
|
table->name, (UWord)old_chains, (UWord)new_chains,
|
|
(UWord)table->n_elements );
|
|
|
|
table->n_chains = new_chains;
|
|
sz = new_chains * sizeof(VgHashNode*);
|
|
chains = VG_(calloc)("hashtable.resize.1", 1, sz);
|
|
|
|
for (i = 0; i < old_chains; i++) {
|
|
node = table->chains[i];
|
|
while (node != NULL) {
|
|
VgHashNode* next = node->next;
|
|
UWord chain = CHAIN_NO(node->key, table);
|
|
node->next = chains[chain];
|
|
chains[chain] = node;
|
|
node = next;
|
|
}
|
|
}
|
|
|
|
VG_(free)(table->chains);
|
|
table->chains = chains;
|
|
}
|
|
|
|
/* Puts a new, heap allocated VgHashNode, into the VgHashTable. Prepends
|
|
the node to the appropriate chain. No duplicate key detection is done. */
|
|
void VG_(HT_add_node) ( VgHashTable table, void* vnode )
|
|
{
|
|
VgHashNode* node = (VgHashNode*)vnode;
|
|
UWord chain = CHAIN_NO(node->key, table);
|
|
node->next = table->chains[chain];
|
|
table->chains[chain] = node;
|
|
table->n_elements++;
|
|
if ( (1 * (ULong)table->n_elements) > (1 * (ULong)table->n_chains) ) {
|
|
resize(table);
|
|
}
|
|
|
|
/* Table has been modified; hence HT_Next should assert. */
|
|
table->iterOK = False;
|
|
}
|
|
|
|
/* Looks up a VgHashNode by key in the table. Returns NULL if not found. */
|
|
void* VG_(HT_lookup) ( VgHashTable table, UWord key )
|
|
{
|
|
VgHashNode* curr = table->chains[ CHAIN_NO(key, table) ];
|
|
|
|
while (curr) {
|
|
if (key == curr->key) {
|
|
return curr;
|
|
}
|
|
curr = curr->next;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Looks up a VgHashNode by node in the table. Returns NULL if not found.
|
|
GEN!!! marks the lines that differs from VG_(HT_lookup). */
|
|
void* VG_(HT_gen_lookup) ( VgHashTable table, void* node, HT_Cmp_t cmp )
|
|
{
|
|
VgHashNode* hnode = (VgHashNode*) node; // GEN!!!
|
|
VgHashNode* curr = table->chains[ CHAIN_NO(hnode->key, table) ]; // GEN!!!
|
|
|
|
while (curr) {
|
|
if (cmp (hnode, curr) == 0) { // GEN!!!
|
|
return curr;
|
|
}
|
|
curr = curr->next;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Removes a VgHashNode from the table. Returns NULL if not found. */
|
|
void* VG_(HT_remove) ( VgHashTable table, UWord key )
|
|
{
|
|
UWord chain = CHAIN_NO(key, table);
|
|
VgHashNode* curr = table->chains[chain];
|
|
VgHashNode** prev_next_ptr = &(table->chains[chain]);
|
|
|
|
/* Table has been modified; hence HT_Next should assert. */
|
|
table->iterOK = False;
|
|
|
|
while (curr) {
|
|
if (key == curr->key) {
|
|
*prev_next_ptr = curr->next;
|
|
table->n_elements--;
|
|
return curr;
|
|
}
|
|
prev_next_ptr = &(curr->next);
|
|
curr = curr->next;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Removes a VgHashNode by node from the table. Returns NULL if not found.
|
|
GEN!!! marks the lines that differs from VG_(HT_remove). */
|
|
void* VG_(HT_gen_remove) ( VgHashTable table, void* node, HT_Cmp_t cmp )
|
|
{
|
|
VgHashNode* hnode = (VgHashNode*) node; // GEN!!!
|
|
UWord chain = CHAIN_NO(hnode->key, table); // GEN!!!
|
|
VgHashNode* curr = table->chains[chain];
|
|
VgHashNode** prev_next_ptr = &(table->chains[chain]);
|
|
|
|
/* Table has been modified; hence HT_Next should assert. */
|
|
table->iterOK = False;
|
|
|
|
while (curr) {
|
|
if (cmp(hnode, curr) == 0) { // GEN!!!
|
|
*prev_next_ptr = curr->next;
|
|
table->n_elements--;
|
|
return curr;
|
|
}
|
|
prev_next_ptr = &(curr->next);
|
|
curr = curr->next;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void VG_(HT_print_stats) ( VgHashTable table, HT_Cmp_t cmp )
|
|
{
|
|
#define MAXOCCUR 20
|
|
UInt elt_occurences[MAXOCCUR];
|
|
UInt key_occurences[MAXOCCUR];
|
|
UInt cno_occurences[MAXOCCUR];
|
|
/* Key occurence : how many ht elements have the same key.
|
|
elt_occurences : how many elements are inserted multiple time.
|
|
cno_occurences : how many chains have that length.
|
|
The last entry in these arrays collects all occurences >= MAXOCCUR-1. */
|
|
#define INCOCCUR(occur,n) (n >= MAXOCCUR ? occur[n-1]++ : occur[n]++)
|
|
UInt i;
|
|
UInt nkey, nelt, ncno;
|
|
VgHashNode *cnode, *node;
|
|
|
|
for (i = 0; i < 20; i++) {
|
|
key_occurences[i] = 0;
|
|
elt_occurences[i] = 0;
|
|
cno_occurences[i] = 0;
|
|
}
|
|
|
|
// Note that the below algorithm is quadractic in nr of elements in a chain
|
|
// but if that happens, the hash table/function is really bad and that
|
|
// should be fixed.
|
|
for (i = 0; i < table->n_chains; i++) {
|
|
ncno = 0;
|
|
for (cnode = table->chains[i]; cnode != NULL; cnode = cnode->next) {
|
|
ncno++;
|
|
|
|
nkey = 0;
|
|
// Is the same cnode->key existing before cnode ?
|
|
for (node = table->chains[i]; node != cnode; node = node->next) {
|
|
if (node->key == cnode->key)
|
|
nkey++;
|
|
}
|
|
// If cnode->key not in a previous node, count occurences of key.
|
|
if (nkey == 0) {
|
|
for (node = cnode; node != NULL; node = node->next) {
|
|
if (node->key == cnode->key)
|
|
nkey++;
|
|
}
|
|
INCOCCUR(key_occurences, nkey);
|
|
}
|
|
|
|
nelt = 0;
|
|
// Is the same cnode element existing before cnode ?
|
|
for (node = table->chains[i]; node != cnode; node = node->next) {
|
|
if (cmp) {
|
|
if ((*cmp)(node, cnode) == 0)
|
|
nelt++;
|
|
} else
|
|
if (node->key == cnode->key)
|
|
nelt++;
|
|
}
|
|
// If cnode element not in a previous node, count occurences of elt.
|
|
if (nelt == 0) {
|
|
for (node = cnode; node != NULL; node = node->next) {
|
|
if (cmp) {
|
|
if ((*cmp)(node, cnode) == 0)
|
|
nelt++;
|
|
} else
|
|
if (node->key == cnode->key)
|
|
nelt++;
|
|
}
|
|
INCOCCUR(elt_occurences, nelt);
|
|
}
|
|
}
|
|
INCOCCUR(cno_occurences, ncno);
|
|
}
|
|
|
|
VG_(message)(Vg_DebugMsg,
|
|
"nr occurences of"
|
|
" chains of len N,"
|
|
" N-plicated keys,"
|
|
" N-plicated elts\n");
|
|
nkey = nelt = ncno = 0;
|
|
for (i = 0; i < MAXOCCUR; i++) {
|
|
if (elt_occurences[i] > 0 || key_occurences[i] > 0 || cno_occurences[i] > 0)
|
|
VG_(message)(Vg_DebugMsg,
|
|
"N:%2d : nr chain %6d, nr keys %6d, nr elts %6d\n",
|
|
i, cno_occurences[i], key_occurences[i], elt_occurences[i]);
|
|
nkey += key_occurences[i];
|
|
nelt += elt_occurences[i];
|
|
ncno += cno_occurences[i];
|
|
}
|
|
VG_(message)(Vg_DebugMsg, "total nr of unique chains: %6d, keys %6d, elts %6d\n",
|
|
ncno, nkey, nelt);
|
|
}
|
|
|
|
|
|
/* Allocates a suitably-sized array, copies pointers to all the hashtable
|
|
elements into it, then returns both the array and the size of it. The
|
|
array must be freed with VG_(free).
|
|
*/
|
|
VgHashNode** VG_(HT_to_array) ( VgHashTable table, /*OUT*/ UInt* n_elems )
|
|
{
|
|
UInt i, j;
|
|
VgHashNode** arr;
|
|
VgHashNode* node;
|
|
|
|
*n_elems = table->n_elements;
|
|
if (*n_elems == 0)
|
|
return NULL;
|
|
|
|
arr = VG_(malloc)( "hashtable.Hta.1", *n_elems * sizeof(VgHashNode*) );
|
|
|
|
j = 0;
|
|
for (i = 0; i < table->n_chains; i++) {
|
|
for (node = table->chains[i]; node != NULL; node = node->next) {
|
|
arr[j++] = node;
|
|
}
|
|
}
|
|
vg_assert(j == *n_elems);
|
|
|
|
return arr;
|
|
}
|
|
|
|
void VG_(HT_ResetIter)(VgHashTable table)
|
|
{
|
|
vg_assert(table);
|
|
table->iterNode = NULL;
|
|
table->iterChain = 0;
|
|
table->iterOK = True;
|
|
}
|
|
|
|
void* VG_(HT_Next)(VgHashTable table)
|
|
{
|
|
Int i;
|
|
vg_assert(table);
|
|
/* See long comment on HT_Next prototype in pub_tool_hashtable.h.
|
|
In short if this fails, it means the caller tried to modify the
|
|
table whilst iterating over it, which is a bug. */
|
|
vg_assert(table->iterOK);
|
|
|
|
if (table->iterNode && table->iterNode->next) {
|
|
table->iterNode = table->iterNode->next;
|
|
return table->iterNode;
|
|
}
|
|
|
|
for (i = table->iterChain; i < table->n_chains; i++) {
|
|
if (table->chains[i]) {
|
|
table->iterNode = table->chains[i];
|
|
table->iterChain = i + 1; // Next chain to be traversed
|
|
return table->iterNode;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void VG_(HT_destruct)(VgHashTable table, void(*freenode_fn)(void*))
|
|
{
|
|
UInt i;
|
|
VgHashNode *node, *node_next;
|
|
|
|
for (i = 0; i < table->n_chains; i++) {
|
|
for (node = table->chains[i]; node != NULL; node = node_next) {
|
|
node_next = node->next;
|
|
freenode_fn(node);
|
|
}
|
|
}
|
|
VG_(free)(table->chains);
|
|
VG_(free)(table);
|
|
}
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
/*--- end ---*/
|
|
/*--------------------------------------------------------------------*/
|