зеркало из https://github.com/microsoft/git.git
Merge branch 'kb/hashmap-updates'
* kb/hashmap-updates: hashmap: add string interning API hashmap: add simplified hashmap_get_from_hash() API hashmap: improve struct hashmap member documentation hashmap: factor out getting a hash code from a SHA1
This commit is contained in:
Коммит
9b1c2a3a8e
|
@ -8,11 +8,19 @@ Data Structures
|
||||||
|
|
||||||
`struct hashmap`::
|
`struct hashmap`::
|
||||||
|
|
||||||
The hash table structure.
|
The hash table structure. Members can be used as follows, but should
|
||||||
|
not be modified directly:
|
||||||
+
|
+
|
||||||
The `size` member keeps track of the total number of entries. The `cmpfn`
|
The `size` member keeps track of the total number of entries (0 means the
|
||||||
member is a function used to compare two entries for equality. The `table` and
|
hashmap is empty).
|
||||||
`tablesize` members store the hash table and its size, respectively.
|
+
|
||||||
|
`tablesize` is the allocated size of the hash table. A non-0 value indicates
|
||||||
|
that the hashmap is initialized. It may also be useful for statistical purposes
|
||||||
|
(i.e. `size / tablesize` is the current load factor).
|
||||||
|
+
|
||||||
|
`cmpfn` stores the comparison function specified in `hashmap_init()`. In
|
||||||
|
advanced scenarios, it may be useful to change this, e.g. to switch between
|
||||||
|
case-sensitive and case-insensitive lookup.
|
||||||
|
|
||||||
`struct hashmap_entry`::
|
`struct hashmap_entry`::
|
||||||
|
|
||||||
|
@ -58,6 +66,15 @@ Functions
|
||||||
+
|
+
|
||||||
`strihash` and `memihash` are case insensitive versions.
|
`strihash` and `memihash` are case insensitive versions.
|
||||||
|
|
||||||
|
`unsigned int sha1hash(const unsigned char *sha1)`::
|
||||||
|
|
||||||
|
Converts a cryptographic hash (e.g. SHA-1) into an int-sized hash code
|
||||||
|
for use in hash tables. Cryptographic hashes are supposed to have
|
||||||
|
uniform distribution, so in contrast to `memhash()`, this just copies
|
||||||
|
the first `sizeof(int)` bytes without shuffling any bits. Note that
|
||||||
|
the results will be different on big-endian and little-endian
|
||||||
|
platforms, so they should not be stored or transferred over the net.
|
||||||
|
|
||||||
`void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`::
|
`void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`::
|
||||||
|
|
||||||
Initializes a hashmap structure.
|
Initializes a hashmap structure.
|
||||||
|
@ -101,6 +118,20 @@ hashmap_entry) that has at least been initialized with the proper hash code
|
||||||
If an entry with matching hash code is found, `key` and `keydata` are passed
|
If an entry with matching hash code is found, `key` and `keydata` are passed
|
||||||
to `hashmap_cmp_fn` to decide whether the entry matches the key.
|
to `hashmap_cmp_fn` to decide whether the entry matches the key.
|
||||||
|
|
||||||
|
`void *hashmap_get_from_hash(const struct hashmap *map, unsigned int hash, const void *keydata)`::
|
||||||
|
|
||||||
|
Returns the hashmap entry for the specified hash code and key data,
|
||||||
|
or NULL if not found.
|
||||||
|
+
|
||||||
|
`map` is the hashmap structure.
|
||||||
|
+
|
||||||
|
`hash` is the hash code of the entry to look up.
|
||||||
|
+
|
||||||
|
If an entry with matching hash code is found, `keydata` is passed to
|
||||||
|
`hashmap_cmp_fn` to decide whether the entry matches the key. The
|
||||||
|
`entry_or_key` parameter points to a bogus hashmap_entry structure that
|
||||||
|
should not be used in the comparison.
|
||||||
|
|
||||||
`void *hashmap_get_next(const struct hashmap *map, const void *entry)`::
|
`void *hashmap_get_next(const struct hashmap *map, const void *entry)`::
|
||||||
|
|
||||||
Returns the next equal hashmap entry, or NULL if not found. This can be
|
Returns the next equal hashmap entry, or NULL if not found. This can be
|
||||||
|
@ -162,6 +193,21 @@ more entries.
|
||||||
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
|
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
|
||||||
and returns the first entry, if any).
|
and returns the first entry, if any).
|
||||||
|
|
||||||
|
`const char *strintern(const char *string)`::
|
||||||
|
`const void *memintern(const void *data, size_t len)`::
|
||||||
|
|
||||||
|
Returns the unique, interned version of the specified string or data,
|
||||||
|
similar to the `String.intern` API in Java and .NET, respectively.
|
||||||
|
Interned strings remain valid for the entire lifetime of the process.
|
||||||
|
+
|
||||||
|
Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
|
||||||
|
strings / data must not be modified or freed.
|
||||||
|
+
|
||||||
|
Interned strings are best used for short strings with high probability of
|
||||||
|
duplicates.
|
||||||
|
+
|
||||||
|
Uses a hashmap to store the pool of interned strings.
|
||||||
|
|
||||||
Usage example
|
Usage example
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
|
|
@ -56,18 +56,9 @@ static int commit_name_cmp(const struct commit_name *cn1,
|
||||||
return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled);
|
return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned int hash_sha1(const unsigned char *sha1)
|
|
||||||
{
|
|
||||||
unsigned int hash;
|
|
||||||
memcpy(&hash, sha1, sizeof(hash));
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct commit_name *find_commit_name(const unsigned char *peeled)
|
static inline struct commit_name *find_commit_name(const unsigned char *peeled)
|
||||||
{
|
{
|
||||||
struct commit_name key;
|
return hashmap_get_from_hash(&names, sha1hash(peeled), peeled);
|
||||||
hashmap_entry_init(&key, hash_sha1(peeled));
|
|
||||||
return hashmap_get(&names, &key, peeled);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int replace_name(struct commit_name *e,
|
static int replace_name(struct commit_name *e,
|
||||||
|
@ -114,7 +105,7 @@ static void add_to_known_names(const char *path,
|
||||||
if (!e) {
|
if (!e) {
|
||||||
e = xmalloc(sizeof(struct commit_name));
|
e = xmalloc(sizeof(struct commit_name));
|
||||||
hashcpy(e->peeled, peeled);
|
hashcpy(e->peeled, peeled);
|
||||||
hashmap_entry_init(e, hash_sha1(peeled));
|
hashmap_entry_init(e, sha1hash(peeled));
|
||||||
hashmap_add(&names, e);
|
hashmap_add(&names, e);
|
||||||
e->path = NULL;
|
e->path = NULL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,10 +8,7 @@
|
||||||
|
|
||||||
static unsigned int hash_obj(const struct object *obj, unsigned int n)
|
static unsigned int hash_obj(const struct object *obj, unsigned int n)
|
||||||
{
|
{
|
||||||
unsigned int hash;
|
return sha1hash(obj->sha1) % n;
|
||||||
|
|
||||||
memcpy(&hash, obj->sha1, sizeof(unsigned int));
|
|
||||||
return hash % n;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration)
|
static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration)
|
||||||
|
|
|
@ -242,14 +242,12 @@ struct file_similarity {
|
||||||
|
|
||||||
static unsigned int hash_filespec(struct diff_filespec *filespec)
|
static unsigned int hash_filespec(struct diff_filespec *filespec)
|
||||||
{
|
{
|
||||||
unsigned int hash;
|
|
||||||
if (!filespec->sha1_valid) {
|
if (!filespec->sha1_valid) {
|
||||||
if (diff_populate_filespec(filespec, 0))
|
if (diff_populate_filespec(filespec, 0))
|
||||||
return 0;
|
return 0;
|
||||||
hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
|
hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
|
||||||
}
|
}
|
||||||
memcpy(&hash, filespec->sha1, sizeof(hash));
|
return sha1hash(filespec->sha1);
|
||||||
return hash;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int find_identical_files(struct hashmap *srcs,
|
static int find_identical_files(struct hashmap *srcs,
|
||||||
|
@ -259,15 +257,14 @@ static int find_identical_files(struct hashmap *srcs,
|
||||||
int renames = 0;
|
int renames = 0;
|
||||||
|
|
||||||
struct diff_filespec *target = rename_dst[dst_index].two;
|
struct diff_filespec *target = rename_dst[dst_index].two;
|
||||||
struct file_similarity *p, *best, dst;
|
struct file_similarity *p, *best = NULL;
|
||||||
int i = 100, best_score = -1;
|
int i = 100, best_score = -1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Find the best source match for specified destination.
|
* Find the best source match for specified destination.
|
||||||
*/
|
*/
|
||||||
best = NULL;
|
p = hashmap_get_from_hash(srcs, hash_filespec(target), NULL);
|
||||||
hashmap_entry_init(&dst, hash_filespec(target));
|
for (; p; p = hashmap_get_next(srcs, p)) {
|
||||||
for (p = hashmap_get(srcs, &dst, NULL); p; p = hashmap_get_next(srcs, p)) {
|
|
||||||
int score;
|
int score;
|
||||||
struct diff_filespec *source = p->filespec;
|
struct diff_filespec *source = p->filespec;
|
||||||
|
|
||||||
|
|
38
hashmap.c
38
hashmap.c
|
@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
|
||||||
current = iter->map->table[iter->tablepos++];
|
current = iter->map->table[iter->tablepos++];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct pool_entry {
|
||||||
|
struct hashmap_entry ent;
|
||||||
|
size_t len;
|
||||||
|
unsigned char data[FLEX_ARRAY];
|
||||||
|
};
|
||||||
|
|
||||||
|
static int pool_entry_cmp(const struct pool_entry *e1,
|
||||||
|
const struct pool_entry *e2,
|
||||||
|
const unsigned char *keydata)
|
||||||
|
{
|
||||||
|
return e1->data != keydata &&
|
||||||
|
(e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
|
||||||
|
}
|
||||||
|
|
||||||
|
const void *memintern(const void *data, size_t len)
|
||||||
|
{
|
||||||
|
static struct hashmap map;
|
||||||
|
struct pool_entry key, *e;
|
||||||
|
|
||||||
|
/* initialize string pool hashmap */
|
||||||
|
if (!map.tablesize)
|
||||||
|
hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
|
||||||
|
|
||||||
|
/* lookup interned string in pool */
|
||||||
|
hashmap_entry_init(&key, memhash(data, len));
|
||||||
|
key.len = len;
|
||||||
|
e = hashmap_get(&map, &key, data);
|
||||||
|
if (!e) {
|
||||||
|
/* not found: create it */
|
||||||
|
e = xmallocz(sizeof(struct pool_entry) + len);
|
||||||
|
hashmap_entry_init(e, key.ent.hash);
|
||||||
|
e->len = len;
|
||||||
|
memcpy(e->data, data, len);
|
||||||
|
hashmap_add(&map, e);
|
||||||
|
}
|
||||||
|
return e->data;
|
||||||
|
}
|
||||||
|
|
27
hashmap.h
27
hashmap.h
|
@ -13,6 +13,17 @@ extern unsigned int strihash(const char *buf);
|
||||||
extern unsigned int memhash(const void *buf, size_t len);
|
extern unsigned int memhash(const void *buf, size_t len);
|
||||||
extern unsigned int memihash(const void *buf, size_t len);
|
extern unsigned int memihash(const void *buf, size_t len);
|
||||||
|
|
||||||
|
static inline unsigned int sha1hash(const unsigned char *sha1)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Equivalent to 'return *(unsigned int *)sha1;', but safe on
|
||||||
|
* platforms that don't support unaligned reads.
|
||||||
|
*/
|
||||||
|
unsigned int hash;
|
||||||
|
memcpy(&hash, sha1, sizeof(hash));
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
/* data structures */
|
/* data structures */
|
||||||
|
|
||||||
struct hashmap_entry {
|
struct hashmap_entry {
|
||||||
|
@ -57,6 +68,14 @@ extern void *hashmap_put(struct hashmap *map, void *entry);
|
||||||
extern void *hashmap_remove(struct hashmap *map, const void *key,
|
extern void *hashmap_remove(struct hashmap *map, const void *key,
|
||||||
const void *keydata);
|
const void *keydata);
|
||||||
|
|
||||||
|
static inline void *hashmap_get_from_hash(const struct hashmap *map,
|
||||||
|
unsigned int hash, const void *keydata)
|
||||||
|
{
|
||||||
|
struct hashmap_entry key;
|
||||||
|
hashmap_entry_init(&key, hash);
|
||||||
|
return hashmap_get(map, &key, keydata);
|
||||||
|
}
|
||||||
|
|
||||||
/* hashmap_iter functions */
|
/* hashmap_iter functions */
|
||||||
|
|
||||||
extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter);
|
extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter);
|
||||||
|
@ -68,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
|
||||||
return hashmap_iter_next(iter);
|
return hashmap_iter_next(iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* string interning */
|
||||||
|
|
||||||
|
extern const void *memintern(const void *data, size_t len);
|
||||||
|
static inline const char *strintern(const char *string)
|
||||||
|
{
|
||||||
|
return memintern(string, strlen(string));
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
11
khash.h
11
khash.h
|
@ -320,19 +320,12 @@ static const double __ac_HASH_UPPER = 0.77;
|
||||||
code; \
|
code; \
|
||||||
} }
|
} }
|
||||||
|
|
||||||
static inline khint_t __kh_oid_hash(const unsigned char *oid)
|
|
||||||
{
|
|
||||||
khint_t hash;
|
|
||||||
memcpy(&hash, oid, sizeof(hash));
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define __kh_oid_cmp(a, b) (hashcmp(a, b) == 0)
|
#define __kh_oid_cmp(a, b) (hashcmp(a, b) == 0)
|
||||||
|
|
||||||
KHASH_INIT(sha1, const unsigned char *, void *, 1, __kh_oid_hash, __kh_oid_cmp)
|
KHASH_INIT(sha1, const unsigned char *, void *, 1, sha1hash, __kh_oid_cmp)
|
||||||
typedef kh_sha1_t khash_sha1;
|
typedef kh_sha1_t khash_sha1;
|
||||||
|
|
||||||
KHASH_INIT(sha1_pos, const unsigned char *, int, 1, __kh_oid_hash, __kh_oid_cmp)
|
KHASH_INIT(sha1_pos, const unsigned char *, int, 1, sha1hash, __kh_oid_cmp)
|
||||||
typedef kh_sha1_pos_t khash_sha1_pos;
|
typedef kh_sha1_pos_t khash_sha1_pos;
|
||||||
|
|
||||||
#endif /* __AC_KHASH_H */
|
#endif /* __AC_KHASH_H */
|
||||||
|
|
|
@ -213,12 +213,11 @@ struct cache_entry *index_dir_exists(struct index_state *istate, const char *nam
|
||||||
struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase)
|
struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase)
|
||||||
{
|
{
|
||||||
struct cache_entry *ce;
|
struct cache_entry *ce;
|
||||||
struct hashmap_entry key;
|
|
||||||
|
|
||||||
lazy_init_name_hash(istate);
|
lazy_init_name_hash(istate);
|
||||||
|
|
||||||
hashmap_entry_init(&key, memihash(name, namelen));
|
ce = hashmap_get_from_hash(&istate->name_hash,
|
||||||
ce = hashmap_get(&istate->name_hash, &key, NULL);
|
memihash(name, namelen), NULL);
|
||||||
while (ce) {
|
while (ce) {
|
||||||
if (same_name(ce, name, namelen, icase))
|
if (same_name(ce, name, namelen, icase))
|
||||||
return ce;
|
return ce;
|
||||||
|
|
13
object.c
13
object.c
|
@ -50,18 +50,7 @@ int type_from_string(const char *str)
|
||||||
*/
|
*/
|
||||||
static unsigned int hash_obj(const unsigned char *sha1, unsigned int n)
|
static unsigned int hash_obj(const unsigned char *sha1, unsigned int n)
|
||||||
{
|
{
|
||||||
unsigned int hash;
|
return sha1hash(sha1) & (n - 1);
|
||||||
|
|
||||||
/*
|
|
||||||
* Since the sha1 is essentially random, we just take the
|
|
||||||
* required number of bits directly from the first
|
|
||||||
* sizeof(unsigned int) bytes of sha1. First we have to copy
|
|
||||||
* the bytes into a properly aligned integer. If we cared
|
|
||||||
* about getting consistent results across architectures, we
|
|
||||||
* would have to call ntohl() here, too.
|
|
||||||
*/
|
|
||||||
memcpy(&hash, sha1, sizeof(unsigned int));
|
|
||||||
return hash & (n - 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -7,10 +7,9 @@ static uint32_t locate_object_entry_hash(struct packing_data *pdata,
|
||||||
const unsigned char *sha1,
|
const unsigned char *sha1,
|
||||||
int *found)
|
int *found)
|
||||||
{
|
{
|
||||||
uint32_t i, hash, mask = (pdata->index_size - 1);
|
uint32_t i, mask = (pdata->index_size - 1);
|
||||||
|
|
||||||
memcpy(&hash, sha1, sizeof(uint32_t));
|
i = sha1hash(sha1) & mask;
|
||||||
i = hash & mask;
|
|
||||||
|
|
||||||
while (pdata->index[i] > 0) {
|
while (pdata->index[i] > 0) {
|
||||||
uint32_t pos = pdata->index[i] - 1;
|
uint32_t pos = pdata->index[i] - 1;
|
||||||
|
|
|
@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
|
||||||
|
|
||||||
'
|
'
|
||||||
|
|
||||||
|
test_expect_success 'string interning' '
|
||||||
|
|
||||||
|
test_hashmap "intern value1
|
||||||
|
intern Value1
|
||||||
|
intern value2
|
||||||
|
intern value2
|
||||||
|
" "value1
|
||||||
|
Value1
|
||||||
|
value2
|
||||||
|
value2"
|
||||||
|
|
||||||
|
'
|
||||||
|
|
||||||
test_done
|
test_done
|
||||||
|
|
|
@ -115,9 +115,8 @@ static void perf_hashmap(unsigned int method, unsigned int rounds)
|
||||||
|
|
||||||
for (j = 0; j < rounds; j++) {
|
for (j = 0; j < rounds; j++) {
|
||||||
for (i = 0; i < TEST_SIZE; i++) {
|
for (i = 0; i < TEST_SIZE; i++) {
|
||||||
struct hashmap_entry key;
|
hashmap_get_from_hash(&map, hashes[i],
|
||||||
hashmap_entry_init(&key, hashes[i]);
|
entries[i]->key);
|
||||||
hashmap_get(&map, &key, entries[i]->key);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,12 +198,8 @@ int main(int argc, char *argv[])
|
||||||
|
|
||||||
} else if (!strcmp("get", cmd) && l1) {
|
} else if (!strcmp("get", cmd) && l1) {
|
||||||
|
|
||||||
/* setup static key */
|
|
||||||
struct hashmap_entry key;
|
|
||||||
hashmap_entry_init(&key, hash);
|
|
||||||
|
|
||||||
/* lookup entry in hashmap */
|
/* lookup entry in hashmap */
|
||||||
entry = hashmap_get(&map, &key, p1);
|
entry = hashmap_get_from_hash(&map, hash, p1);
|
||||||
|
|
||||||
/* print result */
|
/* print result */
|
||||||
if (!entry)
|
if (!entry)
|
||||||
|
@ -239,6 +234,20 @@ int main(int argc, char *argv[])
|
||||||
/* print table sizes */
|
/* print table sizes */
|
||||||
printf("%u %u\n", map.tablesize, map.size);
|
printf("%u %u\n", map.tablesize, map.size);
|
||||||
|
|
||||||
|
} else if (!strcmp("intern", cmd) && l1) {
|
||||||
|
|
||||||
|
/* test that strintern works */
|
||||||
|
const char *i1 = strintern(p1);
|
||||||
|
const char *i2 = strintern(p1);
|
||||||
|
if (strcmp(i1, p1))
|
||||||
|
printf("strintern(%s) returns %s\n", p1, i1);
|
||||||
|
else if (i1 == p1)
|
||||||
|
printf("strintern(%s) returns input pointer\n", p1);
|
||||||
|
else if (i1 != i2)
|
||||||
|
printf("strintern(%s) != strintern(%s)", i1, i2);
|
||||||
|
else
|
||||||
|
printf("%s\n", i1);
|
||||||
|
|
||||||
} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
|
} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
|
||||||
|
|
||||||
perf_hashmap(atoi(p1), atoi(p2));
|
perf_hashmap(atoi(p1), atoi(p2));
|
||||||
|
|
Загрузка…
Ссылка в новой задаче