diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt index b977ae8bbb..ad7a5bddd2 100644 --- a/Documentation/technical/api-hashmap.txt +++ b/Documentation/technical/api-hashmap.txt @@ -8,11 +8,19 @@ Data Structures `struct hashmap`:: - The hash table structure. + The hash table structure. Members can be used as follows, but should + not be modified directly: + -The `size` member keeps track of the total number of entries. The `cmpfn` -member is a function used to compare two entries for equality. The `table` and -`tablesize` members store the hash table and its size, respectively. +The `size` member keeps track of the total number of entries (0 means the +hashmap is empty). ++ +`tablesize` is the allocated size of the hash table. A non-0 value indicates +that the hashmap is initialized. It may also be useful for statistical purposes +(i.e. `size / tablesize` is the current load factor). ++ +`cmpfn` stores the comparison function specified in `hashmap_init()`. In +advanced scenarios, it may be useful to change this, e.g. to switch between +case-sensitive and case-insensitive lookup. `struct hashmap_entry`:: @@ -58,6 +66,15 @@ Functions + `strihash` and `memihash` are case insensitive versions. +`unsigned int sha1hash(const unsigned char *sha1)`:: + + Converts a cryptographic hash (e.g. SHA-1) into an int-sized hash code + for use in hash tables. Cryptographic hashes are supposed to have + uniform distribution, so in contrast to `memhash()`, this just copies + the first `sizeof(int)` bytes without shuffling any bits. Note that + the results will be different on big-endian and little-endian + platforms, so they should not be stored or transferred over the net. + `void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`:: Initializes a hashmap structure. @@ -101,6 +118,20 @@ hashmap_entry) that has at least been initialized with the proper hash code If an entry with matching hash code is found, `key` and `keydata` are passed to `hashmap_cmp_fn` to decide whether the entry matches the key. +`void *hashmap_get_from_hash(const struct hashmap *map, unsigned int hash, const void *keydata)`:: + + Returns the hashmap entry for the specified hash code and key data, + or NULL if not found. ++ +`map` is the hashmap structure. ++ +`hash` is the hash code of the entry to look up. ++ +If an entry with matching hash code is found, `keydata` is passed to +`hashmap_cmp_fn` to decide whether the entry matches the key. The +`entry_or_key` parameter points to a bogus hashmap_entry structure that +should not be used in the comparison. + `void *hashmap_get_next(const struct hashmap *map, const void *entry)`:: Returns the next equal hashmap entry, or NULL if not found. This can be @@ -162,6 +193,21 @@ more entries. `hashmap_iter_first` is a combination of both (i.e. initializes the iterator and returns the first entry, if any). +`const char *strintern(const char *string)`:: +`const void *memintern(const void *data, size_t len)`:: + + Returns the unique, interned version of the specified string or data, + similar to the `String.intern` API in Java and .NET, respectively. + Interned strings remain valid for the entire lifetime of the process. ++ +Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned +strings / data must not be modified or freed. ++ +Interned strings are best used for short strings with high probability of +duplicates. ++ +Uses a hashmap to store the pool of interned strings. + Usage example ------------- diff --git a/builtin/describe.c b/builtin/describe.c index 24d740c8b1..ee6a3b998f 100644 --- a/builtin/describe.c +++ b/builtin/describe.c @@ -56,18 +56,9 @@ static int commit_name_cmp(const struct commit_name *cn1, return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled); } -static inline unsigned int hash_sha1(const unsigned char *sha1) -{ - unsigned int hash; - memcpy(&hash, sha1, sizeof(hash)); - return hash; -} - static inline struct commit_name *find_commit_name(const unsigned char *peeled) { - struct commit_name key; - hashmap_entry_init(&key, hash_sha1(peeled)); - return hashmap_get(&names, &key, peeled); + return hashmap_get_from_hash(&names, sha1hash(peeled), peeled); } static int replace_name(struct commit_name *e, @@ -114,7 +105,7 @@ static void add_to_known_names(const char *path, if (!e) { e = xmalloc(sizeof(struct commit_name)); hashcpy(e->peeled, peeled); - hashmap_entry_init(e, hash_sha1(peeled)); + hashmap_entry_init(e, sha1hash(peeled)); hashmap_add(&names, e); e->path = NULL; } diff --git a/decorate.c b/decorate.c index 7cb5d29a89..b2aac90c26 100644 --- a/decorate.c +++ b/decorate.c @@ -8,10 +8,7 @@ static unsigned int hash_obj(const struct object *obj, unsigned int n) { - unsigned int hash; - - memcpy(&hash, obj->sha1, sizeof(unsigned int)); - return hash % n; + return sha1hash(obj->sha1) % n; } static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration) diff --git a/diffcore-rename.c b/diffcore-rename.c index 749a35d2c2..2e44a37459 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -242,14 +242,12 @@ struct file_similarity { static unsigned int hash_filespec(struct diff_filespec *filespec) { - unsigned int hash; if (!filespec->sha1_valid) { if (diff_populate_filespec(filespec, 0)) return 0; hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1); } - memcpy(&hash, filespec->sha1, sizeof(hash)); - return hash; + return sha1hash(filespec->sha1); } static int find_identical_files(struct hashmap *srcs, @@ -259,15 +257,14 @@ static int find_identical_files(struct hashmap *srcs, int renames = 0; struct diff_filespec *target = rename_dst[dst_index].two; - struct file_similarity *p, *best, dst; + struct file_similarity *p, *best = NULL; int i = 100, best_score = -1; /* * Find the best source match for specified destination. */ - best = NULL; - hashmap_entry_init(&dst, hash_filespec(target)); - for (p = hashmap_get(srcs, &dst, NULL); p; p = hashmap_get_next(srcs, p)) { + p = hashmap_get_from_hash(srcs, hash_filespec(target), NULL); + for (; p; p = hashmap_get_next(srcs, p)) { int score; struct diff_filespec *source = p->filespec; diff --git a/hashmap.c b/hashmap.c index d1b8056d8d..f693839cb4 100644 --- a/hashmap.c +++ b/hashmap.c @@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter) current = iter->map->table[iter->tablepos++]; } } + +struct pool_entry { + struct hashmap_entry ent; + size_t len; + unsigned char data[FLEX_ARRAY]; +}; + +static int pool_entry_cmp(const struct pool_entry *e1, + const struct pool_entry *e2, + const unsigned char *keydata) +{ + return e1->data != keydata && + (e1->len != e2->len || memcmp(e1->data, keydata, e1->len)); +} + +const void *memintern(const void *data, size_t len) +{ + static struct hashmap map; + struct pool_entry key, *e; + + /* initialize string pool hashmap */ + if (!map.tablesize) + hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0); + + /* lookup interned string in pool */ + hashmap_entry_init(&key, memhash(data, len)); + key.len = len; + e = hashmap_get(&map, &key, data); + if (!e) { + /* not found: create it */ + e = xmallocz(sizeof(struct pool_entry) + len); + hashmap_entry_init(e, key.ent.hash); + e->len = len; + memcpy(e->data, data, len); + hashmap_add(&map, e); + } + return e->data; +} diff --git a/hashmap.h b/hashmap.h index a816ad47b1..ab7958ae33 100644 --- a/hashmap.h +++ b/hashmap.h @@ -13,6 +13,17 @@ extern unsigned int strihash(const char *buf); extern unsigned int memhash(const void *buf, size_t len); extern unsigned int memihash(const void *buf, size_t len); +static inline unsigned int sha1hash(const unsigned char *sha1) +{ + /* + * Equivalent to 'return *(unsigned int *)sha1;', but safe on + * platforms that don't support unaligned reads. + */ + unsigned int hash; + memcpy(&hash, sha1, sizeof(hash)); + return hash; +} + /* data structures */ struct hashmap_entry { @@ -57,6 +68,14 @@ extern void *hashmap_put(struct hashmap *map, void *entry); extern void *hashmap_remove(struct hashmap *map, const void *key, const void *keydata); +static inline void *hashmap_get_from_hash(const struct hashmap *map, + unsigned int hash, const void *keydata) +{ + struct hashmap_entry key; + hashmap_entry_init(&key, hash); + return hashmap_get(map, &key, keydata); +} + /* hashmap_iter functions */ extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter); @@ -68,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map, return hashmap_iter_next(iter); } +/* string interning */ + +extern const void *memintern(const void *data, size_t len); +static inline const char *strintern(const char *string) +{ + return memintern(string, strlen(string)); +} + #endif diff --git a/khash.h b/khash.h index 57ff6038c5..06c7906549 100644 --- a/khash.h +++ b/khash.h @@ -320,19 +320,12 @@ static const double __ac_HASH_UPPER = 0.77; code; \ } } -static inline khint_t __kh_oid_hash(const unsigned char *oid) -{ - khint_t hash; - memcpy(&hash, oid, sizeof(hash)); - return hash; -} - #define __kh_oid_cmp(a, b) (hashcmp(a, b) == 0) -KHASH_INIT(sha1, const unsigned char *, void *, 1, __kh_oid_hash, __kh_oid_cmp) +KHASH_INIT(sha1, const unsigned char *, void *, 1, sha1hash, __kh_oid_cmp) typedef kh_sha1_t khash_sha1; -KHASH_INIT(sha1_pos, const unsigned char *, int, 1, __kh_oid_hash, __kh_oid_cmp) +KHASH_INIT(sha1_pos, const unsigned char *, int, 1, sha1hash, __kh_oid_cmp) typedef kh_sha1_pos_t khash_sha1_pos; #endif /* __AC_KHASH_H */ diff --git a/name-hash.c b/name-hash.c index 49fd508317..702cd0518f 100644 --- a/name-hash.c +++ b/name-hash.c @@ -213,12 +213,11 @@ struct cache_entry *index_dir_exists(struct index_state *istate, const char *nam struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase) { struct cache_entry *ce; - struct hashmap_entry key; lazy_init_name_hash(istate); - hashmap_entry_init(&key, memihash(name, namelen)); - ce = hashmap_get(&istate->name_hash, &key, NULL); + ce = hashmap_get_from_hash(&istate->name_hash, + memihash(name, namelen), NULL); while (ce) { if (same_name(ce, name, namelen, icase)) return ce; diff --git a/object.c b/object.c index 9c31e9a5e0..91c7c867b9 100644 --- a/object.c +++ b/object.c @@ -50,18 +50,7 @@ int type_from_string(const char *str) */ static unsigned int hash_obj(const unsigned char *sha1, unsigned int n) { - unsigned int hash; - - /* - * Since the sha1 is essentially random, we just take the - * required number of bits directly from the first - * sizeof(unsigned int) bytes of sha1. First we have to copy - * the bytes into a properly aligned integer. If we cared - * about getting consistent results across architectures, we - * would have to call ntohl() here, too. - */ - memcpy(&hash, sha1, sizeof(unsigned int)); - return hash & (n - 1); + return sha1hash(sha1) & (n - 1); } /* diff --git a/pack-objects.c b/pack-objects.c index 4f36c32045..9992f3ecf2 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -7,10 +7,9 @@ static uint32_t locate_object_entry_hash(struct packing_data *pdata, const unsigned char *sha1, int *found) { - uint32_t i, hash, mask = (pdata->index_size - 1); + uint32_t i, mask = (pdata->index_size - 1); - memcpy(&hash, sha1, sizeof(uint32_t)); - i = hash & mask; + i = sha1hash(sha1) & mask; while (pdata->index[i] > 0) { uint32_t pos = pdata->index[i] - 1; diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh index 391e2b6492..f97c80556f 100755 --- a/t/t0011-hashmap.sh +++ b/t/t0011-hashmap.sh @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' ' ' +test_expect_success 'string interning' ' + +test_hashmap "intern value1 +intern Value1 +intern value2 +intern value2 +" "value1 +Value1 +value2 +value2" + +' + test_done diff --git a/test-hashmap.c b/test-hashmap.c index f5183fb9e8..07aa7ecdee 100644 --- a/test-hashmap.c +++ b/test-hashmap.c @@ -115,9 +115,8 @@ static void perf_hashmap(unsigned int method, unsigned int rounds) for (j = 0; j < rounds; j++) { for (i = 0; i < TEST_SIZE; i++) { - struct hashmap_entry key; - hashmap_entry_init(&key, hashes[i]); - hashmap_get(&map, &key, entries[i]->key); + hashmap_get_from_hash(&map, hashes[i], + entries[i]->key); } } @@ -199,12 +198,8 @@ int main(int argc, char *argv[]) } else if (!strcmp("get", cmd) && l1) { - /* setup static key */ - struct hashmap_entry key; - hashmap_entry_init(&key, hash); - /* lookup entry in hashmap */ - entry = hashmap_get(&map, &key, p1); + entry = hashmap_get_from_hash(&map, hash, p1); /* print result */ if (!entry) @@ -239,6 +234,20 @@ int main(int argc, char *argv[]) /* print table sizes */ printf("%u %u\n", map.tablesize, map.size); + } else if (!strcmp("intern", cmd) && l1) { + + /* test that strintern works */ + const char *i1 = strintern(p1); + const char *i2 = strintern(p1); + if (strcmp(i1, p1)) + printf("strintern(%s) returns %s\n", p1, i1); + else if (i1 == p1) + printf("strintern(%s) returns input pointer\n", p1); + else if (i1 != i2) + printf("strintern(%s) != strintern(%s)", i1, i2); + else + printf("%s\n", i1); + } else if (!strcmp("perfhashmap", cmd) && l1 && l2) { perf_hashmap(atoi(p1), atoi(p2));