这一篇博客将介绍哈希表
哈希表 哈希表 ,又名散列表 ,通过建立键 key
与值 value
之间的映射,实现高效 的元素查询 ,向哈希表输入一个键,可在 时间内获取其对应的值
常用操作 包括初始化 、查询 和添加(删除)键值对
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 hash_map = {} hash_map[12836 ] = "小哈" hash_map[15937 ] = "小啰" hash_map[16750 ] = "小算" hash_map[13276 ] = "小法" hash_map[10583 ] = "小鸭" name = hash_map[15937 ] hash_map.pop(10583 ) for key, value in hash_map.items(): ... for key in hash_map.keys(): ... for value in hash_map.values(): ...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 unordered_map<int , string> hashMap; hashMap[12836 ] = "小哈" ; hashMap[15937 ] = "小啰" ; hashMap[16750 ] = "小算" ; hashMap[13276 ] = "小法" ; hashMap[10583 ] = "小鸭" ; string name = hashMap[15937 ]; hashMap.erase (10583 ); for (auto kv: hashMap){ cout << kv.first << " -> " << kv.second << endl; } for (auto iter = hashMap.begin (); iter != hashMap.end (); iter++){ cout << iter->first << " -> " << iter->second << endl; }
简单实现 哈希表中的每个空位称为桶 ,每个桶可存储一个键值对 。查询操作就是通过哈希函数 找到key
对应的桶,并在桶中获取value
。
哈希函数的作用是将一个较大的输入空间(所有key)映射到一个较小的输出空间(所有桶)
哈希函数的计算过程分为以下两步
key
通过某种哈希算法 hash()
计算得到哈希值
将哈希值 对桶数量 capacity
取模 ,得到对应桶的索引index
index = hash(key) % capacity
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 class Pair : """键值对""" def __init (self, key, value ): self.key = key self.value =value class ArrayHashMap : def __init__ (self ): self._buckets = [None ] * 100 def hash_func (self, key ): index = key % 100 return index def get (self, key ): index = self.hash_func(key) pair = self._buckets[index] if pair is None : return None return pair.value def put (self, key, value ): pair = Pair(key, value) index = self.hash_func(key) self._buckets[index] = pair def entry_set (self ): """获取所有键值对""" result = [] for pair in self.buckets: if pair is not None : result.append(pair) return result def key_set (self ): """获取所有键""" result = [] for pair in self.buckets: if pair is not None : result.append(pair.key) return result def value_set (self ): """获取所有值""" result = [] for pair in self._buckets: if pair is not None : result.append(pair.value) return result def print (self ): """打印哈希表""" for pair in self._buckets: if pair is not None : print (pair.key, "->" , pair.value)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 struct Pair { public : int key; string value; Pair (int key, string value){ this ->key = key; this ->value = value; } }; class ArrayHashMap { private : vector<Pair*> buckets; public : ArrayHashMap (){ buckets = vector <Pair*>(100 ); } ~ArrayHashMap (){ for (const auto &bucket : buckets){ delete bucket; } buckets.clear (); } int hashFunc (int key) { int index = key % 100 ; return index; } string get (int key) { int index = hashFunc (key); Pair* pair = buckets[index]; if (pair == nullptr ){ return "" ; } return pair.value; } void put (int key, string value) { Pair* pair = new Pair (key, value); int index = hashFunc (key); buckets[index] = pair; } void remove (int key) { int index = hashFunc (key); if (buckets[index] != nullptr ){ delete buckets[index]; } buckets[index] = nullptr ; } vector<Pair*> pairSet () { vector<Pair*> result; for (Pair* pair : buckets){ if (pair != nullptr ){ result.push_back (pair); } } return result; } vector<int > keySet () { vector<int > result; for (Pair* pair : buckets){ if (pair != nullptr ){ result.push_back (pair.key); } } return result; } vector<string> valueSet () { vector<int > result; for (Pair* pair : buckets){ if (pair != nullptr ){ result.push_back (pair.value); } } return result; } void print () { for (Pair* pair : buckets){ if (pair != nullptr ){ cout << pair->key << " -> " << pair->val << endl; } } } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 typedef struct { int key; char * val; }Pair; typedef struct { Pair* buckets[HASHTABLE_CAPACITY]; }ArrayHashMap; ArrayHashMap* newArrayHashMap () { ArrayHashMap* hashMap = malloc (sizeof (ArrayHashMap)); return hashMap; } void deleteArrayHashMap (ArrayHashMap* hashMap) { for (int i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ free (hashMap->buckets[i]->val); free (hashMap->buckets[i]); } } free (hashMap); } void put (ArrayHashMap* hashMap, const int key, const char * value) { Pair* pair = (Pair*)malloc (sizeof (Pair)); pair ->key = key; pair ->val = (char *)malloc (strlen (value) + 1 ); strcpy (pair ->val, value); int index = hashFunc(key); hashMap->buckets[index] = pair ; } void remove (ArrayHashMap* hashMap, const int key) { int index = hashFunc(key); if (hashMap->buckets[index] != NULL ){ free (hashMap->buckets[index]->val); free (hashMap->buckets[index]); } hashMap->buckets[index] = NULL ; } void pairSet (ArrayHashMap* hashMap, MapSet* set ) { Pair * entries; int i = 0 , index = 0 ; int total = 0 ; for (i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ total++; } } entries = (Pair*)malloc (sizeof (Pair) * total); for (i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ entries[index]->key = hashMap->buckets[i]->key; entries[index]->val = (char *)malloc (strlen (hashMap->buckets[i]->val) + 1 ); strcpt(entries[index]->val, hashMap->buckets[i]->val); index++; } } set ->set = entries; set ->len = total; } void keySet (ArrayHashMap* hashMap, MapSet *set ) { int * keys; int i = 0 , index = 0 ; int total = 0 ; for (i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ total++; } } keys = (int *)malloc (sizeof (int ) * total); for (i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ keys[index] = hashMap->buckets[i]->key; index++; } } set ->set = keys; set ->len = total; } void valueSet (ArrayHashMap* hashMap, MapSet *set ) { char ** values; int i = 0 , index = 0 ; int total = 0 ; for (i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ total++; } } values = (char **)malloc (sizeof (char *) * total); for (i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ values[index] = hashMap->buckets[i]->val; index++; } } set ->set = values; set ->len = total; } void print (ArrayHashMap* hashMap) { for (int i = 0 ; i < HASHTABLE_CAPACITY; i++){ if (hashMap->buckets[i] != NULL ){ printf ("%d -> %s\n" , hashMap->buckets[i].key, hashMap->buckets[i].val); } } }
哈希冲突 多个不同 key
输入对应同一个 bucket
的情况称为哈希冲突
哈希表的容量capacity
越大 ,多个key
被分配到同一个桶中的概率就越低 ,哈希冲突就越少 。因此我们可以通过扩容 哈希表来减少哈希冲突
哈希表扩容需要将所有键值对从原哈希表 迁移至新哈希表 ,并且键值对的储存位置需要通过哈希函数重新计算 ,开销很大。内置哈希表的编程语言通常会预留足够大的哈希表容量,防止频繁扩容
负载因子 是哈希表元素数量 除以桶数量 ,用于衡量哈希冲突的严重程度 ,也常用作哈希表扩容的触发条件 。如Java中,负载因子超过0.75,系统会将哈希表容量扩展未原先的2倍
解决哈希冲突最简单粗暴 的方法是:遇到哈希冲突时就进行哈希扩容,直到冲突消失。这个方法虽然解决了哈希冲突,但是效率实在太低 ,我们采用一些替代方案
改良哈希表数据结构(链式地址 、开放寻址 ,使得即使有哈希冲突发生,哈希表仍然可以正常工作
仅在必要(哈希冲突比较严重)时,才进行扩容操作
链式地址 将桶中存储的单个元素 转换为链表 ,键值对作为链表节点 ,将所有发生冲突的键值对都存储在同一链表 中
链式地址哈希表操作方法如下:
查询元素:输入key
,经过哈希函数得到桶索引index
,访问对应链表头节点,然后遍历链表对比key
查找目标键值对
添加元素:通过哈希函数访问链表头节点,然后将键值对添加到链表中
输出元素:先查询元素,然后在对应链表中将其删除
缺点:
需要节点指针,占用空间增大
查询元素效率降低,需要线性遍历链表
当链表长度很长时,可以将链表转换为AVL树 或红黑树 ,将查询操作的时间复杂度优化至
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 class HashMapChaining : def __init__ (self ): self.size = 0 self.capacity = 4 self.load_thres = 2.0 / 3.0 self.extend_ratio = 2 self.buckets = [[] for _ in range (self.capacity)] def hash_func (self, key ): return key % self.capacity def load_factor (self ): return self.size / self.capacity def get (self, key ): index = self.hash_func(key) bucket = self.buckets[index] for pair in bucket: if pair.key == key: return pair.val return None def put (self, key, value ): if self.load_factor() > self.load_thres: self.extend() index = self.hash_func(key) bucket = self.buckets[index] for pair in bucket: if pair.key == key: pair.val = value return pair = Pair(key, value) bucket.append(pair) self.size += 1 def remove (self, key ): index = self.hash_func(key) bucket = self.buckets[index] for pair in bucket: if pair.key == key: bucket.remove(pair) self.size -= 1 break def extend (self ): buckets = self.buckets self.capacity *= self.extend_ratio self.buckets = [[] for _ in range (self.capacity)] self.size = 0 for bucket in buckets: for pair in buckets: self.put(pair.key, pair.val) def print (self ): for bucket in self.buckets: res = [] for pair in bucket: res.append(str (pair.key) + " -> " + pair.val) print (res)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 class HashMapChaining { private : int size; int capacity; double loadThres; int extendRatio; vector<vector<Pair*>> buckets; public : HashMapChaining () : size (0 ), capacity (4 ), loadThres (2.0 / 3.0 ), extendRatio (2 ){ buckets.resize (capacity); } ~HashMapChaining (){ for (auto &bucket : buckets){ for (Pair* pair : bucket){ delete pair; } } } int hashFunc (int key) { return key % capacity; } double loadFactor () { return (double )size / (double )capacity; } string get (int key) { int index = hashFunc (key); for (Pair* pair : buckets[index]){ if (pair->key == key){ return pair->val; } } return "" ; } void put (int key, string value) { if (loadFactor () > loadThres){ extend (); } int index = hashFunc (key); for (Pair* pair : buckets[index]){ if (pair->key == key){ pair->val = val; return ; } } buckets[index].push_back (new Pair (key, value)); size++; } void remove (int key) { int index = hashFunc (key); auto &bucket = buckets[index]; for (int i = 0 ; i < bucket.size (); i++){ if (bucket[i]->key == key){ Pair* temp = bucket[i]; bucket.erase (bucket.begin () + i); delete temp; size--; return ; } } } void extend () { vector<vector<Pair*>> tempBuckets = buckets; capacity *= extendRatio; buckets.clear (); buckets.resize (capacity); size = 0 ; for (auto bucket : tempBuckets){ for (Pair* pair : bucket){ put (pair->key, pair->val); delete pair; } } } void print () { for (auto &bucket : buckets){ cout << "[" ; for (Pair *pair : bucket) { cout << pair->key << " -> " << pair->val << ", " ; } cout << "]\n" ; } } }
开放寻址 开放寻址 不引入额外的 数据结构,而是通过多次探测 来处理哈希冲突,探测方式主要有线性探测 、平方探测 、多次哈希 等
线性探测:采用固定步长 的线性搜索来进行探测
插入元素 :若发生哈希冲突,从冲突位置向后线性遍历固定步长 ,直到找到空桶
查找元素 :若发生哈希冲突,从冲突位置向后线性遍历固定步长 。找到对应元素则返回值;遇到空桶说明目标元素不再哈希表中
线性探测的缺点 :
容易产生聚集现象 :数组中连续被占用的位置越长,这些连续位置发生哈希冲突的可能性越大,从而加重聚集现象 ,形成恶性循环,效率不断降低
开放寻址的缺点 :
不能在开放寻址哈希表中直接删除元素 :删除元素会在数组内产生一个空桶,查询元素时遇到该空桶则返回,该空桶之下的元素无法再被访问。
懒删除机制 可以解决这个问题:
不直接从哈希表中删除元素,而是用一个常量TOMBSTONE
来标记这个桶
插入元素时,TOMBSTONE
被视作空桶
查询元素时,TOMBSTONE
不被视作空桶
然而 ,懒删除可能会加速 哈希表的性能退化 ,随着时间增加,线性探测可能要跳过多个TOMBSTONE
才能找到目标元素。可以在线性探测中记录遇到的首个TOMBSTONE
的索引,并将搜索到的目标元素与该TOMBSTONE
交换位置,这样可以优化查询效率
线性探测中,可以将哈希表看作一个环形数组
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 class HashMapOpenAddressing : def __init__ (self ): self.size = 0 self.capacity = 4 self.load_thres = 2.0 / 3.0 self.extend_ratio = 2 self.buckets = [None ] * self.capacity self.TOMBSTONE = Pair(-1 , "-1" ) def hash_func (self, key ): return key % self.capacity def load_factor (self ): return self.size / self.capacity def find_buckets (self, key ): index = self.hash_func(key) first_tombstone = -1 while self.buckets[index] is not None : if self.buckets[index].key == key: if first_tombstone != -1 : self.buckets[first_tombstone] = self.buckets[index] self.buckets[index] = self.TOMBSTONE return first_tombstone return index if first_tombstone == -1 and self.buckets[index] is self.TOMBSTONE: first_tombstone = index index = (index + 1 ) % self.capacity return index if first_tombstone == -1 else fiirst_tombstone def get (self, key ): index = self.find_bucket(key) if self.buckets[index] not in [None , self.TOMBSTONE]: return self.buckets[index].val return None def put (self, key, val ): if self.load_factor() > self.load_thres: self.extend() index = self.find_bucket(key) if self.buckets[index] not in [None , self.TOMBSTONE]: self.buckets[index].val = val return self.buckets[index] = Pair9(key, val) self.size += 1 def remove (self, key ): index = self.find_bucket(key) if self.buckets[index] not in [None , self.TOMBSTONE]: self.buckets[index] = self.TOMBSTONE self.size -= 1 def extend (self ): temp_buckets = self.buckets self.capaity *= self.extend_ratio self.buckets = [None ] * self.capacity self.size = 0 for pair in temp_buckets: if pair not in [None , self.TOMBSTONE]: self.put(pair.key, pair.val) def print (self ): for pair in self.buckets: if pair is None : print ("None" ) elif pair is self.TOMBSTONE: print ("TOMBSTONE" ) else : print (pair.key, "->" , pair.val)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 class HashMapOpenAddressing { private : int size; int capacity; const double loadThres; const int extendRatio; vector<Pair*>buckets; Pair* TOMBSTONE = new Pair (-1 , "-1" ); public : HashMapOpenAddressing () : size (0 ), capacity (4 ), loadThres (2.0 / 3.0 ), extendRatio (2 ), buckets (capacity, nullptr ) {} ~HashMapOpenAddressing (){ for (Pair* pair : buckets){ if (pair != nullptr && pair != TOMBSTONE){ delete pair; } } delete TOMBSTONE; } int hashFunc (int key) { return key % capacity; } double loadFactor () { return size / capacity; } int findBucket (int key) { int index = hashFunc (key); int firstTombStone = -1 ; while (bucket[index] != nullptr ){ if (bucket[index].key == key){ if (firstTombstone != -1 ){ buckets[firstTombstone] = buckets[index]; buckets[index] = TOMBSTONE; return firstTombstone; } return index; } if (firstTombstone == -1 && buckets[index] == TOMBSTONE){ firstTombstone = index; } index = (index + 1 ) % capacity; } return firstTombstone == -1 ? index : firstTombstone; } string get (int key) { int index = findBuckey (key); if (buckets[index] != nullptr && buckets[index] != TOMBSTONE){ return buckets[index]->val; } return "" ; } void put (int key, string value) { if (loadFactor () > loadThres){ extend (); } int index = findBucket (key); if (buckets[index] != nullptr && buckets[index] != TOMBSTONE){ buckets[index]->val = value; return ; } buckets[index] = new Pair (key, value); size++; } void remove (int key) { int index = findBucket (key); if (buckets[index] != nullptr && buckets[index] != TOMBSTONE){ delete buckets[index]; buckets[index] = TOMBSTONE; size--; } } void extend () { vector<Pair *> tempBuckets = buckets; capacity *= extendRatio; buckets = vector <Pair*>(capacity, nullptr ); size = 0 ; for (Pair *pair : tempBuckets){ if (pair != nullptr && pair != TOMBSTONE){ put (pair->key, pair->val); delete pair; } } } void print () { for (Pair* pair : buckets){ if (pair == nullptr ) { cout << "nullptr" << endl; } else if (pair == TOMBSTONE) { cout << "TOMBSTONE" << endl; } else { cout << pair->key << " -> " << pair->val << endl; } } } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 typedef struct { int size; int capacity; double loadThres; int extendRatio; Pair** buckets; Pair* TOMBSTONE; }HashMapOpenAddressing; HashMapOpenAddressing* newHashMapOpenAddressing () { HashMapOpenAddressing* hashMap = (HashMapOpenAddressing*)malloc (sizeof (HashMapOpenAddressing)); hashMap->size = 0 ; hashMap->capacity = 4 ; hashMap->loadThres = 2.0 / 3.0 hashMap->extendRatio = 2 ; hashMap->buckets = (Pair**)malloc (sizeof (Pair*) * hashMap->capacity); hashMap->TOMBSTONE = (Pair*)malloc (sizeof (Pair)); hashMap->TOMBSTONE->key = -1 ; hashMap->TOMBSTONE->val = "-1" ; return hashMap; } void delHashMapOpenAddressing (HashMapOpenAddressing* hashMap) { Pair* pair ; for (int i = 0 ; i < hashMap->capacity; i++){ pair = hashMap->buckets[i]; if (pair != NULL && pair != hashMap->TOMBSTONE){ free (pair ->val); free (pair ); } } free (hashMap); } int hashFunc (HashMapOpenAddressing* hashMap, int key) { return key % hashMap->capacity; } double loadFator (HashMapOpenAddressing* hashMap) { return hashMap->size / hashMap->capacity; } int findBuckets (HashMapOpenAddressing* hashMap, int key) { int index = hashFunc(hashMap, key); int firstTombstone = -1 ; while (hashMap->buckets[index] != NULL ){ if (hashMap->buckets[index]->key == key){ if (firstTombstone != -1 ){ hashMap->buckets[firstTombstone] = hashMap->buckets[index]; hashMap->buckets[index] = hashMap->TOMBSTONE; return firstTombstone; } return index; } if (firstTombstone == -1 && hashMap->buckets[index] == hashMap->TOMBSTONE){ firstTombstone = index; } index = (index + 1 ) % hashMap->capacity; } return firstTombstone == -1 ? index : firstTomstone; } const char * get (HashMapOpenAddressing* hashMap, int key) { int index = findBucket(hashMap, key); if (hashMap->buckets[index] != NULL && hashMap->buckets[index] != hashMap->TOMBSTONE){ return (const char *)hashMap->buckets[index]->val; } return "" ; } void put (HashMapOpenAddressing* hashMap, int key, const char * val) { if (loadFactor() > hashMap->loadThres){ extend(hashMap); } int index = findBucket(hashMap, key); if (hashMap->buckets[index] != NULL && hashMap->buckets[index] != hashMap->TOMBSTONE){ free (hashMap->buckets[index]->Val); hashMap->buckets[index]->val = (char *)malloc (strlen (val) + 1 ); strcpy (hashMap->buckets[index]->val, val); return ; } Pair* pair = (Pair*)malloc (sizeof (Pair)); pair ->key = key; pair ->val = (char *)malloc (strlen (val) + 1 ); strcpy (pair ->val, val); hashMap->buckets[index] = pair ; hashMap->size++; } void remove (HashMapOpenAddressing* hashMap, int key) { int index = findBucket(hashMap, key); if (hashMap->buckets[index] != NULL && hashMap->buckets[index] != hashMap->TOMBSTONE) { Pair *pair = hashMap->buckets[index]; free (pair ->val); free (pair ); hashMap->buckets[index] = hashMap->TOMBSTONE; hashMap->size--; } } void extend (HashMapOpenAddressing* hashMap) { Pair** tempBuckets = hashMap->buckets; int oldCapacity = hashMap->capacity; hashMap->capacity *= hashMap->extendRatio; hashMap->buckets = (Pair **)malloc (sizeof (Pair *) * hashMap->capacity); hashMap->size = 0 ; for (int i = 0 ; i < oldCapacity; i++){ if (tempBuckets[i] != NULL && tempBuckets[i] != hashMap->TOMBSTONE){ put(hashMap, tempBuckets[i]->key, tempBuckets[i]->val); free (tempBuckets[i]->val); free (tempBuckets[i]); } } free (tempBukcets); } void print (HashMapOpenAddressing *hashMap) { for (int i = 0 ; i < hashMap->capacity; i++) { Pair *pair = hashMap->buckets[i]; if (pair == NULL ) { printf ("NULL\n" ); } else if (pair == hashMap->TOMBSTONE) { printf ("TOMBSTONE\n" ); } else { printf ("%d -> %s\n" , pair ->key, pair ->val); } } }
平方探测
发生哈希冲突时,跳过探测次数的平方
优势:
缺点:
多次哈希
使用多个哈希函数 、 、 、···进行探测
不同编程语言对哈希表的实现
Java:链式地址 ,从JDK1.8开始,当HashMap内数组长度达到64且链表长度达到8时,链表会被转换为红黑树以提升查找性能
Python 采用开放寻址。字典 dict 使用伪随机数 进行探测。
Golang 采用链式地址。Go 规定每个桶最多存储 8 个键值对,超出容量则连接一个溢出桶。当溢出桶过多时,会执行一次特殊的等量扩容操作,以确保性能。
哈希算法 开放寻址和链式地址只能保证哈希表在发生哈希冲突时可用,没有从根本上减少哈希冲突发生的次数。而哈希冲突越频繁 ,哈希表的性能就越低 。想要减小哈希冲突发生的概率,就要改进 我们的哈希算法
哈希算法的目标 当哈希算法用于哈希表 时,有以下目标:
确定性 :对于相同输入始终有相同输出 -> 可靠
效率高 :计算哈希值够快。开销越小 ,实用性越高
均匀分布 :键值对分布越均匀 ,哈希冲突概率越低
当哈希算法用于密码存储 、数据完整性检查 时,有以下额外目标:
单向性 :无法通过哈希值反推出关于输入数据的任何信息
抗碰撞性 :极其困难找到两个不同的输入,使得它们的哈希值相同
雪崩效应 :输入的微小变化应当导致输出的显著且不可预测的变化
哈希算法的设计 介绍一些简单的哈希算法
加法哈希 :将输入的字符的ASCⅡ码相加,总和作为哈希值
1 2 3 4 5 6 7 def add_hash (key: str ) -> int : """加法哈希""" hash = 0 modulus = 1000000007 for c in key: hash += ord (c) return hash % modulus
1 2 3 4 5 6 7 8 int addHash (string key) { long long hash = 0 ; const int MODULUS = 1000000007 ; for (unsigned char c : key) { hash = (hash + (int )c) % MODULUS; } return (int )hash; }
1 2 3 4 5 6 7 8 int addHash (char *key) { long long hash = 0 ; const int MODULUS = 1000000007 ; for (int i = 0 ; i < strlen (key); i++) { hash = (hash + (unsigned char )key[i]) % MODULUS; } return (int )hash; }
乘法哈希 :利用乘法的不相关性 ,每轮乘以一个常数,将各个字符的ASCⅡ码累积到哈希值中
1 2 3 4 5 6 7 def mul_hash (key: str ) -> int : """乘法哈希""" hash = 0 modulus = 1000000007 for c in key: hash = 31 * hash + ord (c) return hash % modulus
1 2 3 4 5 6 7 8 int mulHash (string key) { long long hash = 0 ; const int MODULUS = 1000000007 ; for (unsigned char c : key) { hash = (31 * hash + (int )c) % MODULUS; } return (int )hash; }
1 2 3 4 5 6 7 8 9 int mulHash (char *key) { long long hash = 0 ; const int MODULUS = 1000000007 ; for (int i = 0 ; i < strlen (key); i++) { hash = (31 * hash + (unsigned char )key[i]) % MODULUS; } return (int )hash; }
异或哈希 :将输入数据的每个元素通过异或 操作积累到一个哈希值中
1 2 3 4 5 6 7 def xor_hash (key: str ) -> int : """异或哈希""" hash = 0 modulus = 1000000007 for c in key: hash ^= ord (c) return hash % modulus
1 2 3 4 5 6 7 8 int xorHash (string key) { int hash = 0 ; const int MODULUS = 1000000007 ; for (unsigned char c : key) { hash ^= (int )c; } return hash & MODULUS; }
1 2 3 4 5 6 7 8 9 int xorHash (char *key) { int hash = 0 ; const int MODULUS = 1000000007 ; for (int i = 0 ; i < strlen (key); i++) { hash ^= (unsigned char )key[i]; } return hash & MODULUS; }
旋转哈希 :将每个字符的 ASCII 码累积到一个哈希值中,每次累积之前都会对哈希值进行旋转操作。
1 2 3 4 5 6 7 def rot_hash (key: str ) -> int : """旋转哈希""" hash = 0 modulus = 1000000007 for c in key: hash = (hash << 4 ) ^ (hash >> 28 ) ^ ord (c) return hash % modulus
1 2 3 4 5 6 7 8 int rotHash (string key) { long long hash = 0 ; const int MODULUS = 1000000007 ; for (unsigned char c : key) { hash = ((hash << 4 ) ^ (hash >> 28 ) ^ (int )c) % MODULUS; } return (int )hash; }
1 2 3 4 5 6 7 8 9 int rotHash (char *key) { long long hash = 0 ; const int MODULUS = 1000000007 ; for (int i = 0 ; i < strlen (key); i++) { hash = ((hash << 4 ) ^ (hash >> 28 ) ^ (unsigned char )key[i]) % MODULUS; } return (int )hash; }
每种哈希算法的最后一步都对一个大质数 取模,以确保哈希值在合适的范围内,当我们使用大质数作为模数时,可以最大化地保证哈希值地均匀分布 ,因为质数不会和其他数字存在公约数,可以减少因取模操作而产生地周期性模式,从而避免哈希冲突。
如果key
是随机均匀分布的,模数选质数或是合数都可以;如果key
存在某种周期性,对合数取模更容易出现聚集现象。通常选择足够大的质数作为模数,以尽可能消除周期性模式,提升哈希算法的稳健性
常见哈希算法 在实际中通常用一些标准哈希算法,如MD5、SHA-1、SHA-2、SHA3等,它们可将任意长度 的输入数据映射到恒定长度 的哈希值
数据结构的哈希值 哈希表的key
可以是整数 、小数 或字符串 等数据类型。编程语言通常会为这些数据类型提供内置的哈希算法,用于计算桶索引
1 2 3 int num = 3 ;size_t hashNum = hash <int >()(num);
在许多编程语言中,只有不可变对象 可用作哈希表的key
。但是自定义对象是可哈希的,因为对象的哈希值通常基于内存地址生成,哈希值不会随着对象内容变化而变化。Python解释器在每次启动时,都会为字符串哈希函数加入一个随机的盐(salt)值,可有效防止HashDos攻击,提升哈希算法的安全性