Introduction#
Hash tables power Python dicts, Java HashMaps, and database indexes. They provide average O(1) insert, delete, and lookup. Understanding their internals — hash functions, collision resolution, load factors, and resizing — helps you use them correctly and diagnose pathological performance.
How Hash Tables Work#
1
2
3
4
5
6
7
8
| Key → Hash Function → Index → Bucket
"alice" → hash("alice") % 8 → 3 → buckets[3]
Collision: two keys hash to the same bucket
Resolution strategies:
1. Chaining: each bucket is a linked list
2. Open addressing: probe for next empty slot (linear, quadratic, double hashing)
|
Chaining Implementation#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
| from typing import Optional
class HashTable:
"""Hash table with separate chaining for collision resolution."""
def __init__(self, capacity: int = 8):
self._capacity = capacity
self._size = 0
self._buckets: list[list[tuple]] = [[] for _ in range(capacity)]
self._load_factor_threshold = 0.75
def _hash(self, key) -> int:
return hash(key) % self._capacity
def set(self, key, value) -> None:
if self._load_factor() >= self._load_factor_threshold:
self._resize()
idx = self._hash(key)
bucket = self._buckets[idx]
for i, (k, v) in enumerate(bucket):
if k == key:
bucket[i] = (key, value) # update
return
bucket.append((key, value)) # insert
self._size += 1
def get(self, key, default=None):
idx = self._hash(key)
for k, v in self._buckets[idx]:
if k == key:
return v
return default
def delete(self, key) -> bool:
idx = self._hash(key)
bucket = self._buckets[idx]
for i, (k, v) in enumerate(bucket):
if k == key:
bucket.pop(i)
self._size -= 1
return True
return False
def _load_factor(self) -> float:
return self._size / self._capacity
def _resize(self) -> None:
old_buckets = self._buckets
self._capacity *= 2
self._buckets = [[] for _ in range(self._capacity)]
self._size = 0
for bucket in old_buckets:
for key, value in bucket:
self.set(key, value)
def __contains__(self, key) -> bool:
return self.get(key) is not None
def __len__(self) -> int:
return self._size
|
Open Addressing with Linear Probing#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
| _DELETED = object() # sentinel for deleted slots
class OpenAddressHashTable:
"""Hash table with open addressing (linear probing)."""
def __init__(self, capacity: int = 8):
self._capacity = capacity
self._size = 0
self._keys: list = [None] * capacity
self._values: list = [None] * capacity
def _hash(self, key) -> int:
return hash(key) % self._capacity
def _probe(self, key) -> int:
"""Find slot for key using linear probing."""
idx = self._hash(key)
for _ in range(self._capacity):
slot_key = self._keys[idx]
if slot_key is None or slot_key is _DELETED or slot_key == key:
return idx
idx = (idx + 1) % self._capacity
raise RuntimeError("Hash table is full")
def set(self, key, value) -> None:
if self._size / self._capacity >= 0.5: # lower threshold for open addressing
self._resize()
idx = self._probe(key)
if self._keys[idx] is None or self._keys[idx] is _DELETED:
self._size += 1
self._keys[idx] = key
self._values[idx] = value
def get(self, key, default=None):
idx = self._hash(key)
for _ in range(self._capacity):
slot_key = self._keys[idx]
if slot_key is None:
return default
if slot_key is not _DELETED and slot_key == key:
return self._values[idx]
idx = (idx + 1) % self._capacity
return default
def delete(self, key) -> bool:
idx = self._hash(key)
for _ in range(self._capacity):
slot_key = self._keys[idx]
if slot_key is None:
return False
if slot_key is not _DELETED and slot_key == key:
self._keys[idx] = _DELETED # mark as deleted, don't break probe chains
self._values[idx] = None
self._size -= 1
return True
idx = (idx + 1) % self._capacity
return False
def _resize(self) -> None:
old_keys = self._keys
old_values = self._values
self._capacity *= 2
self._keys = [None] * self._capacity
self._values = [None] * self._capacity
self._size = 0
for k, v in zip(old_keys, old_values):
if k is not None and k is not _DELETED:
self.set(k, v)
|
Hash Function Quality#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| import hashlib
# Good hash function properties:
# 1. Deterministic: same input → same output
# 2. Uniform: keys spread evenly across buckets
# 3. Fast to compute
# 4. Avalanche effect: small input change → large output change
# Python's built-in hash() is fine for most use cases
# but randomized per-process (hash randomization)
# For consistent hashing across processes (e.g., cache keys):
def stable_hash(key: str) -> int:
return int(hashlib.md5(key.encode()).hexdigest(), 16)
# FNV-1a: fast, simple, good distribution
def fnv1a_32(data: bytes) -> int:
FNV_PRIME = 0x01000193
FNV_OFFSET = 0x811c9dc5
h = FNV_OFFSET
for byte in data:
h ^= byte
h = (h * FNV_PRIME) & 0xFFFFFFFF
return h
# Pathological case: all keys hash to the same bucket
class BadHashObject:
def __hash__(self):
return 42 # all instances hash to same bucket
ht = HashTable()
for i in range(1000):
ht.set(BadHashObject(), i) # O(n) per operation — all in one chain
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| import random
import time
def benchmark_load_factor():
"""Demonstrate the effect of load factor on performance."""
for load_target in [0.1, 0.3, 0.5, 0.7, 0.9, 0.99]:
# Build a hash table to target load factor
size = 10000
capacity = int(size / load_target)
ht = {} # use Python dict for accuracy
keys = [random.randint(0, 10**9) for _ in range(size)]
for k in keys:
ht[k] = k
# Measure lookup time
start = time.perf_counter()
for _ in range(100000):
k = random.choice(keys)
_ = ht[k]
elapsed = time.perf_counter() - start
print(f"Load {load_target:.2f}: {elapsed*1000:.1f}ms for 100k lookups")
# Results show near-constant time up to ~0.7 load factor
# Performance degrades significantly above 0.8 with chaining
# Open addressing degrades faster — typically resize at 0.5
|
Consistent Hashing (for Distributed Systems)#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
| import hashlib
import bisect
class ConsistentHashRing:
"""Distribute keys across nodes with minimal remapping on node changes."""
def __init__(self, replicas: int = 150):
self._replicas = replicas
self._ring: dict[int, str] = {}
self._sorted_keys: list[int] = []
def _hash(self, key: str) -> int:
return int(hashlib.md5(key.encode()).hexdigest(), 16)
def add_node(self, node: str) -> None:
for i in range(self._replicas):
virtual_key = self._hash(f"{node}:{i}")
self._ring[virtual_key] = node
bisect.insort(self._sorted_keys, virtual_key)
def remove_node(self, node: str) -> None:
for i in range(self._replicas):
virtual_key = self._hash(f"{node}:{i}")
del self._ring[virtual_key]
self._sorted_keys.remove(virtual_key)
def get_node(self, key: str) -> str:
if not self._ring:
raise ValueError("Empty ring")
h = self._hash(key)
idx = bisect.bisect(self._sorted_keys, h) % len(self._sorted_keys)
return self._ring[self._sorted_keys[idx]]
# Usage
ring = ConsistentHashRing()
ring.add_node("cache-1")
ring.add_node("cache-2")
ring.add_node("cache-3")
# Adding a 4th node only remaps ~25% of keys
ring.add_node("cache-4")
print(ring.get_node("user:12345")) # consistently routed
|
Python Dict Internals#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| Python dict (CPython 3.6+):
- Open addressing with compact array layout
- Key order preserved (insertion order)
- Load factor threshold: 2/3 (~0.67)
- Resize: doubles capacity when threshold exceeded
- Hash randomization: PYTHONHASHSEED varies per process (security feature)
Common pitfalls:
- Mutable objects (list, dict) cannot be dict keys (not hashable)
- Custom __eq__ must be paired with __hash__
- If __eq__ is defined without __hash__, __hash__ is set to None
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def __eq__(self, other):
return self.x == other.x and self.y == other.y
def __hash__(self):
return hash((self.x, self.y)) # must be consistent with __eq__
|
Conclusion#
Hash tables provide O(1) average case for insert, lookup, and delete by trading space for speed. Load factor controls the tradeoff between memory and performance — keep it below 0.75 for chaining, 0.5 for open addressing. Collisions are inevitable; their impact depends on hash function quality and the resolution strategy. For distributed systems, consistent hashing minimizes key remapping when nodes are added or removed. Always implement __hash__ alongside __eq__ in custom classes that will be used as dictionary keys.