Hash Tables: Internals, Collision Resolution, and Performance

Introduction#

Hash tables power Python dicts, Java HashMaps, and database indexes. They provide average O(1) insert, delete, and lookup. Understanding their internals — hash functions, collision resolution, load factors, and resizing — helps you use them correctly and diagnose pathological performance.

How Hash Tables Work#

Key → Hash Function → Index → Bucket

"alice" → hash("alice") % 8 → 3 → buckets[3]

Collision: two keys hash to the same bucket
Resolution strategies:
1. Chaining: each bucket is a linked list
2. Open addressing: probe for next empty slot (linear, quadratic, double hashing)

Chaining Implementation#

from typing import Optional

class HashTable:
    """Hash table with separate chaining for collision resolution."""

    def __init__(self, capacity: int = 8):
        self._capacity = capacity
        self._size = 0
        self._buckets: list[list[tuple]] = [[] for _ in range(capacity)]
        self._load_factor_threshold = 0.75

    def _hash(self, key) -> int:
        return hash(key) % self._capacity

    def set(self, key, value) -> None:
        if self._load_factor() >= self._load_factor_threshold:
            self._resize()

        idx = self._hash(key)
        bucket = self._buckets[idx]

        for i, (k, v) in enumerate(bucket):
            if k == key:
                bucket[i] = (key, value)  # update
                return

        bucket.append((key, value))  # insert
        self._size += 1

    def get(self, key, default=None):
        idx = self._hash(key)
        for k, v in self._buckets[idx]:
            if k == key:
                return v
        return default

    def delete(self, key) -> bool:
        idx = self._hash(key)
        bucket = self._buckets[idx]
        for i, (k, v) in enumerate(bucket):
            if k == key:
                bucket.pop(i)
                self._size -= 1
                return True
        return False

    def _load_factor(self) -> float:
        return self._size / self._capacity

    def _resize(self) -> None:
        old_buckets = self._buckets
        self._capacity *= 2
        self._buckets = [[] for _ in range(self._capacity)]
        self._size = 0

        for bucket in old_buckets:
            for key, value in bucket:
                self.set(key, value)

    def __contains__(self, key) -> bool:
        return self.get(key) is not None

    def __len__(self) -> int:
        return self._size

Open Addressing with Linear Probing#

_DELETED = object()  # sentinel for deleted slots

class OpenAddressHashTable:
    """Hash table with open addressing (linear probing)."""

    def __init__(self, capacity: int = 8):
        self._capacity = capacity
        self._size = 0
        self._keys: list = [None] * capacity
        self._values: list = [None] * capacity

    def _hash(self, key) -> int:
        return hash(key) % self._capacity

    def _probe(self, key) -> int:
        """Find slot for key using linear probing."""
        idx = self._hash(key)
        for _ in range(self._capacity):
            slot_key = self._keys[idx]
            if slot_key is None or slot_key is _DELETED or slot_key == key:
                return idx
            idx = (idx + 1) % self._capacity
        raise RuntimeError("Hash table is full")

    def set(self, key, value) -> None:
        if self._size / self._capacity >= 0.5:  # lower threshold for open addressing
            self._resize()

        idx = self._probe(key)
        if self._keys[idx] is None or self._keys[idx] is _DELETED:
            self._size += 1
        self._keys[idx] = key
        self._values[idx] = value

    def get(self, key, default=None):
        idx = self._hash(key)
        for _ in range(self._capacity):
            slot_key = self._keys[idx]
            if slot_key is None:
                return default
            if slot_key is not _DELETED and slot_key == key:
                return self._values[idx]
            idx = (idx + 1) % self._capacity
        return default

    def delete(self, key) -> bool:
        idx = self._hash(key)
        for _ in range(self._capacity):
            slot_key = self._keys[idx]
            if slot_key is None:
                return False
            if slot_key is not _DELETED and slot_key == key:
                self._keys[idx] = _DELETED  # mark as deleted, don't break probe chains
                self._values[idx] = None
                self._size -= 1
                return True
            idx = (idx + 1) % self._capacity
        return False

    def _resize(self) -> None:
        old_keys = self._keys
        old_values = self._values
        self._capacity *= 2
        self._keys = [None] * self._capacity
        self._values = [None] * self._capacity
        self._size = 0

        for k, v in zip(old_keys, old_values):
            if k is not None and k is not _DELETED:
                self.set(k, v)

Hash Function Quality#

import hashlib

# Good hash function properties:
# 1. Deterministic: same input → same output
# 2. Uniform: keys spread evenly across buckets
# 3. Fast to compute
# 4. Avalanche effect: small input change → large output change

# Python's built-in hash() is fine for most use cases
# but randomized per-process (hash randomization)

# For consistent hashing across processes (e.g., cache keys):
def stable_hash(key: str) -> int:
    return int(hashlib.md5(key.encode()).hexdigest(), 16)

# FNV-1a: fast, simple, good distribution
def fnv1a_32(data: bytes) -> int:
    FNV_PRIME = 0x01000193
    FNV_OFFSET = 0x811c9dc5
    h = FNV_OFFSET
    for byte in data:
        h ^= byte
        h = (h * FNV_PRIME) & 0xFFFFFFFF
    return h

# Pathological case: all keys hash to the same bucket
class BadHashObject:
    def __hash__(self):
        return 42  # all instances hash to same bucket

ht = HashTable()
for i in range(1000):
    ht.set(BadHashObject(), i)  # O(n) per operation — all in one chain

Load Factor and Performance#

import random
import time

def benchmark_load_factor():
    """Demonstrate the effect of load factor on performance."""

    for load_target in [0.1, 0.3, 0.5, 0.7, 0.9, 0.99]:
        # Build a hash table to target load factor
        size = 10000
        capacity = int(size / load_target)
        ht = {}  # use Python dict for accuracy

        keys = [random.randint(0, 10**9) for _ in range(size)]
        for k in keys:
            ht[k] = k

        # Measure lookup time
        start = time.perf_counter()
        for _ in range(100000):
            k = random.choice(keys)
            _ = ht[k]
        elapsed = time.perf_counter() - start

        print(f"Load {load_target:.2f}: {elapsed*1000:.1f}ms for 100k lookups")

# Results show near-constant time up to ~0.7 load factor
# Performance degrades significantly above 0.8 with chaining
# Open addressing degrades faster — typically resize at 0.5

Consistent Hashing (for Distributed Systems)#

import hashlib
import bisect

class ConsistentHashRing:
    """Distribute keys across nodes with minimal remapping on node changes."""

    def __init__(self, replicas: int = 150):
        self._replicas = replicas
        self._ring: dict[int, str] = {}
        self._sorted_keys: list[int] = []

    def _hash(self, key: str) -> int:
        return int(hashlib.md5(key.encode()).hexdigest(), 16)

    def add_node(self, node: str) -> None:
        for i in range(self._replicas):
            virtual_key = self._hash(f"{node}:{i}")
            self._ring[virtual_key] = node
            bisect.insort(self._sorted_keys, virtual_key)

    def remove_node(self, node: str) -> None:
        for i in range(self._replicas):
            virtual_key = self._hash(f"{node}:{i}")
            del self._ring[virtual_key]
            self._sorted_keys.remove(virtual_key)

    def get_node(self, key: str) -> str:
        if not self._ring:
            raise ValueError("Empty ring")
        h = self._hash(key)
        idx = bisect.bisect(self._sorted_keys, h) % len(self._sorted_keys)
        return self._ring[self._sorted_keys[idx]]

# Usage
ring = ConsistentHashRing()
ring.add_node("cache-1")
ring.add_node("cache-2")
ring.add_node("cache-3")

# Adding a 4th node only remaps ~25% of keys
ring.add_node("cache-4")
print(ring.get_node("user:12345"))  # consistently routed

Python Dict Internals#

Python dict (CPython 3.6+):
- Open addressing with compact array layout
- Key order preserved (insertion order)
- Load factor threshold: 2/3 (~0.67)
- Resize: doubles capacity when threshold exceeded
- Hash randomization: PYTHONHASHSEED varies per process (security feature)

Common pitfalls:
- Mutable objects (list, dict) cannot be dict keys (not hashable)
- Custom __eq__ must be paired with __hash__
- If __eq__ is defined without __hash__, __hash__ is set to None

class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def __hash__(self):
        return hash((self.x, self.y))  # must be consistent with __eq__

Conclusion#

Hash tables provide O(1) average case for insert, lookup, and delete by trading space for speed. Load factor controls the tradeoff between memory and performance — keep it below 0.75 for chaining, 0.5 for open addressing. Collisions are inevitable; their impact depends on hash function quality and the resolution strategy. For distributed systems, consistent hashing minimizes key remapping when nodes are added or removed. Always implement __hash__ alongside __eq__ in custom classes that will be used as dictionary keys.