Bloom Filters: Space-Efficient Set Membership Testing

Introduction#

A Bloom filter answers the question “is this element in the set?” using a fraction of the memory a hash set would require. The trade-off: it may return false positives (reporting an element is in the set when it is not), but it never returns false negatives (if it says an element is not in the set, it definitely isn’t). This asymmetry makes Bloom filters useful for cache bypass optimization, duplicate detection, and database query optimization.

How It Works#

A Bloom filter is a bit array of size m. To add an element, compute k hash functions and set those bit positions to 1. To query, compute the same k hashes and check if all those positions are 1.

import math
import hashlib
from bitarray import bitarray

class BloomFilter:
    def __init__(self, expected_items: int, false_positive_rate: float = 0.01):
        # Optimal bit array size
        self.size = self._optimal_size(expected_items, false_positive_rate)
        # Optimal number of hash functions
        self.hash_count = self._optimal_hash_count(self.size, expected_items)
        self.bit_array = bitarray(self.size)
        self.bit_array.setall(0)

    def _optimal_size(self, n: int, p: float) -> int:
        return int(-n * math.log(p) / (math.log(2) ** 2))

    def _optimal_hash_count(self, m: int, n: int) -> int:
        return int((m / n) * math.log(2))

    def _hashes(self, item: str) -> list[int]:
        hashes = []
        for i in range(self.hash_count):
            # Use different salts to generate independent hashes
            h = hashlib.md5(f"{i}:{item}".encode()).hexdigest()
            hashes.append(int(h, 16) % self.size)
        return hashes

    def add(self, item: str) -> None:
        for pos in self._hashes(item):
            self.bit_array[pos] = 1

    def contains(self, item: str) -> bool:
        return all(self.bit_array[pos] for pos in self._hashes(item))

    @property
    def memory_bytes(self) -> int:
        return self.size // 8

# Usage
bf = BloomFilter(expected_items=1_000_000, false_positive_rate=0.01)

# Add items
for email in user_emails:
    bf.add(email)

# Check membership
if not bf.contains("new@example.com"):
    # Definitely not in the set — skip database lookup
    return False

# Might be in the set — do the actual lookup
return db.exists("new@example.com")

# Memory comparison
import sys
regular_set = set(user_emails)
print(f"Set: {sys.getsizeof(regular_set) / 1024 / 1024:.1f} MB")
print(f"Bloom filter: {bf.memory_bytes / 1024 / 1024:.1f} MB")
# Set: ~40 MB for 1M strings
# Bloom filter: ~1.2 MB (33x smaller)

False Positive Rate#

# False positive rate depends on: size, hash count, and number of elements
# At optimal settings with 1M items and 1% target FPR:
# size = 9,585,058 bits (~1.2 MB)
# hash_count = 7

# As you add more items beyond expected_items, FPR increases
bf = BloomFilter(expected_items=1000, false_positive_rate=0.01)
for i in range(5000):  # 5x more than expected
    bf.add(str(i))

# FPR is now much higher than 1%
false_positives = sum(
    1 for i in range(5000, 10000)
    if bf.contains(str(i))
) / 5000
print(f"Actual FPR: {false_positives:.1%}")

For distributed systems, Redis provides a built-in Bloom filter module:

import redis

r = redis.Redis(host="localhost", port=6379)

# BF.RESERVE: create filter with custom capacity and error rate
r.execute_command("BF.RESERVE", "processed_orders", 0.01, 10_000_000)
# error rate, capacity

# BF.ADD: add a single item
r.execute_command("BF.ADD", "processed_orders", "order-12345")

# BF.MADD: add multiple items
r.execute_command("BF.MADD", "processed_orders", "order-1", "order-2", "order-3")

# BF.EXISTS: check membership
result = r.execute_command("BF.EXISTS", "processed_orders", "order-12345")
# 1 = possibly in set, 0 = definitely not in set

# BF.MEXISTS: check multiple items
results = r.execute_command("BF.MEXISTS", "processed_orders",
                             "order-1", "order-99999")
# [1, 0]

Real-World Use Cases#

Preventing Duplicate Event Processing#

class IdempotentProcessor:
    def __init__(self):
        self.bloom = BloomFilter(expected_items=10_000_000, false_positive_rate=0.001)
        self.db = Database()

    def process(self, event_id: str, event: dict) -> bool:
        # Fast path: definitely not seen before
        if not self.bloom.contains(event_id):
            self.bloom.add(event_id)
            return self._do_process(event)

        # Slow path: check actual database (handles false positives)
        if self.db.is_processed(event_id):
            return False  # duplicate

        self.bloom.add(event_id)
        return self._do_process(event)

Cache Bypass Optimization#

# Avoid caching objects that are only requested once (cache pollution)
class SmartCache:
    def __init__(self):
        self.seen_once = BloomFilter(expected_items=1_000_000, false_positive_rate=0.01)
        self.cache = {}

    def get(self, key: str) -> object:
        if key in self.cache:
            return self.cache[key]

        value = db.get(key)

        if self.seen_once.contains(key):
            # Second request — worth caching
            self.cache[key] = value
        else:
            # First request — just track it
            self.seen_once.add(key)

        return value

Limitations#

No deletion: you cannot remove an item from a standard Bloom filter. Counting Bloom filters support deletion but use more memory.
No enumeration: you cannot list elements in the filter.
Grows stale: once filled beyond capacity, FPR rises. Plan capacity based on expected growth.

Conclusion#

Bloom filters offer 10-100x memory reduction over hash sets for membership testing, at the cost of configurable false positive rates. Use them where false positives are acceptable but false negatives are not: cache bypass, duplicate detection, pre-filtering before expensive database lookups. Redis’s built-in BF module makes distributed Bloom filters straightforward.