- 
                Notifications
    You must be signed in to change notification settings 
- Fork 33
Closed
Description
Description
Back in #375, I added sorting by hash to pytest-randomly, using hashlib.md5() without much thought. I think it would be better to move to zlib.crc32 for speed, as it's about 5x faster while still providing great distribution. We don't need a 128 bit hash, 32 bits will do, and we don't need any level of security (although MD5 is now broken).
Benchmarking script
from __future__ import annotations
import time
from hashlib import md5
from zlib import crc32
def md5_hash(text: str) -> int:
    return int.from_bytes(md5(text.encode()).digest(), "big")
def crc32_hash(text: str) -> int:
    return crc32(text.encode())
def generate_test_ids(n: int) -> list[str]:
    return [f"test_{i}" for i in range(1, n + 1)]
def benchmark_speed(hash_func, test_ids: list[str], iterations: int = 100) -> float:
    """Benchmark hash function speed in nanoseconds per hash"""
    start = time.perf_counter_ns()
    for _ in range(iterations):
        for test_id in test_ids:
            hash_func(test_id)
    end = time.perf_counter_ns()
    total_hashes = len(test_ids) * iterations
    return (end - start) / total_hashes
def analyze_distribution(hashes: list[int], name: str) -> dict:
    """Analyze statistical properties of hash distribution"""
    return {
        "name": name,
        "min": min(hashes),
        "max": max(hashes),
        "range": max(hashes) - min(hashes),
        "range_usage": (max(hashes) - min(hashes))
        / (2 ** (32 if name == "CRC32" else 128)),
        "unique_count": len(set(hashes)),
        "collision_rate": 1.0 - (len(set(hashes)) / len(hashes)),
    }
def main():
    print("Hash Function Comparison: MD5 vs CRC32")
    print("=" * 50)
    # Generate test data
    test_ids = generate_test_ids(100_000)
    # Speed benchmarks
    print("Speed Benchmarks (average per hash):")
    md5_speed = benchmark_speed(md5_hash, test_ids)
    crc32_speed = benchmark_speed(crc32_hash, test_ids)
    print(f"MD5:   {md5_speed:.1f} ns")
    print(f"CRC32: {crc32_speed:.1f} ns")
    print(f"CRC32 is {md5_speed / crc32_speed:.1f}x faster than MD5")
    print()
    # Generate hashes for analysis
    md5_hashes = [md5_hash(test_id) for test_id in test_ids]
    crc32_hashes = [crc32_hash(test_id) for test_id in test_ids]
    # Statistical analysis
    md5_stats = analyze_distribution(md5_hashes, "MD5")
    crc32_stats = analyze_distribution(crc32_hashes, "CRC32")
    print("Statistical Analysis:")
    print("-" * 30)
    for key in [
        "range",
        "range_usage",
        "unique_count",
        "collision_rate",
    ]:
        print(f"{key:20}: MD5={md5_stats[key]:>12.2f}, CRC32={crc32_stats[key]:>12.2f}")
if __name__ == "__main__":
    main()Results:
Hash Function Comparison: MD5 vs CRC32
==================================================
Speed Benchmarks (average per hash):
MD5:   434.5 ns
CRC32: 89.8 ns
CRC32 is 4.8x faster than MD5
Statistical Analysis:
------------------------------
range               : MD5=340276881851018210237546467185869717504.00, CRC32=4294733938.00
range_usage         : MD5=        1.00, CRC32=        1.00
unique_count        : MD5=   100000.00, CRC32=   100000.00
collision_rate      : MD5=        0.00, CRC32=        0.00
It seems good, both end up evenly distributing 100k test IDs with no collisions!
Metadata
Metadata
Assignees
Labels
No labels