From 87d494f1faab48dea902e050a5dee7f0bccb8dab Mon Sep 17 00:00:00 2001 From: ifnesi Date: Fri, 5 Jan 2024 08:25:17 +0000 Subject: [PATCH] Added donalm's enhancements --- README.md | 17 +++++++- calculateAveragePypy.py | 88 ++++++++++++++++++++--------------------- 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index e61ee04..f44a186 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,16 @@ -From Gunnar's 1 billion rows challenge (https://github.com/gunnarmorling/1brc) +# 1BRC: One Billion Row Challenge in Python -Python implementation +Python implementation of Gunnar's 1 billion rows challenge: +- https://www.morling.dev/blog/one-billion-row-challenge +- https://github.com/gunnarmorling/1brc + +## Performance (on a MacBook Pro M1 32GB) +| Interperter | Script | user | system | cpu | total | +| ----------- | ------ | ---- | ------ | --- | ----- | +| pypy3 | calculateAveragePypy.py | 139.15s | 3.02s | 699% | 20.323 | +| python3 | calculateAverageDuckDB.py | 186.78s | 4.21s | 806% | 23.673 | +| pypy3 | calculateAverage.py | 284.90s | 9.12s | 749% | 39.236 | +| pypy3 | calculateAverage.py | 286.33s | 9.57s | 746% | 39.665 | +| python3 | calculateAverage.py | 378.54s | 6.94s | 747% | 51.544 | + +The file `calculateAveragePypy.py` was created by [donalm](https://github.com/donalm), a +2x improved version of the initial version (`calculateAverage.py`) when running in pypy3, even capable of beating the implementation using (DuckDB)[https://duckdb.org/] `calculateAverageDuckDB.py`. \ No newline at end of file diff --git a/calculateAveragePypy.py b/calculateAveragePypy.py index 6bf70ef..4f3aa8d 100644 --- a/calculateAveragePypy.py +++ b/calculateAveragePypy.py @@ -1,4 +1,4 @@ -# time python3 calculateAverage.py +# time pypy3 calculateAveragePypy.py import os import multiprocessing as mp @@ -58,61 +58,61 @@ def _process_file_chunk( file_name: str, chunk_start: int, chunk_end: int, + blocksize: int = 1024 * 1024, ) -> dict: """Process each file chunk in a different process""" result = dict() - blocksize = 1024 * 1024 - fh = open(file_name, "rb") - byte_count = chunk_end - chunk_start - fh.seek(chunk_start) - tail = b"" - location = None + with open(file_name, "r+b") as fh: + fh.seek(chunk_start) - while byte_count: - if blocksize > byte_count: - blocksize = byte_count - byte_count = byte_count - blocksize + tail = b"" + location = None + byte_count = chunk_end - chunk_start - data = tail + fh.read(blocksize) + while byte_count > 0: + if blocksize > byte_count: + blocksize = byte_count + byte_count -= blocksize + + index = 0 + data = tail + fh.read(blocksize) + while data: + if location is None: + try: + semicolon = data.index(b";", index) + except ValueError: + tail = data[index:] + break + + location = data[index:semicolon] + index = semicolon + 1 - index = 0 - while data: - if location is None: try: - semicolon = data.index(b";", index) + newline = data.index(b"\n", index) except ValueError: tail = data[index:] break - location = data[index:semicolon] - index = semicolon + 1 - - try: - newline = data.index(b"\n", index) - except ValueError: - tail = data[index:] - break - - value = float(data[index:newline]) - index = newline + 1 - - if location not in result: - result[location] = [ - value, - value, - value, - 1, - ] # min, max, sum, count - else: - if value < result[location][0]: - result[location][0] = value - if value > result[location][1]: - result[location][1] = value - result[location][2] += value - result[location][3] += 1 - - location = None + value = float(data[index:newline]) + index = newline + 1 + + if location not in result: + result[location] = [ + value, + value, + value, + 1, + ] # min, max, sum, count + else: + if value < result[location][0]: + result[location][0] = value + if value > result[location][1]: + result[location][1] = value + result[location][2] += value + result[location][3] += 1 + + location = None return result