From 87d494f1faab48dea902e050a5dee7f0bccb8dab Mon Sep 17 00:00:00 2001
From: ifnesi <inesi@confluent.io>
Date: Fri, 5 Jan 2024 08:25:17 +0000
Subject: [PATCH] Added donalm's enhancements

---
 README.md               | 17 +++++++-
 calculateAveragePypy.py | 88 ++++++++++++++++++++---------------------
 2 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index e61ee04..f44a186 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,16 @@
-From Gunnar's 1 billion rows challenge (https://github.com/gunnarmorling/1brc)
+# 1BRC: One Billion Row Challenge in Python
 
-Python implementation
+Python implementation of Gunnar's 1 billion rows challenge:
+- https://www.morling.dev/blog/one-billion-row-challenge
+- https://github.com/gunnarmorling/1brc
+
+## Performance (on a MacBook Pro M1 32GB)
+| Interperter | Script | user | system | cpu | total |
+| ----------- | ------ | ---- | ------ | --- | ----- |
+| pypy3 | calculateAveragePypy.py | 139.15s | 3.02s | 699% | 20.323 |
+| python3 | calculateAverageDuckDB.py | 186.78s | 4.21s | 806% | 23.673 |
+| pypy3 | calculateAverage.py | 284.90s | 9.12s | 749% | 39.236 |
+| pypy3 | calculateAverage.py | 286.33s | 9.57s | 746% | 39.665 |
+| python3 | calculateAverage.py | 378.54s | 6.94s | 747% | 51.544 |
+
+The file `calculateAveragePypy.py` was created by [donalm](https://github.com/donalm), a +2x improved version of the initial version (`calculateAverage.py`) when running in pypy3, even capable of beating the implementation using (DuckDB)[https://duckdb.org/] `calculateAverageDuckDB.py`.
\ No newline at end of file
diff --git a/calculateAveragePypy.py b/calculateAveragePypy.py
index 6bf70ef..4f3aa8d 100644
--- a/calculateAveragePypy.py
+++ b/calculateAveragePypy.py
@@ -1,4 +1,4 @@
-# time python3 calculateAverage.py
+# time pypy3 calculateAveragePypy.py
 import os
 import multiprocessing as mp
 
@@ -58,61 +58,61 @@ def _process_file_chunk(
     file_name: str,
     chunk_start: int,
     chunk_end: int,
+    blocksize: int = 1024 * 1024,
 ) -> dict:
     """Process each file chunk in a different process"""
     result = dict()
-    blocksize = 1024 * 1024
-    fh = open(file_name, "rb")
-    byte_count = chunk_end - chunk_start
-    fh.seek(chunk_start)
-    tail = b""
 
-    location = None
+    with open(file_name, "r+b") as fh:
+        fh.seek(chunk_start)
 
-    while byte_count:
-        if blocksize > byte_count:
-            blocksize = byte_count
-        byte_count = byte_count - blocksize
+        tail = b""
+        location = None
+        byte_count = chunk_end - chunk_start
 
-        data = tail + fh.read(blocksize)
+        while byte_count > 0:
+            if blocksize > byte_count:
+                blocksize = byte_count
+            byte_count -= blocksize
+
+            index = 0
+            data = tail + fh.read(blocksize)
+            while data:
+                if location is None:
+                    try:
+                        semicolon = data.index(b";", index)
+                    except ValueError:
+                        tail = data[index:]
+                        break
+
+                    location = data[index:semicolon]
+                    index = semicolon + 1
 
-        index = 0
-        while data:
-            if location is None:
                 try:
-                    semicolon = data.index(b";", index)
+                    newline = data.index(b"\n", index)
                 except ValueError:
                     tail = data[index:]
                     break
 
-                location = data[index:semicolon]
-                index = semicolon + 1
-
-            try:
-                newline = data.index(b"\n", index)
-            except ValueError:
-                tail = data[index:]
-                break
-
-            value = float(data[index:newline])
-            index = newline + 1
-
-            if location not in result:
-                result[location] = [
-                    value,
-                    value,
-                    value,
-                    1,
-                ]  # min, max, sum, count
-            else:
-                if value < result[location][0]:
-                    result[location][0] = value
-                if value > result[location][1]:
-                    result[location][1] = value
-                result[location][2] += value
-                result[location][3] += 1
-
-            location = None
+                value = float(data[index:newline])
+                index = newline + 1
+
+                if location not in result:
+                    result[location] = [
+                        value,
+                        value,
+                        value,
+                        1,
+                    ]  # min, max, sum, count
+                else:
+                    if value < result[location][0]:
+                        result[location][0] = value
+                    if value > result[location][1]:
+                        result[location][1] = value
+                    result[location][2] += value
+                    result[location][3] += 1
+
+                location = None
 
     return result