Skip to content

Commit

Permalink
Merge pull request #2 from fizmat/faster-create
Browse files Browse the repository at this point in the history
Faster create
  • Loading branch information
ifnesi authored Mar 25, 2024
2 parents 0d5b2ea + d9c3a85 commit 741cd19
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 31 deletions.
70 changes: 40 additions & 30 deletions createMeasurements.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import argparse
import random
import time

import numpy as np
import polars as pl
from tqdm import tqdm


class CreateMeasurement:
Expand Down Expand Up @@ -422,45 +423,53 @@ class CreateMeasurement:
("Zürich", 9.3),
)

def generateSingleMeasurement(
self,
std_dev: float = 10,
) -> tuple:
station, mean_temperature = random.choice(CreateMeasurement.STATIONS)
random_temperature = np.random.normal(mean_temperature, std_dev)
return (
station,
random_temperature,
stations = pl.DataFrame(STATIONS, ("names", "means"))

def __init__(self):
self.rng = np.random.default_rng()

def generate_batch(
self,
std_dev: float = 10,
records: int = 10_000_000
) -> pl.DataFrame:
batch = self.stations.sample(
records,
with_replacement=True,
shuffle=True,
seed=self.rng.integers(np.iinfo(np.int64).max)
)
batch = batch.with_columns(temperature=self.rng.normal(batch["means"], std_dev))
return batch.drop("means")

def generateMeasurementFile(
self,
file_name: str = "measurements.txt",
records: int = 1000000000,
sep: str = ";",
std_dev: float = 10,
def generate_measurement_file(
self,
file_name: str = "measurements.txt",
records: int = 1_000_000_000,
sep: str = ";",
std_dev: float = 10,
) -> None:
print(
f"Creating measurement file '{file_name}' with {records:,} measurements..."
)
start = time.time()
progress_report_unit = records // 100
batches = max(records // 10_000_000, 1)
batch_ends = np.linspace(0, records, batches + 1).astype(int)

with open(file_name, "w") as f:
for i in range(records):
station, measurement = self.generateSingleMeasurement(std_dev=std_dev)
f.write(f"{station}{sep}{measurement:.1f}\n")
if i > 0 and i % progress_report_unit == 0:
print(
f" - Wrote {i:,} measurements in {time.time() - start:.2f} seconds"
)
for i in tqdm(range(batches)):
from_, to = batch_ends[i], batch_ends[i + 1]
data = self.generate_batch(std_dev, to - from_)
data.write_csv(f, separator=sep, float_precision=1, include_header=False)

print(
f"Created file '{file_name}' with {i+1:,} measurements in {time.time() - start:.2f} seconds"
f"Created file '{file_name}' with {to:,} measurements in {time.time() - start:.2f} seconds"
)


if __name__ == "__main__":

def min_records(records: int) -> int:
def min_records(records: str) -> int:
try:
value = int(records)
except Exception:
Expand All @@ -473,28 +482,29 @@ def min_records(records: int) -> int:
else:
return value


parser = argparse.ArgumentParser(description="Create measurement file")
parser.add_argument(
"-o",
"--output",
dest="output",
type=str,
help=f"Measurement file name (Default is measurements.txt)",
help='Measurement file name (default is "measurements.txt")',
default="measurements.txt",
)
parser.add_argument(
"-r",
"--records",
help=f"Number of records to create (Default is 1000000000)",
help="Number of records to create (default is 1_000_000_000)",
dest="records",
type=min_records,
default=1000000000,
default=1_000_000_000,
)

args = parser.parse_args()

measurement = CreateMeasurement()
measurement.generateMeasurementFile(
measurement.generate_measurement_file(
file_name=args.output,
records=args.records,
)
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
numpy==1.24.2
numpy>=1.24.2
polars>=0.20.0
tqdm>=4.66.0

0 comments on commit 741cd19

Please sign in to comment.