Skip to content

Conversation

@hagenw
Copy link
Member

@hagenw hagenw commented Feb 6, 2024

Our implementation of audb.Dependencies.archives is not the fastest (as can be seen when comparing to the pyarrow.Table only implementation in #356).
We improve it here by using .unique() instead of set().

For a dependency table containing 1,000,000 files stored in different archvies we get:

main this branch
0.409 s 0.117 s
Benchmark code
import hashlib
import os
import pickle
import random
import string
import time

import pandas as pd

import audb
import audeer


# Dependency table with 1,000,000 entries
random.seed(1)

cache = audeer.mkdir("./cache")

# === Dependency pandas.DataFrame ===
data_cache = audeer.path(cache, "df.pkl")
num_rows = 1000000
dtypes = [str, str, int, int, str, float, str, int, int, int, str]
columns = [ 
    "file",
    "archive",
    "bit_depth",
    "channels",
    "checksum",
    "duration",
    "format",
    "removed",
    "sampling_rate",
    "type",
    "version",
]
if not os.path.exists(data_cache):
    records = [ 
        {
            "file": f"file-{n}.wav",
            "archive": f"archive-{n}",
            "bit_depth": random.choices([0, 16, 24], weights=[0.1, 0.8, 0.1])[0],
            "channels": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
            "checksum": hashlib.md5(
                pickle.dumps(random.choice(string.ascii_letters))
            ).hexdigest(),
            "duration": 10 * random.random(),
            "format": random.choices(["csv", "wav", "txt"], weights=[0.1, 0.8, 0.1])[0], 
            "removed": random.choices([0, 1], weights=[0.1, 0.9])[0],
            "sampling_rate": random.choices(
                [0, 16000, 44100],
                weights=[0.1, 0.8, 0.1],
            )[0],
            "type": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
            "version": random.choices(["1.0.0", "1.1.0"], weights=[0.2, 0.8])[0],
        }
        for n in range(num_rows)
    ]
    df = pd.DataFrame.from_records(records)
    for column, dtype in zip(df.columns, dtypes):
        df[column] = df[column].astype(dtype)
    df.set_index("file", inplace=True)
    df.index.name = ""
    df.to_pickle(data_cache)

deps = audb.Dependencies()
deps.load(data_cache)

t = time.time()
deps.archives
print(f"Dependency.archives: {time.time() -t:.3f} s")

@hagenw hagenw marked this pull request as ready for review February 6, 2024 12:56
@hagenw hagenw merged commit 19baaea into main Feb 6, 2024
@hagenw hagenw deleted the speed-deps-archives branch February 6, 2024 12:57
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants