Skip to content

Conversation

@hagenw
Copy link
Member

@hagenw hagenw commented Feb 6, 2024

From all the audb.Dependencies methods _drop() was one of the slowest compared to a pure pyarrow.Table implementation (compare #356).
This will dramatically speed up audb.publish() when several files are removed from a dataset.

When dropping 1000 files from a dependency table containing 1,000,000 files we get:

main this branch
209.117 s 0.115 s

BTW, the main branch could have also called audb.Dependency._drop() with a list of files instead of a single file in each call, but we didn't do this. This would result in an execution time of 0.249 s, which is still slower than the proposed solution.

If we have smaller dependency tables (e.g. 1000 files) and drop 10 files, the solution proposed here measuers at 0.000 s.
So, I don't think we need to compare it for those cases.

Benchmark code this branch
import hashlib
import os
import pickle
import random
import string
import time

import pandas as pd

import audb
import audeer


# Dependency table with 1,000,000 entries
random.seed(1)

cache = audeer.mkdir("./cache")

# === Dependency pandas.DataFrame ===
data_cache = audeer.path(cache, "df.pkl")
num_rows = 1000000
dtypes = [str, str, int, int, str, float, str, int, int, int, str]
columns = [ 
    "file",
    "archive",
    "bit_depth",
    "channels",
    "checksum",
    "duration",
    "format",
    "removed",
    "sampling_rate",
    "type",
    "version",
]
if not os.path.exists(data_cache):
    records = [ 
        {
            "file": f"file-{n}.wav",
            "archive": f"archive-{n}",
            "bit_depth": random.choices([0, 16, 24], weights=[0.1, 0.8, 0.1])[0],
            "channels": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
            "checksum": hashlib.md5(
                pickle.dumps(random.choice(string.ascii_letters))
            ).hexdigest(),
            "duration": 10 * random.random(),
            "format": random.choices(["csv", "wav", "txt"], weights=[0.1, 0.8, 0.1])[0], 
            "removed": random.choices([0, 1], weights=[0.1, 0.9])[0],
            "sampling_rate": random.choices(
                [0, 16000, 44100],
                weights=[0.1, 0.8, 0.1],
            )[0],
            "type": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
            "version": random.choices(["1.0.0", "1.1.0"], weights=[0.2, 0.8])[0],
        }
        for n in range(num_rows)
    ]
    df = pd.DataFrame.from_records(records)
    for column, dtype in zip(df.columns, dtypes):
        df[column] = df[column].astype(dtype)
    df.set_index("file", inplace=True)
    df.index.name = ""
    df.to_pickle(data_cache)

deps = audb.Dependencies()
deps.load(data_cache)

files = list(deps.files)[:1000]

# Measure execution time of String
t = time.time()
deps._drop(files)
print(f"Dependency._drop(): {time.time() -t:.3f} s")
Benchmark code main
import hashlib
import os
import pickle
import random
import string
import time

import pandas as pd

import audb
import audeer


# Dependency table with 1,000,000 entries
random.seed(1)

cache = audeer.mkdir("./cache")

# === Dependency pandas.DataFrame ===
data_cache = audeer.path(cache, "df.pkl")
num_rows = 1000000
dtypes = [str, str, int, int, str, float, str, int, int, int, str]
columns = [ 
    "file",
    "archive",
    "bit_depth",
    "channels",
    "checksum",
    "duration",
    "format",
    "removed",
    "sampling_rate",
    "type",
    "version",
]
if not os.path.exists(data_cache):
    records = [ 
        {
            "file": f"file-{n}.wav",
            "archive": f"archive-{n}",
            "bit_depth": random.choices([0, 16, 24], weights=[0.1, 0.8, 0.1])[0],
            "channels": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
            "checksum": hashlib.md5(
                pickle.dumps(random.choice(string.ascii_letters))
            ).hexdigest(),
            "duration": 10 * random.random(),
            "format": random.choices(["csv", "wav", "txt"], weights=[0.1, 0.8, 0.1])[0], 
            "removed": random.choices([0, 1], weights=[0.1, 0.9])[0],
            "sampling_rate": random.choices(
                [0, 16000, 44100],
                weights=[0.1, 0.8, 0.1],
            )[0],
            "type": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
            "version": random.choices(["1.0.0", "1.1.0"], weights=[0.2, 0.8])[0],
        }
        for n in range(num_rows)
    ]
    df = pd.DataFrame.from_records(records)
    for column, dtype in zip(df.columns, dtypes):
        df[column] = df[column].astype(dtype)
    df.set_index("file", inplace=True)
    df.index.name = ""
    df.to_pickle(data_cache)

deps = audb.Dependencies()
deps.load(data_cache)

files = list(deps.files)[:1000]

# Measure execution time of String
t = time.time()
for file in files:
    deps._drop(file)
print(f"Dependency._drop(): {time.time() -t:.3f} s")

@hagenw hagenw marked this pull request as ready for review February 6, 2024 12:03
@codecov
Copy link

codecov bot commented Feb 6, 2024

Codecov Report

All modified and coverable lines are covered by tests ✅

Comparison is base (d15facd) 100.0% compared to head (ba7110d) 100.0%.

Additional details and impacted files
Files Coverage Δ
audb/core/dependencies.py 100.0% <100.0%> (ø)
audb/core/publish.py 100.0% <100.0%> (ø)

@hagenw hagenw merged commit 998f851 into main Feb 6, 2024
@hagenw hagenw deleted the speed-deps-drop branch February 6, 2024 12:38
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants