Skip to content

Commit

Permalink
multiproc v2 -> webdataset conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
ylabbe committed May 19, 2023
1 parent 8268c0c commit 94d06e5
Showing 1 changed file with 36 additions and 9 deletions.
45 changes: 36 additions & 9 deletions bop_toolkit_lib/dataset/convert_v2_to_webdataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import json
import multiprocessing
import pathlib
import re
import tarfile

import numpy as np
import webdataset as wds

from bop_toolkit_lib.dataset import bop_v2
Expand Down Expand Up @@ -140,15 +142,40 @@ def main():

v2_file_paths = v2_dir.glob('*')
keys = set([p.name.split('.')[0] for p in v2_file_paths])
keys = list(keys)[:1000]

convert_v2_to_webdataset(
v2_dir,
wds_dir,
keys,
0,
args.maxcount
)
keys = list(keys)

if args.shuffle:
np.random.RandomState(args.seed).shuffle(keys)

if args.nprocs > 0:
keys_splits = np.array_split(keys, args.nprocs)
_args = []
start_shard = 0
for keys_split in keys_splits:
_args.append(
(
v2_dir,
wds_dir,
keys,
start_shard,
args.maxcount
)
)
n_shards = np.ceil(len(keys_split) / args.maxcount)
start_shard += n_shards
with multiprocessing.Pool(processes=args.nprocs) as pool:
pool.starmap(
convert_v2_to_webdataset,
iterable=_args
)
else:
convert_v2_to_webdataset(
v2_dir,
wds_dir,
keys,
0,
args.maxcount
)
key_to_shard = make_key_to_shard_map(
wds_dir
)
Expand Down

0 comments on commit 94d06e5

Please sign in to comment.