Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provides a way to train with a percentage of data. #239

Merged
merged 9 commits into from
Apr 12, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
wip
  • Loading branch information
vturrisi committed Apr 12, 2022
commit 0445106ab3eb7670f08de7e4e268a66f74b1655f
2 changes: 2 additions & 0 deletions solo/methods/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ def add_model_specific_args(parent_parser: ArgumentParser) -> ArgumentParser:
# uses sample indexes as labels and then gets the labels from a lookup table
# this may use more CPU memory, so just use when needed.
parser.add_argument("--encode_indexes_into_labels", action="store_true")
# percentage of data for pretraining, leave 0 to use all data available
parser.add_argument("--data_percent", default=0.0, type=float)

# online knn eval
parser.add_argument("--knn_eval", action="store_true")
Expand Down
5 changes: 5 additions & 0 deletions solo/methods/dali.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def train_dataloader(self) -> DALIGenericIterator:

# hack to encode image indexes into the labels
self.encode_indexes_into_labels = self.extra_args["encode_indexes_into_labels"]
self.data_percent = self.extra_args["data_percent"]

# handle custom data by creating the needed pipeline
dataset = self.extra_args["dataset"]
Expand Down Expand Up @@ -191,6 +192,7 @@ def train_dataloader(self) -> DALIGenericIterator:
num_threads=num_workers,
no_labels=self.extra_args["no_labels"],
encode_indexes_into_labels=self.encode_indexes_into_labels,
data_percent=self.data_percent,
)
output_map = (
[f"large{i}" for i in range(self.num_large_crops)]
Expand Down Expand Up @@ -249,6 +251,7 @@ def train_dataloader(self) -> DALIGenericIterator:
shard_id=shard_id,
num_shards=num_shards,
num_threads=num_workers,
data_percent=self.data_percent,
)
train_loader = Wrapper(
train_pipeline,
Expand All @@ -272,6 +275,8 @@ def val_dataloader(self) -> DALIGenericIterator:
data_dir = Path(self.extra_args["data_dir"])
val_dir = Path(self.extra_args["val_dir"])

self.data_percent = self.extra_args["data_percent"]

# handle custom data by creating the needed pipeline
dataset = self.extra_args["dataset"]
if dataset in ["imagenet100", "imagenet"]:
Expand Down
95 changes: 65 additions & 30 deletions solo/utils/dali_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def __init__(
num_shards: int = 1,
num_threads: int = 4,
seed: int = 12,
data_percent: float = -1.0,
):
"""Initializes the pipeline for validation or linear eval training.

Expand All @@ -204,6 +205,8 @@ def __init__(
num_shards (int): total number of shards. Defaults to 1.
num_threads (int): number of threads to run in parallel. Defaults to 4.
seed (int): seed for random number generation. Defaults to 12.
data_percent (float): percentage of data to use. Use all data when set to -1.0.
Defaults to -1.0.
"""

seed += device_id
Expand All @@ -212,8 +215,28 @@ def __init__(
self.device = device
self.validation = validation

# manually load files and labels
labels = sorted(Path(entry.name) for entry in os.scandir(data_path) if entry.is_dir())
data = [
(data_path / label / file, label_idx)
for label_idx, label in enumerate(labels)
for file in sorted(os.listdir(data_path / label))
]
files, labels = map(list, zip(*data))

# sample data if needed
if data_percent > 0:
assert data_percent < 1, "Only use data_percent for values smaller than 1."

from sklearn.model_selection import train_test_split

files, _, labels, _ = train_test_split(
files, labels, train_size=data_percent, stratify=labels, random_state=42
)

self.reader = ops.readers.File(
file_root=data_path,
files=files,
labels=labels,
shard_id=shard_id,
num_shards=num_shards,
shuffle_after_epoch=not self.validation,
Expand Down Expand Up @@ -514,6 +537,7 @@ def __init__(
seed: int = 12,
no_labels: bool = False,
encode_indexes_into_labels: bool = False,
data_percent: float = -1.0,
):
"""Initializes the pipeline for pretraining.

Expand All @@ -535,6 +559,8 @@ def __init__(
encode_indexes_into_labels (bool, optional): uses sample indexes as labels
and then gets the labels from a lookup table. This may use more CPU memory,
so just use when needed. Defaults to False.
data_percent (float): percentage of data to use. Use all data when set to -1.
Defaults to -1.
"""

seed += device_id
Expand All @@ -548,55 +574,64 @@ def __init__(
self.device = device

data_path = Path(data_path)

# manually load files and labels
if no_labels:
files = [data_path / f for f in sorted(os.listdir(data_path))]
labels = [-1] * len(files)
self.reader = ops.readers.File(
files=files,
shard_id=shard_id,
num_shards=num_shards,
shuffle_after_epoch=random_shuffle,
labels=labels,
)
elif encode_indexes_into_labels:
else:
labels = sorted(Path(entry.name) for entry in os.scandir(data_path) if entry.is_dir())

data = [
(data_path / label / file, label_idx)
for label_idx, label in enumerate(labels)
for file in sorted(os.listdir(data_path / label))
]
files, labels = map(list, zip(*data))

files = []
labels = []
# for debugging
true_labels = []
if data_percent > 0:
assert data_percent < 1, "Only use data_percent for values smaller than 1."

self.conversion_map = []
for file_idx, (file, label_idx) in enumerate(data):
files.append(file)
labels.append(file_idx)
true_labels.append(label_idx)
self.conversion_map.append(label_idx)
if no_labels:
labels = [-1] * len(files)
else:
labels = [l for _, l in data]

# debugging
for file, file_idx, label_idx in zip(files, labels, true_labels):
assert self.conversion_map[file_idx] == label_idx
from sklearn.model_selection import train_test_split

self.reader = ops.readers.File(
files=files,
shard_id=shard_id,
num_shards=num_shards,
shuffle_after_epoch=random_shuffle,
files, _, labels, _ = train_test_split(
files, labels, train_size=data_percent, stratify=labels, random_state=42
)
else:
self.reader = ops.readers.File(
file_root=data_path,
files=files,
labels=labels,
shard_id=shard_id,
num_shards=num_shards,
shuffle_after_epoch=random_shuffle,
)

if encode_indexes_into_labels:
encoded_labels = []

self.conversion_map = []
for file_idx, label_idx in enumerate(labels):
encoded_labels.append(file_idx)
self.conversion_map.append(label_idx)

# to assert that everything is fine
for file_idx, label_idx in zip(encoded_labels, labels):
assert self.conversion_map[file_idx] == label_idx

# use the encoded labels which will be decoded later
labels = encoded_labels

self.reader = ops.readers.File(
files=files,
labels=labels,
shard_id=shard_id,
num_shards=num_shards,
shuffle_after_epoch=random_shuffle,
)

decoder_device = "mixed" if self.device == "gpu" else "cpu"
device_memory_padding = 211025920 if decoder_device == "mixed" else 0
host_memory_padding = 140544512 if decoder_device == "mixed" else 0
Expand Down