From 3264363c99f9a07fd48216221409955e8aa121ef Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Tue, 2 Aug 2022 22:33:04 -0700 Subject: [PATCH] Raw coco benchmark (#72) --- python/benchmarks/bench_utils.py | 48 +++++++++++++++++++ python/benchmarks/coco.py | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 python/benchmarks/bench_utils.py create mode 100755 python/benchmarks/coco.py diff --git a/python/benchmarks/bench_utils.py b/python/benchmarks/bench_utils.py new file mode 100644 index 0000000000000..5d21e192077d3 --- /dev/null +++ b/python/benchmarks/bench_utils.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# Copyright 2022 Lance Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import wraps +import multiprocessing as mp +import pandas as pd +import time + +import pyarrow.fs + +__all__ = ["download_uris", "timeit"] + + +def get_bytes(uri): + fs, key = pyarrow.fs.FileSystem.from_uri(uri) + return fs.open_input_file(key).read() + + +def download_uris(uris: pd.Series) -> pd.Series: + pool = mp.Pool(mp.cpu_count() - 1) + data = pool.map(get_bytes, uris.values) + return data + + +def timeit(func): + @wraps(func) + def timeit_wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + total_time = end_time - start_time + # first item in the args, ie `args[0]` is `self` + print(f"Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds") + return result + + return timeit_wrapper diff --git a/python/benchmarks/coco.py b/python/benchmarks/coco.py new file mode 100755 index 0000000000000..4e2bd3d42d6ba --- /dev/null +++ b/python/benchmarks/coco.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os + +import pandas as pd +import pyarrow as pa +import pyarrow.fs + +from bench_utils import download_uris, timeit + + +def get_metadata(base_uri: str, split: str = "val"): + annotation_uri = os.path.join(base_uri, f"annotations/instances_{split}2017.json") + fs, path = pa.fs.FileSystem.from_uri(annotation_uri) + with fs.open_input_file(path) as fobj: + annotation_json = json.load(fobj) + df = pd.DataFrame(annotation_json["annotations"]) + category_df = pd.DataFrame(annotation_json["categories"]) + annotations_df = df.merge(category_df, left_on="category_id", right_on="id").rename( + {"id": "category_id"} + ) + anno_df = ( + pd.DataFrame( + { + "image_id": df.image_id, + "annotations": annotations_df.drop( + columns=["image_id"], axis=1 + ).to_dict(orient="records"), + } + ) + .groupby("image_id") + .agg(list) + ) + # print(anno_df, anno_df.columns) + images_df = pd.DataFrame(annotation_json["images"]) + images_df["split"] = split + images_df["image_uri"] = images_df["file_name"].apply( + lambda fname: os.path.join(base_uri, f"{split}2017", fname) + ) + return images_df.merge(anno_df, left_on="id", right_on="image_id") + + +@timeit +def get_label_distribution(base_uri: str): + """Minic + SELECT label, count(1) FROM coco_dataset GROUP BY 1 + """ + metadata = get_metadata(base_uri) + exploded_series = ( + metadata["annotations"].explode("annotations").apply(lambda r: r["name"]) + ) + return exploded_series.value_counts() + + +@timeit +def get_filtered_data(url: str, klass="cat", offset=20, limit=50): + """SELECT image, annotations FROM coco WHERE annotations.label = 'cat' LIMIT 50 OFFSET 20""" + # %time rs = bench.get_pets_filtered_data(url, "pug", 20, 50) + df = get_metadata(url) + print(df["annotations"]) + filtered = df[["image_uri", "annotations"]].loc[df["annotations"].apply( + lambda annos: any([a["name"] == "cat" for a in annos]) + )] + limited = filtered[offset:offset + limit] + limited["image"] = download_uris(limited.image_uri) + return limited + + +def main(): + parser = argparse.ArgumentParser(description="Benchmarks on COCO dataset") + parser.add_argument("uri", help="base uri for coco dataset") + args = parser.parse_args() + + get_label_distribution(args.uri) + get_filtered_data(args.uri) + + +if __name__ == "__main__": + main()