update

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
ray-project · stephanie-wang · Sep 5, 2023 · Aug 25, 2023 · Aug 25, 2023 · Aug 25, 2023
commit 48bf64bf6b7d164b20232fd42c11c36b01f2cfe1
diff --git a/release/nightly_tests/dataset/image_loader_microbenchmark.py b/release/nightly_tests/dataset/image_loader_microbenchmark.py
@@ -104,7 +104,7 @@ def tf_crop_and_flip(image_buffer, num_channels=3):
     # bounding box. If no box is supplied, then we assume the bounding box is
     # the entire image.
     shape = tf.shape(image_buffer)
-    if len(shape) == 4:
+    if len(shape) == num_channels + 1:
         shape = shape[1:]
 
     bbox = tf.constant(
@@ -196,18 +196,7 @@ def get_transform(to_torch_tensor):
     )
     return transform
 
-transform = torchvision.transforms.Compose(
-        [
-            torchvision.transforms.RandomResizedCrop(
-                size=DEFAULT_IMAGE_SIZE,
-                scale=(0.05, 1.0),
-                ratio=(0.75, 1.33),
-            ),
-            torchvision.transforms.RandomHorizontalFlip(),
-            # torchvision.transforms.ToTensor(),
-        ]
-    )
-
+# Capture `transform`` in the map UDFs.
 transform = get_transform(False)
 
 def crop_and_flip_image(row):
@@ -229,11 +218,10 @@ def decode_image_crop_and_flip(row):
     # NUM_CHANNELS = 3
     # row["image"] = np.frombuffer(row["image"], dtype=np.uint8).reshape((NUM_CHANNELS, row["height"], row["width"])) #
     row["image"] = Image.frombytes("RGB", (row["height"], row["width"]), row["image"]) 
-    del row["width"]
-    del row["height"]
     # Convert back np to avoid storing a np.object array.
-    row["image"] = np.array(transform(row["image"]))
-    return row
+    return {
+        "image": np.array(transform(row["image"]))
+    }
 
 
 class MdsDatasource(ray.data.datasource.FileBasedDatasource):

diff --git a/release/nightly_tests/dataset/run_image_loader_microbenchmark.sh b/release/nightly_tests/dataset/run_image_loader_microbenchmark.sh
@@ -14,9 +14,6 @@ rm -rf "$MOSAIC_DIR"
 rm -rf "$TFRECORDS_DIR"
 rm -rf "$PARQUET_DIR"
 
-# Download 1GB dataset from S3 to local disk.
-aws s3 sync s3://imagenetmini1000/1gb "$DIR"
-
 # Download 1GB dataset from S3 to local disk so we can preprocess with mosaic.
 aws s3 sync s3://imagenetmini1000/1gb $DIR
 # Generated with