Skip to content

Commit

Permalink
Added script to generate videos of nuScene instances (nutonomy#514)
Browse files Browse the repository at this point in the history
For more detailed instructions please see:
https://github.com/EricWiener/nuscenes-instance-videos
  • Loading branch information
EricWiener authored Jan 8, 2021
1 parent 818eeae commit 6e0ed36
Showing 1 changed file with 384 additions and 0 deletions.
384 changes: 384 additions & 0 deletions python-sdk/nuscenes/scripts/export_instance_videos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,384 @@
# nuScenes dev-kit.
# Code written by Eric Wiener, 2020.

"""Generate videos of NuScene object instances.
See https://github.com/EricWiener/nuscenes-instance-videos for more detailed instructions.
Usage: python3 generate_videos.py --dataroot <path to data> --version <version> -o <output directory>
Note: you first need to generate 2D annotations with export_2d_annotations_as_json.py
"""

import argparse
from collections import defaultdict
import json
import os
import numpy as np
from PIL import Image
import cv2
from tqdm import tqdm
import pathlib
from shutil import rmtree

def convert_annotation_list_to_dict(annotation_list, categories=None, visibilities=['', '1', '2', '3', '4']):
"""
When saving the list of annotations to a dictionary, special attention must be paid to the
correct keys to use.
For example, you will have bounding boxes with the same instance_token and sample_annotation_token
because there are multiple cameras on the car, so you can have the same object appearing across
multiple sensors. Each sensor's data is identified with a sample_data_token.
{'attribute_tokens': ['58aa28b1c2a54dc88e169808c07331e3'], 'bbox_corners': [1370.3079971217335, 446.66394956158524, 1600.0, 607.4567037983365], 'category_name': 'vehicle.car', 'filename': 'samples/CAM_FRONT/n008-2018-08-27-11-48-51-0400__CAM_FRONT__1535385095912404.jpg', 'instance_token': '0f8696c5e7284236b29a806d3d6f3513', 'next': '624a662244a241529e9f4d42fe75d2bd', 'num_lidar_pts': 4, 'num_radar_pts': 2, 'prev': '8291db1bc2704230867275bad5f42297', 'sample_annotation_token': 'ee04de72a30e4517a366ddad89d64fef', 'sample_data_token': '60ade2dececb46c69b114ce4c8a0bd3e', 'visibility_token': '1'}
{'attribute_tokens': ['58aa28b1c2a54dc88e169808c07331e3'], 'bbox_corners': [0.0, 446.3944232196225, 387.13952090477727, 618.0310593208171], 'category_name': 'vehicle.car', 'filename': 'samples/CAM_FRONT_RIGHT/n008-2018-08-27-11-48-51-0400__CAM_FRONT_RIGHT__1535385095920482.jpg', 'instance_token': '0f8696c5e7284236b29a806d3d6f3513', 'next': '624a662244a241529e9f4d42fe75d2bd', 'num_lidar_pts': 4, 'num_radar_pts': 2, 'prev': '8291db1bc2704230867275bad5f42297', 'sample_annotation_token': 'ee04de72a30e4517a366ddad89d64fef', 'sample_data_token': '92d49452e5804d0a9724ab4161a26147', 'visibility_token': '1'}
A combination of [instance_token][sample_data_token] can be used to uniquely identify
the bounding boxes. You can enumerate through [instance_token][x] to find all the different
views of a single bounding box.
"""
# Convert the list of instance to a dictionary that uses the
# instance_token -> sample_annotation_token -> camera
# to look up the instance
bbox_2d_annotations = defaultdict(lambda: defaultdict(dict))

num_dups = 0
for instance in annotation_list:
instance_token = instance['instance_token']

# 3. `sample` - An annotated snapshot of a scene at a particular timestamp.
# This is identified by `sample_annotation_token`.
# 4. `sample_data` - Data collected from a particular sensor.

# sample_data refers to the picture captured by a single sensor at a single timestamp.
# sample_annotation_token refers to a single bounding box, which might exist in multiple
# sample_data (across the different cameras)
sample_token = instance['sample_annotation_token']
category = instance['category_name']
visibility = instance['visibility_token']
camera_name = extract_camera_key_from_filename(instance['filename'])

# Append additional information
instance['camera_name'] = camera_name
instance['bbox_area'] = calculate_bb_area(instance['bbox_corners'])

if (categories is not None and category not in categories) or visibility not in visibilities:
continue

if instance_token in bbox_2d_annotations and sample_token in bbox_2d_annotations[instance_token] and camera_name in bbox_2d_annotations[instance_token][sample_token]:
num_dups += 1
print('Duplicate instance {}, sample {}, and camera {}'.format(
instance_token, sample_token, camera_name))

bbox_2d_annotations[instance_token][sample_token][camera_name] = instance

print("Number of duplicates (should be zero)", num_dups)
return bbox_2d_annotations


def extract_camera_key_from_filename(filename):
"""
Parameters:
- filename: the name of the file where the samples image is stored.
Ex: 'samples/CAM_BACK/n015-2018-10-02-10-50-40+0800__CAM_BACK__1538448750037525.jpg',
"""

camera_name = filename.split('/')[1]

# Validate the camera name is valid
camera_names = ['CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',
'CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT']
assert(camera_name in camera_names), "Invalid camera name: {} from path: {}".format(
camera_name, filename)

return camera_name


def calculate_bb_area(bounding_box):
"""
Calculates area of a 2D bounding box
Parameters:
- bounding_box: np.array of length 4 (x min, y min, x max, y max)
"""
x_min, y_min, x_max, y_max = bounding_box
return (x_max - x_min) * (y_max - y_min)


def get_most_visible_camera_annotation(camera_data_dict):
"""
Parameters:
- camera_data_dict: dictionary of form:
{
'CAM_BACK': {'attribute_tokens': ['cb5118da1ab342aa947717dc53544259'],
'bbox_corners': [600.8315617945755,
426.38901275036744,
643.6756536789582,
476.66593163100237],
'category_name': 'vehicle.bus.rigid',
'filename': 'samples/CAM_BACK/n015-2018-10-02-10-50-40+0800__CAM_BACK__1538448750037525.jpg',
'instance_token': '9cba9cd8af85487fb010652c90d845b5',
'next': 'ef90c2e525244b7d9eeb759837cf2277',
'num_lidar_pts': 0,
'num_radar_pts': 0,
'prev': '6628e81912584a72bd448a44931afb42',
'sample_annotation_token': '06b4886e79d2435c80bd23e7ac60c618',
'sample_data_token': '0008443755a14b3ca483f1489c767040',
'visibility_token': '4'},
'CAM_FRONT': ...
...
}
"""

# Loop through all the camera views to find the best view of this instance
# Each of the cameras will have a corresponding bounding box and visibility
# we want the largest bounding box and highest visibility
best_visibility = ''
largest_area = -1
best_camera_token = None

for camera_token in camera_data_dict:
visibility = camera_data_dict[camera_token]['visibility_token']
bbox_area = camera_data_dict[camera_token]['bbox_area']

if visibility > best_visibility or (visibility == best_visibility and bbox_area > largest_area):
best_camera_token = camera_token
largest_area = bbox_area
best_visibility = visibility

if not best_camera_token:
print('Unable to find any good views for camera data dict: {}'.format(
camera_data_dict))

best_instance_data = camera_data_dict[best_camera_token]
return best_instance_data


def get_cropped_image_for_annotation(sample_data_annotation, data_directory, output_size):
"""
Parameters:
- sample_data_annotation: of form:
```
{'attribute_tokens': ['cb5118da1ab342aa947717dc53544259'],
'bbox_corners': [600.8315617945755,
426.38901275036744,
643.6756536789582,
476.66593163100237],
'category_name': 'vehicle.bus.rigid',
'filename': 'samples/CAM_BACK/n015-2018-10-02-10-50-40+0800__CAM_BACK__1538448750037525.jpg',
'instance_token': '9cba9cd8af85487fb010652c90d845b5',
'next': 'ef90c2e525244b7d9eeb759837cf2277',
'num_lidar_pts': 0,
'num_radar_pts': 0,
'prev': '6628e81912584a72bd448a44931afb42',
'sample_annotation_token': '06b4886e79d2435c80bd23e7ac60c618',
'sample_data_token': '0008443755a14b3ca483f1489c767040',
'visibility_token': '4'},
```
"""
data_path = os.path.join(data_directory,
sample_data_annotation['filename'])
bbox = sample_data_annotation['bbox_corners']
im = Image.open(data_path)
im1 = im.crop(bbox)
im1 = im1.resize(output_size)
np_img = np.asarray(im1)
return np_img


def sort_sample_annotations_chronologically(instance_dict):
"""
Parameters:
- instance_dict: taken by indexing bbox_2d_annotations[instance_token]
Uses ['bd26c2cdb22d4bb1834e808c89128898'][sample_annotation_token]['best_annotation']
to find the correct sequence
"""

# Find the first sample token
first_sample_token = None

for sample_token in instance_dict:
if instance_dict[sample_token]['best_annotation']['prev'] == '':
first_sample_token = sample_token
break

if first_sample_token is None:
print("Unable to find a start token")

# Now iterate and find a list of the sample_tokens in order
sequential_sample_tokens = [first_sample_token]

while True:
try:
next_sample_token = instance_dict[sequential_sample_tokens[-1]
]['best_annotation']['next']
except:
print("Unrecognized sample annotaton token: {}", sequential_sample_tokens)
break

if next_sample_token == '':
break

sequential_sample_tokens.append(next_sample_token)

return sequential_sample_tokens


def remove_bad_samples(instance_annotation, minimum_bb_area, minimum_visibility, image_area=1600*900):
"""Removes bad samples from an instance annotation's sample sequence
Args:
instance_annotation (object): an instance annotation
minimum_bb_area (float): The minimum fraction of a frame a bounding box take up to be used (0, 1)
minimum_visibility (string): The minimum visibility a frame is allowed to haev ('', '1', '2', '3', '4')
image_area (int, optional): The area of an image frame. Defaults to 1600*900.
Returns: a cleaned list of sample annotation tokens that meet requirements
"""
sample_token_sequence = instance_annotation['sample_annotation_sequence']
cleaned = []

for sample_token in sample_token_sequence:
area = instance_annotation[sample_token]['best_annotation']['bbox_area']
visibility = instance_annotation[sample_token]['best_annotation']['visibility_token']
if area / image_area > minimum_bb_area and visibility >= minimum_visibility:
cleaned.append(sample_token)

return cleaned

def main(version, dataroot, output, object_categories, fps, output_size, minimum_frames, minimum_bb_area, visibility):
"""Generates video sequences of NuScene object instances over time.
Expects the data to be organized as:
```
"$dataroot"/
samples - Sensor data for keyframes.
sweeps - Sensor data for intermediate frames.
maps - Folder for all map files: rasterized .png images and vectorized .json files.
v1.0-* - JSON tables that include all the meta data and annotations. Each split (trainval, test, mini) is provided in a separate folder.
Note that image_annotations.json should be inside this directory.
```
Args:
version (string): the NuScenes data version
dataroot (string): the path to the data root directory
output (string): the path to the output video directory
fps: frames per second to use for the video
output_size (tuple): the output dimension to resize every cropped bounding box to. Defaults to (112, 112)
minimum_frames (int): the minimum number of frames an instance must have
minimum_bb_area (float): the minimum fraction of a frame a bounding box take up to be used (0, 1)
visibility (string): the minimum visibility a frame is allowed to haev ('', '1', '2', '3', '4')
"""
print('='*20)
print('Generating video sequences:')
print('\t* Size: {}'.format(output_size))
print('\t* FPS: {}'.format(fps))
print('\t* Minimum frame count: {}'.format(minimum_frames))
print('\t* Minimum BB area: {}'.format(minimum_bb_area))
print('\t* Minimum visibility: {}'.format(visibility))

# ================================Load image annotations ========================================
with open(os.path.join(dataroot, version, 'image_annotations.json')) as json_file:
# A list of dictionaries
bbox_2d_annotations_list = json.load(json_file)

# These can be indexed with [instance_token][sample_annotation_token][camera_name] -> data about the annotation
# You can use the sample_annotation_token with the nuscenes helper in order to get
# the sample tokens
bbox_2d_annotations = convert_annotation_list_to_dict(
bbox_2d_annotations_list, categories=object_categories)
print('Number of unique vehicle instances: {}'.format(len(bbox_2d_annotations)))
# ==============================================================================================


# ===== For each instance and each sample annotation, find the best camera sensor to use ======
# Get sorted sample annotation tokens per instance per camera
for instance_token in bbox_2d_annotations:
for sample_annotation_token in bbox_2d_annotations[instance_token]:
bbox_2d_annotations[instance_token][sample_annotation_token]['best_annotation'] = get_most_visible_camera_annotation(
bbox_2d_annotations[instance_token][sample_annotation_token])
# ==============================================================================================


# ====== For each instance, find the correct sequence of sample annotations ====================
# Get sorted sample annotation tokens per instance per camera
for instance_token in bbox_2d_annotations:
bbox_2d_annotations[instance_token]['sample_annotation_sequence'] = sort_sample_annotations_chronologically(
bbox_2d_annotations[instance_token])
# ==============================================================================================

# ====== Remove samples from sequence that don't meet requirements ====================
for instance_token in bbox_2d_annotations:
bbox_2d_annotations[instance_token]['sample_annotation_sequence'] = remove_bad_samples(
bbox_2d_annotations[instance_token], minimum_bb_area, visibility)
# ==============================================================================================

# ====== Create videos for every instance ======================================================

# Remove the directory if it already exists and create new one
rmtree(output, ignore_errors=True)
pathlib.Path(output).mkdir(parents=True, exist_ok=True)

print("Creating videos and storing in '{}'...".format(output))
total_videos = 0
for instance_token in tqdm(bbox_2d_annotations):
sample_annotation_tokens = bbox_2d_annotations[instance_token]['sample_annotation_sequence']

if len(sample_annotation_tokens) < minimum_frames:
continue

video_path = os.path.join(
output, '{}.mp4'.format(instance_token))

# Need to use vp09 to be able to upload to certain data annotation platforms
out = cv2.VideoWriter(
video_path, cv2.VideoWriter_fourcc(*'vp09'), fps, output_size)

for sample_annotation_token in sample_annotation_tokens:
best_annotation = bbox_2d_annotations[instance_token][sample_annotation_token]['best_annotation']
cropped_img = get_cropped_image_for_annotation(
best_annotation, dataroot, output_size)

# Convert from PIL's RGB to cv2 BGR
out.write(cropped_img[:, :, ::-1])

out.release()

total_videos += 1

print('Created {} videos ({} did not meet requirements).'.format(
total_videos, len(bbox_2d_annotations) - total_videos, minimum_frames))
# ==============================================================================================
print('='*20)


if __name__ == "__main__":
# Construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataroot", type=str,
help="The path to the root directory where the data is stored")
ap.add_argument("-v", "--version", type=str,
help="The NuScene's data version")
ap.add_argument("-o", "--output", type=str,
help="The output video directory")
ap.add_argument("-f", "--fps", type=int, default=2,
help="Frames per second for output video (use 2 to match speed of original data)")
ap.add_argument("-m", "--minimum_frames", type=int, default=9,
help="The minimum number of frames an instance must have")
ap.add_argument("-p", "--minimum_percentage", type=float, default=0.01,
help="The minimum fraction of a frame a bounding box take up to be used (0, 1)")
ap.add_argument("--visibility", type=str, default='2',
help="The minimum visibility a frame is allowed ('', '1', '2', '3', '4')")
ap.add_argument("-s", "--size", type=int, default=[112, 112], nargs=2,
help="Size of the output video")

# Excludes bicycle and motorcycle by default
vehicle_categories = ['vehicle.bus.bendy', 'vehicle.bus.rigid',
'vehicle.car', 'vehicle.construction', 'vehicle.trailer', 'vehicle.truck']
ap.add_argument("-c", "--categories", nargs='+',
help="The categories to extract videos for", required=False, default=vehicle_categories)

args = vars(ap.parse_args())
main(args['version'], args['dataroot'], args['output'], args['categories'],
args['fps'], tuple(args['size']), args['minimum_frames'], args['minimum_percentage'], args["visibility"])

0 comments on commit 6e0ed36

Please sign in to comment.