tools/dist_test.py

import argparse
import copy
import json
import os
import sys

try:
    import apex
except:
    print("No APEX!")
import numpy as np
import torch
import yaml
from det3d import torchie
from det3d.datasets import build_dataloader, build_dataset
from det3d.models import build_detector
from det3d.torchie import Config
from det3d.torchie.apis import (
    batch_processor,
    build_optimizer,
    get_root_logger,
    init_dist,
    set_random_seed,
    train_detector,
)
from det3d.torchie.trainer import get_dist_info, load_checkpoint
from det3d.torchie.trainer.utils import all_gather, synchronize
from torch.nn.parallel import DistributedDataParallel
import pickle 
import time 

def save_pred(pred, root):
    with open(os.path.join(root, "prediction.pkl"), "wb") as f:
        pickle.dump(pred, f)


def load_pred(root):
    with open(root, 'rb') as f:
        predictions = pickle.load(f)
    return predictions

def parse_args():
    parser = argparse.ArgumentParser(description="Train a detector")
    parser.add_argument("config", help="train config file path")
    parser.add_argument("--work_dir", required=True, help="the dir to save logs and models")
    parser.add_argument(
        "--checkpoint", help="the dir to checkpoint which the model read from"
    )
    parser.add_argument(
        "--pkl", default=None, help="the dir to *.pkl file which the model read from"
    )
    parser.add_argument(
        "--openpcdet", action="store_true", help="save result.pkl following OpenPCDet"
    )
    parser.add_argument(
        "--txt_result",
        type=bool,
        default=False,
        help="whether to save results to standard KITTI format of txt type",
    )
    parser.add_argument(
        "--gpus",
        type=int,
        default=1,
        help="number of gpus to use " "(only applicable to non-distributed training)",
    )
    parser.add_argument(
        "--launcher",
        choices=["none", "pytorch", "slurm", "mpi"],
        default="none",
        help="job launcher",
    )
    parser.add_argument("--speed_test", action="store_true")
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--testset", action="store_true")

    args = parser.parse_args()
    if "LOCAL_RANK" not in os.environ:
        os.environ["LOCAL_RANK"] = str(args.local_rank)

    return args


def transform_pcdet_format(result, save_dir):
    class_names = ['Vehicle', 'Pedestrian', 'Cyclist']
    # class_names = ['Pedestrian']
    new_results = []
    for token, result_dict in result.items():
        new_result = {}
        sequence_name = result_dict['metadata']['pcdet_token']['context_name']
        frame_id = 'segment-' + sequence_name + '_with_camera_labels_' + ('%03d' % int(token.split('.')[0].split('_')[-1]))
        new_result['frame_id'] = frame_id
        new_result['metadata'] = result_dict['metadata']['pcdet_token']
        new_result['score'] = result_dict['scores'].cpu().numpy()
        new_result['boxes_lidar'] = result_dict['box3d_lidar'].cpu().numpy()
        new_result['boxes_lidar'][:, [3,4]] = new_result['boxes_lidar'][:, [4,3]]
        new_result['boxes_lidar'][:, -1] = -new_result['boxes_lidar'][:, -1] - np.pi/2
        new_result['name'] = np.array([class_names[int(ele)] for ele in result_dict['label_preds']])
        new_results.append(new_result)
    with open(save_dir + '/result.pkl', 'wb') as f:
        pickle.dump(new_results, f)
    return

def main():

    # torch.manual_seed(0)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False
    # np.random.seed(0)

    args = parse_args()

    cfg = Config.fromfile(args.config)
    cfg.local_rank = args.local_rank

    # update configs according to CLI args
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir

    distributed = False
    if "WORLD_SIZE" in os.environ:
        distributed = int(os.environ["WORLD_SIZE"]) > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl", init_method="env://")

        cfg.gpus = torch.distributed.get_world_size()
    else:
        cfg.gpus = args.gpus


    # init logger before other steps
    logger = get_root_logger(cfg.log_level)
    logger.info("Distributed testing: {}".format(distributed))
    logger.info(f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}")

    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)

    if args.testset:
        print("Use Test Set")
        dataset = build_dataset(cfg.data.test)
    else:
        print("Use Val Set")
        dataset = build_dataset(cfg.data.val)

    data_loader = build_dataloader(
        dataset,
        batch_size=cfg.data.samples_per_gpu if not args.speed_test else 1,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False,
    )

    if args.pkl is None:
        checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu")

        # put model on gpus
        if distributed:
            model = apex.parallel.convert_syncbn_model(model)
            model = DistributedDataParallel(
                model.cuda(cfg.local_rank),
                device_ids=[cfg.local_rank],
                output_device=cfg.local_rank,
                # broadcast_buffers=False,
                find_unused_parameters=True,
            )
        else:
            # model = fuse_bn_recursively(model)
            model = model.cuda()

        model.eval()
        mode = "val"

        logger.info(f"work dir: {args.work_dir}")
        if cfg.local_rank == 0:
            prog_bar = torchie.ProgressBar(len(data_loader.dataset) // cfg.gpus)

        detections = {}
        cpu_device = torch.device("cpu")

        start = time.time()

        start = int(len(dataset) / 3)
        end = int(len(dataset) * 2 /3)

        time_start = 0
        time_end = 0

        for i, data_batch in enumerate(data_loader):

            if i == start:
                torch.cuda.synchronize()
                time_start = time.time()

            if i == end:
                torch.cuda.synchronize()
                time_end = time.time()

            with torch.no_grad():
                outputs = batch_processor(
                    model, data_batch, train_mode=False, local_rank=args.local_rank,
                )

            for output in outputs:
                token = output["metadata"]["token"]
                for k, v in output.items():
                    if k not in [
                        "metadata",
                    ]:
                        output[k] = v.to(cpu_device)
                detections.update(
                    {token: output,}
                )
                if args.local_rank == 0:
                    prog_bar.update()

        synchronize()

        all_predictions = all_gather(detections)

        print("\n Total time per frame: ", (time_end -  time_start) / (end - start))

        if args.local_rank != 0:
            return

        predictions = {}
        for p in all_predictions:
            predictions.update(p)

        if not os.path.exists(args.work_dir):
            os.makedirs(args.work_dir)

        save_pred(predictions, args.work_dir)

    else:
        predictions = load_pred(args.pkl)

    if args.openpcdet:
        transform_pcdet_format(predictions, args.work_dir)
        print("Evaluation Done")
    else:
        result_dict, _ = dataset.evaluation(copy.deepcopy(predictions), output_dir=args.work_dir, testset=args.testset)

        if result_dict is not None:
            for k, v in result_dict["results"].items():
                print(f"Evaluation {k}: {v}")

    if args.txt_result:
        assert False, "No longer support kitti"

if __name__ == "__main__":
    main()