Skip to content

Inference speed issue #9951

@bgyooPtr

Description

@bgyooPtr

Search before asking

  • I have searched the YOLOv5 issues and found no similar bug report.

YOLOv5 Component

Detection, Integrations

Bug

When executing real-time prediction using Realsense d415, there is a big difference in speed depending on the camera fps.
The higher the camera FPS, the slower the inference speed.
Obviously, I wonder why this happens even though the function that reads data is composed of threads.

  • fps: 30
    avg time: 10.963139772415161
  • fps: 60
    avg time: 24.456851720809937
  • fps: 90
    avg time: 35.522178649902344

Environment

  • detect: weights=['runs/train/exp2/weights/best.pt'], data=data/coco128.yaml, imgsz=[500, 500], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1
  • YOLOv5 🚀 v6.2-211-g32a9218 Python-3.7.13 torch-1.8.1 CUDA:0 (NVIDIA TITAN RTX, 24217MiB)

Fusing layers...
Model summary: 157 layers, 1760518 parameters, 0 gradients, 4.1 GFLOPs
WARNING ⚠️ --img-size [500, 500] must be multiple of max stride 32, updating to [512, 512]

Minimal Reproducible Example

class LoadStreams1:
    # YOLOv5 streamloader, i.e. `python detect.py --source 'rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP streams`
    def __init__(
        self,
        img_size=640,
        stride=32,
        auto=True,
        transforms=None,
        vid_stride=1,
        fps=30,
    ):
        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
        self.img_size = img_size
        self.stride = stride
        self.vid_stride = vid_stride  # video frame-rate stride
        self.img, self.depth, self.fps, self.frame, self.thread = (
            None,
            None,
            0,
            0,
            None,
        )

        self.pipeline = rs.pipeline()
        config = rs.config()
        config.enable_stream(rs.stream.color, 848, 480, rs.format.rgb8, fps)
        config.enable_stream(rs.stream.depth, 848, 480, rs.format.z16, fps)
        self.align_to_color = rs.align(rs.stream.color)

        self.prof = self.pipeline.start(config)
        device = self.prof.get_device().first_depth_sensor()
        preset_range = device.get_option_range(rs.option.visual_preset)
        for i in range(int(preset_range.max)):
            visulpreset = device.get_option_value_description(
                rs.option.visual_preset, i
            )
            print("%02d: %s" % (i, visulpreset))
            if visulpreset == "High Accuracy":
                device.set_option(rs.option.visual_preset, i)
                print(":: set preset to High Accuracy")

        # warm up
        for i in range(60):
            # ret, frames = self.pipeline.wait_for_frames()
            ret, frames = self.pipeline.try_wait_for_frames()
            if not ret:
                print("warm up failed")
                continue
            aligned_frames = self.align_to_color.process(frames)
            color_frame = aligned_frames.get_color_frame()
            rgb_image = np.asanyarray(color_frame.get_data())
            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
            self.img = bgr_image

            depth_frame = aligned_frames.get_depth_frame()
            depth_data = np.array(depth_frame.data)
            self.depth = depth_data

            break

        self.thread = threading.Thread(
            target=self.update, args=(), daemon=True
        )
        self.thread.start()
        while True:
            if self.thread.is_alive():
                break
            time.sleep(0.1)

        # check for common shapes
        s = np.stack(
            [
                letterbox(self.img, img_size, stride=stride, auto=auto)[0].shape
            ]
        )
        self.rect = (
            np.unique(s, axis=0).shape[0] == 1
        )  # rect inference if all shapes equal
        self.auto = auto and self.rect
        self.transforms = transforms  # optional
        if not self.rect:
            LOGGER.warning(
                "WARNING ⚠️ Stream shapes differ. For optimal performance supply similarly-shaped streams."
            )

    def update(self):
        while True:
            ret, frames = self.pipeline.try_wait_for_frames()
            if not ret:
                LOGGER.warning(
                    "WARNING ⚠️ Video stream unresponsive, please check your IP camera connection."
                )
                break
            aligned_frames = self.align_to_color.process(frames)
            color_frame = aligned_frames.get_color_frame()
            rgb_image = np.asanyarray(color_frame.get_data())
            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)

            depth_frame = aligned_frames.get_depth_frame()
            depth_data = np.array(depth_frame.data)

            self.imgs = bgr_image
            self.depth = depth_data
            time.sleep(0.0)  # wait time

    def __iter__(self):
        self.count = -1
        return self

    def __next__(self):
        self.count += 1
        if not self.thread.is_alive() or cv2.waitKey(1) == ord(
            "q"
        ):  # q to quit
            cv2.destroyAllWindows()
            raise StopIteration

        im0 = self.img.copy()
        depthu = None
        if isinstance(self.depth, np.ndarray):
            depthu = self.depth.copy()
        im = np.stack(
            [
                letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0]
            ]
        )  # resize
        im = im[..., ::-1].transpose((0, 3, 1, 2))  # BGR to RGB, BHWC to BCHW
        im = np.ascontiguousarray(im)  # contiguous

        return im, im0, depthu
    # Load model
    device = select_device(device)
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
    stride, names, pt = model.stride, model.names, model.pt
    imgsz = check_img_size(imgsz, s=stride)  # check image size

    # Dataloader
    bs = 1  # batch_size

    # Run inference
    model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz))  # warmup
    dataset = LoadStreams1(
         img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride, fps=90
    )
    exit(0)
    times = []
    for  _, im0s, depth in dataset:
        if len(times) == 1000:
            break
        s_time = time.time()
        img0 = letterbox(im0s, imgsz, stride, pt)[0]
        img = img0.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
        img = np.ascontiguousarray(img)
        im = torch.from_numpy(img).to(model.device)
        im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
        im /= 255  # 0 - 255 to 0.0 - 1.0
        if len(im.shape) == 3:
            im = im[None]  # expand for batch dim

        # Inference
        pred = model(im, augment=augment, visualize=False)

        # NMS
        pred = non_max_suppression(
            pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det
        )

        e_time = (time.time() - s_time) * 1000
        times.append(e_time)
        print("elapsed time: ", e_time)
    print("avg time: ", np.mean(times))

Additional

No response

Are you willing to submit a PR?

  • Yes I'd like to help by submitting a PR!

Metadata

Metadata

Assignees

No one assigned

    Labels

    StaleStale and schedule for closing soonbugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions