openvinotoolkit
diff --git a/‎demos/python_demos/human_pose_estimation_demo/README.md‎
Lines changed: 19 additions & 19 deletions b/‎demos/python_demos/human_pose_estimation_demo/README.md‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎demos/python_demos/human_pose_estimation_demo/human_pose_estimation.py‎
Lines changed: 20 additions & 11 deletions b/‎demos/python_demos/human_pose_estimation_demo/human_pose_estimation.py‎
Lines changed: 20 additions & 11 deletions
diff --git a/‎demos/python_demos/human_pose_estimation_demo/human_pose_estimation_demo/decoder.py‎
Lines changed: 14 additions & 214 deletions b/‎demos/python_demos/human_pose_estimation_demo/human_pose_estimation_demo/decoder.py‎
Lines changed: 14 additions & 214 deletions
@@ -32,42 +32,42 @@ python3 human_pose_estimation.py -h
 ```
 The command yields the following usage message:
 ```
-usage: human_pose_estimation.py [-h] -m MODEL -i INPUT [-d DEVICE]
-                                [-t PROB_THRESHOLD] [-r]
-                                [-nireq NUM_INFER_REQUESTS]
+usage: human_pose_estimation.py [-h] -i INPUT -m MODEL --type {ae,openpose}
+                                [--tsize TSIZE] [-t PROB_THRESHOLD] [-r]
+                                [-d DEVICE] [-nireq NUM_INFER_REQUESTS]
                                 [-nstreams NUM_STREAMS]
                                 [-nthreads NUM_THREADS] [-loop LOOP]
                                 [-no_show] [-u UTILIZATION_MONITORS]
 
 Options:
   -h, --help            Show this help message and exit.
-  -m MODEL, --model MODEL
-                        Required. Path to an .xml file with a trained model.
   -i INPUT, --input INPUT
                         Required. Path to an image, video file or a numeric
                         camera ID.
-  -d DEVICE, --device DEVICE
-                        Optional. Specify the target device to infer on; CPU,
-                        GPU, FPGA, HDDL or MYRIAD is acceptable. The sample
-                        will look for a suitable plugin for device specified.
-                        Default value is CPU.
+  -m MODEL, --model MODEL
+                        Required. Path to an .xml file with a trained model.
+  --type {ae,openpose}  Required. Type of the network, either "ae" for
+                        Associative Embedding or "openpose" for OpenPose.
+  --tsize TSIZE         Optional. Target input size. This demo implements
+                        image pre-processing pipeline that is common to human
+                        pose estimation approaches. Image is resize first to
+                        some target size and then the network is reshaped to
+                        fit the input image shape. By default target image
+                        size is determined based on the input shape from IR.
+                        Alternatively it can be manually set via this
+                        parameter. Note that for OpenPose-like nets image is
+                        resized to a predefined height, which is the target
+                        size in this case. For Associative Embedding-like nets
+                        target size is the length of a short image side.
   -t PROB_THRESHOLD, --prob_threshold PROB_THRESHOLD
                         Optional. Probability threshold for poses filtering.
   -r, --raw_output_message
                         Optional. Output inference results raw values showing.
-                        Required. Path to an .xml file with a trained model.
-  -i INPUT, --input INPUT
-                        Required. Path to an image, video file or a numeric
-                        camera ID.
   -d DEVICE, --device DEVICE
                         Optional. Specify the target device to infer on; CPU,
                         GPU, FPGA, HDDL or MYRIAD is acceptable. The sample
                         will look for a suitable plugin for device specified.
                         Default value is CPU.
-  -t PROB_THRESHOLD, --prob_threshold PROB_THRESHOLD
-                        Optional. Probability threshold for poses filtering.
-  -r, --raw_output_message
-                        Optional. Output inference results raw values showing.
   -nireq NUM_INFER_REQUESTS, --num_infer_requests NUM_INFER_REQUESTS
                         Optional. Number of infer requests
   -nstreams NUM_STREAMS, --num_streams NUM_STREAMS
@@ -87,7 +87,7 @@ Options:
 ```
 
 Running the application with the empty list of options yields the short usage message and an error message.
-You can use the following command to do inference on GPU with a pre-trained human pose estimation model:
+You can use the following command to do inference on CPU with a pre-trained human pose estimation model:
 ```
 python3 human_pose_estimation.py -i <path_to_video>/inputVideo.mp4 -m <path_to_model>/hpe.xml -d CPU
 ```
 
@@ -42,18 +42,29 @@ def build_argparser():
     parser = ArgumentParser(add_help=False)
     args = parser.add_argument_group('Options')
     args.add_argument('-h', '--help', action='help', default=SUPPRESS, help='Show this help message and exit.')
-    args.add_argument('-m', '--model', help='Required. Path to an .xml file with a trained model.',
-                      required=True, type=str)
     args.add_argument('-i', '--input', help='Required. Path to an image, video file or a numeric camera ID.',
                       required=True, type=str)
+    args.add_argument('-m', '--model', help='Required. Path to an .xml file with a trained model.',
+                      required=True, type=str)
+    args.add_argument('--type', choices=('ae', 'openpose'), required=True, type=str,
+                      help='Required. Type of the network, either "ae" for Associative Embedding '
+                           'or "openpose" for OpenPose.')
+    args.add_argument('--tsize', default=None, type=int,
+                      help='Optional. Target input size. This demo implements image pre-processing pipeline '
+                           'that is common to human pose estimation approaches. Image is resize first to some '
+                           'target size and then the network is reshaped to fit the input image shape. '
+                           'By default target image size is determined based on the input shape from IR. '
+                           'Alternatively it can be manually set via this parameter. Note that for OpenPose-like '
+                           'nets image is resized to a predefined height, which is the target size in this case. '
+                           'For Associative Embedding-like nets target size is the length of a short image side.')
+    args.add_argument('-t', '--prob_threshold', help='Optional. Probability threshold for poses filtering.',
+                      default=0.1, type=float)
+    args.add_argument('-r', '--raw_output_message', help='Optional. Output inference results raw values showing.',
+                      default=False, action='store_true')
     args.add_argument('-d', '--device',
                       help='Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is '
                            'acceptable. The sample will look for a suitable plugin for device specified. '
                            'Default value is CPU.', default='CPU', type=str)
-    args.add_argument('-t', '--prob_threshold', help='Optional. Probability threshold for poses filtering.',
-                      default=0.5, type=float)
-    args.add_argument('-r', '--raw_output_message', help='Optional. Output inference results raw values showing.',
-                      default=False, action='store_true')
     args.add_argument('-nireq', '--num_infer_requests', help='Optional. Number of infer requests',
                       default=1, type=int)
     args.add_argument('-nstreams', '--num_streams',
@@ -68,8 +79,6 @@ def build_argparser():
     args.add_argument('-no_show', '--no_show', help="Optional. Don't show output", action='store_true')
     args.add_argument('-u', '--utilization_monitors', default='', type=str,
                       help='Optional. List of monitors to show initially.')
-
-    args.add_argument('--type', default='ae', choices=('ae', 'openpose'), type=str, help='Optional.')
     return parser
 
 
@@ -140,12 +149,12 @@ def main():
 
     hpes = {
         Modes.USER_SPECIFIED:
-            HPE(ie, args.model, device=args.device, plugin_config=config_user_specified,
+            HPE(ie, args.model, target_size=args.tsize, device=args.device, plugin_config=config_user_specified,
                 results=completed_request_results, max_num_requests=args.num_infer_requests,
                 caught_exceptions=exceptions),
         Modes.MIN_LATENCY:
-            HPE(ie, args.model, device=args.device.split(':')[-1].split(',')[0], plugin_config=config_min_latency,
-                results=completed_request_results, max_num_requests=1,
+            HPE(ie, args.model, target_size=args.tsize, device=args.device.split(':')[-1].split(',')[0],
+                plugin_config=config_min_latency, results=completed_request_results, max_num_requests=1,
                 caught_exceptions=exceptions)
     }
 
 
@@ -1,218 +1,18 @@
-import numpy as np
-from openvino.inference_engine import IECore
-from scipy.optimize import linear_sum_assignment
+"""
+ Copyright (C) 2020 Intel Corporation
 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
 
-class AssociativeEmbeddingDecoder:
+      http://www.apache.org/licenses/LICENSE-2.0
 
-    def __init__(self, num_joints, max_num_people, detection_threshold, use_detection_val,
-                 ignore_too_much, tag_threshold, adjust=True, refine=True, delta=0.0, joints_order=None):
-        self.num_joints = num_joints
-        self.max_num_people = max_num_people
-        self.detection_threshold = detection_threshold
-        self.tag_threshold = tag_threshold
-        self.use_detection_val = use_detection_val
-        self.ignore_too_much = ignore_too_much
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
 
-        if self.num_joints == 17 and joints_order is None:
-            self.joint_order = (0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16)
-        else:
-            self.joint_order = list(np.arange(self.num_joints))
-        
-        self.do_adjust = adjust
-        self.do_refine = refine
-        self.delta = delta
-
-    def match(self, tag_k, loc_k, val_k):
-        return list(map(self._match_by_tag, zip(tag_k, loc_k, val_k)))
-
-    def _max_match(self, scores):
-        r, c = linear_sum_assignment(scores)
-        tmp = np.stack((r, c), axis=1)
-        return tmp
-
-    def _match_by_tag(self, inp):
-        tag_k, loc_k, val_k = inp
-
-        embd_size = tag_k.shape[2]
-
-        class Pose:
-            def __init__(self, num_joints, tag_size=1):
-                self.num_joints = num_joints
-                self.tag_size = tag_size
-                self.pose = np.zeros((num_joints, 2 + 1 + tag_size), dtype=np.float32)
-                self.pose_tag = np.zeros(tag_size, dtype=np.float32)
-                self.valid_points_num = 0
-
-            def add(self, idx, joint, tag):
-                self.pose[idx] = joint
-                self.pose_tag = (self.pose_tag * self.valid_points_num) + tag
-                self.valid_points_num += 1
-                self.pose_tag /= self.valid_points_num
-
-            @property
-            def tag(self):
-                if self.valid_points_num > 0:
-                    return self.pose_tag
-                else:
-                    return None
-
-        all_joints = np.concatenate((loc_k, val_k[..., None], tag_k), -1)
-
-        poses = []
-        for idx in self.joint_order:
-            tags = tag_k[idx]
-            joints = all_joints[idx]
-            mask = joints[:, 2] > self.detection_threshold
-            tags = tags[mask]
-            joints = joints[mask]
-
-            if joints.shape[0] == 0:
-                continue
-
-            if len(poses) == 0:
-                for tag, joint in zip(tags, joints):
-                    pose = Pose(self.num_joints, embd_size)
-                    pose.add(idx, joint, tag)
-                    poses.append(pose)
-            else:
-                if self.ignore_too_much and len(poses) == self.max_num_people:
-                    continue
-                poses_tags = np.stack([p.tag for p in poses], axis=0)
-
-                diff = tags[:, None] - poses_tags[None, :]
-                diff_normed = np.linalg.norm(diff, ord=2, axis=2)
-                diff_saved = np.copy(diff_normed)
-
-                if self.use_detection_val:
-                    diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
-
-                num_added = diff.shape[0]
-                num_grouped = diff.shape[1]
-
-                if num_added > num_grouped:
-                    diff_normed = np.concatenate(
-                        (diff_normed,
-                        np.zeros((num_added, num_added - num_grouped),
-                                dtype=np.float32) + 1e10),
-                        axis=1)
-
-                pairs = self._max_match(diff_normed)
-                for row, col in pairs:
-                    if row < num_added and col < num_grouped and diff_saved[row][col] < self.tag_threshold:
-                        poses[col].add(idx, joints[row], tags[row])
-                    else:
-                        pose = Pose(self.num_joints, embd_size)
-                        pose.add(idx, joints[row], tags[row])
-                        poses.append(pose)
-
-        if len(poses):
-            ans = np.stack([p.pose for p in poses]).astype(np.float32)
-            tags = np.stack([p.tag for p in poses]).astype(np.float32)
-        else:
-            ans = np.empty((0, self.num_joints, 2 + 1 + embd_size), dtype=np.float32)
-            tags = np.empty((0, embd_size), dtype=np.float32)
-        return ans, tags
-
-    def top_k(self, heatmaps, tags):
-        N, K, H, W = heatmaps.shape
-        heatmaps = heatmaps.reshape(N, K, -1)
-        ind = heatmaps.argpartition(-self.max_num_people, axis=2)[:, :, -self.max_num_people:]
-        val_k = np.take_along_axis(heatmaps, ind, axis=2)
-        subind = np.argsort(-val_k, axis=2)
-        ind = np.take_along_axis(ind, subind, axis=2)
-        val_k = np.take_along_axis(val_k, subind, axis=2)
-
-        tags = tags.reshape(N, K, W * H, -1)
-        tag_k = [np.take_along_axis(tags[..., i], ind, axis=2) for i in range(tags.shape[3])]
-        tag_k = np.stack(tag_k, axis=3)
-
-        x = ind % W
-        y = ind // W
-        ind_k = np.stack((x, y), axis=3)
-
-        ans = {'tag_k': tag_k, 'loc_k': ind_k, 'val_k': val_k}
-        return ans
-
-    def adjust(self, ans, heatmaps):
-        H, W = heatmaps.shape[-2:]
-        for n, people in enumerate(ans):
-            for person in people:
-                for k, joint in enumerate(person):
-                    heatmap = heatmaps[n, k]
-                    px = int(joint[0])
-                    py = int(joint[1])
-                    if 1 < px < W - 1 and 1 < py < H - 1:
-                        diff = np.array([
-                            heatmap[py, px + 1] - heatmap[py, px - 1],
-                            heatmap[py + 1, px] - heatmap[py - 1, px]
-                        ])
-                        joint[:2] += np.sign(diff) * .25
-        return ans
-
-    def refine(self, heatmap, tag, keypoints, pose_tag=None):
-        K, H, W = heatmap.shape
-        if len(tag.shape) == 3:
-            tag = tag[..., None]
-
-        if pose_tag is not None:
-            prev_tag = pose_tag
-        else:
-            tags = []
-            for i in range(K):
-                if keypoints[i, 2] > 0:
-                    x, y = keypoints[i][:2].astype(int)
-                    tags.append(tag[i, y, x])
-            prev_tag = np.mean(tags, axis=0)
-
-        # Allocate the buffer for tags similarity matrix.
-        tag_copy = np.empty_like(tag[0, ..., 0])
-        for i, (_heatmap, _tag) in enumerate(zip(heatmap, tag)):
-            if keypoints[i, 2] > 0:
-                continue
-            tag_copy[...] = _tag[..., 0]
-            diff = tag_copy
-            diff -= prev_tag
-            np.abs(diff, out=diff)
-            np.floor(diff + 0.5, out=diff)
-            diff -= _heatmap
-            idx = diff.argmin()
-            y, x = np.divmod(idx, _heatmap.shape[-1])
-
-            # detection score at maximum position
-            val = _heatmap[y, x]
-
-            if val > 0:
-                keypoints[i, :3] = x, y, val
-                if 1 < x < W - 1 and 1 < y < H - 1:
-                    diff = np.array([
-                        _heatmap[y, x + 1] - _heatmap[y, x - 1],
-                        _heatmap[y + 1, x] - _heatmap[y - 1, x]
-                    ])
-                    keypoints[i, :2] += np.sign(diff) * .25
-
-        return keypoints
-
-    def __call__(self, heatmaps, tags, nms_heatmaps=None):
-        ans = self.match(**self.top_k(nms_heatmaps, tags))
-        ans, ans_tags = map(list, zip(*ans))
-
-        if self.do_adjust:
-            ans = self.adjust(ans, heatmaps)
-
-        if self.delta != 0.0:
-            for people in ans:
-                for person in people:
-                    for joint in person:
-                        joint[:2] += self.delta
-
-        ans = ans[0]
-        scores = np.asarray([i[:, 2].mean() for i in ans])
-
-        if self.do_refine:
-            heatmap_numpy = heatmaps[0]
-            tag_numpy = tags[0]
-            for i in range(len(ans)):
-                ans[i] = self.refine(heatmap_numpy, tag_numpy, ans[i], ans_tags[0][i])
-
-        return ans, scores
+from .decoder_ae import AssociativeEmbeddingDecoder
+from .decoder_openpose import OpenPoseDecoder