Add hopenet model

theSoenke · Jul 14, 2019 · ab870e5 · ab870e5
1 parent 7059d31
commit ab870e5
Show file tree

Hide file tree

Showing 6 changed files with 245 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/README.md b/README.md
@@ -1 +1,5 @@
-# headpose
+# Head Pose Estimation
+
+Implementation of [Fine-Grained Head Pose Estimation Without Keypoints](https://arxiv.org/abs/1710.00925)
+
+    python3 head_pose.py --checkpoint hopenet_robust_alpha1.pkl --image face.png
diff --git a/head_pose.py b/head_pose.py
@@ -0,0 +1,70 @@
+import argparse
+
+import cv2
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from hopenet import Hopenet
+from PIL import Image
+from torchvision import transforms
+from visualization import draw_pose
+
+
+class HeadPose():
+    def __init__(self, checkpoint, transform=None):
+        self.transform = transform
+        if self.transform is None:
+            self.transform = transforms.Compose([
+                transforms.Resize(224),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+            ])
+
+        num_bins = 66
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.idx_tensor = torch.FloatTensor([idx for idx in range(num_bins)]).to(self.device)
+        self.model = Hopenet()
+        checkpoint = torch.load(checkpoint)
+        self.model.load_state_dict(checkpoint, strict=False)
+        self.model.to(self.device)
+        self.model.eval()
+
+    @torch.no_grad()
+    def predict(self, image):
+        if isinstance(image, list):
+            image = [self.transform(img) for img in image]
+        elif isinstance(image, str):
+            image = Image.open(image)
+            image = self.transform(image).unsqueeze(dim=0)
+        else:
+            image = self.transform(image).unsqueeze(dim=0)
+
+        image = image.to(self.device)
+        yaw, pitch, roll = self.model(image)
+        yaw = F.softmax(yaw, dim=1)
+        pitch = F.softmax(pitch, dim=1)
+        roll = F.softmax(roll, dim=1)
+
+        yaw = torch.sum(yaw * self.idx_tensor, dim=1) * 3 - 99
+        pitch = torch.sum(pitch * self.idx_tensor, dim=1) * 3 - 99
+        roll = torch.sum(roll * self.idx_tensor, dim=1) * 3 - 99
+        return yaw.item(), pitch.item(), roll.item()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str)
+    parser.add_argument('--image', type=str)
+    args = parser.parse_args()
+
+    head_pose = HeadPose(checkpoint=args.checkpoint)
+    yaw, pitch, roll = head_pose.predict(args.image)
+
+    print("Yaw: %f" % yaw)
+    img = cv2.imread(args.image)
+    img = draw_pose(img, yaw, pitch, roll, tdx=200, tdy=200, size=100)
+    plt.imshow(img)
+    plt.show()
diff --git a/hopenet.py b/hopenet.py
@@ -0,0 +1,131 @@
+import math
+
+import torch.nn as nn
+import torchvision
+
+
+class Hopenet(nn.Module):
+    def __init__(self, resnet=50, num_bins=66):
+        self.inplanes = 64
+        super().__init__()
+
+        if resnet == 50:
+            block = torchvision.models.resnet.Bottleneck
+            layers = [3, 4, 6, 3]
+        else:
+            raise("ResNet not supported")
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7)
+        self.fc_yaw = nn.Linear(512 * block.expansion, num_bins)
+        self.fc_pitch = nn.Linear(512 * block.expansion, num_bins)
+        self.fc_roll = nn.Linear(512 * block.expansion, num_bins)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        pre_yaw = self.fc_yaw(x)
+        pre_pitch = self.fc_pitch(x)
+        pre_roll = self.fc_roll(x)
+
+        return pre_yaw, pre_pitch, pre_roll
+
+
+class ResNet(nn.Module):
+    # ResNet for regression of 3 Euler angles.
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7)
+        self.fc_angles = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc_angles(x)
+        return x
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.1.0
+torchvision==0.3.0
+opencv-python
+pillow
diff --git a/visualization.py b/visualization.py
@@ -0,0 +1,34 @@
+from math import cos, sin
+
+import cv2
+import numpy as np
+
+
+def draw_pose(img, yaw, pitch, roll, tdx=None, tdy=None, size=100):
+    pitch = pitch * np.pi / 180
+    yaw = -(yaw * np.pi / 180)
+    roll = roll * np.pi / 180
+
+    if tdx == None or tdy == None:
+        height, width = img.shape[:2]
+        tdx = width / 2
+        tdy = height / 2
+
+    # X-Axis pointing to right. drawn in red
+    x1 = size * (cos(yaw) * cos(roll)) + tdx
+    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy
+
+    # Y-Axis | drawn in green
+    #        v
+    x2 = size * (-cos(yaw) * sin(roll)) + tdx
+    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy
+
+    # Z-Axis (out of the screen) drawn in blue
+    x3 = size * (sin(yaw)) + tdx
+    y3 = size * (-cos(yaw) * sin(pitch)) + tdy
+
+    cv2.line(img, (int(tdx), int(tdy)), (int(x1), int(y1)), (0, 0, 255), 3)
+    cv2.line(img, (int(tdx), int(tdy)), (int(x2), int(y2)), (0, 255, 0), 3)
+    cv2.line(img, (int(tdx), int(tdy)), (int(x3), int(y3)), (255, 0, 0), 2)
+
+    return img