-
Notifications
You must be signed in to change notification settings - Fork 0
/
Hand Gesture Detection.py
171 lines (128 loc) · 4.85 KB
/
Hand Gesture Detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# # Hand Gesture Recognizer
import cv2
import csv
import copy
import math
import torch
# +
import numpy as np
import mediapipe as mp
import torch.nn as nn
import torch.nn.functional as F
import tensorflow as tf
# -
from torchvision import transforms
from tensorflow.keras.models import load_model
path = r"C:\Users\47637\Downloads\HW of DL\DL_Final\Hand Gesture Recognition Mediapipe"
# initialize mediapipe
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mpDraw = mp.solutions.drawing_utils
transform=transforms.ToTensor()
def calc_bounding_rect(image, landmarks):
image_width, image_height = image.shape[1], image.shape[0]
landmark_array = np.empty((0, 2), int)
for _, landmark in enumerate(landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_point = [np.array((landmark_x, landmark_y))]
landmark_array = np.append(landmark_array, landmark_point, axis=0)
x, y, w, h = cv2.boundingRect(landmark_array)
return [max(0,x-20), max(0,y-20), min(x + w+20,image_width-1), min(y + h+20,image_height-1)]
def reshape(a,l):
newimage=np.ndarray(shape=(l,l,3))
hight=np.shape(a)[0]
width=np.shape(a)[1]
spaceh=math.floor(hight/l)
spacew=math.floor(width/l)
for k in range(3):
for i in range(l):
for j in range(l):
newimage[i,j,k]=np.average(a[i*spaceh:i*spaceh+spaceh,j*spacew:j*spacew+spacew,k])
return newimage
# +
# Load the gesture recognizer model
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 3)
self.fc1 = nn.Linear(442, 1024)
self.fc2 = nn.Linear(1024, 512)
self.fc3 = nn.Linear(512, 8)
def forward(self, x, lm):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x,1) # flatten all dimensions except batch
print(np.shape(x))
print(np.shape(lm))
x = torch.cat((x, lm), 1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def predict(self, x, lm):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x,1) # flatten all dimensions except batch
x = x.view(1,400)
print(np.shape(x))
print(np.shape(lm))
x = torch.cat((x, lm), 1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
model = Net()
#model.load_state_dict(torch.load("./MyNet.pth")) # Model 1
model.load_state_dict(torch.load('DL_model.pt')) # Model 2 (Reponse faster, and have more accuracy)
model.eval()
# -
# Load class names
classNames = ["STOP","GOOD","YES","LOVE YOU","NO WAY","OKAY","GIMME A SECOND","CALL ME LATER"]
# Initialize the webcam
cap = cv2.VideoCapture(0)
while True:
# Read each frame from the webcam
_, frame = cap.read()
x, y, c = frame.shape
# Flip the frame vertically
frame = cv2.flip(frame, 1)
debug_image = copy.deepcopy(frame)
framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Get hand landmark prediction
result = hands.process(framergb)
className = ''
# post process the result
if result.multi_hand_landmarks:
landmarks = []
for handslms in result.multi_hand_landmarks:
brect = calc_bounding_rect(debug_image, handslms)
for lm in handslms.landmark:
# print(id, lm)
lmx = int(lm.x * x)
lmy = int(lm.y * y)
landmarks.append([lmx, lmy])
#print([landmarks])
# Drawing landmarks on frames
mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)
flatLM = torch.tensor(np.float32(np.reshape(landmarks, (1, -1))))
cropped = framergb[brect[1]:brect[3], brect[0]:brect[2], :]
reshaped = np.uint8(reshape(cropped, 28))
# Predict gesture
prediction = model.predict(transform(reshaped),torch.tensor(flatLM))
#print(np.shape(torch.tensor(flatLM)))
# Print(prediction)
classID = np.argmax(prediction.detach().numpy())
className = classNames[classID]
# Show the prediction on the frame
cv2.putText(frame, className, (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
1, (0,0,255), 2, cv2.LINE_AA)
# Show the final output
cv2.imshow("Output", frame)
if cv2.waitKey(1) == ord('q'):
break
# Release the webcam and destroy all active windows
cap.release()
cv2.destroyAllWindows()