-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvideo_caption.py
124 lines (106 loc) · 4.87 KB
/
video_caption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import cv2
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
from openai import OpenAI
import os
import base64
import requests
PROMPT = ''''
Given a sequence of image which are selected frames from a clip within a video. You are tasked to provide
a comprehensive list of narrations describing what is happening in that clip. You are output should consists
of a list of sentencce describing what the segments of the clips captures and can tell you about the video. For example
if several segments describe someone opening a jar and pouring the content it into a bowl. Your output should be
of the form: "person A opens the jar.\nPerson A places the lid and the open jar on the side of the table.\nPerson A grabs
a bowl from the cabinet behind him.\nPerson A places the bowl onto the counter.\nPerson A pour the content of the jar into
the bowl". Each line of narration should be a independent clause that highlight the subject that performs an action and the
action performed by the subject. Note if they are multiple suject, you may use person A or B. When you see an action is performed
with something but are unsure what the thing is, you may refer to it as #unsure. When outputting your replies, you do not
need to output any description to what you are going to output, your chain of thought, or any report of anomalies; plain text out is fine.
'''
class Video:
def __init__(self, path):
try:
self.path = path
self.video = cv2.VideoCapture(self.path)
self.fps = int(self.video.get(cv2.CAP_PROP_FPS))
self.total_frame = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
self.total_time = self.total_frame / self.fps
except:
print("An error have occured")
def get_frame(self, timestamp, save=False, path=None):
self.video.set(cv2.CAP_PROP_POS_MSEC, timestamp * 1000)
ret, frame = self.video.read()
if ret:
# Display the frame
if save == False:
return frame
else:
try:
cv2.imwrite(os.path.join(path, f"{timestamp}.jpeg"), frame)
except:
print("Failed to save")
else:
print("Failed to retrieve frame at the specified time.")
def show_frame(self, timestamp):
self.video.set(cv2.CAP_PROP_POS_MSEC, timestamp * 1000)
ret, frame = self.video.read()
if ret:
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.show()
else:
print("Failed to retrieve frame at the specified time.")
def save_frames(self, start_t, end_t, k, path):
if end_t <= start_t or end_t > self.total_time:
print(f"ERROR: total time: {self.total_time} start time: {start_t} end time: {end_t}")
raise ValueError()
return
print(f"Total Time: {self.total_time}, start time {start_t}, end time {end_t}")
increment = round((end_t - start_t) / k)
timestamps = [round(start_t + (i * increment), 2) for i in range(k)]
for ts in tqdm(timestamps, desc=f"Saving {k} frames to {path}", leave=False):
self.get_frame(ts, save=True, path=path)
def del_video(self):
self.video.release()
def video_to_clip(annotation, video_path, save_path):
video = Video(video_path)
for i in tqdm(range(len(annotation)), "Processing Clips"):
os.mkdir(os.path.join(save_path, f"_{i}"))
video.save_frames(annotation[i]['start_time'], annotation[i]['end_time'],
annotation[i]['end_time']-annotation[i]['start_time'],
save_path + f"//_{i}")
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def multi_image_inference(prompt, image_list, key):
headers = {
'Content-Type': 'application/json',
"Authorization": f"Bearer {key['api_key']}"
}
content = [{
"type": "text",
"text": prompt
}]
for path in tqdm(image_list, desc="Encoding Frames", leave=False):
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(path)}"
}
})
payload = {
"model": "gpt-4o",
"messages": [ {
"role": "user",
"content": content
}],
"max_tokens": 1105
}
# try:
response = requests.post(f"{key['api_base']}/chat/completions", headers=headers, json=payload)
return response.json()['choices'][0]['message']['content']
# except:
# return "An error has occured"
def video_inference(prompt, clip_folder_path, key):
return [multi_image_inference(prompt, video_segment_list, key) for video_segment_list in tqdm(clip_folder_path)]