-
Notifications
You must be signed in to change notification settings - Fork 20
/
story_loader.py
151 lines (120 loc) · 4.95 KB
/
story_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""MovieQA - Story Understanding Benchmark.
Story loaders for reading plots, subtitles, DVS, etc.
http://movieqa.cs.toronto.edu/
Release: v1.0
Date: 18 Nov 2015
"""
import os
import re
import pysrt
import config
PKG = config.PACKAGE_DIRECTORY
dvs_rep = re.compile('^\d+-\d+')
dvs_cur = re.compile('\{.+?\}')
quote_matches = re.compile('<.+?>')
class StoryLoader(object):
"""Data loader class."""
def __init__(self):
self.available_types = ['plot', 'split_plot', 'subtitle', 'dvs', 'script']
def _check_exists(self, filename):
"""Check that the story file exists and is OK to load.
"""
if os.path.exists(filename): # git repo OK!
return True
return False
def _read_plot(self, plot_filename):
"""Read a plot synopsis file.
Also used to read split plots, where each line contains one sentence of the plot.
"""
with open(plot_filename) as f:
plot = f.readlines()
plot = [p.strip() for p in plot]
plot = [p for p in plot if p]
return plot
def _read_subtitle(self, subtitle_filename):
"""Read the subtitle file and output dialogs.
"""
subtitle_text = pysrt.open(subtitle_filename, encoding='iso-8859-1')
subtitle_text = [l.strip() for l in subtitle_text.text.split('\n')]
subtitle_text = [quote_matches.sub('', l).strip() for l in subtitle_text]
# Prepare dialogs
dialogs = []
create_new_dialog = True
for l in subtitle_text:
if not l: # Get rid of newlines
continue
if create_new_dialog:
dialogs.append([l]) # Start new dialog
else:
dialogs[-1].append(l) # Append to last dialog
# Decide what to do with next line based on current line ending
create_new_dialog = False
if l[-1] in ['.', '!', '?', ':', ')']:
create_new_dialog = True
# Join the lists to form single dialogs
for d in range(len(dialogs)):
dialogs[d] = ' '.join(dialogs[d])
return dialogs
def _read_dvs(self, dvs_filename):
"""Read a DVS file.
"""
dvs_text = pysrt.open(dvs_filename, encoding='iso-8859-1')
dvs_text = [l.strip() for l in dvs_text.text.split('\n')]
dvs_text = [quote_matches.sub('', l).strip() for l in dvs_text]
# Cleanup DVS (remove the DVS index and stuff in {})
for k in range(len(dvs_text)):
dvs_text[k] = dvs_rep.sub('', dvs_text[k]).strip()
dvs_text[k] = dvs_cur.sub('', dvs_text[k]).strip()
return dvs_text
def load_story(self, movies_map, story_type='plot'):
"""Load story files for given set of movies.
Args:
movies_map: Dictionary of movie named tuples.
story_type: 'plot', 'split_plot', 'subtitle', 'dvs', 'script'.
Returns:
story: Story for each movie indexed by imdb_key.
Raises:
ValueError: If input story type is not supported.
"""
story = {}
for imdb_key, movie in movies_map.iteritems():
if story_type == 'plot':
if not movie.text.plot:
continue
plot_filename = os.path.join(PKG, movie.text.plot)
if not self._check_exists(plot_filename):
continue
this_story = self._read_plot(plot_filename)
elif story_type == 'split_plot':
fname = 'story/split_plot/' + imdb_key + '.split.wiki'
split_plot_filename = os.path.join(PKG, fname)
if not self._check_exists(split_plot_filename):
continue
this_story = self._read_plot(split_plot_filename)
elif story_type == 'subtitle':
if not movie.text.subtitle:
continue
subtitle_filename = os.path.join(PKG, movie.text.subtitle)
if not self._check_exists(subtitle_filename):
continue
this_story = self._read_subtitle(subtitle_filename)
elif story_type == 'dvs':
if not movie.text.dvs:
continue
dvs_filename = os.path.join(PKG, movie.text.dvs)
if not self._check_exists(dvs_filename):
continue
this_story = self._read_subtitle(dvs_filename)
elif story_type == 'script':
if not movie.text.script:
continue
script_filename = os.path.join(PKG, movie.text.script)
if not self._check_exists(script_filename):
continue
this_story = self._read_plot(script_filename)
else:
raise ValueError('Unsupported story type!')
story[imdb_key] = this_story
if not story:
raise ValueError('Story returned empty!')
return story