-
Notifications
You must be signed in to change notification settings - Fork 0
/
makePair.m
executable file
·145 lines (127 loc) · 4.64 KB
/
makePair.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
function tmplist = makePair()
run 'configure';
tmplist = zeros(10000, 1);
%% make directory for saving
mkdir(pair_path);
global content_name;
%% make space for each season
% img_folder_list = dir(image_path);
% for i = 3 : size(img_folder_list, 1)
%% get image list
% img_folder_name = img_folder_list(i).name;
%% load smi per image
smi_file_path = [smi_path '\' content_name '.smi'];
text_info = extractTextFromSMI(smi_file_path);
%% caption directory and file name
caption_dir = [content_path_prefix '/Preprocessing/captions'];
mkdir(caption_dir);
caption_name = sprintf('%s/%s_caption.txt', caption_dir, content_name);
file_id = fopen(caption_name, 'w');
%% get number of pair
pair_num = size(text_info, 2);
%% make space for each pair
pair = [];
%% insert text and img
img_common_path = [image_path '/' ];%img_folder_name '/'];% img_folder_name '_' ];
for j = 1:pair_num
%% erase brace
text = removeBrace(text_info(j).text);
split_str = splitStn(text);
%% uncontract words
str = [];
for k = 1:size(split_str, 2)
str = [str, unContract(split_str{1, k})'];
end
%% remove funtional words
str = removeFuncitonalWord(str);
%% bind a collection of words
str = bindWord(str);
%% make caption
for k=1:size(str, 2)
fprintf(file_id, '%s ', str{k});
end
fprintf(file_id, '\n');
%% make pair
if isempty(str) == false
img_path = [img_common_path content_name sprintf('_%d', text_info(1, j).frame) '.bmp'];
img = imread(img_path);
pair = [pair; {img}, {str}];
end
end
%% save img and text
pair_name = sprintf([pair_path '/pair_' content_name '.mat']);
save(pair_name, 'pair', '-v7.3');
fclose(file_id);
end
%% function for erasing a [~] from input text
function outStn = removeBrace(inStn)
start_idx = find(ismember(inStn, '[') == 1);
end_idx = find(ismember(inStn, ']') == 1);
for i = 1:size(start_idx, 2)
inStn([start_idx(i):end_idx(i)]) = ' ';
end
outStn = inStn;
end
%% function for extracting text from smi
function TextInfo = extractTextFromSMI(smi_filename)
if ~exist(smi_filename), error('No such a file.'); end
strs = textread(smi_filename,'%s', 'delimiter', '\n');
for i=1:length(strs)
one_line = strs{i};
if isempty(one_line), continue; end
[matchstr{i} splitstr{i}] = regexpi(one_line,'<sync [^>\n]*>','match', 'split');
end
i=2; k=1;
comp_str{1} = splitstr{1};
while i < length(strs)
while isempty(matchstr{i}) && i < length(strs)
comp_str{k} = strcat(comp_str{k}, '<br>', splitstr{i});
i=i+1;
end
comp_str{k} = strcat(comp_str{k}, '<br>', splitstr{i}{1});
if i==length(strs), break;
else
k=k+1;
frame_str{k} = regexpi(matchstr{i},'(?<=sync [^>\n]*start[ ]*=[ ]*)\d+','match');
comp_str{k} = splitstr{i}{2};
i=i+1;
end
end
comp_str{1} = ' ';
for k=1:length(comp_str)
refined_str{k} = regexprep(comp_str{k}, '<br>',' ');
refined_str{k} = regexprep(refined_str{k}, '<BR>',' ');
refined_str{k} = regexprep(refined_str{k}, '��','');
refined_str{k} = regexprep(refined_str{k}, '-lt','');
refined_str{k} = regexprep(refined_str{k}, '-',' ');
refined_str{k} = regexprep(refined_str{k}, '"','');
refined_str{k} = regexprep(refined_str{k}, '?',' ');
refined_str{k} = regexprep(refined_str{k}, '!',' ');
refined_str{k} = regexprep(refined_str{k}, '~',' ');
refined_str{k} = regexprep(refined_str{k}, '\...','');
refined_str{k} = regexprep(refined_str{k}, '\.',' ');
refined_str{k} = regexprep(refined_str{k}, '\,',' ');
refined_str{k} = regexprep(refined_str{k}, '\..',' ');
refined_str{k} = regexprep(refined_str{k}, '\...',' ');
refined_str{k} = regexprep(refined_str{k}, '������','');
refined_str{k} = regexprep(refined_str{k}, ' ',' ');
refined_str{k} = regexprep(refined_str{k}, '''ll',' will');
refined_str{k} = regexprep(refined_str{k}, '''m',' am');
refined_str{k} = regexprep(refined_str{k}, '''re',' are');
refined_str{k} = regexprep(refined_str{k}, '''ve',' have');
refined_str{k} = regexprep(refined_str{k}, 'n''t',' not');
refined_str{k} = regexprep(refined_str{k}, '<[^\n]*?>|[ ]*&[ \w;]*|<!--[^\e]*-->','');
refined_str{k} = lower(refined_str{k});
end
i=1;
for k=2:length(comp_str)
if isempty(refined_str{k}), continue; end
TextInfo(i).frame = str2double(frame_str{k}{1});
if iscell(refined_str{k})
TextInfo(i).text = refined_str{k}{1};
else
TextInfo(i).text = refined_str{k};
end
i=i+1;
end
end