Skip to content

Commit 4411d07

Browse files
committed
ttml, vtt
1 parent abe3b56 commit 4411d07

File tree

9 files changed

+164
-91
lines changed

9 files changed

+164
-91
lines changed

.gitignore

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11

2-
input/
3-
debug/
4-
output/
2+
input/*
3+
debug/*
4+
output/*
5+
6+
!input/.keep
7+
!debug/.keep
8+
!output/.keep
59

610
# Created by https://www.gitignore.io/api/node,code,linux,macos,textmate
711
# Edit at https://www.gitignore.io/?templates=node,code,linux,macos,textmate

.prettierrc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
singleQuote: true
2+
trailingComma: es5
3+
printWidth: 120
4+
endOfLine: lf

Makefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
.PHONY: output clean
2+
.DEFAULT_GOAL := output
3+
4+
output: input/subs.info.json input/auto-subs.info.json
5+
node src/convert
26

37
input/subs.info.json:
48
youtube-dl --continue --retries 5 --write-info-json \
@@ -40,9 +44,6 @@ input/auto-subs.info.json:
4044
--output "input/auto-subs.%(ext)s" \
4145
"https://www.youtube.com/watch?v=hB7aGnfLB-8"
4246

43-
output: input/subs.info.json input/auto-subs.info.json
44-
node src/convert
45-
4647
clean:
4748
rm -f ./input/*
4849
rm -f ./debug/*

debug/.keep

Whitespace-only changes.

input/.keep

Whitespace-only changes.

output/.keep

Whitespace-only changes.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"license": "ISC",
1111
"dependencies": {
1212
"shortid": "^2.2.15",
13+
"vtt.js": "^0.13.0",
1314
"xml2js": "^0.4.22"
1415
}
1516
}

src/convert.js

Lines changed: 143 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
const fs = require("fs");
2-
const xml2js = require("xml2js");
3-
const shortid = require("shortid");
1+
const fs = require('fs');
2+
const xml2js = require('xml2js');
3+
const vtt = require('vtt.js');
4+
const shortid = require('shortid');
45

5-
const { id: ytid, subtitles } = require("../input/subs.info.json");
6-
const { automatic_captions } = require("../input/auto-subs.info.json");
6+
const { id: ytid, subtitles } = require('../input/subs.info.json');
7+
const { automatic_captions } = require('../input/auto-subs.info.json');
78

89
const DEBUG = true;
910

@@ -16,10 +17,32 @@ const generateID = () => {
1617
return id;
1718
};
1819

20+
const moreMagic = paragraphs =>
21+
paragraphs.reverse().reduce((acc, p) => {
22+
if (!p.end && acc.length > 0) p.end = acc[0].start;
23+
if (!p.words || p.words.length === 0) {
24+
p.words = p.text.split(' ').reduce(
25+
(acc, text, index, words) => [
26+
...acc,
27+
{
28+
id: generateID(),
29+
start: p.start + Math.floor((index * (p.end - p.start)) / words.length),
30+
end: p.start + Math.floor(((index + 1) * (p.end - p.start)) / words.length),
31+
offset: index === 0 ? 0 : acc.map(({ text }) => text).join(' ').length + 1,
32+
length: text.length,
33+
text,
34+
},
35+
],
36+
[]
37+
);
38+
}
39+
return [p, ...acc];
40+
}, []);
41+
1942
const main = () => {
2043
const queue = [
21-
["subs", subtitles],
22-
["auto-subs", automatic_captions]
44+
['subs', subtitles],
45+
['auto-subs', automatic_captions],
2346
]
2447
.reduce(
2548
(acc, [type, subs]) => [
@@ -31,128 +54,163 @@ const main = () => {
3154
lang,
3255
ext,
3356
type,
34-
file: `${type}.${lang}.${ext}`
35-
}))
57+
file: `${type}.${lang}.${ext}`,
58+
})),
3659
],
3760
[]
38-
)
61+
),
3962
],
4063
[]
4164
)
4265
.filter(({ file }) => fs.existsSync(`./input/${file}`));
43-
// .filter(({ lang }) => lang === "ro");
66+
// .filter(({ lang }) => lang === 'ro');
4467
// .slice(0, 1);
4568

46-
console.log(JSON.stringify(queue, null, 2));
69+
DEBUG && console.log(JSON.stringify(queue, null, 2));
4770

4871
queue.forEach(({ lang, type, ext, file }) => {
4972
switch (ext) {
50-
case "srv3":
73+
case 'srv3':
5174
convertSRV3(file, lang, type);
5275
break;
53-
76+
case 'ttml':
77+
convertTTML(file, lang, type);
78+
break;
79+
case 'vtt':
80+
convertVTT(file, lang, type);
81+
break;
5482
default:
5583
console.warn(`Unknown format ${ext}`);
5684
break;
5785
}
5886
});
5987
};
6088

61-
const convertSRV3 = async (file, lang, type) => {
62-
const data = await xml2js.parseStringPromise(
63-
fs.readFileSync(`./input/${file}`, { encoding: "utf-8" }),
89+
const convertVTT = async (file, lang, type) => {
90+
global.navigator = { userAgent: '' };
91+
const cues = [];
92+
93+
const parser = new vtt.WebVTT.Parser(
6494
{
65-
attrkey: "attrs",
66-
charkey: "text",
67-
trim: true,
68-
explicitArray: true
69-
}
95+
VTTCue: vtt.VTTCue,
96+
VTTRegion: vtt.VTTRegion,
97+
},
98+
vtt.WebVTT.StringDecoder()
7099
);
71100

72-
DEBUG &&
73-
fs.writeFileSync(
74-
`./debug/${file}.parsed.json`,
75-
JSON.stringify(data, null, 2),
76-
"utf8"
77-
);
101+
parser.oncue = cue => cues.push(cue);
78102

79-
const paragraphs = data.timedtext.body[0].p
80-
.filter(({ s, text }) => !!s || !!text)
81-
.map(({ attrs: { t }, s = [], text }) => {
82-
const start = parseInt(t);
103+
parser.parse(fs.readFileSync(`./input/${file}`, { encoding: 'utf-8' }));
104+
parser.flush();
105+
106+
DEBUG && fs.writeFileSync(`./debug/${file}.parsed.json`, JSON.stringify(cues, null, 2), 'utf8');
107+
108+
const paragraphs = moreMagic(
109+
cues.map(({ startTime, endTime, text }) => {
110+
const start = parseFloat(startTime) * 1e3;
111+
112+
return {
113+
id: generateID(),
114+
start,
115+
text: text.trim(),
116+
};
117+
})
118+
);
119+
120+
DEBUG && fs.writeFileSync(`./debug/${file}.paragraphs.json`, JSON.stringify(paragraphs, null, 2), 'utf8');
121+
122+
const transcript = {
123+
id: generateID(),
124+
lang,
125+
paragraphs,
126+
};
127+
128+
fs.writeFileSync(`./output/${file}.json`, JSON.stringify(transcript, null, 2), 'utf8');
129+
};
130+
131+
const convertTTML = async (file, lang, type) => {
132+
const data = await xml2js.parseStringPromise(fs.readFileSync(`./input/${file}`, { encoding: 'utf-8' }), {
133+
attrkey: 'attrs',
134+
charkey: 'text',
135+
trim: true,
136+
explicitArray: true,
137+
});
138+
139+
DEBUG && fs.writeFileSync(`./debug/${file}.parsed.json`, JSON.stringify(data, null, 2), 'utf8');
140+
141+
const paragraphs = moreMagic(
142+
data.tt.body[0].div[0].p.map(({ attrs: { begin }, text }) => {
143+
const [hh, mm, ss] = begin.split(':');
144+
const start = (hh * 3600 + mm * 60 + ss) * 1e3;
83145

84-
const words = s.reduce(
85-
(acc, { attrs: { t = 0, ac = 0 }, text }, index) => [
86-
...acc,
87-
{
88-
id: generateID(),
89-
start: start + parseInt(t),
90-
end: start + parseInt(t) + parseInt(ac),
91-
offset:
92-
index === 0
93-
? 0
94-
: acc.map(({ text }) => text).join(" ").length + 1,
95-
length: text.length,
96-
text
97-
}
98-
],
99-
[]
100-
);
101146
return {
102147
id: generateID(),
103148
start,
104-
end: words.length > 0 ? words[words.length - 1].end : null,
105-
text: text ? text : words.map(({ text }) => text).join(" "),
106-
words
149+
text,
107150
};
108151
})
109-
.reverse()
110-
.reduce((acc, p) => {
111-
if (!p.end && acc.length > 0) p.end = acc[0].start;
112-
if (!p.words || p.words.length === 0) {
113-
p.words = p.text.split(" ").reduce(
114-
(acc, text, index, words) => [
152+
);
153+
154+
DEBUG && fs.writeFileSync(`./debug/${file}.paragraphs.json`, JSON.stringify(paragraphs, null, 2), 'utf8');
155+
156+
const transcript = {
157+
id: generateID(),
158+
lang,
159+
paragraphs,
160+
};
161+
162+
fs.writeFileSync(`./output/${file}.json`, JSON.stringify(transcript, null, 2), 'utf8');
163+
};
164+
165+
const convertSRV3 = async (file, lang, type) => {
166+
const data = await xml2js.parseStringPromise(fs.readFileSync(`./input/${file}`, { encoding: 'utf-8' }), {
167+
attrkey: 'attrs',
168+
charkey: 'text',
169+
trim: true,
170+
explicitArray: true,
171+
});
172+
173+
DEBUG && fs.writeFileSync(`./debug/${file}.parsed.json`, JSON.stringify(data, null, 2), 'utf8');
174+
175+
const paragraphs = moreMagic(
176+
data.timedtext.body[0].p
177+
.filter(({ s, text }) => !!s || !!text)
178+
.map(({ attrs: { t }, s = [], text }) => {
179+
const start = parseInt(t);
180+
181+
const words = s.reduce(
182+
(acc, { attrs: { t = 0, ac = 0 }, text }, index) => [
115183
...acc,
116184
{
117185
id: generateID(),
118-
start:
119-
p.start +
120-
Math.floor((index * (p.end - p.start)) / words.length),
121-
end:
122-
p.start +
123-
Math.floor(((index + 1) * (p.end - p.start)) / words.length),
124-
offset:
125-
index === 0
126-
? 0
127-
: acc.map(({ text }) => text).join(" ").length + 1,
186+
start: start + parseInt(t),
187+
end: start + parseInt(t) + parseInt(ac),
188+
offset: index === 0 ? 0 : acc.map(({ text }) => text).join(' ').length + 1,
128189
length: text.length,
129-
text
130-
}
190+
text,
191+
},
131192
],
132193
[]
133194
);
134-
}
135-
return [p, ...acc];
136-
}, []);
195+
return {
196+
id: generateID(),
197+
start,
198+
end: words.length > 0 ? words[words.length - 1].end : null,
199+
text: text ? text : words.map(({ text }) => text).join(' '),
200+
words,
201+
};
202+
})
203+
);
137204

138-
DEBUG &&
139-
fs.writeFileSync(
140-
`./debug/${file}.paragraphs.json`,
141-
JSON.stringify(paragraphs, null, 2),
142-
"utf8"
143-
);
205+
DEBUG && fs.writeFileSync(`./debug/${file}.paragraphs.json`, JSON.stringify(paragraphs, null, 2), 'utf8');
144206

145207
const transcript = {
146208
id: generateID(),
147209
lang,
148-
paragraphs
210+
paragraphs,
149211
};
150212

151-
fs.writeFileSync(
152-
`./output/${file}.json`,
153-
JSON.stringify(transcript, null, 2),
154-
"utf8"
155-
);
213+
fs.writeFileSync(`./output/${file}.json`, JSON.stringify(transcript, null, 2), 'utf8');
156214
};
157215

158216
main();

yarn.lock

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,11 @@ util.promisify@~1.0.0:
134134
define-properties "^1.1.2"
135135
object.getownpropertydescriptors "^2.0.3"
136136

137+
vtt.js@^0.13.0:
138+
version "0.13.0"
139+
resolved "https://registry.yarnpkg.com/vtt.js/-/vtt.js-0.13.0.tgz#955c667b34d5325b2012cb9e8ba9bad6e0b11ff8"
140+
integrity sha1-lVxmezTVMlsgEsuei6m61uCxH/g=
141+
137142
xml2js@^0.4.22:
138143
version "0.4.22"
139144
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.22.tgz#4fa2d846ec803237de86f30aa9b5f70b6600de02"

0 commit comments

Comments
 (0)