1
- const fs = require ( "fs" ) ;
2
- const xml2js = require ( "xml2js" ) ;
3
- const shortid = require ( "shortid" ) ;
1
+ const fs = require ( 'fs' ) ;
2
+ const xml2js = require ( 'xml2js' ) ;
3
+ const vtt = require ( 'vtt.js' ) ;
4
+ const shortid = require ( 'shortid' ) ;
4
5
5
- const { id : ytid , subtitles } = require ( " ../input/subs.info.json" ) ;
6
- const { automatic_captions } = require ( " ../input/auto-subs.info.json" ) ;
6
+ const { id : ytid , subtitles } = require ( ' ../input/subs.info.json' ) ;
7
+ const { automatic_captions } = require ( ' ../input/auto-subs.info.json' ) ;
7
8
8
9
const DEBUG = true ;
9
10
@@ -16,10 +17,32 @@ const generateID = () => {
16
17
return id ;
17
18
} ;
18
19
20
+ const moreMagic = paragraphs =>
21
+ paragraphs . reverse ( ) . reduce ( ( acc , p ) => {
22
+ if ( ! p . end && acc . length > 0 ) p . end = acc [ 0 ] . start ;
23
+ if ( ! p . words || p . words . length === 0 ) {
24
+ p . words = p . text . split ( ' ' ) . reduce (
25
+ ( acc , text , index , words ) => [
26
+ ...acc ,
27
+ {
28
+ id : generateID ( ) ,
29
+ start : p . start + Math . floor ( ( index * ( p . end - p . start ) ) / words . length ) ,
30
+ end : p . start + Math . floor ( ( ( index + 1 ) * ( p . end - p . start ) ) / words . length ) ,
31
+ offset : index === 0 ? 0 : acc . map ( ( { text } ) => text ) . join ( ' ' ) . length + 1 ,
32
+ length : text . length ,
33
+ text,
34
+ } ,
35
+ ] ,
36
+ [ ]
37
+ ) ;
38
+ }
39
+ return [ p , ...acc ] ;
40
+ } , [ ] ) ;
41
+
19
42
const main = ( ) => {
20
43
const queue = [
21
- [ " subs" , subtitles ] ,
22
- [ " auto-subs" , automatic_captions ]
44
+ [ ' subs' , subtitles ] ,
45
+ [ ' auto-subs' , automatic_captions ] ,
23
46
]
24
47
. reduce (
25
48
( acc , [ type , subs ] ) => [
@@ -31,128 +54,163 @@ const main = () => {
31
54
lang,
32
55
ext,
33
56
type,
34
- file : `${ type } .${ lang } .${ ext } `
35
- } ) )
57
+ file : `${ type } .${ lang } .${ ext } ` ,
58
+ } ) ) ,
36
59
] ,
37
60
[ ]
38
- )
61
+ ) ,
39
62
] ,
40
63
[ ]
41
64
)
42
65
. filter ( ( { file } ) => fs . existsSync ( `./input/${ file } ` ) ) ;
43
- // .filter(({ lang }) => lang === "ro" );
66
+ // .filter(({ lang }) => lang === 'ro' );
44
67
// .slice(0, 1);
45
68
46
- console . log ( JSON . stringify ( queue , null , 2 ) ) ;
69
+ DEBUG && console . log ( JSON . stringify ( queue , null , 2 ) ) ;
47
70
48
71
queue . forEach ( ( { lang, type, ext, file } ) => {
49
72
switch ( ext ) {
50
- case " srv3" :
73
+ case ' srv3' :
51
74
convertSRV3 ( file , lang , type ) ;
52
75
break ;
53
-
76
+ case 'ttml' :
77
+ convertTTML ( file , lang , type ) ;
78
+ break ;
79
+ case 'vtt' :
80
+ convertVTT ( file , lang , type ) ;
81
+ break ;
54
82
default :
55
83
console . warn ( `Unknown format ${ ext } ` ) ;
56
84
break ;
57
85
}
58
86
} ) ;
59
87
} ;
60
88
61
- const convertSRV3 = async ( file , lang , type ) => {
62
- const data = await xml2js . parseStringPromise (
63
- fs . readFileSync ( `./input/${ file } ` , { encoding : "utf-8" } ) ,
89
+ const convertVTT = async ( file , lang , type ) => {
90
+ global . navigator = { userAgent : '' } ;
91
+ const cues = [ ] ;
92
+
93
+ const parser = new vtt . WebVTT . Parser (
64
94
{
65
- attrkey : "attrs" ,
66
- charkey : "text" ,
67
- trim : true ,
68
- explicitArray : true
69
- }
95
+ VTTCue : vtt . VTTCue ,
96
+ VTTRegion : vtt . VTTRegion ,
97
+ } ,
98
+ vtt . WebVTT . StringDecoder ( )
70
99
) ;
71
100
72
- DEBUG &&
73
- fs . writeFileSync (
74
- `./debug/${ file } .parsed.json` ,
75
- JSON . stringify ( data , null , 2 ) ,
76
- "utf8"
77
- ) ;
101
+ parser . oncue = cue => cues . push ( cue ) ;
78
102
79
- const paragraphs = data . timedtext . body [ 0 ] . p
80
- . filter ( ( { s, text } ) => ! ! s || ! ! text )
81
- . map ( ( { attrs : { t } , s = [ ] , text } ) => {
82
- const start = parseInt ( t ) ;
103
+ parser . parse ( fs . readFileSync ( `./input/${ file } ` , { encoding : 'utf-8' } ) ) ;
104
+ parser . flush ( ) ;
105
+
106
+ DEBUG && fs . writeFileSync ( `./debug/${ file } .parsed.json` , JSON . stringify ( cues , null , 2 ) , 'utf8' ) ;
107
+
108
+ const paragraphs = moreMagic (
109
+ cues . map ( ( { startTime, endTime, text } ) => {
110
+ const start = parseFloat ( startTime ) * 1e3 ;
111
+
112
+ return {
113
+ id : generateID ( ) ,
114
+ start,
115
+ text : text . trim ( ) ,
116
+ } ;
117
+ } )
118
+ ) ;
119
+
120
+ DEBUG && fs . writeFileSync ( `./debug/${ file } .paragraphs.json` , JSON . stringify ( paragraphs , null , 2 ) , 'utf8' ) ;
121
+
122
+ const transcript = {
123
+ id : generateID ( ) ,
124
+ lang,
125
+ paragraphs,
126
+ } ;
127
+
128
+ fs . writeFileSync ( `./output/${ file } .json` , JSON . stringify ( transcript , null , 2 ) , 'utf8' ) ;
129
+ } ;
130
+
131
+ const convertTTML = async ( file , lang , type ) => {
132
+ const data = await xml2js . parseStringPromise ( fs . readFileSync ( `./input/${ file } ` , { encoding : 'utf-8' } ) , {
133
+ attrkey : 'attrs' ,
134
+ charkey : 'text' ,
135
+ trim : true ,
136
+ explicitArray : true ,
137
+ } ) ;
138
+
139
+ DEBUG && fs . writeFileSync ( `./debug/${ file } .parsed.json` , JSON . stringify ( data , null , 2 ) , 'utf8' ) ;
140
+
141
+ const paragraphs = moreMagic (
142
+ data . tt . body [ 0 ] . div [ 0 ] . p . map ( ( { attrs : { begin } , text } ) => {
143
+ const [ hh , mm , ss ] = begin . split ( ':' ) ;
144
+ const start = ( hh * 3600 + mm * 60 + ss ) * 1e3 ;
83
145
84
- const words = s . reduce (
85
- ( acc , { attrs : { t = 0 , ac = 0 } , text } , index ) => [
86
- ...acc ,
87
- {
88
- id : generateID ( ) ,
89
- start : start + parseInt ( t ) ,
90
- end : start + parseInt ( t ) + parseInt ( ac ) ,
91
- offset :
92
- index === 0
93
- ? 0
94
- : acc . map ( ( { text } ) => text ) . join ( " " ) . length + 1 ,
95
- length : text . length ,
96
- text
97
- }
98
- ] ,
99
- [ ]
100
- ) ;
101
146
return {
102
147
id : generateID ( ) ,
103
148
start,
104
- end : words . length > 0 ? words [ words . length - 1 ] . end : null ,
105
- text : text ? text : words . map ( ( { text } ) => text ) . join ( " " ) ,
106
- words
149
+ text,
107
150
} ;
108
151
} )
109
- . reverse ( )
110
- . reduce ( ( acc , p ) => {
111
- if ( ! p . end && acc . length > 0 ) p . end = acc [ 0 ] . start ;
112
- if ( ! p . words || p . words . length === 0 ) {
113
- p . words = p . text . split ( " " ) . reduce (
114
- ( acc , text , index , words ) => [
152
+ ) ;
153
+
154
+ DEBUG && fs . writeFileSync ( `./debug/${ file } .paragraphs.json` , JSON . stringify ( paragraphs , null , 2 ) , 'utf8' ) ;
155
+
156
+ const transcript = {
157
+ id : generateID ( ) ,
158
+ lang,
159
+ paragraphs,
160
+ } ;
161
+
162
+ fs . writeFileSync ( `./output/${ file } .json` , JSON . stringify ( transcript , null , 2 ) , 'utf8' ) ;
163
+ } ;
164
+
165
+ const convertSRV3 = async ( file , lang , type ) => {
166
+ const data = await xml2js . parseStringPromise ( fs . readFileSync ( `./input/${ file } ` , { encoding : 'utf-8' } ) , {
167
+ attrkey : 'attrs' ,
168
+ charkey : 'text' ,
169
+ trim : true ,
170
+ explicitArray : true ,
171
+ } ) ;
172
+
173
+ DEBUG && fs . writeFileSync ( `./debug/${ file } .parsed.json` , JSON . stringify ( data , null , 2 ) , 'utf8' ) ;
174
+
175
+ const paragraphs = moreMagic (
176
+ data . timedtext . body [ 0 ] . p
177
+ . filter ( ( { s, text } ) => ! ! s || ! ! text )
178
+ . map ( ( { attrs : { t } , s = [ ] , text } ) => {
179
+ const start = parseInt ( t ) ;
180
+
181
+ const words = s . reduce (
182
+ ( acc , { attrs : { t = 0 , ac = 0 } , text } , index ) => [
115
183
...acc ,
116
184
{
117
185
id : generateID ( ) ,
118
- start :
119
- p . start +
120
- Math . floor ( ( index * ( p . end - p . start ) ) / words . length ) ,
121
- end :
122
- p . start +
123
- Math . floor ( ( ( index + 1 ) * ( p . end - p . start ) ) / words . length ) ,
124
- offset :
125
- index === 0
126
- ? 0
127
- : acc . map ( ( { text } ) => text ) . join ( " " ) . length + 1 ,
186
+ start : start + parseInt ( t ) ,
187
+ end : start + parseInt ( t ) + parseInt ( ac ) ,
188
+ offset : index === 0 ? 0 : acc . map ( ( { text } ) => text ) . join ( ' ' ) . length + 1 ,
128
189
length : text . length ,
129
- text
130
- }
190
+ text,
191
+ } ,
131
192
] ,
132
193
[ ]
133
194
) ;
134
- }
135
- return [ p , ...acc ] ;
136
- } , [ ] ) ;
195
+ return {
196
+ id : generateID ( ) ,
197
+ start,
198
+ end : words . length > 0 ? words [ words . length - 1 ] . end : null ,
199
+ text : text ? text : words . map ( ( { text } ) => text ) . join ( ' ' ) ,
200
+ words,
201
+ } ;
202
+ } )
203
+ ) ;
137
204
138
- DEBUG &&
139
- fs . writeFileSync (
140
- `./debug/${ file } .paragraphs.json` ,
141
- JSON . stringify ( paragraphs , null , 2 ) ,
142
- "utf8"
143
- ) ;
205
+ DEBUG && fs . writeFileSync ( `./debug/${ file } .paragraphs.json` , JSON . stringify ( paragraphs , null , 2 ) , 'utf8' ) ;
144
206
145
207
const transcript = {
146
208
id : generateID ( ) ,
147
209
lang,
148
- paragraphs
210
+ paragraphs,
149
211
} ;
150
212
151
- fs . writeFileSync (
152
- `./output/${ file } .json` ,
153
- JSON . stringify ( transcript , null , 2 ) ,
154
- "utf8"
155
- ) ;
213
+ fs . writeFileSync ( `./output/${ file } .json` , JSON . stringify ( transcript , null , 2 ) , 'utf8' ) ;
156
214
} ;
157
215
158
216
main ( ) ;
0 commit comments