|
| 1 | +// Copyright 2013 The Flutter Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style license that can be |
| 3 | +// found in the LICENSE file. |
| 4 | + |
| 5 | +import 'dart:convert'; |
| 6 | + |
| 7 | +import 'package:html/dom.dart'; |
| 8 | + |
| 9 | +import 'closed_caption_file.dart'; |
| 10 | +import 'package:html/parser.dart' as html_parser; |
| 11 | + |
| 12 | +/// Represents a [ClosedCaptionFile], parsed from the WebVTT file format. |
| 13 | +/// See: https://en.wikipedia.org/wiki/WebVTT |
| 14 | +class WebVTTCaptionFile extends ClosedCaptionFile { |
| 15 | + /// Parses a string into a [ClosedCaptionFile], assuming [fileContents] is in |
| 16 | + /// the WebVTT file format. |
| 17 | + /// * See: https://en.wikipedia.org/wiki/WebVTT |
| 18 | + WebVTTCaptionFile(String fileContents) |
| 19 | + : _captions = _parseCaptionsFromWebVTTString(fileContents); |
| 20 | + |
| 21 | + @override |
| 22 | + List<Caption> get captions => _captions; |
| 23 | + |
| 24 | + final List<Caption> _captions; |
| 25 | +} |
| 26 | + |
| 27 | +List<Caption> _parseCaptionsFromWebVTTString(String file) { |
| 28 | + final List<Caption> captions = <Caption>[]; |
| 29 | + |
| 30 | + // Ignore metadata |
| 31 | + Set<String> metadata = {'HEADER', 'NOTE', 'REGION', 'WEBVTT'}; |
| 32 | + |
| 33 | + int captionNumber = 1; |
| 34 | + for (List<String> captionLines in _readWebVTTFile(file)) { |
| 35 | + // CaptionLines represent a complete caption. |
| 36 | + // E.g |
| 37 | + // [ |
| 38 | + // [00:00.000 --> 01:24.000 align:center] |
| 39 | + // ['Introduction'] |
| 40 | + // ] |
| 41 | + // If caption has just header or time, but no text, `captionLines.length` will be 1. |
| 42 | + if (captionLines.length < 2) continue; |
| 43 | + |
| 44 | + // If caption has header equal metadata, ignore. |
| 45 | + String metadaType = captionLines[0].split(' ')[0]; |
| 46 | + if (metadata.contains(metadaType)) continue; |
| 47 | + |
| 48 | + // Caption has header |
| 49 | + bool hasHeader = captionLines.length > 2; |
| 50 | + if (hasHeader) { |
| 51 | + final int? tryParseCaptionNumber = int.tryParse(captionLines[0]); |
| 52 | + if (tryParseCaptionNumber != null) { |
| 53 | + captionNumber = tryParseCaptionNumber; |
| 54 | + } |
| 55 | + } |
| 56 | + |
| 57 | + final _CaptionRange? captionRange = _CaptionRange.fromWebVTTString( |
| 58 | + hasHeader ? captionLines[1] : captionLines[0], |
| 59 | + ); |
| 60 | + |
| 61 | + if (captionRange == null) { |
| 62 | + continue; |
| 63 | + } |
| 64 | + |
| 65 | + final String text = captionLines.sublist(hasHeader ? 2 : 1).join('\n'); |
| 66 | + |
| 67 | + // TODO(cyanglaz): Handle special syntax in VTT captions. |
| 68 | + // https://github.com/flutter/flutter/issues/90007. |
| 69 | + final String textWithoutFormat = _extractTextFromHtml(text); |
| 70 | + |
| 71 | + final Caption newCaption = Caption( |
| 72 | + number: captionNumber, |
| 73 | + start: captionRange.start, |
| 74 | + end: captionRange.end, |
| 75 | + text: textWithoutFormat, |
| 76 | + ); |
| 77 | + captions.add(newCaption); |
| 78 | + captionNumber++; |
| 79 | + } |
| 80 | + |
| 81 | + return captions; |
| 82 | +} |
| 83 | + |
| 84 | +class _CaptionRange { |
| 85 | + final Duration start; |
| 86 | + final Duration end; |
| 87 | + |
| 88 | + _CaptionRange(this.start, this.end); |
| 89 | + |
| 90 | + // Assumes format from an VTT file. |
| 91 | + // For example: |
| 92 | + // 00:09.000 --> 00:11.000 |
| 93 | + static _CaptionRange? fromWebVTTString(String line) { |
| 94 | + final RegExp format = |
| 95 | + RegExp(_webVTTTimeStamp + _webVTTArrow + _webVTTTimeStamp); |
| 96 | + |
| 97 | + if (!format.hasMatch(line)) { |
| 98 | + return null; |
| 99 | + } |
| 100 | + |
| 101 | + final List<String> times = line.split(_webVTTArrow); |
| 102 | + |
| 103 | + final Duration? start = _parseWebVTTTimestamp(times[0]); |
| 104 | + final Duration? end = _parseWebVTTTimestamp(times[1]); |
| 105 | + |
| 106 | + if (start == null || end == null) { |
| 107 | + return null; |
| 108 | + } |
| 109 | + |
| 110 | + return _CaptionRange(start, end); |
| 111 | + } |
| 112 | +} |
| 113 | + |
| 114 | +String _extractTextFromHtml(String htmlString) { |
| 115 | + final Document document = html_parser.parse(htmlString); |
| 116 | + final Element? body = document.body; |
| 117 | + if (body == null) { |
| 118 | + return ''; |
| 119 | + } |
| 120 | + final Element? bodyElement = html_parser.parse(body.text).documentElement; |
| 121 | + return bodyElement?.text ?? ''; |
| 122 | +} |
| 123 | + |
| 124 | +// Parses a time stamp in an VTT file into a Duration. |
| 125 | +// |
| 126 | +// Returns `null` if `timestampString` is in an invalid format. |
| 127 | +// |
| 128 | +// For example: |
| 129 | +// |
| 130 | +// _parseWebVTTTimestamp('00:01:08.430') |
| 131 | +// returns |
| 132 | +// Duration(hours: 0, minutes: 1, seconds: 8, milliseconds: 430) |
| 133 | +Duration? _parseWebVTTTimestamp(String timestampString) { |
| 134 | + if (!RegExp(_webVTTTimeStamp).hasMatch(timestampString)) { |
| 135 | + return null; |
| 136 | + } |
| 137 | + |
| 138 | + final List<String> dotSections = timestampString.split('.'); |
| 139 | + final List<String> timeComponents = dotSections[0].split(':'); |
| 140 | + |
| 141 | + // Validating and parsing the `timestampString`, invalid format will result this method |
| 142 | + // to return `null`. See https://www.w3.org/TR/webvtt1/#webvtt-timestamp for valid |
| 143 | + // WebVTT timestamp format. |
| 144 | + if (timeComponents.length > 3 || timeComponents.length < 2) { |
| 145 | + return null; |
| 146 | + } |
| 147 | + int hours = 0; |
| 148 | + if (timeComponents.length == 3) { |
| 149 | + final String hourString = timeComponents.removeAt(0); |
| 150 | + if (hourString.length < 2) { |
| 151 | + return null; |
| 152 | + } |
| 153 | + hours = int.parse(hourString); |
| 154 | + } |
| 155 | + final int minutes = int.parse(timeComponents.removeAt(0)); |
| 156 | + if (minutes < 0 || minutes > 59) { |
| 157 | + return null; |
| 158 | + } |
| 159 | + final int seconds = int.parse(timeComponents.removeAt(0)); |
| 160 | + if (seconds < 0 || seconds > 59) { |
| 161 | + return null; |
| 162 | + } |
| 163 | + |
| 164 | + List<String> milisecondsStyles = dotSections[1].split(" "); |
| 165 | + |
| 166 | + // TODO(cyanglaz): Handle caption styles. |
| 167 | + // https://github.com/flutter/flutter/issues/90009. |
| 168 | + // ```dart |
| 169 | + // if (milisecondsStyles.length > 1) { |
| 170 | + // List<String> styles = milisecondsStyles.sublist(1); |
| 171 | + // } |
| 172 | + // ``` |
| 173 | + // For a better readable code style, style parsing should happen before |
| 174 | + // calling this method. See: https://github.com/flutter/plugins/pull/2878/files#r713381134. |
| 175 | + int milliseconds = int.parse(milisecondsStyles[0]); |
| 176 | + |
| 177 | + return Duration( |
| 178 | + hours: hours, |
| 179 | + minutes: minutes, |
| 180 | + seconds: seconds, |
| 181 | + milliseconds: milliseconds, |
| 182 | + ); |
| 183 | +} |
| 184 | + |
| 185 | +// Reads on VTT file and splits it into Lists of strings where each list is one |
| 186 | +// caption. |
| 187 | +List<List<String>> _readWebVTTFile(String file) { |
| 188 | + final List<String> lines = LineSplitter.split(file).toList(); |
| 189 | + |
| 190 | + final List<List<String>> captionStrings = <List<String>>[]; |
| 191 | + List<String> currentCaption = <String>[]; |
| 192 | + int lineIndex = 0; |
| 193 | + for (final String line in lines) { |
| 194 | + final bool isLineBlank = line.trim().isEmpty; |
| 195 | + if (!isLineBlank) { |
| 196 | + currentCaption.add(line); |
| 197 | + } |
| 198 | + |
| 199 | + if (isLineBlank || lineIndex == lines.length - 1) { |
| 200 | + captionStrings.add(currentCaption); |
| 201 | + currentCaption = <String>[]; |
| 202 | + } |
| 203 | + |
| 204 | + lineIndex += 1; |
| 205 | + } |
| 206 | + |
| 207 | + return captionStrings; |
| 208 | +} |
| 209 | + |
| 210 | +const String _webVTTTimeStamp = r'(\d+):(\d{2})(:\d{2})?\.(\d{3})'; |
| 211 | +const String _webVTTArrow = r' --> '; |
0 commit comments