From fe416b437c28cd6cf383248b90005a2d516549f2 Mon Sep 17 00:00:00 2001 From: Stephen Macke Date: Fri, 29 Jan 2021 22:33:25 -0800 Subject: [PATCH] misc sync improvements (see HISTORY.srt for 0.4.11) --- HISTORY.rst | 7 ++++ ffsubsync/constants.py | 1 + ffsubsync/ffsubsync.py | 25 +++++++----- ffsubsync/speech_transformers.py | 65 ++++++++++++++++++-------------- requirements-dev.txt | 1 + test-data | 2 +- 6 files changed, 62 insertions(+), 39 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index af2231c..6f8555f 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -179,3 +179,10 @@ History * Filter out metadata in subtitles when extracting speech; * Add experimental --golden-section-search over framerate ratio (off by default); * Try to improve sync by inferring framerate ratio based on relative duration of synced vs unsynced; + +0.4.11 (2021-01-29) +------------------- +* Misc sync improvements: + * Have webrtcvad use '0' as the non speech label instead of 0.5; + * Allow the vad non speech label to be specified via the --non-speech-label command line parameter; + * Don't try to infer framerate ratio based on length between first and last speech frames for non-subtitle speech detection; diff --git a/ffsubsync/constants.py b/ffsubsync/constants.py index 0e4d927..ef4a026 100644 --- a/ffsubsync/constants.py +++ b/ffsubsync/constants.py @@ -6,6 +6,7 @@ FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.] DEFAULT_FRAME_RATE = 48000 +DEFAULT_NON_SPEECH_LABEL = 0. DEFAULT_ENCODING = 'infer' DEFAULT_MAX_SUBTITLE_SECONDS = 10 DEFAULT_START_SECONDS = 0 diff --git a/ffsubsync/ffsubsync.py b/ffsubsync/ffsubsync.py index 1d3e5ec..9a79cd9 100755 --- a/ffsubsync/ffsubsync.py +++ b/ffsubsync/ffsubsync.py @@ -116,7 +116,7 @@ def try_sync(args, reference_pipe, result): continue else: srt_pipe.fit(srtin) - if not args.skip_infer_framerate_ratio: + if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'): inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length) srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin)) @@ -185,7 +185,7 @@ def make_reference_pipe(args): if args.vad is not None: logger.warning('Vad specified, but reference was not a movie') return Pipeline([ - ('deserialize', DeserializeSpeechTransformer()) + ('deserialize', DeserializeSpeechTransformer(args.non_speech_label)) ]) else: vad = args.vad or DEFAULT_VAD @@ -195,14 +195,17 @@ def make_reference_pipe(args): if ref_stream is not None and not ref_stream.startswith('0:'): ref_stream = '0:' + ref_stream return Pipeline([ - ('speech_extract', VideoSpeechTransformer(vad=vad, - sample_rate=SAMPLE_RATE, - frame_rate=args.frame_rate, - start_seconds=args.start_seconds, - ffmpeg_path=args.ffmpeg_path, - ref_stream=ref_stream, - vlc_mode=args.vlc_mode, - gui_mode=args.gui_mode)) + ('speech_extract', VideoSpeechTransformer( + vad=vad, + sample_rate=SAMPLE_RATE, + frame_rate=args.frame_rate, + non_speech_label=args.non_speech_label, + start_seconds=args.start_seconds, + ffmpeg_path=args.ffmpeg_path, + ref_stream=ref_stream, + vlc_mode=args.vlc_mode, + gui_mode=args.gui_mode + )), ]) @@ -392,6 +395,8 @@ def add_cli_only_args(parser): help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE) parser.add_argument('--skip-infer-framerate-ratio', action='store_true', help='If set, do not try to infer framerate ratio based on duration ratio.') + parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL, + help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL) parser.add_argument('--output-encoding', default='utf-8', help='What encoding to use for writing output subtitles ' '(default=utf-8). Can indicate "same" to use same ' diff --git a/ffsubsync/speech_transformers.py b/ffsubsync/speech_transformers.py index 15ac6bf..5ab7f33 100644 --- a/ffsubsync/speech_transformers.py +++ b/ffsubsync/speech_transformers.py @@ -59,7 +59,7 @@ def subpipe_maker(framerate_ratio): return subpipe_maker(scale_factor) -def _make_auditok_detector(sample_rate, frame_rate): +def _make_auditok_detector(sample_rate, frame_rate, non_speech_label): try: from auditok import \ BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer @@ -76,31 +76,37 @@ def _make_auditok_detector(sample_rate, frame_rate): bytes_per_frame = 2 frames_per_window = frame_rate // sample_rate validator = AudioEnergyValidator( - sample_width=bytes_per_frame, energy_threshold=50) + sample_width=bytes_per_frame, energy_threshold=50 + ) tokenizer = StreamTokenizer( - validator=validator, min_length=0.2*sample_rate, - max_length=int(5*sample_rate), - max_continuous_silence=0.25*sample_rate) + validator=validator, + min_length=0.2 * sample_rate, + max_length=int(5 * sample_rate), + max_continuous_silence=0.25 * sample_rate + ) def _detect(asegment): - asource = BufferAudioSource(data_buffer=asegment, - sampling_rate=frame_rate, - sample_width=bytes_per_frame, - channels=1) + asource = BufferAudioSource( + data_buffer=asegment, + sampling_rate=frame_rate, + sample_width=bytes_per_frame, + channels=1 + ) ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate) ads.open() tokens = tokenizer.tokenize(ads) - length = (len(asegment)//bytes_per_frame - + frames_per_window - 1)//frames_per_window - media_bstring = np.zeros(length+1, dtype=int) + length = ( + len(asegment)//bytes_per_frame + frames_per_window - 1 + ) // frames_per_window + media_bstring = np.zeros(length + 1) for token in tokens: - media_bstring[token[1]] += 1 - media_bstring[token[2]+1] -= 1 - return (np.cumsum(media_bstring)[:-1] > 0).astype(float) + media_bstring[token[1]] = 1. + media_bstring[token[2] + 1] = non_speech_label - 1. + return np.clip(np.cumsum(media_bstring)[:-1], 0., 1.) return _detect -def _make_webrtcvad_detector(sample_rate, frame_rate): +def _make_webrtcvad_detector(sample_rate, frame_rate, non_speech_label): import webrtcvad vad = webrtcvad.Vad() vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3 @@ -123,7 +129,7 @@ def _detect(asegment): is_speech = False failures += 1 # webrtcvad has low recall on mode 3, so treat non-speech as "not sure" - media_bstring.append(1. if is_speech else 0.5) + media_bstring.append(1. if is_speech else non_speech_label) return np.array(media_bstring) return _detect @@ -141,20 +147,23 @@ def num_frames(self): return self.end_frame_ - self.start_frame_ def fit_boundaries(self, speech_frames): - nz = np.nonzero(speech_frames)[0] + nz = np.nonzero(speech_frames > 0.5)[0] if len(nz) > 0: self.start_frame_ = np.min(nz) self.end_frame_ = np.max(nz) return self -class VideoSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin): - def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, - ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False): +class VideoSpeechTransformer(TransformerMixin): + def __init__( + self, vad, sample_rate, frame_rate, non_speech_label, start_seconds=0, + ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False + ): super(VideoSpeechTransformer, self).__init__() self.vad = vad self.sample_rate = sample_rate self.frame_rate = frame_rate + self._non_speech_label = non_speech_label self.start_seconds = start_seconds self.ffmpeg_path = ffmpeg_path self.ref_stream = ref_stream @@ -197,7 +206,6 @@ def try_fit_using_embedded_subs(self, fname): # use longest set of embedded subs subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))] self.video_speech_results_ = subs_to_use.subtitle_speech_results_ - self.fit_boundaries(self.video_speech_results_) def fit(self, fname, *_): if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')): @@ -216,9 +224,9 @@ def fit(self, fname, *_): logger.warning(e) total_duration = None if 'webrtc' in self.vad: - detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate) + detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate, self._non_speech_label) elif 'auditok' in self.vad: - detector = _make_auditok_detector(self.sample_rate, self.frame_rate) + detector = _make_auditok_detector(self.sample_rate, self.frame_rate, self._non_speech_label) else: raise ValueError('unknown vad: %s' % self.vad) media_bstring = [] @@ -284,7 +292,6 @@ def redirect_stderr(enter_result=None): 'Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad.' ) self.video_speech_results_ = np.concatenate(media_bstring) - self.fit_boundaries(self.video_speech_results_) return self def transform(self, *_): @@ -300,6 +307,7 @@ def transform(self, *_): } +# TODO: need way better metadata detector def _is_metadata(content, is_beginning_or_end): content = content.strip() if len(content) == 0: @@ -348,9 +356,10 @@ def transform(self, *_): return self.subtitle_speech_results_ -class DeserializeSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin): - def __init__(self): +class DeserializeSpeechTransformer(TransformerMixin): + def __init__(self, non_speech_label): super(DeserializeSpeechTransformer, self).__init__() + self._non_speech_label = non_speech_label self.deserialized_speech_results_ = None def fit(self, fname, *_): @@ -361,8 +370,8 @@ def fit(self, fname, *_): else: raise ValueError('could not find "speech" array in ' 'serialized file; only contains: %s' % speech.files) + speech[speech < 1.] = self._non_speech_label self.deserialized_speech_results_ = speech - self.fit_boundaries(self.deserialized_speech_results_) return self def transform(self, *_): diff --git a/requirements-dev.txt b/requirements-dev.txt index dc85fdb..bd3a266 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,7 @@ # testing flake8 pytest +pytest-cov pyyaml;python_version!="3.4" twine;python_version!="3.4" versioneer diff --git a/test-data b/test-data index 9ed7ed6..cb3c897 160000 --- a/test-data +++ b/test-data @@ -1 +1 @@ -Subproject commit 9ed7ed60ff6a85caf04a3ce7d959a1cb62fca1d0 +Subproject commit cb3c89741db871d095d4da067a065bcf0821f1c6