Skip to content

Commit

Permalink
misc sync improvements (see HISTORY.srt for 0.4.11)
Browse files Browse the repository at this point in the history
  • Loading branch information
smacke committed Jan 30, 2021
1 parent e33f117 commit fe416b4
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 39 deletions.
7 changes: 7 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,10 @@ History
* Filter out metadata in subtitles when extracting speech;
* Add experimental --golden-section-search over framerate ratio (off by default);
* Try to improve sync by inferring framerate ratio based on relative duration of synced vs unsynced;

0.4.11 (2021-01-29)
-------------------
* Misc sync improvements:
* Have webrtcvad use '0' as the non speech label instead of 0.5;
* Allow the vad non speech label to be specified via the --non-speech-label command line parameter;
* Don't try to infer framerate ratio based on length between first and last speech frames for non-subtitle speech detection;
1 change: 1 addition & 0 deletions ffsubsync/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.]

DEFAULT_FRAME_RATE = 48000
DEFAULT_NON_SPEECH_LABEL = 0.
DEFAULT_ENCODING = 'infer'
DEFAULT_MAX_SUBTITLE_SECONDS = 10
DEFAULT_START_SECONDS = 0
Expand Down
25 changes: 15 additions & 10 deletions ffsubsync/ffsubsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def try_sync(args, reference_pipe, result):
continue
else:
srt_pipe.fit(srtin)
if not args.skip_infer_framerate_ratio:
if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'):
inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames
logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length)
srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin))
Expand Down Expand Up @@ -185,7 +185,7 @@ def make_reference_pipe(args):
if args.vad is not None:
logger.warning('Vad specified, but reference was not a movie')
return Pipeline([
('deserialize', DeserializeSpeechTransformer())
('deserialize', DeserializeSpeechTransformer(args.non_speech_label))
])
else:
vad = args.vad or DEFAULT_VAD
Expand All @@ -195,14 +195,17 @@ def make_reference_pipe(args):
if ref_stream is not None and not ref_stream.startswith('0:'):
ref_stream = '0:' + ref_stream
return Pipeline([
('speech_extract', VideoSpeechTransformer(vad=vad,
sample_rate=SAMPLE_RATE,
frame_rate=args.frame_rate,
start_seconds=args.start_seconds,
ffmpeg_path=args.ffmpeg_path,
ref_stream=ref_stream,
vlc_mode=args.vlc_mode,
gui_mode=args.gui_mode))
('speech_extract', VideoSpeechTransformer(
vad=vad,
sample_rate=SAMPLE_RATE,
frame_rate=args.frame_rate,
non_speech_label=args.non_speech_label,
start_seconds=args.start_seconds,
ffmpeg_path=args.ffmpeg_path,
ref_stream=ref_stream,
vlc_mode=args.vlc_mode,
gui_mode=args.gui_mode
)),
])


Expand Down Expand Up @@ -392,6 +395,8 @@ def add_cli_only_args(parser):
help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE)
parser.add_argument('--skip-infer-framerate-ratio', action='store_true',
help='If set, do not try to infer framerate ratio based on duration ratio.')
parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL,
help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL)
parser.add_argument('--output-encoding', default='utf-8',
help='What encoding to use for writing output subtitles '
'(default=utf-8). Can indicate "same" to use same '
Expand Down
65 changes: 37 additions & 28 deletions ffsubsync/speech_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def subpipe_maker(framerate_ratio):
return subpipe_maker(scale_factor)


def _make_auditok_detector(sample_rate, frame_rate):
def _make_auditok_detector(sample_rate, frame_rate, non_speech_label):
try:
from auditok import \
BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer
Expand All @@ -76,31 +76,37 @@ def _make_auditok_detector(sample_rate, frame_rate):
bytes_per_frame = 2
frames_per_window = frame_rate // sample_rate
validator = AudioEnergyValidator(
sample_width=bytes_per_frame, energy_threshold=50)
sample_width=bytes_per_frame, energy_threshold=50
)
tokenizer = StreamTokenizer(
validator=validator, min_length=0.2*sample_rate,
max_length=int(5*sample_rate),
max_continuous_silence=0.25*sample_rate)
validator=validator,
min_length=0.2 * sample_rate,
max_length=int(5 * sample_rate),
max_continuous_silence=0.25 * sample_rate
)

def _detect(asegment):
asource = BufferAudioSource(data_buffer=asegment,
sampling_rate=frame_rate,
sample_width=bytes_per_frame,
channels=1)
asource = BufferAudioSource(
data_buffer=asegment,
sampling_rate=frame_rate,
sample_width=bytes_per_frame,
channels=1
)
ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate)
ads.open()
tokens = tokenizer.tokenize(ads)
length = (len(asegment)//bytes_per_frame
+ frames_per_window - 1)//frames_per_window
media_bstring = np.zeros(length+1, dtype=int)
length = (
len(asegment)//bytes_per_frame + frames_per_window - 1
) // frames_per_window
media_bstring = np.zeros(length + 1)
for token in tokens:
media_bstring[token[1]] += 1
media_bstring[token[2]+1] -= 1
return (np.cumsum(media_bstring)[:-1] > 0).astype(float)
media_bstring[token[1]] = 1.
media_bstring[token[2] + 1] = non_speech_label - 1.
return np.clip(np.cumsum(media_bstring)[:-1], 0., 1.)
return _detect


def _make_webrtcvad_detector(sample_rate, frame_rate):
def _make_webrtcvad_detector(sample_rate, frame_rate, non_speech_label):
import webrtcvad
vad = webrtcvad.Vad()
vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3
Expand All @@ -123,7 +129,7 @@ def _detect(asegment):
is_speech = False
failures += 1
# webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
media_bstring.append(1. if is_speech else 0.5)
media_bstring.append(1. if is_speech else non_speech_label)
return np.array(media_bstring)

return _detect
Expand All @@ -141,20 +147,23 @@ def num_frames(self):
return self.end_frame_ - self.start_frame_

def fit_boundaries(self, speech_frames):
nz = np.nonzero(speech_frames)[0]
nz = np.nonzero(speech_frames > 0.5)[0]
if len(nz) > 0:
self.start_frame_ = np.min(nz)
self.end_frame_ = np.max(nz)
return self


class VideoSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin):
def __init__(self, vad, sample_rate, frame_rate, start_seconds=0,
ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False):
class VideoSpeechTransformer(TransformerMixin):
def __init__(
self, vad, sample_rate, frame_rate, non_speech_label, start_seconds=0,
ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False
):
super(VideoSpeechTransformer, self).__init__()
self.vad = vad
self.sample_rate = sample_rate
self.frame_rate = frame_rate
self._non_speech_label = non_speech_label
self.start_seconds = start_seconds
self.ffmpeg_path = ffmpeg_path
self.ref_stream = ref_stream
Expand Down Expand Up @@ -197,7 +206,6 @@ def try_fit_using_embedded_subs(self, fname):
# use longest set of embedded subs
subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))]
self.video_speech_results_ = subs_to_use.subtitle_speech_results_
self.fit_boundaries(self.video_speech_results_)

def fit(self, fname, *_):
if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')):
Expand All @@ -216,9 +224,9 @@ def fit(self, fname, *_):
logger.warning(e)
total_duration = None
if 'webrtc' in self.vad:
detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate)
detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
elif 'auditok' in self.vad:
detector = _make_auditok_detector(self.sample_rate, self.frame_rate)
detector = _make_auditok_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
else:
raise ValueError('unknown vad: %s' % self.vad)
media_bstring = []
Expand Down Expand Up @@ -284,7 +292,6 @@ def redirect_stderr(enter_result=None):
'Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad.'
)
self.video_speech_results_ = np.concatenate(media_bstring)
self.fit_boundaries(self.video_speech_results_)
return self

def transform(self, *_):
Expand All @@ -300,6 +307,7 @@ def transform(self, *_):
}


# TODO: need way better metadata detector
def _is_metadata(content, is_beginning_or_end):
content = content.strip()
if len(content) == 0:
Expand Down Expand Up @@ -348,9 +356,10 @@ def transform(self, *_):
return self.subtitle_speech_results_


class DeserializeSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin):
def __init__(self):
class DeserializeSpeechTransformer(TransformerMixin):
def __init__(self, non_speech_label):
super(DeserializeSpeechTransformer, self).__init__()
self._non_speech_label = non_speech_label
self.deserialized_speech_results_ = None

def fit(self, fname, *_):
Expand All @@ -361,8 +370,8 @@ def fit(self, fname, *_):
else:
raise ValueError('could not find "speech" array in '
'serialized file; only contains: %s' % speech.files)
speech[speech < 1.] = self._non_speech_label
self.deserialized_speech_results_ = speech
self.fit_boundaries(self.deserialized_speech_results_)
return self

def transform(self, *_):
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# testing
flake8
pytest
pytest-cov
pyyaml;python_version!="3.4"
twine;python_version!="3.4"
versioneer
2 changes: 1 addition & 1 deletion test-data

0 comments on commit fe416b4

Please sign in to comment.