audio: Fix timestamps outputted by AudioOpusEncoder

Make AudioOpusEncoder output correct timestamps even if inputs' length are not aligned to 60ms. Bug: 1166554 Change-Id: If7b48d5d7a135d6a10471ca4fe4647643486263d Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2644917 Commit-Queue: Eugene Zemtsov <eugene@chromium.org> Reviewed-by: Dan Sanders <sandersd@chromium.org> Cr-Commit-Position: refs/heads/master@{#846577}
FairyWorld · Jan 24, 2021 · 4c64dac · 4c64dac
1 parent e2f8934
commit 4c64dac
Show file tree

Hide file tree

Showing 6 changed files with 189 additions and 52 deletions.
diff --git a/media/audio/audio_encoders_unittest.cc b/media/audio/audio_encoders_unittest.cc
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "base/bind.h"
+#include "base/test/bind.h"
 #include "base/time/time.h"
 #include "media/audio/audio_opus_encoder.h"
 #include "media/audio/audio_pcm_encoder.h"
@@ -24,9 +25,6 @@ namespace {
 
 constexpr int kAudioSampleRate = 48000;
 
-constexpr base::TimeDelta kBufferDuration =
-    base::TimeDelta::FromMilliseconds(10);
-
 // This is the preferred opus buffer duration (60 ms), which corresponds to a
 // value of 2880 frames per buffer (|kOpusFramesPerBuffer|).
 constexpr base::TimeDelta kOpusBufferDuration =
@@ -36,33 +34,28 @@ constexpr int kOpusFramesPerBuffer = kOpusBufferDuration.InMicroseconds() *
                                      base::Time::kMicrosecondsPerSecond;
 
 struct TestAudioParams {
-  const media::AudioParameters::Format format;
-  const media::ChannelLayout channel_layout;
+  const AudioParameters::Format format;
+  const ChannelLayout channel_layout;
   const int sample_rate;
 };
 
 constexpr TestAudioParams kTestAudioParams[] = {
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
-     media::CHANNEL_LAYOUT_STEREO, kAudioSampleRate},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_STEREO,
+     kAudioSampleRate},
     // Change to mono:
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_MONO,
      kAudioSampleRate},
     // Different sampling rate as well:
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
-     24000},
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
-     media::CHANNEL_LAYOUT_STEREO, 8000},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_MONO, 24000},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_STEREO, 8000},
     // Using a non-default Opus sampling rate (48, 24, 16, 12, or 8 kHz).
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
-     22050},
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
-     media::CHANNEL_LAYOUT_STEREO, 44100},
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
-     media::CHANNEL_LAYOUT_STEREO, 96000},
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY, media::CHANNEL_LAYOUT_MONO,
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_MONO, 22050},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_STEREO, 44100},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_STEREO, 96000},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_MONO,
+     kAudioSampleRate},
+    {AudioParameters::AUDIO_PCM_LOW_LATENCY, CHANNEL_LAYOUT_STEREO,
      kAudioSampleRate},
-    {media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
-     media::CHANNEL_LAYOUT_STEREO, kAudioSampleRate},
 };
 
 }  // namespace
@@ -90,24 +83,28 @@ class AudioEncodersTest : public ::testing::TestWithParam<TestAudioParams> {
     encode_callback_count_ = 0;
   }
 
-  // Produces an audio data that corresponds to a |kBufferDuration| and the
+  // Produces an audio data that corresponds to a |buffer_duration| and the
   // sample rate of the current |input_params_|. The produced data is send to
   // |encoder_| to be encoded, and the number of frames generated is returned.
-  int ProduceAudioAndEncode() {
+  int ProduceAudioAndEncode(
+      base::TimeTicks timestamp = base::TimeTicks::Now()) {
     DCHECK(encoder_);
     const int num_frames =
-        input_params_.sample_rate() * kBufferDuration.InSecondsF();
-    current_audio_bus_ =
-        media::AudioBus::Create(input_params_.channels(), num_frames);
-    const auto capture_time = base::TimeTicks::Now();
+        input_params_.sample_rate() * buffer_duration.InSecondsF();
+    base::TimeTicks capture_time = timestamp + buffer_duration;
+    current_audio_bus_ = AudioBus::Create(input_params_.channels(), num_frames);
     audio_source_.OnMoreData(base::TimeDelta(), capture_time, 0,
                              current_audio_bus_.get());
     encoder_->EncodeAudio(*current_audio_bus_, capture_time);
     return num_frames;
   }
 
   // Used to verify we get no errors.
-  void OnErrorCallback(Status error) { FAIL() << error.message(); }
+  void OnErrorCallback(Status error) {
+    DCHECK(!error.is_ok());
+    FAIL() << error.message();
+    NOTREACHED();
+  }
 
   // Used as the callback of the PCM encoder.
   void VerifyPcmEncoding(EncodedAudioBuffer output) {
@@ -143,22 +140,24 @@ class AudioEncodersTest : public ::testing::TestWithParam<TestAudioParams> {
                                 kOpusFramesPerBuffer, 0));
   }
 
- private:
+ protected:
   // The input params as initialized from the test's parameter.
   const AudioParameters input_params_;
 
   // The audio source used to fill in the data of the |current_audio_bus_|.
-  media::SineWaveAudioSource audio_source_;
+  SineWaveAudioSource audio_source_;
 
   // The encoder the test is verifying.
   std::unique_ptr<AudioEncoder> encoder_;
 
   // The audio bus that was most recently generated and sent to the |encoder_|
   // by ProduceAudioAndEncode().
-  std::unique_ptr<media::AudioBus> current_audio_bus_;
+  std::unique_ptr<AudioBus> current_audio_bus_;
 
   // The number of encoder callbacks received.
   int encode_callback_count_ = 0;
+
+  base::TimeDelta buffer_duration = base::TimeDelta::FromMilliseconds(10);
 };
 
 TEST_P(AudioEncodersTest, PcmEncoder) {
@@ -176,6 +175,108 @@ TEST_P(AudioEncodersTest, PcmEncoder) {
   EXPECT_EQ(kCount, encode_callback_count());
 }
 
+TEST_P(AudioEncodersTest, OpusTimestamps) {
+  constexpr int kCount = 12;
+  for (base::TimeDelta duration :
+       {kOpusBufferDuration * 10, kOpusBufferDuration,
+        kOpusBufferDuration * 2 / 3}) {
+    buffer_duration = duration;
+    size_t expected_outputs = (buffer_duration * kCount) / kOpusBufferDuration;
+    base::TimeTicks current_timestamp;
+    std::vector<base::TimeTicks> timestamps;
+
+    auto output_cb = base::BindLambdaForTesting([&](EncodedAudioBuffer output) {
+      timestamps.push_back(output.timestamp);
+    });
+
+    SetEncoder(std::make_unique<AudioOpusEncoder>(
+        input_params(), std::move(output_cb),
+        base::BindRepeating(&AudioEncodersTest::OnErrorCallback,
+                            base::Unretained(this)),
+        /*opus_bitrate=*/0));
+
+    for (int i = 0; i < kCount; ++i) {
+      ProduceAudioAndEncode(current_timestamp);
+      current_timestamp += buffer_duration;
+    }
+    encoder()->Flush();
+    EXPECT_EQ(expected_outputs, timestamps.size());
+
+    current_timestamp = base::TimeTicks();
+    for (auto& ts : timestamps) {
+      EXPECT_EQ(current_timestamp, ts);
+      current_timestamp += kOpusBufferDuration;
+    }
+  }
+}
+
+// Check how Opus encoder reacts to breaks in continuity of incoming sound.
+// Capture times are expected to be exactly buffer durations apart,
+// but the encoder should be ready to handle situations when it's not the case.
+TEST_P(AudioEncodersTest, OpusTimeContinuityBreak) {
+  base::TimeTicks current_timestamp;
+  base::TimeDelta small_gap = base::TimeDelta::FromMicroseconds(500);
+  base::TimeDelta large_gap = base::TimeDelta::FromMicroseconds(1500);
+  std::vector<base::TimeTicks> timestamps;
+
+  auto output_cb = base::BindLambdaForTesting([&](EncodedAudioBuffer output) {
+    timestamps.push_back(output.timestamp);
+  });
+
+  SetEncoder(std::make_unique<AudioOpusEncoder>(
+      input_params(), std::move(output_cb),
+      base::BindRepeating(&AudioEncodersTest::OnErrorCallback,
+                          base::Unretained(this)),
+      /*opus_bitrate=*/0));
+
+  // Encode first normal buffer and immediately get an output for it.
+  buffer_duration = kOpusBufferDuration;
+  auto ts0 = current_timestamp;
+  ProduceAudioAndEncode(current_timestamp);
+  current_timestamp += buffer_duration;
+  EXPECT_EQ(1u, timestamps.size());
+  EXPECT_EQ(ts0, timestamps[0]);
+
+  // Add another buffer which is too small and will be buffered
+  buffer_duration = kOpusBufferDuration / 2;
+  auto ts1 = current_timestamp;
+  ProduceAudioAndEncode(current_timestamp);
+  current_timestamp += buffer_duration;
+  EXPECT_EQ(1u, timestamps.size());
+
+  // Add another large buffer after a large gap, 2 outputs are expected
+  // because large gap should trigger a flush.
+  current_timestamp += large_gap;
+  buffer_duration = kOpusBufferDuration;
+  auto ts2 = current_timestamp;
+  ProduceAudioAndEncode(current_timestamp);
+  current_timestamp += buffer_duration;
+  EXPECT_EQ(3u, timestamps.size());
+  EXPECT_EQ(ts1, timestamps[1]);
+  EXPECT_EQ(ts2, timestamps[2]);
+
+  // Add another buffer which is too small and will be buffered
+  buffer_duration = kOpusBufferDuration / 2;
+  auto ts3 = current_timestamp;
+  ProduceAudioAndEncode(current_timestamp);
+  current_timestamp += buffer_duration;
+  EXPECT_EQ(3u, timestamps.size());
+
+  // Add a small gap and a large buffer, only one output is expected because
+  // small gap doesn't trigger a flush.
+  // Small gap itself is not counted in output timestamps.
+  auto ts4 = current_timestamp + kOpusBufferDuration / 2;
+  current_timestamp += small_gap;
+  buffer_duration = kOpusBufferDuration;
+  ProduceAudioAndEncode(current_timestamp);
+  EXPECT_EQ(4u, timestamps.size());
+  EXPECT_EQ(ts3, timestamps[3]);
+
+  encoder()->Flush();
+  EXPECT_EQ(5u, timestamps.size());
+  EXPECT_EQ(ts4, timestamps[4]);
+}
+
 TEST_P(AudioEncodersTest, OpusEncoder) {
   int error;
   OpusDecoder* opus_decoder =
@@ -195,8 +296,11 @@ TEST_P(AudioEncodersTest, OpusEncoder) {
   const int frames_in_60_ms =
       kOpusBufferDuration.InSecondsF() * input_params().sample_rate();
   int total_frames = 0;
-  while (total_frames < frames_in_60_ms)
-    total_frames += ProduceAudioAndEncode();
+  base::TimeTicks time;
+  while (total_frames < frames_in_60_ms) {
+    time += buffer_duration;
+    total_frames += ProduceAudioAndEncode(time);
+  }
 
   EXPECT_EQ(1, encode_callback_count());
 

diff --git a/media/audio/audio_opus_encoder.cc b/media/audio/audio_opus_encoder.cc
@@ -10,6 +10,7 @@
 #include "base/logging.h"
 #include "base/stl_util.h"
 #include "base/strings/stringprintf.h"
+#include "media/base/audio_timestamp_helper.h"
 #include "media/base/status.h"
 #include "media/base/status_codes.h"
 
@@ -218,6 +219,23 @@ void AudioOpusEncoder::EncodeAudioImpl(const AudioBus& audio_bus,
   if (!opus_encoder_)
     return;
 
+  // If |capture_time| - |audio_bus|'s duration is not equal to the expected
+  // timestamp for the next audio sample. It means there is a gap/overlap,
+  // if it's big enough we flash and start anew, otherwise we ignore it.
+  auto capture_ts = ComputeTimestamp(audio_bus.frames(), capture_time);
+  auto end_of_existing_buffer_ts =
+      next_timestamp_ +
+      AudioTimestampHelper::FramesToTime(fifo_.queued_frames(),
+                                         audio_input_params().sample_rate());
+  base::TimeDelta gap = (capture_ts - end_of_existing_buffer_ts).magnitude();
+  constexpr base::TimeDelta max_gap = base::TimeDelta::FromMilliseconds(1);
+  if (gap > max_gap) {
+    DLOG(ERROR) << "Large gap in sound. Forced flush. "
+                << "Gap/overlap duration: " << gap;
+    FlushImpl();
+    next_timestamp_ = capture_ts;
+  }
+
   // The |fifo_| won't trigger OnFifoOutput() until we have enough frames
   // suitable for the converter.
   fifo_.Push(audio_bus);
@@ -251,11 +269,14 @@ void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
                converted_params_.frames_per_buffer(), &encoded_data,
                &encoded_data_size)) {
     DCHECK_GT(encoded_data_size, 1u);
-    encode_callback().Run(EncodedAudioBuffer(
-        converted_params_, std::move(encoded_data), encoded_data_size,
-        ComputeTimestamp(
-            number_of_flushed_frames_.value_or(output_bus.frames()),
-            last_capture_time())));
+    auto reported_duration = AudioTimestampHelper::FramesToTime(
+        number_of_flushed_frames_.value_or(output_bus.frames()),
+        audio_input_params().sample_rate());
+
+    encode_callback().Run(
+        EncodedAudioBuffer(converted_params_, std::move(encoded_data),
+                           encoded_data_size, next_timestamp_));
+    next_timestamp_ += reported_duration;
   }
 }
 

diff --git a/media/audio/audio_opus_encoder.h b/media/audio/audio_opus_encoder.h
@@ -77,6 +77,9 @@ class MEDIA_EXPORT AudioOpusEncoder : public AudioEncoder {
   // this will be the number of flushed frames, which is used to compute the
   // timestamp provided in the output |EncodedAudioBuffer|.
   base::Optional<int> number_of_flushed_frames_;
+
+  // Timestamp that should be reported by the next call of encode_callback()
+  base::TimeTicks next_timestamp_;
 };
 
 }  // namespace media

diff --git a/media/base/audio_encoder.cc b/media/base/audio_encoder.cc
@@ -55,7 +55,7 @@ void AudioEncoder::EncodeAudio(const AudioBus& audio_bus,
           !last_capture_time_.is_null() &&
               ((capture_time - last_capture_time_).InSecondsF() >
                1.5f * audio_bus.frames() / audio_input_params().sample_rate()))
-      << "Possibly frames were skipped, which may result in inaccuarate "
+      << "Possibly frames were skipped, which may result in inaccurate "
          "timestamp calculation.";
 
   last_capture_time_ = capture_time;

diff --git a/third_party/blink/renderer/modules/webcodecs/audio_encoder.cc b/third_party/blink/renderer/modules/webcodecs/audio_encoder.cc
@@ -6,6 +6,7 @@
 
 #include "media/audio/audio_opus_encoder.h"
 #include "media/base/audio_parameters.h"
+#include "media/base/audio_timestamp_helper.h"
 #include "third_party/blink/public/mojom/web_feature/web_feature.mojom-blink.h"
 #include "third_party/blink/renderer/bindings/modules/v8/v8_audio_encoder_config.h"
 #include "third_party/blink/renderer/bindings/modules/v8/v8_audio_frame_init.h"
@@ -81,11 +82,16 @@ void AudioEncoder::ProcessEncode(Request* request) {
   DCHECK_GT(requested_encodes_, 0);
 
   auto* frame = request->frame.Release();
-
-  base::TimeTicks time =
-      base::TimeTicks() + base::TimeDelta::FromMicroseconds(frame->timestamp());
-
   auto* buffer = frame->buffer();
+
+  // Converting time at the beginning of the frame (aka timestamp) into
+  // time at the end of the frame (aka capture time) that is expected by
+  // media::AudioEncoder.
+  base::TimeTicks capture_time =
+      base::TimeTicks() +
+      base::TimeDelta::FromMicroseconds(frame->timestamp()) +
+      media::AudioTimestampHelper::FramesToTime(buffer->length(),
+                                                active_config_->sample_rate);
   DCHECK(buffer);
   {
     auto audio_bus = media::AudioBus::CreateWrapper(buffer->numberOfChannels());
@@ -95,7 +101,7 @@ void AudioEncoder::ProcessEncode(Request* request) {
       audio_bus->SetChannelData(channel, data);
     }
     audio_bus->set_frames(buffer->length());
-    media_encoder_->EncodeAudio(*audio_bus, time);
+    media_encoder_->EncodeAudio(*audio_bus, capture_time);
   }
 
   frame->close();