While encoding disregard audio buffer timestamps except after Flush()

The audio encoder treats incoming frames as continuous sound signal and disregard timestamps even if it appears that there are gaps and overlaps in the audio signal. Test: updated unit test, wc-audio.glitch.me sounds good Bug: 1195189 Change-Id: I005398d1f06662ef455d05c8442a4f00ae2d82dd Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2801287 Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Reviewed-by: Thomas Guilbert <tguilbert@chromium.org> Commit-Queue: Eugene Zemtsov <eugene@chromium.org> Cr-Commit-Position: refs/heads/master@{#869125}
FairyWorld · Apr 5, 2021 · 5ff3c9b · 5ff3c9b
1 parent cb44300
commit 5ff3c9b
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 109 deletions.
diff --git a/media/audio/audio_encoders_unittest.cc b/media/audio/audio_encoders_unittest.cc
@@ -170,7 +170,8 @@ TEST_P(AudioEncodersTest, OpusTimestamps) {
 
     current_timestamp = base::TimeTicks();
     for (auto& ts : timestamps) {
-      EXPECT_EQ(current_timestamp, ts);
+      auto drift = (current_timestamp - ts).magnitude();
+      EXPECT_LE(drift, base::TimeDelta::FromMicroseconds(1));
       current_timestamp += kOpusBufferDuration;
     }
   }
@@ -209,12 +210,16 @@ TEST_P(AudioEncodersTest, OpusExtraData) {
 }
 
 // Check how Opus encoder reacts to breaks in continuity of incoming sound.
-// Capture times are expected to be exactly buffer durations apart,
-// but the encoder should be ready to handle situations when it's not the case.
+// Under normal circumstances capture times are expected to be exactly
+// a buffer's duration apart, but if they are not, the encoder just ignores
+// incoming capture times. In other words the only capture times that matter
+// are
+//   1. timestamp of the first encoded buffer
+//   2. timestamps of buffers coming immediately after Flush() calls.
 TEST_P(AudioEncodersTest, OpusTimeContinuityBreak) {
-  base::TimeTicks current_timestamp;
-  base::TimeDelta small_gap = base::TimeDelta::FromMicroseconds(500);
-  base::TimeDelta large_gap = base::TimeDelta::FromMicroseconds(1500);
+  base::TimeTicks current_timestamp = base::TimeTicks::Now();
+  base::TimeDelta gap = base::TimeDelta::FromMicroseconds(1500);
+  buffer_duration_ = kOpusBufferDuration;
   std::vector<base::TimeTicks> timestamps;
 
   auto output_cb =
@@ -225,55 +230,42 @@ TEST_P(AudioEncodersTest, OpusTimeContinuityBreak) {
   SetupEncoder(std::move(output_cb));
 
   // Encode first normal buffer and immediately get an output for it.
-  buffer_duration_ = kOpusBufferDuration;
   auto ts0 = current_timestamp;
   ProduceAudioAndEncode(current_timestamp);
   current_timestamp += buffer_duration_;
   EXPECT_EQ(1u, timestamps.size());
   EXPECT_EQ(ts0, timestamps[0]);
 
-  // Add another buffer which is too small and will be buffered
-  buffer_duration_ = kOpusBufferDuration / 2;
+  // Encode another buffer after a large gap, output timestamp should
+  // disregard the gap.
   auto ts1 = current_timestamp;
+  current_timestamp += gap;
   ProduceAudioAndEncode(current_timestamp);
   current_timestamp += buffer_duration_;
-  EXPECT_EQ(1u, timestamps.size());
-
-  // Add another large buffer after a large gap, 2 outputs are expected
-  // because large gap should trigger a flush.
-  current_timestamp += large_gap;
-  buffer_duration_ = kOpusBufferDuration;
-  auto ts2 = current_timestamp;
-  ProduceAudioAndEncode(current_timestamp);
-  current_timestamp += buffer_duration_;
-  EXPECT_EQ(3u, timestamps.size());
+  EXPECT_EQ(2u, timestamps.size());
   EXPECT_EQ(ts1, timestamps[1]);
-  EXPECT_EQ(ts2, timestamps[2]);
 
-  // Add another buffer which is too small and will be buffered
-  buffer_duration_ = kOpusBufferDuration / 2;
-  auto ts3 = current_timestamp;
+  // Another buffer without a gap.
+  auto ts2 = ts1 + buffer_duration_;
   ProduceAudioAndEncode(current_timestamp);
-  current_timestamp += buffer_duration_;
   EXPECT_EQ(3u, timestamps.size());
-
-  // Add a small gap and a large buffer, only one output is expected because
-  // small gap doesn't trigger a flush.
-  // Small gap itself is not counted in output timestamps.
-  auto ts4 = current_timestamp + kOpusBufferDuration / 2;
-  current_timestamp += small_gap;
-  buffer_duration_ = kOpusBufferDuration;
-  ProduceAudioAndEncode(current_timestamp);
-  EXPECT_EQ(4u, timestamps.size());
-  EXPECT_EQ(ts3, timestamps[3]);
+  EXPECT_EQ(ts2, timestamps[2]);
 
   encoder()->Flush(base::BindOnce([](Status error) {
     if (!error.is_ok())
       FAIL() << error.message();
   }));
   RunLoop();
-  EXPECT_EQ(5u, timestamps.size());
-  EXPECT_EQ(ts4, timestamps[4]);
+
+  // Reset output timestamp after Flush(), the encoder should start producing
+  // timestamps from new base 0.
+  current_timestamp = base::TimeTicks();
+
+  auto ts3 = current_timestamp;
+  ProduceAudioAndEncode(current_timestamp);
+  current_timestamp += buffer_duration_;
+  EXPECT_EQ(4u, timestamps.size());
+  EXPECT_EQ(ts3, timestamps[3]);
 }
 
 TEST_P(AudioEncodersTest, FullCycleEncodeDecode) {

diff --git a/media/audio/audio_opus_encoder.cc b/media/audio/audio_opus_encoder.cc
@@ -11,10 +11,10 @@
 #include "base/numerics/checked_math.h"
 #include "base/stl_util.h"
 #include "base/strings/stringprintf.h"
-#include "media/base/audio_timestamp_helper.h"
 #include "media/base/bind_to_current_loop.h"
 #include "media/base/status.h"
 #include "media/base/status_codes.h"
+#include "media/base/timestamp_constants.h"
 
 namespace media {
 
@@ -132,6 +132,8 @@ void AudioOpusEncoder::Initialize(const Options& options,
   converter_ =
       std::make_unique<AudioConverter>(input_params_, converted_params_,
                                        /*disable_fifo=*/false);
+  timestamp_tracker_ =
+      std::make_unique<AudioTimestampHelper>(converted_params_.sample_rate());
   fifo_ = std::make_unique<AudioPushFifo>(base::BindRepeating(
       &AudioOpusEncoder::OnFifoOutput, base::Unretained(this)));
   converted_audio_bus_ = AudioBus::Create(
@@ -207,6 +209,7 @@ void AudioOpusEncoder::Encode(std::unique_ptr<AudioBus> audio_bus,
   DCHECK_EQ(audio_bus->channels(), input_params_.channels());
   DCHECK(!capture_time.is_null());
   DCHECK(!done_cb.is_null());
+  DCHECK(timestamp_tracker_);
 
   current_done_cb_ = BindToCurrentLoop(std::move(done_cb));
   if (!opus_encoder_) {
@@ -215,32 +218,11 @@ void AudioOpusEncoder::Encode(std::unique_ptr<AudioBus> audio_bus,
     return;
   }
 
-  DLOG_IF(ERROR, !last_capture_time_.is_null() &&
-                     ((capture_time - last_capture_time_).InSecondsF() >
-                      1.5f * audio_bus->frames() / input_params_.sample_rate()))
-      << "Possibly frames were skipped, which may result in inaccurate "
-         "timestamp calculation.";
-
-  last_capture_time_ = capture_time;
-
-  // If |capture_time| - |audio_bus|'s duration is not equal to the expected
-  // timestamp for the next audio sample. It means there is a gap/overlap,
-  // if it's big enough we flash and start anew, otherwise we ignore it.
-  auto capture_ts = ComputeTimestamp(audio_bus->frames(), capture_time);
-  auto existing_buffer_duration = AudioTimestampHelper::FramesToTime(
-      fifo_->queued_frames(), input_params_.sample_rate());
-  auto end_of_existing_buffer_ts = next_timestamp_ + existing_buffer_duration;
-  base::TimeDelta gap = (capture_ts - end_of_existing_buffer_ts).magnitude();
-  constexpr base::TimeDelta max_gap = base::TimeDelta::FromMilliseconds(1);
-  if (gap > max_gap) {
-    DLOG(ERROR) << "Large gap in sound. Forced flush."
-                << " Gap/overlap duration: " << gap
-                << " capture_ts: " << capture_ts
-                << " next_timestamp_: " << next_timestamp_
-                << " existing_buffer_duration: " << existing_buffer_duration
-                << " end_of_existing_buffer_ts: " << end_of_existing_buffer_ts;
-    FlushInternal();
-    next_timestamp_ = capture_ts;
+  if (timestamp_tracker_->base_timestamp() == kNoTimestamp) {
+    auto capture_ts = capture_time - base::TimeTicks() -
+                      AudioTimestampHelper::FramesToTime(
+                          audio_bus->frames(), input_params_.sample_rate());
+    timestamp_tracker_->SetBaseTimestamp(capture_ts);
   }
 
   // The |fifo_| won't trigger OnFifoOutput() until we have enough frames
@@ -263,24 +245,15 @@ void AudioOpusEncoder::Flush(StatusCB done_cb) {
   }
 
   current_done_cb_ = std::move(done_cb);
-  FlushInternal();
+  fifo_->Flush();
+  timestamp_tracker_->SetBaseTimestamp(kNoTimestamp);
   if (!current_done_cb_.is_null()) {
     // Is |current_done_cb_| is null, it means OnFifoOutput() has already
     // reported an error.
     std::move(current_done_cb_).Run(OkStatus());
   }
 }
 
-void AudioOpusEncoder::FlushInternal() {
-  // This is needed to correctly compute the timestamp, since the number of
-  // frames of |output_bus| provided to OnFifoOutput() will always be equal to
-  // the full frames_per_buffer(), as the fifo's Flush() will pad the remaining
-  // empty frames with zeros.
-  number_of_flushed_frames_ = fifo_->queued_frames();
-  fifo_->Flush();
-  number_of_flushed_frames_.reset();
-}
-
 void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
                                     int frame_delay) {
   // Provides input to the converter from |output_bus| within this scope only.
@@ -300,10 +273,6 @@ void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
     return;
   }
 
-  auto encoded_duration = AudioTimestampHelper::FramesToTime(
-      number_of_flushed_frames_.value_or(output_bus.frames()),
-      input_params_.sample_rate());
-
   size_t encoded_data_size = result;
   // If |result| in {0,1}, do nothing; the documentation says that a return
   // value of zero or one means the packet does not need to be transmitted.
@@ -313,19 +282,12 @@ void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
       desc = PrepareExtraData();
       need_to_emit_extra_data_ = false;
     }
-    output_cb_.Run(
-        EncodedAudioBuffer(converted_params_, std::move(encoded_data),
-                           encoded_data_size, next_timestamp_),
-        desc);
+    auto ts = base::TimeTicks() + timestamp_tracker_->GetTimestamp();
+    EncodedAudioBuffer encoded_buffer(
+        converted_params_, std::move(encoded_data), encoded_data_size, ts);
+    output_cb_.Run(std::move(encoded_buffer), desc);
   }
-  next_timestamp_ += encoded_duration;
-}
-
-base::TimeTicks AudioOpusEncoder::ComputeTimestamp(
-    int num_frames,
-    base::TimeTicks capture_time) const {
-  return capture_time - AudioTimestampHelper::FramesToTime(
-                            num_frames, input_params_.sample_rate());
+  timestamp_tracker_->AddFrames(converted_params_.frames_per_buffer());
 }
 
 // Creates and returns the libopus encoder instance. Returns nullptr if the

diff --git a/media/audio/audio_opus_encoder.h b/media/audio/audio_opus_encoder.h
@@ -13,6 +13,7 @@
 #include "media/base/audio_converter.h"
 #include "media/base/audio_encoder.h"
 #include "media/base/audio_push_fifo.h"
+#include "media/base/audio_timestamp_helper.h"
 #include "third_party/opus/src/include/opus.h"
 
 namespace media {
@@ -46,18 +47,10 @@ class MEDIA_EXPORT AudioOpusEncoder : public AudioEncoder {
   // buffered. Calls libopus to do actual encoding.
   void OnFifoOutput(const AudioBus& output_bus, int frame_delay);
 
-  void FlushInternal();
-
   CodecDescription PrepareExtraData();
 
   StatusOr<OwnedOpusEncoder> CreateOpusEncoder();
 
-  // Computes the timestamp of an AudioBus which has |num_frames| and was
-  // captured at |capture_time|. This timestamp is the capture time of the first
-  // sample in that AudioBus.
-  base::TimeTicks ComputeTimestamp(int num_frames,
-                                   base::TimeTicks capture_time) const;
-
   AudioParameters input_params_;
 
   // Output parameters after audio conversion. This may differ from the input
@@ -82,17 +75,8 @@ class MEDIA_EXPORT AudioOpusEncoder : public AudioEncoder {
   // encoder fails.
   OwnedOpusEncoder opus_encoder_;
 
-  // The capture time of the most recent |audio_bus| delivered to
-  // EncodeAudio().
-  base::TimeTicks last_capture_time_;
-
-  // If FlushImpl() was called while |fifo_| has some frames but not full yet,
-  // this will be the number of flushed frames, which is used to compute the
-  // timestamp provided in the output |EncodedAudioBuffer|.
-  base::Optional<int> number_of_flushed_frames_;
-
-  // Timestamp that should be reported by the next call of |output_callback_|
-  base::TimeTicks next_timestamp_;
+  // Keeps track of the timestamps for the each |output_callback_|
+  std::unique_ptr<AudioTimestampHelper> timestamp_tracker_;
 
   // Callback for reporting completion and status of the current Flush() or
   // Encoder()