From 5ff3c9b040c579bdfbac606709ce9c4106a1d537 Mon Sep 17 00:00:00 2001
From: Eugene Zemtsov <eugene@chromium.org>
Date: Mon, 5 Apr 2021 08:41:54 +0000
Subject: [PATCH] While encoding disregard audio buffer timestamps except after
 Flush()

The audio encoder treats incoming frames as continuous sound signal and
disregard timestamps even if it appears that there are gaps and overlaps
in the audio signal.

Test: updated unit test, wc-audio.glitch.me sounds good
Bug: 1195189
Change-Id: I005398d1f06662ef455d05c8442a4f00ae2d82dd
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2801287
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
Reviewed-by: Thomas Guilbert <tguilbert@chromium.org>
Commit-Queue: Eugene Zemtsov <eugene@chromium.org>
Cr-Commit-Position: refs/heads/master@{#869125}
---
 media/audio/audio_encoders_unittest.cc | 64 +++++++++++------------
 media/audio/audio_opus_encoder.cc      | 70 ++++++--------------------
 media/audio/audio_opus_encoder.h       | 22 ++------
 3 files changed, 47 insertions(+), 109 deletions(-)

diff --git a/media/audio/audio_encoders_unittest.cc b/media/audio/audio_encoders_unittest.cc
index 50a4278c159f4e..b8b0fde973392a 100644
--- a/media/audio/audio_encoders_unittest.cc
+++ b/media/audio/audio_encoders_unittest.cc
@@ -170,7 +170,8 @@ TEST_P(AudioEncodersTest, OpusTimestamps) {
 
     current_timestamp = base::TimeTicks();
     for (auto& ts : timestamps) {
-      EXPECT_EQ(current_timestamp, ts);
+      auto drift = (current_timestamp - ts).magnitude();
+      EXPECT_LE(drift, base::TimeDelta::FromMicroseconds(1));
       current_timestamp += kOpusBufferDuration;
     }
   }
@@ -209,12 +210,16 @@ TEST_P(AudioEncodersTest, OpusExtraData) {
 }
 
 // Check how Opus encoder reacts to breaks in continuity of incoming sound.
-// Capture times are expected to be exactly buffer durations apart,
-// but the encoder should be ready to handle situations when it's not the case.
+// Under normal circumstances capture times are expected to be exactly
+// a buffer's duration apart, but if they are not, the encoder just ignores
+// incoming capture times. In other words the only capture times that matter
+// are
+//   1. timestamp of the first encoded buffer
+//   2. timestamps of buffers coming immediately after Flush() calls.
 TEST_P(AudioEncodersTest, OpusTimeContinuityBreak) {
-  base::TimeTicks current_timestamp;
-  base::TimeDelta small_gap = base::TimeDelta::FromMicroseconds(500);
-  base::TimeDelta large_gap = base::TimeDelta::FromMicroseconds(1500);
+  base::TimeTicks current_timestamp = base::TimeTicks::Now();
+  base::TimeDelta gap = base::TimeDelta::FromMicroseconds(1500);
+  buffer_duration_ = kOpusBufferDuration;
   std::vector<base::TimeTicks> timestamps;
 
   auto output_cb =
@@ -225,55 +230,42 @@ TEST_P(AudioEncodersTest, OpusTimeContinuityBreak) {
   SetupEncoder(std::move(output_cb));
 
   // Encode first normal buffer and immediately get an output for it.
-  buffer_duration_ = kOpusBufferDuration;
   auto ts0 = current_timestamp;
   ProduceAudioAndEncode(current_timestamp);
   current_timestamp += buffer_duration_;
   EXPECT_EQ(1u, timestamps.size());
   EXPECT_EQ(ts0, timestamps[0]);
 
-  // Add another buffer which is too small and will be buffered
-  buffer_duration_ = kOpusBufferDuration / 2;
+  // Encode another buffer after a large gap, output timestamp should
+  // disregard the gap.
   auto ts1 = current_timestamp;
+  current_timestamp += gap;
   ProduceAudioAndEncode(current_timestamp);
   current_timestamp += buffer_duration_;
-  EXPECT_EQ(1u, timestamps.size());
-
-  // Add another large buffer after a large gap, 2 outputs are expected
-  // because large gap should trigger a flush.
-  current_timestamp += large_gap;
-  buffer_duration_ = kOpusBufferDuration;
-  auto ts2 = current_timestamp;
-  ProduceAudioAndEncode(current_timestamp);
-  current_timestamp += buffer_duration_;
-  EXPECT_EQ(3u, timestamps.size());
+  EXPECT_EQ(2u, timestamps.size());
   EXPECT_EQ(ts1, timestamps[1]);
-  EXPECT_EQ(ts2, timestamps[2]);
 
-  // Add another buffer which is too small and will be buffered
-  buffer_duration_ = kOpusBufferDuration / 2;
-  auto ts3 = current_timestamp;
+  // Another buffer without a gap.
+  auto ts2 = ts1 + buffer_duration_;
   ProduceAudioAndEncode(current_timestamp);
-  current_timestamp += buffer_duration_;
   EXPECT_EQ(3u, timestamps.size());
-
-  // Add a small gap and a large buffer, only one output is expected because
-  // small gap doesn't trigger a flush.
-  // Small gap itself is not counted in output timestamps.
-  auto ts4 = current_timestamp + kOpusBufferDuration / 2;
-  current_timestamp += small_gap;
-  buffer_duration_ = kOpusBufferDuration;
-  ProduceAudioAndEncode(current_timestamp);
-  EXPECT_EQ(4u, timestamps.size());
-  EXPECT_EQ(ts3, timestamps[3]);
+  EXPECT_EQ(ts2, timestamps[2]);
 
   encoder()->Flush(base::BindOnce([](Status error) {
     if (!error.is_ok())
       FAIL() << error.message();
   }));
   RunLoop();
-  EXPECT_EQ(5u, timestamps.size());
-  EXPECT_EQ(ts4, timestamps[4]);
+
+  // Reset output timestamp after Flush(), the encoder should start producing
+  // timestamps from new base 0.
+  current_timestamp = base::TimeTicks();
+
+  auto ts3 = current_timestamp;
+  ProduceAudioAndEncode(current_timestamp);
+  current_timestamp += buffer_duration_;
+  EXPECT_EQ(4u, timestamps.size());
+  EXPECT_EQ(ts3, timestamps[3]);
 }
 
 TEST_P(AudioEncodersTest, FullCycleEncodeDecode) {
diff --git a/media/audio/audio_opus_encoder.cc b/media/audio/audio_opus_encoder.cc
index 9156280d8d665b..4e7b56950a14fb 100644
--- a/media/audio/audio_opus_encoder.cc
+++ b/media/audio/audio_opus_encoder.cc
@@ -11,10 +11,10 @@
 #include "base/numerics/checked_math.h"
 #include "base/stl_util.h"
 #include "base/strings/stringprintf.h"
-#include "media/base/audio_timestamp_helper.h"
 #include "media/base/bind_to_current_loop.h"
 #include "media/base/status.h"
 #include "media/base/status_codes.h"
+#include "media/base/timestamp_constants.h"
 
 namespace media {
 
@@ -132,6 +132,8 @@ void AudioOpusEncoder::Initialize(const Options& options,
   converter_ =
       std::make_unique<AudioConverter>(input_params_, converted_params_,
                                        /*disable_fifo=*/false);
+  timestamp_tracker_ =
+      std::make_unique<AudioTimestampHelper>(converted_params_.sample_rate());
   fifo_ = std::make_unique<AudioPushFifo>(base::BindRepeating(
       &AudioOpusEncoder::OnFifoOutput, base::Unretained(this)));
   converted_audio_bus_ = AudioBus::Create(
@@ -207,6 +209,7 @@ void AudioOpusEncoder::Encode(std::unique_ptr<AudioBus> audio_bus,
   DCHECK_EQ(audio_bus->channels(), input_params_.channels());
   DCHECK(!capture_time.is_null());
   DCHECK(!done_cb.is_null());
+  DCHECK(timestamp_tracker_);
 
   current_done_cb_ = BindToCurrentLoop(std::move(done_cb));
   if (!opus_encoder_) {
@@ -215,32 +218,11 @@ void AudioOpusEncoder::Encode(std::unique_ptr<AudioBus> audio_bus,
     return;
   }
 
-  DLOG_IF(ERROR, !last_capture_time_.is_null() &&
-                     ((capture_time - last_capture_time_).InSecondsF() >
-                      1.5f * audio_bus->frames() / input_params_.sample_rate()))
-      << "Possibly frames were skipped, which may result in inaccurate "
-         "timestamp calculation.";
-
-  last_capture_time_ = capture_time;
-
-  // If |capture_time| - |audio_bus|'s duration is not equal to the expected
-  // timestamp for the next audio sample. It means there is a gap/overlap,
-  // if it's big enough we flash and start anew, otherwise we ignore it.
-  auto capture_ts = ComputeTimestamp(audio_bus->frames(), capture_time);
-  auto existing_buffer_duration = AudioTimestampHelper::FramesToTime(
-      fifo_->queued_frames(), input_params_.sample_rate());
-  auto end_of_existing_buffer_ts = next_timestamp_ + existing_buffer_duration;
-  base::TimeDelta gap = (capture_ts - end_of_existing_buffer_ts).magnitude();
-  constexpr base::TimeDelta max_gap = base::TimeDelta::FromMilliseconds(1);
-  if (gap > max_gap) {
-    DLOG(ERROR) << "Large gap in sound. Forced flush."
-                << " Gap/overlap duration: " << gap
-                << " capture_ts: " << capture_ts
-                << " next_timestamp_: " << next_timestamp_
-                << " existing_buffer_duration: " << existing_buffer_duration
-                << " end_of_existing_buffer_ts: " << end_of_existing_buffer_ts;
-    FlushInternal();
-    next_timestamp_ = capture_ts;
+  if (timestamp_tracker_->base_timestamp() == kNoTimestamp) {
+    auto capture_ts = capture_time - base::TimeTicks() -
+                      AudioTimestampHelper::FramesToTime(
+                          audio_bus->frames(), input_params_.sample_rate());
+    timestamp_tracker_->SetBaseTimestamp(capture_ts);
   }
 
   // The |fifo_| won't trigger OnFifoOutput() until we have enough frames
@@ -263,7 +245,8 @@ void AudioOpusEncoder::Flush(StatusCB done_cb) {
   }
 
   current_done_cb_ = std::move(done_cb);
-  FlushInternal();
+  fifo_->Flush();
+  timestamp_tracker_->SetBaseTimestamp(kNoTimestamp);
   if (!current_done_cb_.is_null()) {
     // Is |current_done_cb_| is null, it means OnFifoOutput() has already
     // reported an error.
@@ -271,16 +254,6 @@ void AudioOpusEncoder::Flush(StatusCB done_cb) {
   }
 }
 
-void AudioOpusEncoder::FlushInternal() {
-  // This is needed to correctly compute the timestamp, since the number of
-  // frames of |output_bus| provided to OnFifoOutput() will always be equal to
-  // the full frames_per_buffer(), as the fifo's Flush() will pad the remaining
-  // empty frames with zeros.
-  number_of_flushed_frames_ = fifo_->queued_frames();
-  fifo_->Flush();
-  number_of_flushed_frames_.reset();
-}
-
 void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
                                     int frame_delay) {
   // Provides input to the converter from |output_bus| within this scope only.
@@ -300,10 +273,6 @@ void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
     return;
   }
 
-  auto encoded_duration = AudioTimestampHelper::FramesToTime(
-      number_of_flushed_frames_.value_or(output_bus.frames()),
-      input_params_.sample_rate());
-
   size_t encoded_data_size = result;
   // If |result| in {0,1}, do nothing; the documentation says that a return
   // value of zero or one means the packet does not need to be transmitted.
@@ -313,19 +282,12 @@ void AudioOpusEncoder::OnFifoOutput(const AudioBus& output_bus,
       desc = PrepareExtraData();
       need_to_emit_extra_data_ = false;
     }
-    output_cb_.Run(
-        EncodedAudioBuffer(converted_params_, std::move(encoded_data),
-                           encoded_data_size, next_timestamp_),
-        desc);
+    auto ts = base::TimeTicks() + timestamp_tracker_->GetTimestamp();
+    EncodedAudioBuffer encoded_buffer(
+        converted_params_, std::move(encoded_data), encoded_data_size, ts);
+    output_cb_.Run(std::move(encoded_buffer), desc);
   }
-  next_timestamp_ += encoded_duration;
-}
-
-base::TimeTicks AudioOpusEncoder::ComputeTimestamp(
-    int num_frames,
-    base::TimeTicks capture_time) const {
-  return capture_time - AudioTimestampHelper::FramesToTime(
-                            num_frames, input_params_.sample_rate());
+  timestamp_tracker_->AddFrames(converted_params_.frames_per_buffer());
 }
 
 // Creates and returns the libopus encoder instance. Returns nullptr if the
diff --git a/media/audio/audio_opus_encoder.h b/media/audio/audio_opus_encoder.h
index 4b82020dc1d47c..e37dd1555d310c 100644
--- a/media/audio/audio_opus_encoder.h
+++ b/media/audio/audio_opus_encoder.h
@@ -13,6 +13,7 @@
 #include "media/base/audio_converter.h"
 #include "media/base/audio_encoder.h"
 #include "media/base/audio_push_fifo.h"
+#include "media/base/audio_timestamp_helper.h"
 #include "third_party/opus/src/include/opus.h"
 
 namespace media {
@@ -46,18 +47,10 @@ class MEDIA_EXPORT AudioOpusEncoder : public AudioEncoder {
   // buffered. Calls libopus to do actual encoding.
   void OnFifoOutput(const AudioBus& output_bus, int frame_delay);
 
-  void FlushInternal();
-
   CodecDescription PrepareExtraData();
 
   StatusOr<OwnedOpusEncoder> CreateOpusEncoder();
 
-  // Computes the timestamp of an AudioBus which has |num_frames| and was
-  // captured at |capture_time|. This timestamp is the capture time of the first
-  // sample in that AudioBus.
-  base::TimeTicks ComputeTimestamp(int num_frames,
-                                   base::TimeTicks capture_time) const;
-
   AudioParameters input_params_;
 
   // Output parameters after audio conversion. This may differ from the input
@@ -82,17 +75,8 @@ class MEDIA_EXPORT AudioOpusEncoder : public AudioEncoder {
   // encoder fails.
   OwnedOpusEncoder opus_encoder_;
 
-  // The capture time of the most recent |audio_bus| delivered to
-  // EncodeAudio().
-  base::TimeTicks last_capture_time_;
-
-  // If FlushImpl() was called while |fifo_| has some frames but not full yet,
-  // this will be the number of flushed frames, which is used to compute the
-  // timestamp provided in the output |EncodedAudioBuffer|.
-  base::Optional<int> number_of_flushed_frames_;
-
-  // Timestamp that should be reported by the next call of |output_callback_|
-  base::TimeTicks next_timestamp_;
+  // Keeps track of the timestamps for the each |output_callback_|
+  std::unique_ptr<AudioTimestampHelper> timestamp_tracker_;
 
   // Callback for reporting completion and status of the current Flush() or
   // Encoder()