MSE: Add support for xHE-AAC nonkeyframe buffering

Updates FrameProcessor to: 1. Understand if it is currently handling a codec for which nonkeyframes are supported (specifically, only AAC with xHE-AAC profile in this CL.) 2. Enforce parse failure if a supported audio nonkeyframe sequence does not increase in PTS monotonically (as a major simplification to related buffering logic, and coherent with the supported underlying codec's restriction on decode order matching presentation order.) 3. Disallows an audio nonkeyframe from becoming a possible audio preroll buffer in HandlePartialAppendWindowTrimming. Updates SourceBufferStream and SourceBufferRange to: * Trust the upstream FrameProcessor to enforce (1) and (2), above. In particular, SourceBufferRange::GetBuffersInRange() and SourceBufferStream::TrimSpliceOverlap() no longer require all audio frames to be keyframes. This simplifies tracking whether or not each buffered frame is allowed to be a nonkeyframe (e.g. if it's AAC:xHE-AAC), since changeType uses the same SourceBufferStream and SourceBufferRanges across the bytestream or codec type changes. * Trivially make TrimSpliceOverlap a no-op if the beginning of the new buffers is a nonkeyframe. Otherwise, it would be infeasible to determine if the new buffers are coherently decodable when fed to the decoder after a possibly end-overlapped, previously buffered, frame. Includes (and tests) a further simplification: if an audio keyframe is fully outside the append window and the next continuous frame processed is a nonkeyframe, that previous audio keyframe is not used as preroll; it's dropped instead, and a keyframe at least partially within the append window is required to continue buffering. If this is an oversimplification, later CLs can attempt to more precisely handle this case. TEST=Includes new FrameProcessorTests. Manually tested with a private stream on Android P+ that seeking in a mixed keyframe/nonkeyframe MP4 xHE-AAC stream no longer caused decoder to emit logs about silence. BUG=1079034,1081952 Change-Id: I41010079041a14dca90e23660350d4cf2469bf2f Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2186205 Commit-Queue: Matthew Wolenetz <wolenetz@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Cr-Commit-Position: refs/heads/master@{#772400}
marcosholgado · May 27, 2020 · f1efbd8 · f1efbd8
1 parent 6f8683e
commit f1efbd8
Show file tree

Hide file tree

Showing 7 changed files with 533 additions and 38 deletions.
diff --git a/media/base/test_helpers.h b/media/base/test_helpers.h
@@ -328,7 +328,14 @@ MATCHER_P2(AudioNonKeyframe, pts_microseconds, dts_microseconds, "") {
                base::NumberToString(pts_microseconds) + "us and DTS " +
                base::NumberToString(dts_microseconds) +
                "us indicated the frame is not a random access point (key "
-               "frame). All audio frames are expected to be key frames.");
+               "frame). All audio frames are expected to be key frames for "
+               "the current audio codec.");
+}
+
+MATCHER(AudioNonKeyframeOutOfOrder, "") {
+  return CONTAINS_STRING(arg,
+                         "Dependent audio frame with invalid decreasing "
+                         "presentation timestamp detected.");
 }
 
 MATCHER_P2(SkippingSpliceAtOrBefore,
@@ -451,6 +458,20 @@ MATCHER_P3(DroppedFrameCheckAppendWindow,
                       base::NumberToString(append_window_end_us) + "us");
 }
 
+MATCHER_P3(DroppedAppendWindowUnusedPreroll,
+           pts_us,
+           delta_us,
+           next_pts_us,
+           "") {
+  return CONTAINS_STRING(
+      arg,
+      "Partial append window trimming dropping unused audio preroll buffer "
+      "with PTS " +
+          base::NumberToString(pts_us) + "us that ends too far (" +
+          base::NumberToString(delta_us) + "us) from next buffer with PTS " +
+          base::NumberToString(next_pts_us) + "us");
+}
+
 }  // namespace media
 
 #endif  // MEDIA_BASE_TEST_HELPERS_H_
diff --git a/media/filters/frame_processor.cc b/media/filters/frame_processor.cc
@@ -564,6 +564,9 @@ void FrameProcessor::OnPossibleAudioConfigUpdate(
   current_audio_config_ = config;
   sample_duration_ = base::TimeDelta::FromSecondsD(
       1.0 / current_audio_config_.samples_per_second());
+  has_dependent_audio_frames_ =
+      current_audio_config_.profile() == AudioCodecProfile::kXHE_AAC;
+  last_audio_pts_for_nonkeyframe_monotonicity_check_ = kNoTimestamp;
 }
 
 MseTrackBuffer* FrameProcessor::FindTrack(StreamParser::TrackId id) {
@@ -602,7 +605,7 @@ bool FrameProcessor::HandlePartialAppendWindowTrimming(
     scoped_refptr<StreamParserBuffer> buffer) {
   DCHECK(buffer->duration() >= base::TimeDelta());
   DCHECK_EQ(DemuxerStream::AUDIO, buffer->type());
-  DCHECK(buffer->is_key_frame());
+  DCHECK(has_dependent_audio_frames_ || buffer->is_key_frame());
 
   const base::TimeDelta frame_end_timestamp =
       buffer->timestamp() + buffer->duration();
@@ -611,7 +614,13 @@ bool FrameProcessor::HandlePartialAppendWindowTrimming(
   // for the first buffer which overlaps |append_window_start|.
   if (buffer->timestamp() < append_window_start &&
       frame_end_timestamp <= append_window_start) {
-    audio_preroll_buffer_ = std::move(buffer);
+    // But if the buffer is not a keyframe, do not use it for preroll, nor use
+    // any previous preroll buffer for simplicity here.
+    if (has_dependent_audio_frames_ && !buffer->is_key_frame()) {
+      audio_preroll_buffer_.reset();
+    } else {
+      audio_preroll_buffer_ = std::move(buffer);
+    }
     return false;
   }
 
@@ -708,6 +717,37 @@ bool FrameProcessor::HandlePartialAppendWindowTrimming(
   return processed_buffer;
 }
 
+bool FrameProcessor::CheckAudioPresentationOrder(
+    const StreamParserBuffer& frame,
+    bool track_buffer_needs_random_access_point) {
+  DCHECK_EQ(DemuxerStream::AUDIO, frame.type());
+  DCHECK(has_dependent_audio_frames_);
+  if (frame.is_key_frame()) {
+    // Audio keyframes trivially succeed here. They start a new PTS baseline for
+    // the purpose of the checks in this method.
+    last_audio_pts_for_nonkeyframe_monotonicity_check_ = frame.timestamp();
+    return true;
+  }
+  if (track_buffer_needs_random_access_point) {
+    // This nonkeyframe trivially succeeds here, though it will not be buffered
+    // later in the caller since a keyframe is required first.
+    last_audio_pts_for_nonkeyframe_monotonicity_check_ = kNoTimestamp;
+    return true;
+  }
+
+  // We're not waiting for a random access point, so we must have a valid PTS
+  // baseline.
+  DCHECK_NE(kNoTimestamp, last_audio_pts_for_nonkeyframe_monotonicity_check_);
+
+  if (frame.timestamp() >= last_audio_pts_for_nonkeyframe_monotonicity_check_) {
+    last_audio_pts_for_nonkeyframe_monotonicity_check_ = frame.timestamp();
+    return true;
+  }
+
+  last_audio_pts_for_nonkeyframe_monotonicity_check_ = kNoTimestamp;
+  return false;  // Caller should fail parse in this case.
+}
+
 bool FrameProcessor::ProcessFrame(scoped_refptr<StreamParserBuffer> frame,
                                   base::TimeDelta append_window_start,
                                   base::TimeDelta append_window_end,
@@ -744,14 +784,16 @@ bool FrameProcessor::ProcessFrame(scoped_refptr<StreamParserBuffer> frame,
     // assumption that all audio coded frames are key frames. Metadata in the
     // bytestream may not indicate that, so we need to enforce that assumption
     // here with a warning log.
-    if (frame->type() == DemuxerStream::AUDIO && !frame->is_key_frame()) {
+    if (frame->type() == DemuxerStream::AUDIO && !has_dependent_audio_frames_ &&
+        !frame->is_key_frame()) {
       LIMITED_MEDIA_LOG(DEBUG, media_log_, num_audio_non_keyframe_warnings_,
                         kMaxAudioNonKeyframeWarnings)
           << "Bytestream with audio frame PTS "
           << presentation_timestamp.InMicroseconds() << "us and DTS "
           << decode_timestamp.InMicroseconds()
           << "us indicated the frame is not a random access point (key frame). "
-             "All audio frames are expected to be key frames.";
+             "All audio frames are expected to be key frames for the current "
+             "audio codec.";
       frame->set_is_key_frame(true);
     }
 
@@ -906,8 +948,18 @@ bool FrameProcessor::ProcessFrame(scoped_refptr<StreamParserBuffer> frame,
     frame->set_timestamp(presentation_timestamp);
     frame->SetDecodeTimestamp(decode_timestamp);
 
+    if (has_dependent_audio_frames_ && frame->type() == DemuxerStream::AUDIO &&
+        !CheckAudioPresentationOrder(
+            *frame, track_buffer->needs_random_access_point())) {
+      MEDIA_LOG(ERROR, media_log_)
+          << "Dependent audio frame with invalid decreasing presentation "
+             "timestamp detected.";
+      return false;
+    }
+
     // Attempt to trim audio exactly to fit the append window.
     if (frame->type() == DemuxerStream::AUDIO &&
+        (frame->is_key_frame() || !track_buffer->needs_random_access_point()) &&
         HandlePartialAppendWindowTrimming(append_window_start,
                                           append_window_end, frame)) {
       // |frame| has been partially trimmed or had preroll added.  Though

diff --git a/media/filters/frame_processor.h b/media/filters/frame_processor.h
@@ -116,13 +116,25 @@ class MEDIA_EXPORT FrameProcessor {
   // after |append_window_end| will be marked for post-decode discard.
   //
   // If |buffer| lies entirely before |append_window_start|, and thus would
-  // normally be discarded, |audio_preroll_buffer_| will be set to |buffer| and
-  // the method will return false.
+  // normally be discarded, |audio_preroll_buffer_| will be updated and the
+  // method will return false. In this case, the updated preroll will be
+  // |buffer| iff |buffer| is a keyframe, otherwise the preroll will be cleared.
   bool HandlePartialAppendWindowTrimming(
       base::TimeDelta append_window_start,
       base::TimeDelta append_window_end,
       scoped_refptr<StreamParserBuffer> buffer);
 
+  // Enables rejection of audio frame streams with nonkeyframe timestamps that
+  // do not monotonically increase since the last keyframe. Returns true if
+  // |frame| appears to be in order, false if |frame|'s order is not supported.
+  // |track_needs_random_access_point| should be the corresponding value for the
+  // frame's track buffer. This helper should only be called when
+  // |has_dependent_audio_frames_| is true, and only for an audio |frame|. This
+  // method also uses and updates
+  // |last_audio_pts_for_nonkeyframe_monotonicity_check_|.
+  bool CheckAudioPresentationOrder(const StreamParserBuffer& frame,
+                                   bool track_needs_random_access_point);
+
   // Helper that processes one frame with the coded frame processing algorithm.
   // Returns false on error or true on success.
   bool ProcessFrame(scoped_refptr<StreamParserBuffer> frame,
@@ -140,9 +152,23 @@ class MEDIA_EXPORT FrameProcessor {
   scoped_refptr<StreamParserBuffer> audio_preroll_buffer_;
 
   // The AudioDecoderConfig associated with buffers handed to ProcessFrames().
+  // TODO(wolenetz): Associate current audio config and the derived
+  // |has_dependent_audio_frames_|, |sample_duration_| and
+  // |last_audio_pts_for_nonkeyframe_monotonicity_check_| with MseTrackBuffer
+  // instead to enable handling more than 1 audio track in a SourceBuffer
+  // simultaneously. See https://crbug.com/1081952.
   AudioDecoderConfig current_audio_config_;
+  bool has_dependent_audio_frames_ = false;
   base::TimeDelta sample_duration_;
 
+  // When |has_dependent_audio_frames_| is true, holds the PTS of the last
+  // successfully processed audio frame. If the next audio frame is not a
+  // keyframe and has lower PTS, the stream is invalid. Currently, the only
+  // supported audio streams that could contain nonkeyframes are in-order (PTS
+  // increases monotonically since last keyframe), e.g. xHE-AAC.
+  base::TimeDelta last_audio_pts_for_nonkeyframe_monotonicity_check_ =
+      kNoTimestamp;
+
   // The AppendMode of the associated SourceBuffer.
   // See SetSequenceMode() for interpretation of |sequence_mode_|.
   // Per http://www.w3.org/TR/media-source/#widl-SourceBuffer-mode: