Skip to content

Allow multiple DecoderSpecificInfo entries in non-strict mode #426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 92 additions & 35 deletions mp4parse/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ pub enum Status {
ElstBadVersion,
EsdsBadAudioSampleEntry,
EsdsBadDescriptor,
EsdsDecSpecificIntoTagQuantity,
EsdsDecSpecificInfoTagQuantity,
FtypBadSize,
FtypNotFirst,
HdlrNameNoNul,
Expand Down Expand Up @@ -514,7 +514,7 @@ impl From<Status> for &str {
Status::EsdsBadDescriptor => {
"Invalid descriptor."
}
Status::EsdsDecSpecificIntoTagQuantity => {
Status::EsdsDecSpecificInfoTagQuantity => {
"There can be only one DecSpecificInfoTag descriptor"
}
Status::FtypBadSize => {
Expand Down Expand Up @@ -2417,7 +2417,7 @@ pub fn read_avif<T: Read>(f: &mut T, strictness: ParseStrictness) -> Result<Avif
if image_sequence.is_some() {
return Status::MoovBadQuantity.into();
}
image_sequence = Some(read_moov(&mut b, None)?);
image_sequence = Some(read_moov(&mut b, None, strictness)?);
}
BoxType::MediaDataBox => {
let file_offset = b.offset();
Expand Down Expand Up @@ -4058,7 +4058,7 @@ fn read_iloc<T: Read>(src: &mut BMFFBox<T>) -> Result<TryHashMap<ItemId, ItemLoc
}

/// Read the contents of a box, including sub boxes.
pub fn read_mp4<T: Read>(f: &mut T) -> Result<MediaContext> {
pub fn read_mp4<T: Read>(f: &mut T, strictness: ParseStrictness) -> Result<MediaContext> {
let mut context = None;
let mut found_ftyp = false;
// TODO(kinetik): Top-level parsing should handle zero-sized boxes
Expand Down Expand Up @@ -4087,7 +4087,7 @@ pub fn read_mp4<T: Read>(f: &mut T) -> Result<MediaContext> {
debug!("{:?}", ftyp);
}
BoxType::MovieBox => {
context = Some(read_moov(&mut b, context)?);
context = Some(read_moov(&mut b, context, strictness)?);
}
#[cfg(feature = "meta-xml")]
BoxType::MetadataBox => {
Expand Down Expand Up @@ -4133,7 +4133,11 @@ fn parse_mvhd<T: Read>(f: &mut BMFFBox<T>) -> Result<Option<MediaTimeScale>> {
/// Note that despite the spec indicating "exactly one" moov box should exist at
/// the file container level, we support reading and merging multiple moov boxes
/// such as with tests/test_case_1185230.mp4.
fn read_moov<T: Read>(f: &mut BMFFBox<T>, context: Option<MediaContext>) -> Result<MediaContext> {
fn read_moov<T: Read>(
f: &mut BMFFBox<T>,
context: Option<MediaContext>,
strictness: ParseStrictness,
) -> Result<MediaContext> {
let MediaContext {
mut timescale,
mut tracks,
Expand All @@ -4152,7 +4156,7 @@ fn read_moov<T: Read>(f: &mut BMFFBox<T>, context: Option<MediaContext>) -> Resu
}
BoxType::TrackBox => {
let mut track = Track::new(tracks.len());
read_trak(&mut b, &mut track)?;
read_trak(&mut b, &mut track, strictness)?;
tracks.push(track)?;
}
BoxType::MovieExtendsBox => {
Expand Down Expand Up @@ -4262,7 +4266,11 @@ fn read_mehd<T: Read>(src: &mut BMFFBox<T>) -> Result<MediaScaledTime> {

/// Parse a Track Box
/// See ISOBMFF (ISO 14496-12:2020) § 8.3.1.
fn read_trak<T: Read>(f: &mut BMFFBox<T>, track: &mut Track) -> Result<()> {
fn read_trak<T: Read>(
f: &mut BMFFBox<T>,
track: &mut Track,
strictness: ParseStrictness,
) -> Result<()> {
let mut iter = f.box_iter();
while let Some(mut b) = iter.next_box()? {
match b.head.name {
Expand All @@ -4273,7 +4281,7 @@ fn read_trak<T: Read>(f: &mut BMFFBox<T>, track: &mut Track) -> Result<()> {
debug!("{:?}", tkhd);
}
BoxType::EditBox => read_edts(&mut b, track)?,
BoxType::MediaBox => read_mdia(&mut b, track)?,
BoxType::MediaBox => read_mdia(&mut b, track, strictness)?,
BoxType::TrackReferenceBox => track.tref = Some(read_tref(&mut b)?),
_ => skip_box_content(&mut b)?,
};
Expand Down Expand Up @@ -4346,7 +4354,11 @@ fn parse_mdhd<T: Read>(
Ok((mdhd, duration, timescale))
}

fn read_mdia<T: Read>(f: &mut BMFFBox<T>, track: &mut Track) -> Result<()> {
fn read_mdia<T: Read>(
f: &mut BMFFBox<T>,
track: &mut Track,
strictness: ParseStrictness,
) -> Result<()> {
let mut iter = f.box_iter();
while let Some(mut b) = iter.next_box()? {
match b.head.name {
Expand All @@ -4369,7 +4381,7 @@ fn read_mdia<T: Read>(f: &mut BMFFBox<T>, track: &mut Track) -> Result<()> {
}
debug!("{:?}", hdlr);
}
BoxType::MediaInformationBox => read_minf(&mut b, track)?,
BoxType::MediaInformationBox => read_minf(&mut b, track, strictness)?,
_ => skip_box_content(&mut b)?,
};
check_parser_state!(b.content);
Expand Down Expand Up @@ -4403,24 +4415,32 @@ fn read_tref_auxl<T: Read>(f: &mut BMFFBox<T>) -> Result<TrackReference> {
Ok(TrackReference { track_ids })
}

fn read_minf<T: Read>(f: &mut BMFFBox<T>, track: &mut Track) -> Result<()> {
fn read_minf<T: Read>(
f: &mut BMFFBox<T>,
track: &mut Track,
strictness: ParseStrictness,
) -> Result<()> {
let mut iter = f.box_iter();
while let Some(mut b) = iter.next_box()? {
match b.head.name {
BoxType::SampleTableBox => read_stbl(&mut b, track)?,
BoxType::SampleTableBox => read_stbl(&mut b, track, strictness)?,
_ => skip_box_content(&mut b)?,
};
check_parser_state!(b.content);
}
Ok(())
}

fn read_stbl<T: Read>(f: &mut BMFFBox<T>, track: &mut Track) -> Result<()> {
fn read_stbl<T: Read>(
f: &mut BMFFBox<T>,
track: &mut Track,
strictness: ParseStrictness,
) -> Result<()> {
let mut iter = f.box_iter();
while let Some(mut b) = iter.next_box()? {
match b.head.name {
BoxType::SampleDescriptionBox => {
let stsd = read_stsd(&mut b, track)?;
let stsd = read_stsd(&mut b, track, strictness)?;
debug!("{:?}", stsd);
track.stsd = Some(stsd);
}
Expand Down Expand Up @@ -4942,7 +4962,11 @@ fn read_flac_metadata<T: Read>(src: &mut BMFFBox<T>) -> Result<FLACMetadataBlock
}

/// See MPEG-4 Systems (ISO 14496-1:2010) § 7.2.6.5
fn find_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
fn find_descriptor(
data: &[u8],
esds: &mut ES_Descriptor,
strictness: ParseStrictness,
) -> Result<()> {
// Tags for elementary stream description
const ESDESCR_TAG: u8 = 0x03;
const DECODER_CONFIG_TAG: u8 = 0x04;
Expand Down Expand Up @@ -4982,13 +5006,13 @@ fn find_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {

match tag {
ESDESCR_TAG => {
read_es_descriptor(descriptor, esds)?;
read_es_descriptor(descriptor, esds, strictness)?;
}
DECODER_CONFIG_TAG => {
read_dc_descriptor(descriptor, esds)?;
read_dc_descriptor(descriptor, esds, strictness)?;
}
DECODER_SPECIFIC_TAG => {
read_ds_descriptor(descriptor, esds)?;
read_ds_descriptor(descriptor, esds, strictness)?;
}
_ => {
debug!("Unsupported descriptor, tag {}", tag);
Expand All @@ -5014,7 +5038,11 @@ fn get_audio_object_type(bit_reader: &mut BitReader) -> Result<u16> {
}

/// See MPEG-4 Systems (ISO 14496-1:2010) § 7.2.6.7 and probably 14496-3 somewhere?
fn read_ds_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
fn read_ds_descriptor(
data: &[u8],
esds: &mut ES_Descriptor,
strictness: ParseStrictness,
) -> Result<()> {
#[cfg(feature = "mp4v")]
// Check if we are in a Visual esda Box.
if esds.video_codec != CodecType::Unknown {
Expand Down Expand Up @@ -5170,7 +5198,10 @@ fn read_ds_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
esds.audio_sample_rate = Some(sample_frequency_value);
esds.audio_channel_count = Some(channel_counts);
if !esds.decoder_specific_data.is_empty() {
return Status::EsdsDecSpecificIntoTagQuantity.into();
fail_with_status_if(
strictness == ParseStrictness::Strict,
Status::EsdsDecSpecificInfoTagQuantity,
)?;
}
esds.decoder_specific_data.extend_from_slice(data)?;

Expand All @@ -5191,7 +5222,11 @@ fn read_surround_channel_count(bit_reader: &mut BitReader, channels: u8) -> Resu
}

/// See MPEG-4 Systems (ISO 14496-1:2010) § 7.2.6.6
fn read_dc_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
fn read_dc_descriptor(
data: &[u8],
esds: &mut ES_Descriptor,
strictness: ParseStrictness,
) -> Result<()> {
let des = &mut Cursor::new(data);
let object_profile = des.read_u8()?;

Expand All @@ -5207,7 +5242,11 @@ fn read_dc_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
skip(des, 12)?;

if data.len().to_u64() > des.position() {
find_descriptor(&data[des.position().try_into()?..data.len()], esds)?;
find_descriptor(
&data[des.position().try_into()?..data.len()],
esds,
strictness,
)?;
}

esds.audio_codec = match object_profile {
Expand All @@ -5225,7 +5264,11 @@ fn read_dc_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
}

/// See MPEG-4 Systems (ISO 14496-1:2010) § 7.2.6.5
fn read_es_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
fn read_es_descriptor(
data: &[u8],
esds: &mut ES_Descriptor,
strictness: ParseStrictness,
) -> Result<()> {
let des = &mut Cursor::new(data);

skip(des, 2)?;
Expand All @@ -5246,20 +5289,24 @@ fn read_es_descriptor(data: &[u8], esds: &mut ES_Descriptor) -> Result<()> {
}

if data.len().to_u64() > des.position() {
find_descriptor(&data[des.position().try_into()?..data.len()], esds)?;
find_descriptor(
&data[des.position().try_into()?..data.len()],
esds,
strictness,
)?;
}

Ok(())
}

/// See MP4 (ISO 14496-14:2020) § 6.7.2
fn read_esds<T: Read>(src: &mut BMFFBox<T>) -> Result<ES_Descriptor> {
fn read_esds<T: Read>(src: &mut BMFFBox<T>, strictness: ParseStrictness) -> Result<ES_Descriptor> {
let (_, _) = read_fullbox_extra(src)?;

let esds_array = read_buf(src, src.bytes_left())?;

let mut es_data = ES_Descriptor::default();
find_descriptor(&esds_array, &mut es_data)?;
find_descriptor(&esds_array, &mut es_data, strictness)?;

es_data.codec_esds = esds_array;

Expand Down Expand Up @@ -5558,7 +5605,7 @@ fn read_video_sample_entry<T: Read>(src: &mut BMFFBox<T>) -> Result<SampleEntry>
{
// Read ES_Descriptor inside an esds box.
// See ISOBMFF (ISO 14496-1:2010) § 7.2.6.5
let esds = read_esds(&mut b)?;
let esds = read_esds(&mut b, ParseStrictness::Normal)?;
codec_specific =
Some(VideoCodecSpecific::ESDSConfig(esds.decoder_specific_data));
}
Expand Down Expand Up @@ -5621,13 +5668,16 @@ fn read_video_sample_entry<T: Read>(src: &mut BMFFBox<T>) -> Result<SampleEntry>
)
}

fn read_qt_wave_atom<T: Read>(src: &mut BMFFBox<T>) -> Result<ES_Descriptor> {
fn read_qt_wave_atom<T: Read>(
src: &mut BMFFBox<T>,
strictness: ParseStrictness,
) -> Result<ES_Descriptor> {
let mut codec_specific = None;
let mut iter = src.box_iter();
while let Some(mut b) = iter.next_box()? {
match b.head.name {
BoxType::ESDBox => {
let esds = read_esds(&mut b)?;
let esds = read_esds(&mut b, strictness)?;
codec_specific = Some(esds);
}
_ => skip_box_content(&mut b)?,
Expand All @@ -5639,7 +5689,10 @@ fn read_qt_wave_atom<T: Read>(src: &mut BMFFBox<T>) -> Result<ES_Descriptor> {

/// Parse an audio description inside an stsd box.
/// See ISOBMFF (ISO 14496-12:2020) § 12.2.3
fn read_audio_sample_entry<T: Read>(src: &mut BMFFBox<T>) -> Result<SampleEntry> {
fn read_audio_sample_entry<T: Read>(
src: &mut BMFFBox<T>,
strictness: ParseStrictness,
) -> Result<SampleEntry> {
let name = src.get_header().name;

// Skip uninteresting fields.
Expand Down Expand Up @@ -5713,7 +5766,7 @@ fn read_audio_sample_entry<T: Read>(src: &mut BMFFBox<T>) -> Result<SampleEntry>
{
return Status::StsdBadAudioSampleEntry.into();
}
let esds = read_esds(&mut b)?;
let esds = read_esds(&mut b, strictness)?;
codec_type = esds.audio_codec;
codec_specific = Some(AudioCodecSpecific::ES_Descriptor(esds));
}
Expand Down Expand Up @@ -5746,7 +5799,7 @@ fn read_audio_sample_entry<T: Read>(src: &mut BMFFBox<T>) -> Result<SampleEntry>
codec_specific = Some(AudioCodecSpecific::ALACSpecificBox(alac));
}
BoxType::QTWaveAtom => {
let qt_esds = read_qt_wave_atom(&mut b)?;
let qt_esds = read_qt_wave_atom(&mut b, strictness)?;
codec_type = qt_esds.audio_codec;
codec_specific = Some(AudioCodecSpecific::ES_Descriptor(qt_esds));
}
Expand Down Expand Up @@ -5799,7 +5852,11 @@ fn read_audio_sample_entry<T: Read>(src: &mut BMFFBox<T>) -> Result<SampleEntry>
/// Parse a stsd box.
/// See ISOBMFF (ISO 14496-12:2020) § 8.5.2
/// See MP4 (ISO 14496-14:2020) § 6.7.2
fn read_stsd<T: Read>(src: &mut BMFFBox<T>, track: &Track) -> Result<SampleDescriptionBox> {
fn read_stsd<T: Read>(
src: &mut BMFFBox<T>,
track: &Track,
strictness: ParseStrictness,
) -> Result<SampleDescriptionBox> {
let (_, flags) = read_fullbox_extra(src)?;

if flags != 0 {
Expand All @@ -5819,7 +5876,7 @@ fn read_stsd<T: Read>(src: &mut BMFFBox<T>, track: &Track) -> Result<SampleDescr
TrackType::Video => read_video_sample_entry(&mut b),
TrackType::Picture => read_video_sample_entry(&mut b),
TrackType::AuxiliaryVideo => read_video_sample_entry(&mut b),
TrackType::Audio => read_audio_sample_entry(&mut b),
TrackType::Audio => read_audio_sample_entry(&mut b, strictness),
TrackType::Metadata => Err(Error::Unsupported("metadata track")),
TrackType::Unknown => Err(Error::Unsupported("unknown track type")),
};
Expand Down
Loading
Loading