[go: up one dir, main page]

Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
use std::collections::BTreeMap;
use std::io::{Read, Seek};

use crate::{
    skip_box, BoxHeader, BoxType, EmsgBox, Error, FtypBox, MoofBox, MoovBox, ReadBox, Result,
    StblBox, StsdBoxContent, TfhdBox, TrackId, TrackKind, TrakBox, TrunBox,
};

#[derive(Debug)]
pub struct Mp4 {
    pub ftyp: FtypBox,
    pub moov: MoovBox,
    pub moofs: Vec<MoofBox>,
    pub emsgs: Vec<EmsgBox>,
    tracks: BTreeMap<TrackId, Track>,
}

impl Mp4 {
    /// Parses the contents of a byte slice as MP4 data.
    ///
    /// Sample ranges returned by the resulting [`Mp4`] should be used with the same input buffer.
    pub fn read_bytes(bytes: &[u8]) -> Result<Self> {
        let mp4 = Self::read(std::io::Cursor::new(bytes), bytes.len() as u64)?;
        Ok(mp4)
    }

    /// Reads the contents of a file as MP4 data, and returns both the parsed MP4 and its raw data.
    ///
    /// Sample ranges returned by the resulting [`Mp4`] should be used with the same input buffer.
    pub fn read_file(file_path: impl AsRef<std::path::Path>) -> Result<(Self, Vec<u8>)> {
        let bytes = std::fs::read(file_path)?;
        Ok((Self::read_bytes(&bytes)?, bytes))
    }

    pub fn read<R: Read + Seek>(mut reader: R, size: u64) -> Result<Self> {
        let start = reader.stream_position()?;

        let mut ftyp = None;
        let mut moov = None;
        let mut moofs = Vec::new();
        let mut moof_offsets = Vec::new();
        let mut emsgs = Vec::new();

        let mut current = start;
        while current < size {
            // Get box header.
            let header = BoxHeader::read(&mut reader)?;
            let BoxHeader { name, size: s } = header;
            if s > size {
                return Err(Error::InvalidData(
                    "file contains a box with a larger size than it",
                ));
            }

            // Break if size zero BoxHeader, which can result in dead-loop.
            if s == 0 {
                break;
            }

            // Match and parse the atom boxes.
            match name {
                BoxType::FtypBox => {
                    ftyp = Some(FtypBox::read_box(&mut reader, s)?);
                }
                BoxType::FreeBox => {
                    skip_box(&mut reader, s)?;
                }
                BoxType::MdatBox => {
                    skip_box(&mut reader, s)?;
                }
                BoxType::MoovBox => {
                    moov = Some(MoovBox::read_box(&mut reader, s)?);
                }
                BoxType::MoofBox => {
                    let moof_offset = reader.stream_position()? - 8;
                    let moof = MoofBox::read_box(&mut reader, s)?;
                    moofs.push(moof);
                    moof_offsets.push(moof_offset);
                }
                BoxType::EmsgBox => {
                    let emsg = EmsgBox::read_box(&mut reader, s)?;
                    emsgs.push(emsg);
                }
                _ => {
                    // XXX warn!()
                    skip_box(&mut reader, s)?;
                }
            }
            current = reader.stream_position()?;
        }

        let Some(ftyp) = ftyp else {
            return Err(Error::BoxNotFound(BoxType::FtypBox));
        };
        let Some(moov) = moov else {
            return Err(Error::BoxNotFound(BoxType::MoovBox));
        };

        let mut this = Self {
            ftyp,
            moov,
            moofs,
            emsgs,
            tracks: Default::default(),
        };

        let mut tracks = this.build_tracks();
        this.update_sample_list(&mut tracks)?;
        this.tracks = tracks;
        this.update_tracks();

        Ok(this)
    }

    pub fn tracks(&self) -> &BTreeMap<TrackId, Track> {
        &self.tracks
    }

    /// Process each `trak` box to obtain a list of samples for each track.
    ///
    /// Note that the list will be incomplete if the file is fragmented.
    fn build_tracks(&mut self) -> BTreeMap<TrackId, Track> {
        let mut tracks = BTreeMap::new();

        // load samples from traks
        for trak in &self.moov.traks {
            let mut sample_n = 0usize;
            let mut chunk_index = 1u64;
            let mut chunk_run_index = 0usize;
            let mut last_sample_in_chunk = 0u64;
            let mut offset_in_chunk = 0u64;
            let mut last_chunk_in_run = 0u64;
            let mut last_sample_in_stts_run = -1i64;
            let mut stts_run_index = -1i64;
            let mut last_stss_index = 0;
            let mut last_sample_in_ctts_run = -1i64;
            let mut ctts_run_index = -1i64;
            let mut dts_shift = 0;

            // The smallest presentation timestamp observed in this stream.
            //
            // This is typically 0, but in the presence of sample reordering (caused by AVC/HVC b-frames), it may be non-zero.
            // In fact, many formats don't require this to be zero, but video players typically
            // normalize the shown time to start at zero.
            // This is roughly equivalent to FFmpeg's internal `min_corrected_pts`
            // https://github.com/FFmpeg/FFmpeg/blob/4047b887fc44b110bccb1da09bcb79d6e454b88b/libavformat/isom.h#L202
            // To learn more about this I recommend reading the patch that introduced this in FFmpeg:
            // https://patchwork.ffmpeg.org/project/ffmpeg/patch/20170606181601.25187-1-isasi@google.com/#12592
            let mut min_composition_timestamp = i64::MAX;

            let mut samples = Vec::<Sample>::new();

            fn get_sample_chunk_offset(stbl: &StblBox, chunk_index: u64) -> u64 {
                if let Some(stco) = &stbl.stco {
                    stco.entries[chunk_index as usize - 1] as u64
                } else if let Some(co64) = &stbl.co64 {
                    co64.entries[chunk_index as usize - 1]
                } else {
                    panic!()
                }
            }

            let stbl = &trak.mdia.minf.stbl;
            let stsc = &stbl.stsc;
            let stsz = &stbl.stsz;
            let stts = &stbl.stts;

            while sample_n < stsz.sample_sizes.len() {
                // compute offset
                if sample_n == 0 {
                    chunk_index = 1;
                    chunk_run_index = 0;
                    last_sample_in_chunk = stsc.entries[chunk_run_index].samples_per_chunk as u64;
                    offset_in_chunk = 0;

                    if chunk_run_index + 1 < stsc.entries.len() {
                        last_chunk_in_run =
                            stsc.entries[chunk_run_index + 1].first_chunk as u64 - 1;
                    } else {
                        last_chunk_in_run = u64::MAX;
                    }
                } else if sample_n < last_sample_in_chunk as usize {
                    /* ... */
                } else {
                    chunk_index += 1;
                    offset_in_chunk = 0;
                    if chunk_index > last_chunk_in_run {
                        chunk_run_index += 1;
                        if chunk_run_index + 1 < stsc.entries.len() {
                            last_chunk_in_run =
                                stsc.entries[chunk_run_index + 1].first_chunk as u64 - 1;
                        } else {
                            last_chunk_in_run = u64::MAX;
                        }
                    }

                    last_sample_in_chunk += stsc.entries[chunk_run_index].samples_per_chunk as u64;
                }

                // compute timestamp, duration, is_sync
                if sample_n as i64 > last_sample_in_stts_run {
                    stts_run_index += 1;
                    if last_sample_in_stts_run < 0 {
                        last_sample_in_stts_run = 0;
                    }
                    last_sample_in_stts_run +=
                        stts.entries[stts_run_index as usize].sample_count as i64;
                }

                let timescale = trak.mdia.mdhd.timescale as u64;
                let size = stsz.sample_sizes[sample_n] as u64;
                let offset = get_sample_chunk_offset(stbl, chunk_index) + offset_in_chunk;
                offset_in_chunk += size;

                let decode_timestamp = if sample_n > 0 {
                    samples[sample_n - 1].duration =
                        stts.entries[stts_run_index as usize].sample_delta as u64;

                    samples[sample_n - 1].decode_timestamp + samples[sample_n - 1].duration as i64
                } else {
                    0
                };

                let composition_timestamp = if let Some(ctts) = &stbl.ctts {
                    if sample_n as i64 >= last_sample_in_ctts_run {
                        ctts_run_index += 1;
                        if last_sample_in_ctts_run < 0 {
                            last_sample_in_ctts_run = 0;
                        }
                        last_sample_in_ctts_run +=
                            ctts.entries[ctts_run_index as usize].sample_count as i64;
                    }

                    // dts shift is determined by the smallest negative sample offset:
                    // https://github.com/FFmpeg/FFmpeg/blob/455db6fe109cf905fe518ea2690495948937438f/libavformat/mov.c#L3671
                    let offset = ctts.entries[ctts_run_index as usize].sample_offset as i64;
                    if offset < 0 {
                        dts_shift = dts_shift.max(-offset);
                    }

                    decode_timestamp + offset
                } else {
                    decode_timestamp
                };
                min_composition_timestamp = min_composition_timestamp.min(composition_timestamp);

                let is_sync = if let Some(stss) = &stbl.stss {
                    if last_stss_index < stss.entries.len()
                        && sample_n == stss.entries[last_stss_index] as usize - 1
                    {
                        last_stss_index += 1;
                        true
                    } else {
                        false
                    }
                } else {
                    true
                };

                samples.push(Sample {
                    id: samples.len() as u32,
                    timescale,
                    size,
                    offset,
                    decode_timestamp,
                    composition_timestamp,
                    is_sync,
                    duration: 0, // filled once we know next sample timestamp
                });
                sample_n += 1;
            }

            if let Some(last_sample) = samples.last_mut() {
                last_sample.duration =
                    trak.mdia.mdhd.duration - last_sample.decode_timestamp as u64;
            }

            // Fixup all DTS by the dts shift if there's one.
            // https://github.com/FFmpeg/FFmpeg/blob/455db6fe109cf905fe518ea2690495948937438f/libavformat/mov.c#L4271
            if dts_shift > 0 {
                for sample in &mut samples {
                    sample.decode_timestamp -= dts_shift;
                }
            }

            // Shift both DTS & CTS by the smallest CTS.
            // For details, see declaration of `min_composition_timestamp` above.
            if min_composition_timestamp != 0 {
                for sample in &mut samples {
                    sample.decode_timestamp -= min_composition_timestamp;
                    sample.composition_timestamp -= min_composition_timestamp;
                }
            }

            tracks.insert(
                trak.tkhd.track_id,
                Track {
                    track_id: trak.tkhd.track_id,
                    width: trak.tkhd.width.value(),
                    height: trak.tkhd.height.value(),
                    first_traf_merged: false,
                    timescale: trak.mdia.mdhd.timescale as u64,
                    duration: trak.mdia.mdhd.duration,
                    kind: trak.mdia.minf.stbl.stsd.kind(),
                    samples,
                },
            );
        }

        tracks
    }

    /// In case the input file is fragmented, it will contain one or more `moof` boxes,
    /// which must be processed to obtain the full list of samples for each track.
    fn update_sample_list(&mut self, tracks: &mut BTreeMap<TrackId, Track>) -> Result<()> {
        let mut last_run_position = 0;

        for moof in &self.moofs {
            // process moof to update sample list
            for traf in &moof.trafs {
                let track_id = traf.tfhd.track_id;
                let track = tracks
                    .get_mut(&track_id)
                    .ok_or(Error::TrakNotFound(track_id))?;
                let trak = self
                    .moov
                    .traks
                    .iter()
                    .find(|trak| trak.tkhd.track_id == track_id)
                    .ok_or(Error::TrakNotFound(track_id))?;
                let trex = if let Some(mvex) = &self.moov.mvex {
                    mvex.trexs
                        .iter()
                        .find(|trex| trex.track_id == track_id)
                        .ok_or(Error::BoxInTrafNotFound(track_id, BoxType::TrexBox))?
                        .clone()
                } else {
                    Default::default()
                };

                let default_sample_duration = traf
                    .tfhd
                    .default_sample_duration
                    .unwrap_or(trex.default_sample_duration);
                let default_sample_size = traf
                    .tfhd
                    .default_sample_size
                    .unwrap_or(trex.default_sample_size);
                let default_sample_flags = traf
                    .tfhd
                    .default_sample_flags
                    .unwrap_or(trex.default_sample_flags);

                for (traf_idx, trun) in traf.truns.iter().enumerate() {
                    for sample_n in 0..trun.sample_count as usize {
                        let mut sample_flags = default_sample_flags;
                        if trun.flags & TrunBox::FLAG_SAMPLE_FLAGS != 0 {
                            sample_flags = trun
                                .sample_flags
                                .get(sample_n)
                                .copied()
                                .unwrap_or(sample_flags);
                        } else if sample_n == 0
                            && (trun.flags & TrunBox::FLAG_FIRST_SAMPLE_FLAGS != 0)
                        {
                            sample_flags = trun.first_sample_flags.unwrap_or(sample_flags);
                        }

                        let mut decode_timestamp = 0;
                        if track.first_traf_merged || sample_n > 0 {
                            let prev = &track.samples[track.samples.len() - 1];
                            decode_timestamp = prev.decode_timestamp + prev.duration as i64;
                        } else {
                            if let Some(tfdt) = &traf.tfdt {
                                decode_timestamp = tfdt.base_media_decode_time as i64;
                            }
                            track.first_traf_merged = true;
                        }

                        let composition_timestamp = if trun.flags & TrunBox::FLAG_SAMPLE_CTS != 0 {
                            decode_timestamp
                                + trun.sample_cts.get(sample_n).copied().unwrap_or(0) as i64
                        } else {
                            decode_timestamp
                        };

                        let duration = trun
                            .sample_durations
                            .get(sample_n)
                            .copied()
                            .unwrap_or(default_sample_duration)
                            as u64;

                        let base_data_offset_present =
                            traf.tfhd.flags & TfhdBox::FLAG_BASE_DATA_OFFSET != 0;
                        let default_base_is_moof =
                            traf.tfhd.flags & TfhdBox::FLAG_DEFAULT_BASE_IS_MOOF != 0;
                        let data_offset_present = trun.flags & TrunBox::FLAG_DATA_OFFSET != 0;
                        let base_data_offset = if !base_data_offset_present {
                            if !default_base_is_moof {
                                if sample_n == 0 {
                                    // the first sample in the track fragment
                                    moof.start // the position of the first byte of the enclosing Movie Fragment Box
                                } else {
                                    last_run_position // the offset of the previous sample
                                }
                            } else {
                                moof.start
                            }
                        } else {
                            traf.tfhd.base_data_offset.unwrap_or(moof.start)
                        };

                        let sample_size =
                            trun.sample_sizes
                                .get(sample_n)
                                .copied()
                                .unwrap_or(default_sample_size) as u64;

                        // Sample offset in bytes. (Must be positive, otherwise this would be outside of the file.)
                        let sample_offset = if traf_idx == 0 && sample_n == 0 {
                            if data_offset_present {
                                base_data_offset
                                    .saturating_add_signed(trun.data_offset.unwrap_or(0) as i64)
                            } else {
                                base_data_offset
                            }
                        } else {
                            last_run_position
                        };

                        last_run_position = sample_offset + sample_size;

                        track.samples.push(Sample {
                            id: track.samples.len() as u32,
                            is_sync: (sample_flags >> 16) & 0x1 != 0,
                            size: sample_size,
                            offset: sample_offset,
                            timescale: trak.mdia.mdhd.timescale as u64,
                            decode_timestamp,
                            composition_timestamp,
                            duration,
                        });
                    }
                }
            }
        }

        Ok(())
    }

    /// Update track metadata after all samples have been read
    fn update_tracks(&mut self) {
        for track in self.tracks.values_mut() {
            if track.duration == 0 {
                track.duration = track
                    .samples
                    .last()
                    .map(|v| v.duration.saturating_add_signed(v.composition_timestamp))
                    .unwrap_or_default();
            }
        }
    }
}

pub struct Track {
    /// Internal field used when decoding a fragmented MP4 file.
    first_traf_merged: bool,

    pub width: u16,
    pub height: u16,

    pub track_id: u32,

    /// Timescale of the sample.
    ///
    /// One time unit is equal to `1.0 / timescale` seconds.
    pub timescale: u64,

    /// Duration of the track in time units.
    pub duration: u64,

    pub kind: Option<TrackKind>,

    /// List of samples in the track.
    pub samples: Vec<Sample>,
}

impl Track {
    pub fn trak<'a>(&self, mp4: &'a Mp4) -> &'a TrakBox {
        let Some(trak) = mp4
            .moov
            .traks
            .iter()
            .find(|trak| trak.tkhd.track_id == self.track_id)
        else {
            // `Track` structs are only constructed when we have `trak` boxes,
            // so unless the user removes the `trak` box from the `Mp4`, it
            // will always be present.
            unreachable!("track with id \"{}\" not found", self.track_id);
        };

        trak
    }

    pub fn raw_codec_config(&self, mp4: &Mp4) -> Option<Vec<u8>> {
        let sample_description = &self.trak(mp4).mdia.minf.stbl.stsd;

        match &sample_description.contents {
            StsdBoxContent::Av01(content) => Some(content.av1c.raw.clone()),
            StsdBoxContent::Avc1(content) => Some(content.avcc.raw.clone()),
            StsdBoxContent::Hev1(content) | StsdBoxContent::Hvc1(content) => {
                Some(content.hvcc.raw.clone())
            }
            StsdBoxContent::Vp08(content) => Some(content.vpcc.raw.clone()),
            StsdBoxContent::Vp09(content) => Some(content.vpcc.raw.clone()),
            StsdBoxContent::Mp4a(_) | StsdBoxContent::Tx3g(_) | StsdBoxContent::Unknown(_) => None,
        }
    }

    pub fn codec_string(&self, mp4: &Mp4) -> Option<String> {
        self.trak(mp4).mdia.minf.stbl.stsd.contents.codec_string()
    }
}

#[derive(Default, Clone, Copy)]
pub struct Sample {
    /// Sample number.
    pub id: u32,

    /// Whether or not an entire frame can be decoded from this one sample,
    /// or if it needs the context of other samples.
    pub is_sync: bool,

    /// Size of the sample in bytes.
    pub size: u64,

    /// Offset of the sample in bytes from the start of the MP4 file.
    pub offset: u64,

    /// Timescale of the sample.
    ///
    /// One time unit is equal to `1.0 / timescale` seconds.
    pub timescale: u64,

    /// Timestamp of the sample at which it should be decoded,
    /// in time units.
    ///
    /// This is offsetted:
    /// * with decode timestamp shift determined from negative sample offsets
    /// * such that the first [`Self::composition_timestamp`] is zero.
    pub decode_timestamp: i64,

    /// Timestamp of the sample at which the sample should be displayed,
    /// in time units.
    ///
    /// This is offsetted such that the first composition timestamp is zero.
    pub composition_timestamp: i64,

    /// Duration of the sample in time units.
    pub duration: u64,
}

impl Sample {
    /// Returns the range of bytes in the input data that this sample covers.
    pub fn byte_range(&self) -> std::ops::Range<usize> {
        self.offset as usize..(self.offset + self.size) as usize
    }
}

impl std::fmt::Debug for Track {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Track")
            .field("first_traf_merged", &self.first_traf_merged)
            .field("kind", &self.kind)
            .field("timescale", &self.timescale)
            .field("duration", &self.duration)
            .finish()
    }
}

impl std::fmt::Debug for Sample {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Sample")
            .field("is_sync", &self.is_sync)
            .field("size", &self.size)
            .field("offset", &self.offset)
            .field("decode_timestamp", &self.decode_timestamp)
            .field("composition_timestamp", &self.composition_timestamp)
            .field("duration", &self.duration)
            .finish()
    }
}