content_inspector 0.2.4

//! A simple library for *fast* inspection of binary buffers to guess the type of content.
//!
//! This is mainly intended to quickly determine whether a given buffer contains "binary"
//! or "text" data. Programs like `grep` or `git diff` use similar mechanisms to decide whether
//! to treat some files as "binary data" or not.
//!
//! The analysis is based on a very simple heuristic: Searching for NULL bytes
//! (indicating "binary" content) and the detection of special [byte order
//! marks](https://en.wikipedia.org/wiki/Byte_order_mark) (indicating a particular kind of textual
//! encoding). Note that **this analysis can fail**. For example, even if unlikely, UTF-8-encoded
//! text can legally contain NULL bytes. Conversely, some particular binary formats (like binary
//! [PGM](https://en.wikipedia.org/wiki/Netpbm_format)) may not contain NULL bytes. Also, for
//! performance reasons, only the first 1024 bytes are checked for the NULL-byte (if no BOM was
//! detected).
//!
//! If this library reports a certain type of encoding (say `UTF_16LE`), there is **no guarantee**
//! that the binary buffer can *actually* be decoded as UTF-16LE.
//!
//! # Example
//! ```
//! use content_inspector::{ContentType, inspect};
//!
//! assert_eq!(ContentType::UTF_8, inspect(b"Hello"));
//! assert_eq!(ContentType::BINARY, inspect(b"\xFF\xE0\x00\x10\x4A\x46\x49\x46\x00"));
//!
//! assert!(inspect(b"Hello").is_text());
//! ```

extern crate memchr;

use memchr::memchr;
use std::cmp::min;
use std::fmt;

const MAX_SCAN_SIZE: usize = 1024;

/// The type of encoding that was detected (for "text" data) or `BINARY` for "binary" data.
#[allow(non_camel_case_types)]
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum ContentType {
    /// "binary" data
    BINARY,

    /// UTF-8 encoded "text" data
    UTF_8,

    /// UTF-8 encoded "text" data with a byte order mark.
    UTF_8_BOM,

    /// UTF-16 encoded "text" data (little endian)
    UTF_16LE,

    /// UTF-16 encoded "text" data (big endian)
    UTF_16BE,

    /// UTF-32 encoded "text" data (little endian)
    UTF_32LE,

    /// UTF-32 encoded "text" data (big endian)
    UTF_32BE,
}

impl ContentType {
    /// Returns `true`, if the `ContentType` is `BINARY`.
    pub fn is_binary(self) -> bool {
        self == ContentType::BINARY
    }

    /// Returns `true`, if the `ContentType` is __not__ `BINARY`.
    pub fn is_text(self) -> bool {
        !self.is_binary()
    }
}

impl fmt::Display for ContentType {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use ContentType::*;

        let name: &str = match *self {
            BINARY => "binary",
            UTF_8 => "UTF-8",
            UTF_8_BOM => "UTF-8-BOM",
            UTF_16LE => "UTF-16LE",
            UTF_16BE => "UTF-16BE",
            UTF_32LE => "UTF-32LE",
            UTF_32BE => "UTF-32BE",
        };
        write!(f, "{}", name)
    }
}

/// Common byte order marks
/// (see https://en.wikipedia.org/wiki/Byte_order_mark)
static BYTE_ORDER_MARKS: &[(&[u8], ContentType)] = &[
    (&[0xEF, 0xBB, 0xBF], ContentType::UTF_8_BOM),
    // UTF-32 needs to be checked before UTF-16 (overlapping BOMs)
    (&[0x00, 0x00, 0xFE, 0xFF], ContentType::UTF_32BE),
    (&[0xFF, 0xFE, 0x00, 0x00], ContentType::UTF_32LE),
    (&[0xFE, 0xFF], ContentType::UTF_16BE),
    (&[0xFF, 0xFE], ContentType::UTF_16LE),
];

/// Magic numbers for some filetypes that could otherwise be characterized as text.
static MAGIC_NUMBERS: [&[u8]; 2] = [b"%PDF", b"\x89PNG"];

/// Try to determine the type of content in the given buffer. See the crate documentation for a
/// usage example and for more details on how this analysis is performed.
///
/// If the buffer is empty, the content type will be reported as `UTF_8`.
pub fn inspect(buffer: &[u8]) -> ContentType {
    use ContentType::*;

    for &(bom, content_type) in BYTE_ORDER_MARKS {
        if buffer.starts_with(bom) {
            return content_type;
        }
    }

    // Scan the first few bytes for zero-bytes
    let scan_size = min(buffer.len(), MAX_SCAN_SIZE);
    let has_zero_bytes = memchr(0x00, &buffer[..scan_size]).is_some();

    if has_zero_bytes {
        return BINARY;
    }

    if MAGIC_NUMBERS.iter().any(|magic| buffer.starts_with(magic)) {
        return BINARY;
    }

    UTF_8
}

#[cfg(test)]
mod tests {
    use {inspect, ContentType::*};

    #[test]
    fn test_empty_buffer_utf_8() {
        assert_eq!(UTF_8, inspect(b""));
    }

    #[test]
    fn test_text_simple() {
        assert_eq!(UTF_8, inspect("Simple UTF-8 string ☔".as_bytes()));
    }

    #[test]
    fn test_text_utf8() {
        assert_eq!(UTF_8, inspect(include_bytes!("../testdata/text_UTF-8.txt")));
    }

    #[test]
    fn test_text_utf8_bom() {
        assert_eq!(
            UTF_8_BOM,
            inspect(include_bytes!("../testdata/text_UTF-8-BOM.txt"))
        );
    }

    #[test]
    fn test_text_utf16le() {
        assert_eq!(
            UTF_16LE,
            inspect(include_bytes!("../testdata/text_UTF-16LE-BOM.txt"))
        );
    }

    #[test]
    fn test_text_utf16be() {
        assert_eq!(
            UTF_16BE,
            inspect(include_bytes!("../testdata/text_UTF-16BE-BOM.txt"))
        );
    }

    #[test]
    fn test_text_utf32le() {
        assert_eq!(
            UTF_32LE,
            inspect(include_bytes!("../testdata/text_UTF-32LE-BOM.txt"))
        );
    }

    #[test]
    fn test_text_utf32be() {
        assert_eq!(
            UTF_32BE,
            inspect(include_bytes!("../testdata/text_UTF-32BE-BOM.txt"))
        );
    }

    #[test]
    fn test_png() {
        assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.png")));
    }

    #[test]
    fn test_jpg() {
        assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.jpg")));
    }

    #[test]
    fn test_pdf() {
        assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.pdf")));
    }

    #[test]
    fn test_is_text() {
        assert!(UTF_8.is_text());
        assert!(UTF_32LE.is_text());
    }

    #[test]
    fn test_is_binary() {
        assert!(BINARY.is_binary());
    }
}