[go: up one dir, main page]

yore/
lib.rs

1use std::borrow::Cow;
2
3use thiserror::Error;
4
5pub mod code_pages;
6pub(crate) mod decoder;
7mod encoder;
8pub(crate) use encoder::Encoder;
9
10#[derive(Error, Debug)]
11#[error("Character in UTF-8 string has no mapping defined in code page")]
12pub struct EncodeError {}
13
14pub trait CodePage: Encoder {
15    /// Encode UTF-8 string into single-byte encoding
16    ///
17    /// Undefined characters will result in [`EncodeError`]
18    ///
19    /// # Examples
20    ///
21    /// ```
22    /// use yore::{CodePage, EncodeError};
23    ///
24    /// // Erase type for example - prefer concrete type over trait object whenever possible
25    /// let cp850: &dyn CodePage = &yore::code_pages::CP850;
26    /// assert_eq!(cp850.encode("text").unwrap(), vec![116, 101, 120, 116]);
27    /// assert!(matches!(cp850.encode("text 🦀"), EncodeError));
28    /// ```
29    #[inline]
30    fn encode<'a>(&self, s: &'a str) -> Result<Cow<'a, [u8]>, EncodeError> {
31        self.encode_helper(s, None)
32    }
33
34    /// Encode UTF-8 string into single-byte encoding
35    ///
36    /// Undefined characters will be replaced with byte `fallback`
37    ///
38    /// # Examples
39    ///
40    /// ```
41    /// use yore::CodePage;
42    ///
43    /// // Erase type for example - prefer concrete type over trait object whenever possible
44    /// let cp850: &dyn CodePage = &yore::code_pages::CP850;
45    /// assert_eq!(cp850.encode_lossy("text 🦀", 168), vec![116, 101, 120, 116, 32, 168])
46    /// ```
47    #[inline]
48    fn encode_lossy<'a>(&self, s: &'a str, fallback: u8) -> Cow<'a, [u8]> {
49        self.encode_helper(s, Some(fallback)).unwrap()
50    }
51
52    /// Decode single-byte encoding into UTF-8 string
53    ///
54    /// Undefined codepoints will result in [`DecodeError`]
55    ///
56    /// # Examples
57    ///
58    /// ```
59    /// use yore::{CodePage, DecodeError};
60    ///
61    /// // Erase types for example - prefer concrete type over trait object whenever possible
62    /// let cp850: &dyn CodePage = &yore::code_pages::CP850;
63    /// let cp857: &dyn CodePage = &yore::code_pages::CP857;
64    /// assert_eq!(cp850.decode(&[116, 101, 120, 116]).unwrap(), "text");
65    ///
66    /// //codepoint 231 is undefined
67    /// assert!(matches!(cp857.decode(&[116, 101, 120, 116, 231]), Err(DecodeError{position: 4, value: 231})));
68    /// ```
69    fn decode<'a>(&self, bytes: &'a [u8]) -> Result<Cow<'a, str>, DecodeError>;
70
71    /// Decode single-byte encoding into UTF-8 string
72    ///
73    /// Undefined codepoints will be replaced with `'�'`
74    ///
75    /// # Examples
76    ///
77    /// ```
78    /// use yore::CodePage;
79    ///
80    /// // Erase type for example - prefer concrete type over trait object whenever possible
81    /// let cp857: &dyn CodePage = &yore::code_pages::CP857;
82    /// //codepoint 231 is undefined
83    /// assert_eq!(cp857.decode_lossy(&[116, 101, 120, 116, 32, 231]), "text �");
84    /// ```
85    #[inline(always)]
86    fn decode_lossy<'a>(&self, bytes: &'a [u8]) -> Cow<'a, str> {
87        self.decode(bytes).unwrap()
88    }
89
90    /// Decode single-byte encoding into UTF-8 string
91    ///
92    /// Undefined codepoints will be replaced with `fallback`
93    ///
94    /// # Examples
95    ///
96    /// ```
97    /// use yore::CodePage;
98    ///
99    /// // Erase type for example - prefer concrete type over trait object whenever possible
100    /// let cp857: &dyn CodePage = &yore::code_pages::CP857;
101    /// //codepoint 231 is undefined
102    /// assert_eq!(cp857.decode_lossy_fallback(&[116, 101, 120, 116, 32, 231], '�'), "text �");
103    /// ```
104    #[inline(always)]
105    fn decode_lossy_fallback<'a>(&self, bytes: &'a [u8], _fallback: char) -> Cow<'a, str> {
106        self.decode(bytes).unwrap()
107    }
108}
109
110#[derive(Error, Debug)]
111#[error("Undefined codepoint {value} at offset {position}")]
112pub struct DecodeError {
113    pub position: usize,
114    pub value: u8,
115}
116
117#[cfg(test)]
118mod tests {
119    use crate::code_pages::CP864;
120
121    #[test]
122    fn test_nonstandard_ascii() {
123        let bytes = [0x25, 253];
124        //CP864 has nonstandard mapping for 0x25
125        let s = "٪ﻱ";
126        assert_eq!(CP864.decode(&bytes).unwrap(), s);
127        assert_eq!(bytes, *CP864.encode(s).unwrap());
128
129        //Standard '%' should still map to 0x25
130        let s = "%ﻱ";
131        assert_eq!(bytes, *CP864.encode(s).unwrap());
132
133        let s = "AAAAAAA٪";
134        let bytes = [65, 65, 65, 65, 65, 65, 65, 0x25];
135        //Should decode to nonstandard, even if whole usize-len is ascii
136        assert_eq!(CP864.decode(&bytes).unwrap(), s);
137    }
138}