pub struct Extractor { /* private fields */ }Expand description
Extractor for extracting text from different file formats
The Extractor uses the builder pattern to set configurations. This allows configuring and extracting text in one line. For example
use extractous::{CharSet, Extractor};
let (text, metadata) = Extractor::new()
.set_extract_string_max_length(1000)
.extract_file_to_string("README.md")
.unwrap();
println!("{}", text);Implementations§
Source§impl Extractor
impl Extractor
pub fn new() -> Self
Sourcepub fn set_extract_string_max_length(self, max_length: i32) -> Self
pub fn set_extract_string_max_length(self, max_length: i32) -> Self
Set the maximum length of the extracted text. Used only for extract_to_string functions Default: 500_000
Sourcepub fn set_encoding(self, encoding: CharSet) -> Self
pub fn set_encoding(self, encoding: CharSet) -> Self
Set the encoding to use for when extracting text to a stream. Not used for extract_to_string functions. Default: CharSet::UTF_8
Sourcepub fn set_pdf_config(self, config: PdfParserConfig) -> Self
pub fn set_pdf_config(self, config: PdfParserConfig) -> Self
Set the configuration for the PDF parser
Sourcepub fn set_office_config(self, config: OfficeParserConfig) -> Self
pub fn set_office_config(self, config: OfficeParserConfig) -> Self
Set the configuration for the Office parser
Sourcepub fn set_ocr_config(self, config: TesseractOcrConfig) -> Self
pub fn set_ocr_config(self, config: TesseractOcrConfig) -> Self
Set the configuration for the Tesseract OCR
Sourcepub fn set_xml_output(self, xml_output: bool) -> Self
pub fn set_xml_output(self, xml_output: bool) -> Self
Set the configuration for the parse as xml
Sourcepub fn extract_file(
&self,
file_path: &str,
) -> ExtractResult<(StreamReader, Metadata)>
pub fn extract_file( &self, file_path: &str, ) -> ExtractResult<(StreamReader, Metadata)>
Extracts text from a file path. Returns a tuple with stream of the extracted text and metadata.
the stream is decoded using the extractor’s encoding
Sourcepub fn extract_bytes(
&self,
buffer: &[u8],
) -> ExtractResult<(StreamReader, Metadata)>
pub fn extract_bytes( &self, buffer: &[u8], ) -> ExtractResult<(StreamReader, Metadata)>
Extracts text from a byte buffer. Returns a tuple with stream of the extracted text and metadata.
the stream is decoded using the extractor’s encoding
Sourcepub fn extract_url(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)>
pub fn extract_url(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)>
Extracts text from an url. Returns a tuple with stream of the extracted text and metadata.
the stream is decoded using the extractor’s encoding
Sourcepub fn extract_file_to_string(
&self,
file_path: &str,
) -> ExtractResult<(String, Metadata)>
pub fn extract_file_to_string( &self, file_path: &str, ) -> ExtractResult<(String, Metadata)>
Extracts text from a file path. Returns a tuple with string that is of maximum length
of the extractor’s extract_string_max_length and metadata.
Sourcepub fn extract_bytes_to_string(
&self,
buffer: &[u8],
) -> ExtractResult<(String, Metadata)>
pub fn extract_bytes_to_string( &self, buffer: &[u8], ) -> ExtractResult<(String, Metadata)>
Extracts text from a byte buffer. Returns a tuple with string that is of maximum length
of the extractor’s extract_string_max_length and metadata.
Sourcepub fn extract_url_to_string(
&self,
url: &str,
) -> ExtractResult<(String, Metadata)>
pub fn extract_url_to_string( &self, url: &str, ) -> ExtractResult<(String, Metadata)>
Extracts text from a URL. Returns a tuple with string that is of maximum length
of the extractor’s extract_string_max_length and metadata.