diff --git a/Cargo.lock b/Cargo.lock index 2bc8ec6c327639d9647f7c06b2a970b7356ac546..ec56aac3bd001e426bb56eac065903049031610e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,6 +107,7 @@ dependencies = [ "pretty_assertions", "regex", "reqwest", + "sanitize-filename", "scraper", "serde", "serde_json", @@ -114,7 +115,9 @@ dependencies = [ "termination", "thiserror", "tokio", + "unicode-width", "url", + "urlencoding", ] [[package]] @@ -1545,6 +1548,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "sanitize-filename" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ed72fbaf78e6f2d41744923916966c4fbe3d7c74e3037a8ee482f1115572603" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "schannel" version = "0.1.23" @@ -2060,6 +2073,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf-8" version = "0.7.6" diff --git a/Cargo.toml b/Cargo.toml index 1e428042c3612528f7ed6d68cd7adb069b9386bd..a56e976543d8a750c2794d81f4e497a4a332402e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ itertools = "0.11.0" num_cpus = "1.16.0" regex = "1.10.2" reqwest = "0.11.22" +sanitize-filename = "0.5.0" scraper = "0.18.1" serde = { version = "1.0.190", features = ["derive"] } serde_json = "1.0.108" @@ -32,7 +33,9 @@ serde_yaml = "0.9.27" termination = "0.1.2" thiserror = "1.0.50" tokio = { version = "1.33.0", features = ["full"] } +unicode-width = "0.1.11" url = "2.4.1" +urlencoding = "2.1.3" [dev-dependencies] assert_cmd = "2.0.12" diff --git a/README.md b/README.md index 04090076479cb604a9f48cb892dc5598dfc01bad..a404e625767a080de62b158b60c0c4c7b59851e9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # archwiki-rs 📖 A CLI tool to read pages from the ArchWiki -## Table of contents + + - [Installation](#installation) * [crates.io](#cratesio) * [Source](#source) @@ -10,18 +11,22 @@ A CLI tool to read pages from the ArchWiki + [Basic request](#basic-request) + [Using a different format](#using-a-different-format) + [Caching](#caching) - + [404 page not found (-̥̥̥n-̥̥̥ )](#404-page-not-found--̥̥̥n-̥̥̥-) + + [404 page not found (-̥̥̥n-̥̥̥ )](#404-page-not-found--%CC%A5%CC%A5%CC%A5n-%CC%A5%CC%A5%CC%A5-) * [Searching the ArchWiki](#searching-the-archwiki) + [Search by title](#search-by-title) + [Search for text](#search-for-text) * [Downloading wiki info](#downloading-wiki-info) - + [Possible speed-ups](#possible-speed-ups) * [Listing ArchWiki information](#listing-archwiki-information) + [Listing pages](#listing-pages) + [Listing categories](#listing-categories) + [Listing languages](#listing-languages) + * [Downloading a local copy of the ArchWiki](#downloading-a-local-copy-of-the-archwiki) + + [Possible speed-ups](#possible-speed-ups) * [Other Information](#other-information) - [Plugins](#plugins) +- [Alternatives](#alternatives) + + ## Installation Currently, you can only install this tool from [ crates.io ](https://crates.io/crates/archwiki-rs) @@ -83,7 +88,7 @@ uses stderr to give the user suggestions on what they might have wanted to type. An example shell script to do something like this is available in the [repository](https://github.com/jackboxx/archwiki-rs) -under the name `example.sh`. +under the name `example.sh` which can be used like this `sh example.sh `. ### Searching the ArchWiki @@ -106,25 +111,13 @@ that the search term is in ### Downloading wiki info -Page names are stored locally to prevent having to scrape the entire table of contents of -the ArchWiki with every command. - -Use this command to fetch all page names. -Be warned, since this scrapes multiple thousand links, this can be quite slow (-, - )…zzzZZ +Page and category names are stored locally for faster look-ups. +Use this command to fetch all page and category names. ```sh archwiki-rs sync-wiki ``` -#### Possible speed-ups - -If you don't mind your CPU and network becoming a bit saturated you can increase the -amount of threads used to fetch data from the wiki - -```sh -archwiki-rs sync-wiki -t 8 -``` - ### Listing ArchWiki information #### Listing pages @@ -163,6 +156,27 @@ And the same for available languages archwiki-rs list-languages ``` +### Downloading a local copy of the ArchWiki + +Use this command to download a local copy of the ArchWiki. Be warned, this command makes over +10,000 requests to the ArchWiki so it takes a while to finish (-, -)…zzzZZ + +```sh +archwiki-rs local-wiki ~/local-archwiki --format markdown +``` + +#### Possible speed-ups + +If you don't mind your CPU and network becoming a bit saturated you can increase the +amount of threads used to fetch data from the wiki. + +Keep in mind that you might get rate limited by the ArchWiki if make too many requests at once. + +```sh +archwiki-rs local-wiki -t 8 +``` + + ### Other Information Other information such as the value/location of the `cache directory` can be obtained @@ -185,3 +199,8 @@ Here's a list of programs that have plugins for `archwiki-rs` to make your life - [Neovim](https://github.com/Jackboxx/archwiki-nvim) - [Obsidian](https://github.com/Jackboxx/archwiki-obsidian) + +## Alternatives + +If you are using Arch Linux a great alternative for this tool is the `wikiman` CLI tool +in combination with the `arch-wiki-docs` package. diff --git a/src/categories.rs b/src/categories.rs index a26a7020998901823b50f489a09e865c3f792ae4..f14a0e52a55a204a220e460c9de96ebd9b36d031 100644 --- a/src/categories.rs +++ b/src/categories.rs @@ -1,21 +1,9 @@ -use ::futures::future; -use indicatif::{MultiProgress, ProgressBar}; -use itertools::Itertools; -use scraper::{Html, Node, Selector}; -use std::{collections::HashMap, thread, time::Duration}; -use url::Url; +#![allow(unused)] -#[derive(Debug, Clone)] -struct CategoryListItem { - name: String, - url: String, -} +use itertools::Itertools; +use std::collections::HashMap; -use crate::{ - error::WikiError, - utils::{extract_tag_attr, get_elements_by_tag, HtmlTag}, - wiki_api::fetch_page_by_url, -}; +use crate::error::WikiError; /// Returns a print ready list of the provided page names in /// 1. A tree format if `flatten` is `false`: @@ -39,13 +27,32 @@ use crate::{ /// If it is not flattened the list is first ordered by category names and then by page names withing those /// categories. /// If it is flattened then it will by sorted by page names. -pub fn list_pages(categories: &HashMap>, flatten: bool) -> String { +pub fn list_pages( + wiki_tree: &HashMap>, + categories_filter: Option<&[String]>, + flatten: bool, +) -> String { if flatten { - return categories.values().flatten().unique().sorted().join("\n"); + return wiki_tree + .iter() + .filter_map(|(cat, pages)| { + categories_filter + .map(|filter| filter.iter().contains(cat).then_some(pages)) + .unwrap_or(Some(pages)) + }) + .flatten() + .unique() + .sorted() + .join("\n"); } - categories + wiki_tree .iter() + .filter_map(|(cat, pages)| { + categories_filter + .map(|filter| filter.iter().contains(cat).then_some((cat, pages))) + .unwrap_or(Some((cat, pages))) + }) .sorted() .map(|(cat, pages)| { let list = pages.iter().map(|p| format!("───┤{p}")).join("\n"); @@ -54,128 +61,3 @@ pub fn list_pages(categories: &HashMap>, flatten: bool) -> S }) .join("\n\n") } - -/// Scrapes the ArchWiki for all page names and their immediate parent category. Category nesting -/// is ignored as a category can be a sub category of multiple other categories. -/// -/// Caution this function will most likely take several minutes to finish (-, – )…zzzZZ -pub async fn fetch_all_pages( - hide_progress: bool, - thread_count: usize, - max_categories: Option, - start_at: Option<&str>, -) -> Result>, WikiError> { - let from = start_at.unwrap_or(""); - let limit = max_categories.unwrap_or(10000); - - let base_url = "https://wiki.archlinux.org/index.php?title=Special:Categories"; - - let url = Url::parse_with_params( - base_url, - &[("from", from), ("limit", limit.to_string().as_str())], - )?; - - let document = fetch_page_by_url(url).await?; - - let body_class = ".mw-spcontent"; - let selector = Selector::parse(body_class) - .unwrap_or_else(|_| panic!("{body_class} should be valid selector")); - - let body = document.select(&selector).next().unwrap(); - - let category_list_element = get_elements_by_tag(*body, &HtmlTag::Ul) - .into_iter() - .next() - .unwrap(); - - let items = parse_category_list(category_list_element); - let multi_bar = MultiProgress::new(); - - let chunk_count = items.len() / thread_count; - let tasks = items - .chunks(chunk_count) - .map(|chunk| { - let chunk = chunk.to_vec(); - let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0)); - let bar = multi_bar.add(bar); - if hide_progress { - bar.finish_and_clear(); - } - - tokio::spawn(async move { - let mut res = Vec::with_capacity(chunk.len()); - for item in chunk { - let pages = match fetch_page_names_from_categoriy(&item.url).await { - Ok(pages) => pages, - - Err(_) => { - thread::sleep(Duration::from_secs(1)); - fetch_page_names_from_categoriy(&item.url) - .await - .unwrap_or_else(|err| { - eprintln!( - "failed to fetch pages in category {}\n ERROR {err}", - item.name - ); - vec![] - }) - } - }; - - res.push((item.name, pages)); - bar.inc(1); - } - - res - }) - }) - .collect_vec(); - - let out = future::join_all(tasks) - .await - .into_iter() - .flatten() - .flatten() - .collect_vec(); - - Ok(HashMap::from_iter(out)) -} - -fn parse_category_list(list_node: ego_tree::NodeRef<'_, scraper::Node>) -> Vec { - let list_items = get_elements_by_tag(list_node, &HtmlTag::Li); - list_items - .into_iter() - .flat_map(|li| { - let a_tag = li.first_child()?; - let a_tag_element = a_tag.value().as_element()?; - - let name = a_tag.first_child()?.value().as_text()?.to_string(); - let url = extract_tag_attr(a_tag_element, &HtmlTag::A, "href")?; - - Some(CategoryListItem { name, url }) - }) - .collect() -} - -/// Scrape the ArchWiki for a list of all page names that belong to a specific category -async fn fetch_page_names_from_categoriy(url_str: &str) -> Result, WikiError> { - let selector = Selector::parse("#mw-pages").expect("#mw-pages to be a valid css selector"); - - let body = reqwest::get(url_str).await?.text().await?; - let document = Html::parse_document(&body); - - let Some(page_container) = document.select(&selector).next() else { - return Ok(vec![]) - }; - - Ok(page_container - .descendants() - .filter_map(|node| { - if let Node::Element(e) = node.value() { - extract_tag_attr(e, &HtmlTag::A, "title") - } else { - None - } - }) - .collect()) -} diff --git a/src/cli.rs b/src/cli.rs index b2f6a58da18d5078fb39bf0f3b0a6509fc25b769..6be1d1d8fcd7e8835ad890a3dc4a4398cbbc7f8f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -15,7 +15,7 @@ pub struct CliArgs { pub enum Commands { #[command( about = "Read a page from the ArchWiki", - long_about = "Read a page from the ArchWiki, if the page is not found similar page names are recommended. A list of page names is in the pages.yml file which can be updated with the 'sync-wiki' command." + long_about = "Read a page from the ArchWiki, if the page is not found similar page names are recommended" )] ReadPage { #[arg(short, long)] @@ -26,7 +26,7 @@ pub enum Commands { ignore_cache: bool, #[arg(short, long)] /// Don't invalidate the cache even if it is considered stale. A cache is considered stale - /// after it hasn't been updated in more then 14 days. + /// after it hasn't been updated in more then 14 days disable_cache_invalidation: bool, #[arg(short, long)] /// Show URLs for plain-text output @@ -42,7 +42,7 @@ pub enum Commands { }, #[command( about = "Search the ArchWiki for pages", - long_about = "Search the ArchWiki for pages by title. Uses the 'opensearch' API action to perform queries." + long_about = "Search the ArchWiki for pages" )] Search { search: String, @@ -53,74 +53,90 @@ pub enum Commands { /// Maximum number of results limit: u16, #[arg(short, long)] - /// Search for pages by text content instead of title. Uses the 'query' API action instead - /// of 'opensearch'. + /// Search for pages by text content instead of title text_search: bool, }, #[command( about = "List all pages from the ArchWiki that have been downloaded", - long_about = "List all pages from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading." + long_about = "List all pages from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading" )] ListPages { #[arg(short, long)] /// Flatten all pages and don't show their category names flatten: bool, + #[arg(short, long, value_delimiter = ',')] + /// Only show pages in these categories + categories: Vec, #[arg(short, long)] - /// Only show pages in this category - category: Option, - #[arg(short, long)] - /// Use different file to read pages from + /// Use a different file to read pages from page_file: Option, }, #[command( about = "List all categories from the ArchWiki that have been downloaded", - long_about = "List categories from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading." + long_about = "List categories from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading" )] ListCategories { #[arg(short, long)] - /// Use different file to read pages from + /// Use a different file to read pages from page_file: Option, }, #[command( about = "List all languages that the ArchWiki supports", - long_about = "List all languages that the ArchWiki supports." + long_about = "List all languages that the ArchWiki supports" )] ListLanguages, #[command( - about = "Download the names of all pages on the ArchWiki", - long_about = "Download the names of all pages on the ArchWiki. Page names are used for the 'list-pages' and 'list-categories' commands" + about = "Download information about the pages and categories on the ArchWiki", + long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' sub-commands" )] SyncWiki { #[arg(short = 'H', long)] /// Hide progress indicators hide_progress: bool, #[arg(short, long)] - /// Number of threads to use for fetching data from the ArchWiki + /// Print result to stdout instead of writing to a file. Output is formatted as YAML + print: bool, + #[arg(short, long)] + /// Use custom output file location + out_file: Option, + }, + #[command( + about = "Download a copy of the ArchWiki. Will take a long time :)", + long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub-categories are put at the top level of the wiki directory" + )] + LocalWiki { + #[arg(short, long)] + /// Amount of threads to use for fetching pages from the ArchWiki. If not provided the + /// number of physical cores is used thread_count: Option, #[arg(short, long)] - /// Maximum amount of categories to fetch. If no value if provided all categories are - /// fetched. - max_categories: Option, + /// Use a different file to read pages from + page_file: Option, + #[arg(short = 'H', long)] + /// Hide progress indicators + hide_progress: bool, #[arg(short, long)] - /// First category that will be fetched. See 'https://wiki.archlinux.org/index.php?title=Special:Categories' for more information. - start_at: Option, + /// Show URLs in plain-text files + show_urls: bool, #[arg(short, long)] - /// Print result to stdout instead of writing to a file. Output is formatted as YAML. - print: bool, + /// Override already downloaded files + override_existing_files: bool, + #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] + /// The format that the page should be displayed in + format: PageFormat, + /// Location to store the local copy of the wiki at + location: PathBuf, }, #[command( about = "Retrive information related to this tool", - long_about = "Retrive information related to this tool. All Info is shown by default." + long_about = "Retrive information related to this tool" )] Info { #[arg(short = 'c', long)] - /// Location of the cache directory show_cache_dir: bool, #[arg(short = 'd', long)] - /// Location of the data directory show_data_dir: bool, #[arg(short, long)] - /// Only show values and not the properties they belong to or their descriptions only_values: bool, }, } diff --git a/src/error.rs b/src/error.rs index 4cc13cd6ccaa0d505edd3e1065eaed4868888064..c7294064543a8f5a9bc509a95d17383dd1a5b676 100644 --- a/src/error.rs +++ b/src/error.rs @@ -48,6 +48,4 @@ pub enum WikiError { InvalidApiResponse(InvalidApiResponseError), #[error("{}", .0)] NoPageFound(String), - #[error("The category '{}' could not be found", .0)] - NoCategoryFound(String), } diff --git a/src/formats/html.rs b/src/formats/html.rs index 090cafd3dace04734b3426193155765ad4993f01..6ceca4cad042b90255c14ef249ff2487be469144 100644 --- a/src/formats/html.rs +++ b/src/formats/html.rs @@ -1,41 +1,39 @@ -use scraper::Html; - -use crate::utils::get_page_content; +use scraper::{Html, Selector}; /// Converts the body of the ArchWiki page to a HTML string pub fn convert_page_to_html(document: &Html, page: &str) -> String { - let content = get_page_content(document).expect("page should have content"); - + let body_selector = Selector::parse("body").expect("body should be a valid css selector"); format!( "

{heading}

\n{body}", heading = page, - body = content.html() + body = document + .select(&body_selector) + .next() + .map(|body| body.inner_html()) + .unwrap_or_default() ) } #[cfg(test)] mod tests { use super::*; - use crate::utils::PAGE_CONTENT_CLASS; use pretty_assertions::assert_eq; #[tokio::test] async fn test_convert_page_to_html() { let page = "test page"; - let input = format!( - r#"
+ let input = r#"
Hello, world! -
"# - ); +
"#; let expected_output = format!( r#"

{page}

-
+
Hello, world!
"# ); - let document = Html::parse_document(&input); + let document = Html::parse_document(input); let output = convert_page_to_html(&document, page); assert_eq!(output, expected_output); diff --git a/src/formats/markdown.rs b/src/formats/markdown.rs index c36449d03b8bdf5ed615292ab1d90c21202cb6ca..23a2ba90ccc4d9c1427ee9574896ef5eb78b3a22 100644 --- a/src/formats/markdown.rs +++ b/src/formats/markdown.rs @@ -1,29 +1,22 @@ use scraper::Html; -use crate::utils::get_page_content; - /// Converts the body of the ArchWiki page to a Markdown string pub fn convert_page_to_markdown(document: &Html, page: &str) -> String { - let content = get_page_content(document).expect("page should have content"); - - let md = html2md::parse_html(&content.html()); + let md = html2md::parse_html(&document.html()); format!("# {heading}\n\n{body}", heading = page, body = md) } #[cfg(test)] mod tests { use super::*; - use crate::utils::PAGE_CONTENT_CLASS; use pretty_assertions::assert_eq; #[tokio::test] async fn test_convert_page_to_markdown() { let page = "test page"; - let input = format!( - r#"
-

Hello, world!

-
"# - ); + let input = r#"
+

Hello, world!

+
"#; let expected_output = format!( r#"# {page} @@ -31,7 +24,7 @@ mod tests { ### Hello, world! ###"# ); - let document = Html::parse_document(&input); + let document = Html::parse_document(input); let output = convert_page_to_markdown(&document, page); assert_eq!(output, expected_output); diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs index 6b7b6a2818f1cd32fd6af65fcaef470c4c1bbddb..754d4f95d6daf9d3ac4412ac7ecafb6bc2d1e0c2 100644 --- a/src/formats/plain_text.rs +++ b/src/formats/plain_text.rs @@ -2,14 +2,13 @@ use colored::Colorize; use ego_tree::NodeRef; use scraper::{Html, Node}; -use crate::utils::{extract_tag_attr, get_page_content, HtmlTag}; +use crate::utils::extract_tag_attr; /// Converts the body of the ArchWiki page to a plain text string, removing all tags and /// only leaving the text node content. URLs can be shown in a markdown like syntax. pub fn convert_page_to_plain_text(document: &Html, show_urls: bool) -> String { - let content = get_page_content(document).expect("page should have content"); - - content + document + .root_element() .children() .map(|node| format_children(node, show_urls)) .collect::>() @@ -30,7 +29,7 @@ pub fn format_children(node: NodeRef, show_urls: bool) -> String { if show_urls { wrap_text_in_url( &child_text, - &extract_tag_attr(e, &HtmlTag::A, "href").unwrap_or("".to_string()), + &extract_tag_attr(e, "a", "href").unwrap_or("".to_string()), ) } else { child_text @@ -86,41 +85,34 @@ fn wrap_text_in_url(text: &str, url: &str) -> String { #[cfg(test)] mod tests { use super::*; - use crate::utils::PAGE_CONTENT_CLASS; use pretty_assertions::assert_eq; #[tokio::test] async fn test_convert_page_to_plain_text() { { - let input = format!( - r#"
-

Hello, world!

-
how are you
- I'm great -
"# - ); - - let expected_output = format!( - r#" - Hello, world! - how are you - I'm great -"# - ); - - let document = Html::parse_document(&input); + let input = r#" +

Hello, world!

+
how are you
+ I'm great +
"#; + + let expected_output = r#" + Hello, world! + how are you + I'm great + "#; + + let document = Html::parse_document(input); let output = convert_page_to_plain_text(&document, false); assert_eq!(output, expected_output); } { - let input = format!( - r#"
+ let input = r#"

Hello, world!

example -
"# - ); +
"#; let expected_output = format!( r#" diff --git a/src/main.rs b/src/main.rs index 493cb2a7802b7aa029c83a95fb2cf628d76f4ae7..9ccc9cf3d6748012e8692269afe719708e772d7b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,22 +1,24 @@ -use std::{collections::HashMap, fs}; +use std::fs; use clap::Parser; use cli::{CliArgs, Commands}; use directories::BaseDirs; use error::WikiError; use formats::plain_text::convert_page_to_plain_text; + use itertools::Itertools; -use scraper::Html; -use url::Url; -use wiki_api::fetch_page_by_url; use crate::{ - categories::{fetch_all_pages, list_pages}, + categories::list_pages, formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat}, languages::{fetch_all_langs, format_lang_table}, search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, - utils::{create_cache_page_path, get_page_content, page_cache_exists, read_pages_file_as_str}, + utils::{ + create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree, + UNCATEGORIZED_KEY, + }, wiki_api::{fetch_open_search, fetch_page, fetch_text_search}, + wiki_download::{download_wiki, sync_wiki_info}, }; mod categories; @@ -27,6 +29,7 @@ mod languages; mod search; mod utils; mod wiki_api; +mod wiki_download; const PAGE_FILE_NAME: &str = "pages.yml"; @@ -47,8 +50,10 @@ async fn main() -> Result<(), WikiError> { let cache_dir = base_dir.cache_dir().join("archwiki-rs"); let data_dir = base_dir.data_local_dir().join("archwiki-rs"); + let log_dir = data_dir.join("logs"); fs::create_dir_all(&cache_dir)?; fs::create_dir_all(&data_dir)?; + fs::create_dir_all(&log_dir)?; let default_page_file_path = data_dir.join(PAGE_FILE_NAME); @@ -69,7 +74,7 @@ async fn main() -> Result<(), WikiError> { let out = if use_cached_page { fs::read_to_string(&page_cache_path)? } else { - match fetch_document(&page, lang.as_deref()).await { + match fetch_page(&page, lang.as_deref()).await { Ok(document) => match format { PageFormat::PlainText => convert_page_to_plain_text(&document, show_urls), PageFormat::Markdown => convert_page_to_markdown(&document, &page), @@ -119,34 +124,35 @@ async fn main() -> Result<(), WikiError> { } Commands::ListPages { flatten, - category, + categories, page_file, } => { - let path = page_file.unwrap_or(default_page_file_path); - let file = read_pages_file_as_str(path)?; - - let pages_map: HashMap> = serde_yaml::from_str(&file)?; - - let out = if let Some(category) = category { - pages_map - .get(&category) - .ok_or(WikiError::NoCategoryFound(category))? - .iter() - .sorted() - .join("\n") - } else { - list_pages(&pages_map, flatten) - }; + let (path, is_default) = page_file + .map(|path| (path, false)) + .unwrap_or((default_page_file_path, true)); + + let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; + let out = list_pages( + &wiki_tree, + (!categories.is_empty()).then_some(&categories), + flatten, + ); println!("{out}"); } Commands::ListCategories { page_file } => { - let path = page_file.unwrap_or(default_page_file_path); - let file = read_pages_file_as_str(path)?; - - let pages_map: HashMap> = serde_yaml::from_str(&file)?; + let (path, is_default) = page_file + .map(|path| (path, false)) + .unwrap_or((default_page_file_path, true)); + + let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; + let out = wiki_tree + .keys() + .unique() + .sorted() + .filter(|cat| cat.as_str() != UNCATEGORIZED_KEY) + .join("\n"); - let out = pages_map.keys().unique().sorted().join("\n"); println!("{out}"); } Commands::ListLanguages => { @@ -157,31 +163,40 @@ async fn main() -> Result<(), WikiError> { } Commands::SyncWiki { hide_progress, - thread_count, - max_categories, - start_at, print, + out_file, } => { - let thread_count = thread_count.unwrap_or(num_cpus::get_physical()); - let res = fetch_all_pages( - hide_progress, - thread_count, - max_categories, - start_at.as_deref(), - ) - .await?; + let path = out_file.unwrap_or(default_page_file_path); + sync_wiki_info(&path, print, hide_progress).await?; + } + Commands::LocalWiki { + location, + format, + page_file, + thread_count, + show_urls, + override_existing_files, + hide_progress, + } => { + let thread_count = thread_count.unwrap_or(num_cpus::get_physical()).max(1); - let out = serde_yaml::to_string(&res)?; + let (path, is_default) = page_file + .map(|path| (path, false)) + .unwrap_or((default_page_file_path, true)); - if !print { - fs::write(&default_page_file_path, out)?; + let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; - if !hide_progress { - println!("data saved to {}", default_page_file_path.to_string_lossy()); - } - } else { - println!("{out}"); - } + download_wiki( + wiki_tree, + format, + location, + &log_dir, + thread_count, + override_existing_files, + hide_progress, + show_urls, + ) + .await?; } Commands::Info { show_cache_dir, @@ -232,19 +247,3 @@ async fn main() -> Result<(), WikiError> { Ok(()) } - -async fn fetch_document(page: &str, lang: Option<&str>) -> Result { - match Url::parse(page) { - Ok(url) => { - let document = fetch_page_by_url(url).await?; - if get_page_content(&document).is_none() { - return Err(WikiError::NoPageFound( - "page is not a valid ArchWiki page".to_owned(), - )); - } - - Ok(document) - } - Err(_) => fetch_page(page, lang).await, - } -} diff --git a/src/search.rs b/src/search.rs index 15e85e2c6267c6cbd5e8335ffa0b2287a99c3ef6..6af0f923a41becca71c5df21ca80986b3e299263 100644 --- a/src/search.rs +++ b/src/search.rs @@ -131,43 +131,26 @@ pub fn open_search_to_page_names( } } -/// Checks if the open search result contains a name that exactly matches the provided page name. -/// If there is a match the corresponding page URL is returned. -pub fn open_search_get_exact_match_url( - page: &str, +/// Return provided page name if the top search result exactly matches it +pub fn open_search_is_page_exact_match<'a>( + page: &'a str, search_result: &[OpenSearchItem], -) -> Result, WikiError> { +) -> Result, WikiError> { use crate::error::InvalidApiResponseError as IAR; let page_names = search_result.get(1).ok_or(WikiError::InvalidApiResponse( IAR::OpenSearchMissingNthElement(1), ))?; - let page_urls = search_result.get(3).ok_or(WikiError::InvalidApiResponse( - IAR::OpenSearchMissingNthElement(3), - ))?; - let OpenSearchItem::Array(names) = page_names else { return Err(WikiError::InvalidApiResponse( IAR::OpenSearchNthElementShouldBeArray(1), - )) + )); }; - let OpenSearchItem::Array(urls) = page_urls else { - return Err(WikiError::InvalidApiResponse( - IAR::OpenSearchNthElementShouldBeArray(3), - )) - }; - - if let Some(name) = names.first() { - if name == page { - Ok(urls.first().cloned()) - } else { - Ok(None) - } - } else { - Ok(None) - } + Ok(names + .first() + .and_then(|name| (name == page).then_some(page))) } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index b670f00d47d928cdc6fba5048031746661baf360..89288bc54ae98dc86da5e4b1be8586724d7a200c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,32 +1,16 @@ use std::{ + collections::HashMap, fs, io::{self, ErrorKind}, path::{Path, PathBuf}, }; -use ego_tree::NodeRef; -use regex::Regex; -use scraper::{node::Element, ElementRef, Html, Node, Selector}; +use itertools::Itertools; +use scraper::node::Element; use crate::{error::WikiError, formats::PageFormat}; -pub const PAGE_CONTENT_CLASS: &str = "mw-parser-output"; - -pub enum HtmlTag { - A, - Ul, - Li, -} - -impl HtmlTag { - pub fn name(&self) -> String { - match *self { - HtmlTag::A => "a".to_owned(), - HtmlTag::Ul => "ul".to_owned(), - HtmlTag::Li => "li".to_owned(), - } - } -} +pub const UNCATEGORIZED_KEY: &str = "Uncategorized"; /// Construct a path to cache a page. Different page formats are cached separately. /// All none word characters are escaped with an '_' @@ -63,32 +47,8 @@ pub fn page_cache_exists( Ok(secs_since_modified < fourteen_days) } -/// Selects the body of an ArchWiki page -pub fn get_page_content(document: &Html) -> Option> { - let class = format!(".{PAGE_CONTENT_CLASS}"); - let selector = - Selector::parse(&class).unwrap_or_else(|_| panic!("{class} should be valid selector")); - document.select(&selector).next() -} - -pub fn get_elements_by_tag<'a>(root: NodeRef<'a, Node>, tag: &HtmlTag) -> Vec> { - root.children() - .flat_map(|n| { - if let Node::Element(e) = n.value() { - if e.name() == tag.name() { - Some(n) - } else { - None - } - } else { - None - } - }) - .collect() -} - -pub fn extract_tag_attr(element: &Element, tag: &HtmlTag, attr: &str) -> Option { - if element.name() == tag.name() { +pub fn extract_tag_attr(element: &Element, tag: &str, attr: &str) -> Option { + if element.name() == tag { element.attr(attr).map(|attr| attr.to_owned()) } else { None @@ -106,18 +66,103 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String { .replace("poster=\"/", &format!("poster=\"{base_url}/")) } -pub fn read_pages_file_as_str(path: PathBuf) -> Result { - fs::read_to_string(&path).map_err(|err| { +pub fn read_pages_file_as_category_tree( + path: &Path, + is_default_path: bool, +) -> Result>, WikiError> { + let content = fs::read_to_string(path).map_err(|err| { match err.kind() { - ErrorKind::NotFound => WikiError::IO(io::Error::new(ErrorKind::NotFound, format!("Could not find pages file at '{}'. Try running 'archwiki-rs sync-wiki' to create the missing file.", path.to_string_lossy()))), + ErrorKind::NotFound => { + let path_str = path.to_string_lossy(); + let extra_path_arg = if is_default_path { + String::new() + } else { + format!(" --out-file {path_str}") + }; + + WikiError::IO(io::Error::new(ErrorKind::NotFound, format!("Could not find pages file at '{path_str}'. Try running 'archwiki-rs sync-wiki{extra_path_arg}' to create the missing file." ))) + } _ => err.into() } - }) + })?; + + let page_to_category_map: HashMap> = serde_yaml::from_str(&content)?; + + let mut category_to_page_map = HashMap::new(); + let mut uncategorized_pages = vec![]; + + for (page, cats) in page_to_category_map.into_iter().collect_vec() { + if cats.is_empty() { + uncategorized_pages.push(page) + } else { + for cat in cats { + let mut pages: Vec = + category_to_page_map.get(&cat).cloned().unwrap_or_default(); + pages.push(page.clone()); + + category_to_page_map.insert(cat, pages); + } + } + } + + if !uncategorized_pages.is_empty() { + for (i, uncategoriesed_chunk) in uncategorized_pages + .into_iter() + .sorted() + .chunks(500) + .into_iter() + .enumerate() + { + let key = format!("{UNCATEGORIZED_KEY} #{n}", n = i + 1); + category_to_page_map.insert(key, uncategoriesed_chunk.collect_vec()); + } + } + + Ok(category_to_page_map) +} + +pub fn to_save_file_name(page: &str) -> String { + sanitize_filename::sanitize(page) +} + +pub fn truncate_unicode_str(n: usize, text: &str) -> String { + let mut count = 0; + let mut res = vec![]; + let mut chars = text.chars(); + + while count < n { + if let Some(char) = chars.next() { + count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0); + res.push(char); + } else { + break; + } + } + + res.into_iter().collect::() +} + +pub fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf { + let ext = match format { + PageFormat::PlainText => "", + PageFormat::Markdown => "md", + PageFormat::Html => "html", + }; + + parent_dir.join(to_save_file_name(page)).with_extension(ext) } -fn to_save_file_name(page: &str) -> String { - let regex = Regex::new("[^-0-9A-Za-z_]").expect("'[^0-9A-Za-z_]' should be a valid regex"); - regex.replace_all(page, "_").to_string() +pub fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> { + match fs::create_dir(dir) { + Ok(_) => {} + Err(err) => { + if err.kind() != io::ErrorKind::AlreadyExists { + return Err(err.into()); + } + } + } + + Ok(()) } #[cfg(test)] @@ -129,10 +174,10 @@ mod tests { fn test_to_save_file_name() { let cases = [ ("Neovim", "Neovim"), - ("3D Mouse", "3D_Mouse"), - ("/etc/fstab", "_etc_fstab"), - (".NET", "_NET"), - ("ASUS MeMO Pad 7 (ME176C(X))", "ASUS_MeMO_Pad_7__ME176C_X__"), + ("3D Mouse", "3D Mouse"), + ("/etc/fstab", "etcfstab"), + (".NET", ".NET"), + ("ASUS MeMO Pad 7 (ME176C(X))", "ASUS MeMO Pad 7 (ME176C(X))"), ]; for (input, output) in cases { diff --git a/src/wiki_api.rs b/src/wiki_api.rs index 3c3875878b34b0a62931547a7384cdf362007754..e6811909477f58d0c4979f2a94b4ef22618794bd 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -1,20 +1,41 @@ +use std::collections::HashMap; + use scraper::Html; +use serde::Deserialize; use url::Url; use crate::{ error::WikiError, search::{ - open_search_get_exact_match_url, open_search_to_page_names, OpenSearchItem, + open_search_is_page_exact_match, open_search_to_page_names, OpenSearchItem, TextSearchApiResponse, TextSearchItem, }, utils::update_relative_urls, }; +const BLOCK_LISTED_CATEGORY_PREFIXES: &[&str] = &[ + "Pages flagged with", + "Sections flagged with", + "Pages or sections flagged with", + "Pages where template include size is exceeded", + "Pages with broken package links", + "Pages with broken section links", + "Pages with missing package links", + "Pages with missing section links", + "Pages with dead links", +]; + #[derive(Debug, Clone, serde::Deserialize)] pub struct ApiResponse { pub query: T, } +#[derive(Debug, Clone, serde::Deserialize)] +pub struct ApiResponseWithContinue { + pub query: T, + pub r#continue: Option, +} + pub async fn fetch_open_search( search: &str, lang: &str, @@ -25,7 +46,10 @@ pub async fn fetch_open_search( let res: Vec = serde_json::from_str(&body)?; // the first item in the response should be the search term - debug_assert_eq!(res.first(), Some(&OpenSearchItem::Single(search.to_owned()))); + debug_assert_eq!( + res.first(), + Some(&OpenSearchItem::Single(search.to_owned())) + ); Ok(res) } @@ -46,40 +70,40 @@ pub async fn fetch_text_search( Ok(res.query.search) } -/// Gets an ArchWiki pages entire content. Also updates all relative URLs to absolute URLs. -/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim` +/// Gets the HTML content of an ArchWiki page. /// -/// If the ArchWiki page doesn't have exists the top 5 pages that are most +/// If the ArchWiki page doesn't exists the top 5 pages that are most /// like the page that was given as an argument are returned as a `NoPageFound` error. pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result { let lang = lang.unwrap_or("en"); - let search_res = fetch_open_search(page, lang, 5).await?; - let Some(url) = open_search_get_exact_match_url(page, &search_res)? else { + let Some(page_title) = open_search_is_page_exact_match(page, &search_res)? else { let similar_pages = open_search_to_page_names(&search_res)?; return Err(WikiError::NoPageFound(similar_pages.join("\n"))); }; - let parsed_url = Url::parse(&url) - .unwrap_or(Url::parse("https://wiki.archlinux.org").expect("should be a valid URL")); - let base_url = format!( - "{schema}://{host}", - schema = parsed_url.scheme(), - host = parsed_url.host_str().unwrap_or("") - ); + fetch_page_without_recommendations(page_title).await +} - let body = reqwest::get(&url).await?.text().await?; - let body_with_abs_urls = update_relative_urls(&body, &base_url); +/// Gets the HTML content of an ArchWiki page. +pub async fn fetch_page_without_recommendations(page: &str) -> Result { + let raw_url = format!( + "https://wiki.archlinux.org/rest.php/v1/page/{title}/html", + title = urlencoding::encode(page) + ); - Ok(Html::parse_document(&body_with_abs_urls)) + let url = Url::parse(&raw_url)?; + let document = fetch_page_by_url(url).await?; + Ok(document) } /// Gets an ArchWiki pages entire content. Also updates all relative URLs to absolute URLs. -/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim` +/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim`. +/// A different base URL is used for pages that aren't hosted directly on `wiki.archlinux.org` /// /// If the page has no content a `NoPageFound` Error is returned. -pub async fn fetch_page_by_url(url: Url) -> Result { +async fn fetch_page_by_url(url: Url) -> Result { let base_url = format!( "{schema}://{host}", schema = url.scheme(), @@ -91,3 +115,94 @@ pub async fn fetch_page_by_url(url: Url) -> Result { Ok(Html::parse_document(&body_with_abs_urls)) } + +/// Gets the names of all pages on the ArchWiki and the categories that they belong to. +/// +/// ### Example +/// +/// ```sh +/// Wine # page name +/// - Emulation # category +/// - Gaming # category +/// ``` +pub async fn fetch_all_pages() -> Result>, WikiError> { + #[derive(Debug, Deserialize)] + struct ApiAllPagesQuery { + pages: HashMap, + } + + #[derive(Debug, Deserialize)] + struct Page { + title: String, + categories: Option>, + } + + #[derive(Debug, Deserialize)] + struct Category { + title: String, + } + + impl From for String { + fn from(value: Category) -> Self { + value + .title + .split_once("Category:") + .map(|(_, title)| title.to_owned()) + .unwrap_or(value.title) + } + } + + #[derive(Debug, Deserialize)] + struct ApiAllPageContinueParams { + gapcontinue: Option, + clcontinue: Option, + } + + let api_url = + "https://wiki.archlinux.org/api.php?action=query&generator=allpages&prop=categories&format=json&gaplimit=max&cllimit=max"; + + let mut pages: Vec = vec![]; + + let body = reqwest::get(api_url).await?.text().await?; + let mut api_resp: ApiResponseWithContinue = + serde_json::from_str(&body)?; + + pages.append(&mut api_resp.query.pages.into_values().collect()); + + while let Some(continue_params) = api_resp.r#continue { + let next_api_url = if let Some(gapcontinue) = continue_params.gapcontinue { + format!("{api_url}&gapcontinue={}", gapcontinue) + } else if let Some(clcontinue) = continue_params.clcontinue { + format!("{api_url}&clcontinue={}", clcontinue) + } else { + break; + }; + + let body = reqwest::get(&next_api_url).await?.text().await?; + api_resp = serde_json::from_str(&body)?; + + pages.append(&mut api_resp.query.pages.into_values().collect()); + } + + let page_category_tree = pages.into_iter().map(|page| { + ( + page.title, + page.categories + .map(|cats| { + cats.into_iter() + .map::(Into::into) + .filter(|cat| !is_blocked_category(cat)) + .collect() + }) + .unwrap_or_default(), + ) + }); + + Ok(HashMap::from_iter(page_category_tree)) +} + +fn is_blocked_category(category: &str) -> bool { + BLOCK_LISTED_CATEGORY_PREFIXES + .iter() + .any(|blocked_prefix| category.starts_with(blocked_prefix)) +} diff --git a/src/wiki_download.rs b/src/wiki_download.rs new file mode 100644 index 0000000000000000000000000000000000000000..8137b4cbf59b7d6ff9baaf87e6f47d41f574698b --- /dev/null +++ b/src/wiki_download.rs @@ -0,0 +1,286 @@ +use std::{ + collections::HashMap, + fs, + path::{Path, PathBuf}, + sync::Arc, +}; + +use super::formats::plain_text::convert_page_to_plain_text; + +use clap::{builder::PossibleValue, ValueEnum}; +use futures::future; +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use itertools::Itertools; + +use crate::{ + error::WikiError, + formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat}, + utils::truncate_unicode_str, + utils::{create_dir_if_not_exists, page_path, to_save_file_name}, + wiki_api::fetch_all_pages, + wiki_api::fetch_page_without_recommendations, +}; + +pub async fn sync_wiki_info( + page_path: &Path, + print: bool, + hide_progress: bool, +) -> Result<(), WikiError> { + let spinner = ProgressBar::new_spinner(); + if hide_progress { + spinner.finish_and_clear(); + } + + let _spin_task = std::thread::spawn(move || loop { + spinner.tick(); + std::thread::sleep(std::time::Duration::from_millis(100)); + }); + + let wiki_tree = fetch_all_pages().await?; + let out = serde_yaml::to_string(&wiki_tree)?; + + if !print { + fs::write(page_path, out)?; + + if !hide_progress { + println!("data saved to {}", page_path.to_string_lossy()); + } + } else { + println!("{out}"); + } + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +pub async fn download_wiki( + wiki_tree: HashMap>, + format: PageFormat, + location: PathBuf, + log_dir: &Path, + thread_count: usize, + override_exisiting_files: bool, + hide_progress: bool, + show_urls: bool, +) -> Result<(), WikiError> { + create_dir_if_not_exists(&location)?; + + let total_page_count = wiki_tree.values().map(|pages| pages.len()).sum::(); + + if !hide_progress { + if let Some(format) = format + .to_possible_value() + .as_ref() + .map(PossibleValue::get_name) + { + println!("downloading {total_page_count} pages as {format}\n",) + } + } + + let multibar = MultiProgress::new(); + + let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count(); + let category_bar = multibar.add( + ProgressBar::new(category_count.try_into().unwrap_or(0)) + .with_prefix("---FETCHING CATEGORIES---") + .with_style( + ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}") + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + category_bar.finish_and_clear(); + } + + let wiki_tree_without_empty_cats = wiki_tree + .into_iter() + .filter(|(_, p)| !p.is_empty()) + .collect_vec(); + + let format = Arc::new(format); + let location = Arc::new(location); + let multibar = Arc::new(multibar); + let catbar = Arc::new(category_bar); + + let wiki_tree_chunks = + chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count); + + let tasks = wiki_tree_chunks + .into_iter() + .map(|chunk| { + let format_ref = Arc::clone(&format); + let location_ref = Arc::clone(&location); + let multibar_ref = Arc::clone(&multibar); + let catbar_ref = Arc::clone(&catbar); + + tokio::spawn(async move { + download_wiki_chunk( + &chunk, + &format_ref, + &location_ref, + hide_progress, + show_urls, + override_exisiting_files, + &multibar_ref, + &catbar_ref, + ) + .await + }) + }) + .collect_vec(); + + let results = future::join_all(tasks).await; + let mut all_failed_fetches = vec![]; + + for result in results { + match result { + Ok(Ok(mut failed_fetchs)) => all_failed_fetches.append(&mut failed_fetchs), + Ok(Err(thread_err)) => { + eprintln!( + "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}" + ); + } + Err(_) => { + eprintln!("ERROR: failed to join threads, some pages might be missing"); + } + } + } + + if !hide_progress { + let successfuly_fetched_pages = total_page_count - all_failed_fetches.len(); + println!("downloaded {successfuly_fetched_pages} pages successfully"); + } + + if !all_failed_fetches.is_empty() { + if !hide_progress { + println!("failed to download {} pages", all_failed_fetches.len()); + } + + let failed_fetches_str = all_failed_fetches + .into_iter() + .map(|(page, err)| format!("failed to page '{page}'\nREASON: {err}")) + .collect_vec() + .join("\n\n"); + + let path = log_dir.join("local-wiki-download-err.log"); + let write = fs::write(&path, failed_fetches_str); + + if write.is_ok() && !hide_progress { + println!("error log written to '{}'", path.to_string_lossy()); + } + } + + if !hide_progress { + println!( + "saved local copy of the ArchWiki to '{}'", + location.to_string_lossy() + ) + } + + Ok(()) +} + +type FailedPageFetches = Vec<(String, WikiError)>; + +#[allow(clippy::too_many_arguments)] +async fn download_wiki_chunk( + chunk: &[(String, Vec)], + format: &PageFormat, + location: &Path, + hide_progress: bool, + show_urls: bool, + override_exisiting_files: bool, + multibar: &MultiProgress, + catbar: &ProgressBar, +) -> Result { + let mut failed_fetches = vec![]; + + for (cat, pages) in chunk { + let cat_dir = location.join(to_save_file_name(cat)); + create_dir_if_not_exists(&cat_dir)?; + + let width = unicode_width::UnicodeWidthStr::width(cat.as_str()); + + let leak_str: &'static str = Box::leak( + format!( + " fetching pages in \"{}\"", + if width <= 18 { + truncate_unicode_str(18, cat) + } else { + truncate_unicode_str(15, cat) + "..." + } + ) + .into_boxed_str(), + ); + + let bar = multibar.add( + ProgressBar::new(pages.len().try_into().unwrap_or(0)) + .with_prefix(leak_str) + .with_style( + ProgressStyle::with_template( + "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}", + ) + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + bar.finish_and_clear(); + } + + catbar.inc(1); + for page in pages { + bar.inc(1); + + let path = page_path(page, format, &cat_dir); + if override_exisiting_files || !path.exists() { + match write_page_to_local_wiki(page, &path, format, show_urls).await { + Ok(()) => {} + Err(err) => failed_fetches.push((page.to_owned(), err)), + } + } + } + } + + Ok(failed_fetches) +} + +async fn write_page_to_local_wiki( + page: &str, + page_path: &Path, + format: &PageFormat, + show_urls: bool, +) -> Result<(), WikiError> { + let document = fetch_page_without_recommendations(page).await?; + let content = match format { + PageFormat::PlainText => convert_page_to_plain_text(&document, show_urls), + PageFormat::Markdown => convert_page_to_markdown(&document, page), + PageFormat::Html => convert_page_to_html(&document, page), + }; + + fs::write(page_path, content)?; + Ok(()) +} + +fn chunk_wiki_with_even_page_distribution( + wiki_tree: Vec<(String, Vec)>, + chunk_count: usize, +) -> Vec)>> { + let mut chunks: Vec)>> = (0..chunk_count).map(|_| vec![]).collect(); + + for entry in wiki_tree { + if let Some(chunk) = chunks.iter_mut().min_by(|a, b| { + let count_a = a.iter().map(|(_, pages)| pages.len()).sum::(); + let count_b = b.iter().map(|(_, pages)| pages.len()).sum::(); + + count_a.cmp(&count_b) + }) { + chunk.push(entry); + } + } + + chunks +} diff --git a/tests/cli.rs b/tests/cli.rs index 5759687dd9ef45459a08a938775e0e2b93d37987..6857dbb3f028ce5d4b021f57e4845b135fca6d76 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,9 +1,5 @@ use assert_cmd::Command; -use assert_fs::prelude::{FileWriteStr, PathChild}; -use predicates::{ - prelude::{predicate, PredicateBooleanExt}, - Predicate, -}; +use predicates::prelude::{predicate, PredicateBooleanExt}; #[test] fn test_cli_info_cmd() -> Result<(), Box> { @@ -55,22 +51,6 @@ fn test_cli_read_page_cmd() -> Result<(), Box> { cmd.assert().failure().stderr(pstr::starts_with("Neovim")); } - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["read-page", "-i", "https://wiki.archlinux.org/title/Emacs"]); - - cmd.assert() - .success() - .stdout(pstr::contains("Installation")); - } - - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["read-page", "-i", "https://google.com"]); - - cmd.assert().failure(); - } - Ok(()) } @@ -121,45 +101,3 @@ fn test_cli_list_languages_cmd() -> Result<(), Box> { Ok(()) } - -#[test] -fn test_cli_local_wiki_info() -> Result<(), Box> { - use predicate::str as pstr; - - let stdout = { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["sync-wiki", "-p", "-m", "10"]); - - let stdout = String::from_utf8(cmd.assert().success().get_output().stdout.clone()).unwrap(); - pstr::contains("About Arch").eval(&stdout); - - stdout - }; - - let tmp_dir = assert_fs::TempDir::new().unwrap(); - tmp_dir.child("pages.yml").write_str(&stdout).unwrap(); - - let tmp_file_path = tmp_dir.path().join("pages.yml"); - - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["list-pages", "-p", tmp_file_path.to_str().unwrap()]); - - cmd.assert().success().stdout(pstr::contains( - "About Arch: -───┤Arch boot process -───┤Arch build system", - )); - } - - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["list-categories", "-p", tmp_file_path.to_str().unwrap()]); - - cmd.assert() - .success() - .stdout(pstr::contains("\n").count(10)); - } - - Ok(()) -}