From c398b681a6fdaaa328bb418290688097ea64e33f Mon Sep 17 00:00:00 2001 From: jackboxx Date: Mon, 15 Jan 2024 19:48:47 +0100 Subject: [PATCH 01/24] use wiki media api to get page source --- Cargo.lock | 7 +++ Cargo.toml | 1 + src/categories.rs | 128 ++------------------------------------ src/formats/html.rs | 11 +--- src/formats/markdown.rs | 9 +-- src/formats/plain_text.rs | 12 ++-- src/main.rs | 13 +--- src/search.rs | 27 ++------ src/utils.rs | 29 +-------- src/wiki_api.rs | 34 +++++----- 10 files changed, 48 insertions(+), 223 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2bc8ec6..cbcb5a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -115,6 +115,7 @@ dependencies = [ "thiserror", "tokio", "url", + "urlencoding", ] [[package]] @@ -2060,6 +2061,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf-8" version = "0.7.6" diff --git a/Cargo.toml b/Cargo.toml index 1e42804..ac4c33e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ termination = "0.1.2" thiserror = "1.0.50" tokio = { version = "1.33.0", features = ["full"] } url = "2.4.1" +urlencoding = "2.1.3" [dev-dependencies] assert_cmd = "2.0.12" diff --git a/src/categories.rs b/src/categories.rs index a26a702..23fb3ae 100644 --- a/src/categories.rs +++ b/src/categories.rs @@ -1,9 +1,5 @@ -use ::futures::future; -use indicatif::{MultiProgress, ProgressBar}; use itertools::Itertools; -use scraper::{Html, Node, Selector}; -use std::{collections::HashMap, thread, time::Duration}; -use url::Url; +use std::collections::HashMap; #[derive(Debug, Clone)] struct CategoryListItem { @@ -11,11 +7,7 @@ struct CategoryListItem { url: String, } -use crate::{ - error::WikiError, - utils::{extract_tag_attr, get_elements_by_tag, HtmlTag}, - wiki_api::fetch_page_by_url, -}; +use crate::error::WikiError; /// Returns a print ready list of the provided page names in /// 1. A tree format if `flatten` is `false`: @@ -55,127 +47,17 @@ pub fn list_pages(categories: &HashMap>, flatten: bool) -> S .join("\n\n") } +/// TODO replace with api call /// Scrapes the ArchWiki for all page names and their immediate parent category. Category nesting /// is ignored as a category can be a sub category of multiple other categories. /// /// Caution this function will most likely take several minutes to finish (-, – )…zzzZZ +#[allow(unused)] pub async fn fetch_all_pages( hide_progress: bool, thread_count: usize, max_categories: Option, start_at: Option<&str>, ) -> Result>, WikiError> { - let from = start_at.unwrap_or(""); - let limit = max_categories.unwrap_or(10000); - - let base_url = "https://wiki.archlinux.org/index.php?title=Special:Categories"; - - let url = Url::parse_with_params( - base_url, - &[("from", from), ("limit", limit.to_string().as_str())], - )?; - - let document = fetch_page_by_url(url).await?; - - let body_class = ".mw-spcontent"; - let selector = Selector::parse(body_class) - .unwrap_or_else(|_| panic!("{body_class} should be valid selector")); - - let body = document.select(&selector).next().unwrap(); - - let category_list_element = get_elements_by_tag(*body, &HtmlTag::Ul) - .into_iter() - .next() - .unwrap(); - - let items = parse_category_list(category_list_element); - let multi_bar = MultiProgress::new(); - - let chunk_count = items.len() / thread_count; - let tasks = items - .chunks(chunk_count) - .map(|chunk| { - let chunk = chunk.to_vec(); - let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0)); - let bar = multi_bar.add(bar); - if hide_progress { - bar.finish_and_clear(); - } - - tokio::spawn(async move { - let mut res = Vec::with_capacity(chunk.len()); - for item in chunk { - let pages = match fetch_page_names_from_categoriy(&item.url).await { - Ok(pages) => pages, - - Err(_) => { - thread::sleep(Duration::from_secs(1)); - fetch_page_names_from_categoriy(&item.url) - .await - .unwrap_or_else(|err| { - eprintln!( - "failed to fetch pages in category {}\n ERROR {err}", - item.name - ); - vec![] - }) - } - }; - - res.push((item.name, pages)); - bar.inc(1); - } - - res - }) - }) - .collect_vec(); - - let out = future::join_all(tasks) - .await - .into_iter() - .flatten() - .flatten() - .collect_vec(); - - Ok(HashMap::from_iter(out)) -} - -fn parse_category_list(list_node: ego_tree::NodeRef<'_, scraper::Node>) -> Vec { - let list_items = get_elements_by_tag(list_node, &HtmlTag::Li); - list_items - .into_iter() - .flat_map(|li| { - let a_tag = li.first_child()?; - let a_tag_element = a_tag.value().as_element()?; - - let name = a_tag.first_child()?.value().as_text()?.to_string(); - let url = extract_tag_attr(a_tag_element, &HtmlTag::A, "href")?; - - Some(CategoryListItem { name, url }) - }) - .collect() -} - -/// Scrape the ArchWiki for a list of all page names that belong to a specific category -async fn fetch_page_names_from_categoriy(url_str: &str) -> Result, WikiError> { - let selector = Selector::parse("#mw-pages").expect("#mw-pages to be a valid css selector"); - - let body = reqwest::get(url_str).await?.text().await?; - let document = Html::parse_document(&body); - - let Some(page_container) = document.select(&selector).next() else { - return Ok(vec![]) - }; - - Ok(page_container - .descendants() - .filter_map(|node| { - if let Node::Element(e) = node.value() { - extract_tag_attr(e, &HtmlTag::A, "title") - } else { - None - } - }) - .collect()) + todo!() } diff --git a/src/formats/html.rs b/src/formats/html.rs index 090cafd..ca60728 100644 --- a/src/formats/html.rs +++ b/src/formats/html.rs @@ -1,36 +1,31 @@ use scraper::Html; -use crate::utils::get_page_content; - /// Converts the body of the ArchWiki page to a HTML string pub fn convert_page_to_html(document: &Html, page: &str) -> String { - let content = get_page_content(document).expect("page should have content"); - format!( "

{heading}

\n{body}", heading = page, - body = content.html() + body = document.html() ) } #[cfg(test)] mod tests { use super::*; - use crate::utils::PAGE_CONTENT_CLASS; use pretty_assertions::assert_eq; #[tokio::test] async fn test_convert_page_to_html() { let page = "test page"; let input = format!( - r#"
+ r#"
Hello, world!
"# ); let expected_output = format!( r#"

{page}

-
+
Hello, world!
"# ); diff --git a/src/formats/markdown.rs b/src/formats/markdown.rs index c36449d..126fa58 100644 --- a/src/formats/markdown.rs +++ b/src/formats/markdown.rs @@ -1,26 +1,21 @@ use scraper::Html; -use crate::utils::get_page_content; - /// Converts the body of the ArchWiki page to a Markdown string pub fn convert_page_to_markdown(document: &Html, page: &str) -> String { - let content = get_page_content(document).expect("page should have content"); - - let md = html2md::parse_html(&content.html()); + let md = html2md::parse_html(&document.html()); format!("# {heading}\n\n{body}", heading = page, body = md) } #[cfg(test)] mod tests { use super::*; - use crate::utils::PAGE_CONTENT_CLASS; use pretty_assertions::assert_eq; #[tokio::test] async fn test_convert_page_to_markdown() { let page = "test page"; let input = format!( - r#"
+ r#"

Hello, world!

"# ); diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs index 6b7b6a2..3f100c4 100644 --- a/src/formats/plain_text.rs +++ b/src/formats/plain_text.rs @@ -2,14 +2,13 @@ use colored::Colorize; use ego_tree::NodeRef; use scraper::{Html, Node}; -use crate::utils::{extract_tag_attr, get_page_content, HtmlTag}; +use crate::utils::{extract_tag_attr, HtmlTag}; /// Converts the body of the ArchWiki page to a plain text string, removing all tags and /// only leaving the text node content. URLs can be shown in a markdown like syntax. pub fn convert_page_to_plain_text(document: &Html, show_urls: bool) -> String { - let content = get_page_content(document).expect("page should have content"); - - content + document + .root_element() .children() .map(|node| format_children(node, show_urls)) .collect::>() @@ -86,14 +85,13 @@ fn wrap_text_in_url(text: &str, url: &str) -> String { #[cfg(test)] mod tests { use super::*; - use crate::utils::PAGE_CONTENT_CLASS; use pretty_assertions::assert_eq; #[tokio::test] async fn test_convert_page_to_plain_text() { { let input = format!( - r#"
+ r#"

Hello, world!

how are you
I'm great @@ -116,7 +114,7 @@ mod tests { { let input = format!( - r#"
+ r#"

Hello, world!

example
"# diff --git a/src/main.rs b/src/main.rs index 493cb2a..c64be63 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ use crate::{ formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat}, languages::{fetch_all_langs, format_lang_table}, search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, - utils::{create_cache_page_path, get_page_content, page_cache_exists, read_pages_file_as_str}, + utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str}, wiki_api::{fetch_open_search, fetch_page, fetch_text_search}, }; @@ -235,16 +235,7 @@ async fn main() -> Result<(), WikiError> { async fn fetch_document(page: &str, lang: Option<&str>) -> Result { match Url::parse(page) { - Ok(url) => { - let document = fetch_page_by_url(url).await?; - if get_page_content(&document).is_none() { - return Err(WikiError::NoPageFound( - "page is not a valid ArchWiki page".to_owned(), - )); - } - - Ok(document) - } + Ok(url) => fetch_page_by_url(url).await, Err(_) => fetch_page(page, lang).await, } } diff --git a/src/search.rs b/src/search.rs index 15e85e2..e42e093 100644 --- a/src/search.rs +++ b/src/search.rs @@ -131,9 +131,10 @@ pub fn open_search_to_page_names( } } +/// TODO /// Checks if the open search result contains a name that exactly matches the provided page name. /// If there is a match the corresponding page URL is returned. -pub fn open_search_get_exact_match_url( +pub fn open_search_is_page_exact_match( page: &str, search_result: &[OpenSearchItem], ) -> Result, WikiError> { @@ -143,31 +144,15 @@ pub fn open_search_get_exact_match_url( IAR::OpenSearchMissingNthElement(1), ))?; - let page_urls = search_result.get(3).ok_or(WikiError::InvalidApiResponse( - IAR::OpenSearchMissingNthElement(3), - ))?; - let OpenSearchItem::Array(names) = page_names else { return Err(WikiError::InvalidApiResponse( IAR::OpenSearchNthElementShouldBeArray(1), - )) + )); }; - let OpenSearchItem::Array(urls) = page_urls else { - return Err(WikiError::InvalidApiResponse( - IAR::OpenSearchNthElementShouldBeArray(3), - )) - }; - - if let Some(name) = names.first() { - if name == page { - Ok(urls.first().cloned()) - } else { - Ok(None) - } - } else { - Ok(None) - } + Ok(names + .first() + .and_then(|name| (name == page).then_some(name.to_owned()))) } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index b670f00..2dc1d6c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,14 +4,11 @@ use std::{ path::{Path, PathBuf}, }; -use ego_tree::NodeRef; use regex::Regex; -use scraper::{node::Element, ElementRef, Html, Node, Selector}; +use scraper::node::Element; use crate::{error::WikiError, formats::PageFormat}; -pub const PAGE_CONTENT_CLASS: &str = "mw-parser-output"; - pub enum HtmlTag { A, Ul, @@ -63,30 +60,6 @@ pub fn page_cache_exists( Ok(secs_since_modified < fourteen_days) } -/// Selects the body of an ArchWiki page -pub fn get_page_content(document: &Html) -> Option> { - let class = format!(".{PAGE_CONTENT_CLASS}"); - let selector = - Selector::parse(&class).unwrap_or_else(|_| panic!("{class} should be valid selector")); - document.select(&selector).next() -} - -pub fn get_elements_by_tag<'a>(root: NodeRef<'a, Node>, tag: &HtmlTag) -> Vec> { - root.children() - .flat_map(|n| { - if let Node::Element(e) = n.value() { - if e.name() == tag.name() { - Some(n) - } else { - None - } - } else { - None - } - }) - .collect() -} - pub fn extract_tag_attr(element: &Element, tag: &HtmlTag, attr: &str) -> Option { if element.name() == tag.name() { element.attr(attr).map(|attr| attr.to_owned()) diff --git a/src/wiki_api.rs b/src/wiki_api.rs index 3c38758..ce4dd85 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -4,7 +4,7 @@ use url::Url; use crate::{ error::WikiError, search::{ - open_search_get_exact_match_url, open_search_to_page_names, OpenSearchItem, + open_search_is_page_exact_match, open_search_to_page_names, OpenSearchItem, TextSearchApiResponse, TextSearchItem, }, utils::update_relative_urls, @@ -25,7 +25,10 @@ pub async fn fetch_open_search( let res: Vec = serde_json::from_str(&body)?; // the first item in the response should be the search term - debug_assert_eq!(res.first(), Some(&OpenSearchItem::Single(search.to_owned()))); + debug_assert_eq!( + res.first(), + Some(&OpenSearchItem::Single(search.to_owned())) + ); Ok(res) } @@ -46,37 +49,32 @@ pub async fn fetch_text_search( Ok(res.query.search) } -/// Gets an ArchWiki pages entire content. Also updates all relative URLs to absolute URLs. -/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim` +/// Gets the HTML content of an ArchWiki page. /// -/// If the ArchWiki page doesn't have exists the top 5 pages that are most +/// If the ArchWiki page doesn't exists the top 5 pages that are most /// like the page that was given as an argument are returned as a `NoPageFound` error. pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result { let lang = lang.unwrap_or("en"); - let search_res = fetch_open_search(page, lang, 5).await?; - let Some(url) = open_search_get_exact_match_url(page, &search_res)? else { + let Some(page_title) = open_search_is_page_exact_match(page, &search_res)? else { let similar_pages = open_search_to_page_names(&search_res)?; return Err(WikiError::NoPageFound(similar_pages.join("\n"))); }; - let parsed_url = Url::parse(&url) - .unwrap_or(Url::parse("https://wiki.archlinux.org").expect("should be a valid URL")); - let base_url = format!( - "{schema}://{host}", - schema = parsed_url.scheme(), - host = parsed_url.host_str().unwrap_or("") + let raw_url = format!( + "https://wiki.archlinux.org/rest.php/v1/page/{title}/html", + title = urlencoding::encode(&page_title) ); + let url = Url::parse(&raw_url)?; - let body = reqwest::get(&url).await?.text().await?; - let body_with_abs_urls = update_relative_urls(&body, &base_url); - - Ok(Html::parse_document(&body_with_abs_urls)) + let document = fetch_page_by_url(url).await?; + Ok(document) } /// Gets an ArchWiki pages entire content. Also updates all relative URLs to absolute URLs. -/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim` +/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim`. +/// A different base URL is used for pages that aren't hosted directly on `wiki.archlinux.org` /// /// If the page has no content a `NoPageFound` Error is returned. pub async fn fetch_page_by_url(url: Url) -> Result { -- GitLab From 50a71e3959a4f1480483533f80a892339fec92c3 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Mon, 15 Jan 2024 20:22:18 +0100 Subject: [PATCH 02/24] wip: use fetch all endpoint to get pages --- src/categories.rs | 21 ++---------- src/cli.rs | 8 ++--- src/formats/plain_text.rs | 4 +-- src/languages.rs | 2 +- src/main.rs | 17 ++++------ src/search.rs | 12 +++---- src/utils.rs | 20 ++---------- src/wiki_api.rs | 68 +++++++++++++++++++++++++++++++++++++-- 8 files changed, 86 insertions(+), 66 deletions(-) diff --git a/src/categories.rs b/src/categories.rs index 23fb3ae..03a53e7 100644 --- a/src/categories.rs +++ b/src/categories.rs @@ -1,12 +1,8 @@ +#![allow(unused)] + use itertools::Itertools; use std::collections::HashMap; -#[derive(Debug, Clone)] -struct CategoryListItem { - name: String, - url: String, -} - use crate::error::WikiError; /// Returns a print ready list of the provided page names in @@ -47,17 +43,6 @@ pub fn list_pages(categories: &HashMap>, flatten: bool) -> S .join("\n\n") } -/// TODO replace with api call -/// Scrapes the ArchWiki for all page names and their immediate parent category. Category nesting -/// is ignored as a category can be a sub category of multiple other categories. -/// -/// Caution this function will most likely take several minutes to finish (-, – )…zzzZZ -#[allow(unused)] -pub async fn fetch_all_pages( - hide_progress: bool, - thread_count: usize, - max_categories: Option, - start_at: Option<&str>, -) -> Result>, WikiError> { +pub async fn fetch_page_categories(page: &str) -> Result, WikiError> { todo!() } diff --git a/src/cli.rs b/src/cli.rs index b2f6a58..8932176 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -98,12 +98,8 @@ pub enum Commands { /// Number of threads to use for fetching data from the ArchWiki thread_count: Option, #[arg(short, long)] - /// Maximum amount of categories to fetch. If no value if provided all categories are - /// fetched. - max_categories: Option, - #[arg(short, long)] - /// First category that will be fetched. See 'https://wiki.archlinux.org/index.php?title=Special:Categories' for more information. - start_at: Option, + /// Delay (in milliseconds) between requests. Note that this applies on a per thread basis. + delay: Option, #[arg(short, long)] /// Print result to stdout instead of writing to a file. Output is formatted as YAML. print: bool, diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs index 3f100c4..b160857 100644 --- a/src/formats/plain_text.rs +++ b/src/formats/plain_text.rs @@ -2,7 +2,7 @@ use colored::Colorize; use ego_tree::NodeRef; use scraper::{Html, Node}; -use crate::utils::{extract_tag_attr, HtmlTag}; +use crate::utils::extract_tag_attr; /// Converts the body of the ArchWiki page to a plain text string, removing all tags and /// only leaving the text node content. URLs can be shown in a markdown like syntax. @@ -29,7 +29,7 @@ pub fn format_children(node: NodeRef, show_urls: bool) -> String { if show_urls { wrap_text_in_url( &child_text, - &extract_tag_attr(e, &HtmlTag::A, "href").unwrap_or("".to_string()), + &extract_tag_attr(e, "a", "href").unwrap_or("".to_string()), ) } else { child_text diff --git a/src/languages.rs b/src/languages.rs index 543084f..42f714b 100644 --- a/src/languages.rs +++ b/src/languages.rs @@ -20,7 +20,7 @@ pub struct Language { pub async fn fetch_all_langs() -> Result, WikiError> { let body = reqwest::get(LANGUAGE_API_URL).await?.text().await?; - let json: ApiResponse = serde_json::from_str(&body)?; + let json: ApiResponse = serde_json::from_str(&body)?; Ok(json.query.languages) } diff --git a/src/main.rs b/src/main.rs index c64be63..862a44f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,12 +11,12 @@ use url::Url; use wiki_api::fetch_page_by_url; use crate::{ - categories::{fetch_all_pages, list_pages}, + categories::list_pages, formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat}, languages::{fetch_all_langs, format_lang_table}, search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str}, - wiki_api::{fetch_open_search, fetch_page, fetch_text_search}, + wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search}, }; mod categories; @@ -158,18 +158,13 @@ async fn main() -> Result<(), WikiError> { Commands::SyncWiki { hide_progress, thread_count, - max_categories, - start_at, + delay, print, } => { let thread_count = thread_count.unwrap_or(num_cpus::get_physical()); - let res = fetch_all_pages( - hide_progress, - thread_count, - max_categories, - start_at.as_deref(), - ) - .await?; + let res = fetch_all_pages().await?; + println!("{}", res.join("\n")); + panic!("oops"); let out = serde_yaml::to_string(&res)?; diff --git a/src/search.rs b/src/search.rs index e42e093..6af0f92 100644 --- a/src/search.rs +++ b/src/search.rs @@ -131,13 +131,11 @@ pub fn open_search_to_page_names( } } -/// TODO -/// Checks if the open search result contains a name that exactly matches the provided page name. -/// If there is a match the corresponding page URL is returned. -pub fn open_search_is_page_exact_match( - page: &str, +/// Return provided page name if the top search result exactly matches it +pub fn open_search_is_page_exact_match<'a>( + page: &'a str, search_result: &[OpenSearchItem], -) -> Result, WikiError> { +) -> Result, WikiError> { use crate::error::InvalidApiResponseError as IAR; let page_names = search_result.get(1).ok_or(WikiError::InvalidApiResponse( @@ -152,7 +150,7 @@ pub fn open_search_is_page_exact_match( Ok(names .first() - .and_then(|name| (name == page).then_some(name.to_owned()))) + .and_then(|name| (name == page).then_some(page))) } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index 2dc1d6c..44d11ed 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -9,22 +9,6 @@ use scraper::node::Element; use crate::{error::WikiError, formats::PageFormat}; -pub enum HtmlTag { - A, - Ul, - Li, -} - -impl HtmlTag { - pub fn name(&self) -> String { - match *self { - HtmlTag::A => "a".to_owned(), - HtmlTag::Ul => "ul".to_owned(), - HtmlTag::Li => "li".to_owned(), - } - } -} - /// Construct a path to cache a page. Different page formats are cached separately. /// All none word characters are escaped with an '_' pub fn create_cache_page_path(page: &str, format: &PageFormat, cache_dir: &Path) -> PathBuf { @@ -60,8 +44,8 @@ pub fn page_cache_exists( Ok(secs_since_modified < fourteen_days) } -pub fn extract_tag_attr(element: &Element, tag: &HtmlTag, attr: &str) -> Option { - if element.name() == tag.name() { +pub fn extract_tag_attr(element: &Element, tag: &str, attr: &str) -> Option { + if element.name() == tag { element.attr(attr).map(|attr| attr.to_owned()) } else { None diff --git a/src/wiki_api.rs b/src/wiki_api.rs index ce4dd85..237997e 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -1,4 +1,5 @@ use scraper::Html; +use serde::Deserialize; use url::Url; use crate::{ @@ -11,8 +12,9 @@ use crate::{ }; #[derive(Debug, Clone, serde::Deserialize)] -pub struct ApiResponse { +pub struct ApiResponse { pub query: T, + pub r#continue: Option, } pub async fn fetch_open_search( @@ -40,7 +42,7 @@ pub async fn fetch_text_search( ) -> Result, WikiError> { let url = format!("https://wiki.archlinux.org/api.php?action=query&list=search&format=json&srwhat=text&uselang={lang}&srlimit={limit}&srsearch={search}"); let body = reqwest::get(url).await?.text().await?; - let mut res: ApiResponse = serde_json::from_str(&body)?; + let mut res: ApiResponse = serde_json::from_str(&body)?; for item in res.query.search.as_mut_slice() { item.prettify_snippet(search); @@ -64,7 +66,7 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result Result { Ok(Html::parse_document(&body_with_abs_urls)) } + +pub async fn fetch_all_pages() -> Result, WikiError> { + #[derive(Debug, Deserialize)] + struct ApiAllPagesQuery { + allpages: Vec, + } + + #[derive(Debug, Deserialize)] + struct Page { + title: String, + } + + impl From for String { + fn from(value: Page) -> Self { + value.title + } + } + + #[derive(Debug, Deserialize)] + struct ApiAllPageContinueParams { + apcontinue: String, + } + + let api_url = format!( + "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500" + ); + + let mut pages: Vec = vec![]; + + let body = reqwest::get(&api_url).await?.text().await?; + let mut api_resp: ApiResponse = + serde_json::from_str(&body)?; + + pages.append( + &mut api_resp + .query + .allpages + .into_iter() + .map(Into::into) + .collect(), + ); + + while let Some(continue_params) = api_resp.r#continue { + let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue); + + let body = reqwest::get(&next_api_url).await?.text().await?; + api_resp = serde_json::from_str(&body)?; + + pages.append( + &mut api_resp + .query + .allpages + .into_iter() + .map(Into::into) + .collect(), + ); + } + + Ok(pages) +} -- GitLab From e817b9374b7042ec5de8e70653be00ae54e0ab54 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Mon, 15 Jan 2024 20:59:35 +0100 Subject: [PATCH 03/24] wip: fetch pages per category --- src/languages.rs | 2 +- src/main.rs | 2 +- src/wiki_api.rs | 98 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 76 insertions(+), 26 deletions(-) diff --git a/src/languages.rs b/src/languages.rs index 42f714b..543084f 100644 --- a/src/languages.rs +++ b/src/languages.rs @@ -20,7 +20,7 @@ pub struct Language { pub async fn fetch_all_langs() -> Result, WikiError> { let body = reqwest::get(LANGUAGE_API_URL).await?.text().await?; - let json: ApiResponse = serde_json::from_str(&body)?; + let json: ApiResponse = serde_json::from_str(&body)?; Ok(json.query.languages) } diff --git a/src/main.rs b/src/main.rs index 862a44f..91a83a7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -163,7 +163,7 @@ async fn main() -> Result<(), WikiError> { } => { let thread_count = thread_count.unwrap_or(num_cpus::get_physical()); let res = fetch_all_pages().await?; - println!("{}", res.join("\n")); + dbg!(res); panic!("oops"); let out = serde_yaml::to_string(&res)?; diff --git a/src/wiki_api.rs b/src/wiki_api.rs index 237997e..bf564f6 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use scraper::Html; use serde::Deserialize; use url::Url; @@ -12,7 +14,12 @@ use crate::{ }; #[derive(Debug, Clone, serde::Deserialize)] -pub struct ApiResponse { +pub struct ApiResponse { + pub query: T, +} + +#[derive(Debug, Clone, serde::Deserialize)] +pub struct ApiResponseWithContinue { pub query: T, pub r#continue: Option, } @@ -42,7 +49,7 @@ pub async fn fetch_text_search( ) -> Result, WikiError> { let url = format!("https://wiki.archlinux.org/api.php?action=query&list=search&format=json&srwhat=text&uselang={lang}&srlimit={limit}&srsearch={search}"); let body = reqwest::get(url).await?.text().await?; - let mut res: ApiResponse = serde_json::from_str(&body)?; + let mut res: ApiResponse = serde_json::from_str(&body)?; for item in res.query.search.as_mut_slice() { item.prettify_snippet(search); @@ -92,62 +99,105 @@ pub async fn fetch_page_by_url(url: Url) -> Result { Ok(Html::parse_document(&body_with_abs_urls)) } -pub async fn fetch_all_pages() -> Result, WikiError> { +pub async fn fetch_all_pages() -> Result>, WikiError> { + let categories = fetch_all_categories().await?; + let mut wiki = HashMap::new(); + + for category in categories { + let pages = fetch_pages_in_category(&category).await?; + if !pages.is_empty() { + wiki.insert(category, pages); + } + } + + Ok(wiki) +} + +async fn fetch_all_categories() -> Result, WikiError> { #[derive(Debug, Deserialize)] - struct ApiAllPagesQuery { - allpages: Vec, + struct ApiAllCategoriesQuery { + allcategories: Vec, } #[derive(Debug, Deserialize)] - struct Page { - title: String, + struct Category { + #[serde[rename = "*"]] + name: String, } - impl From for String { - fn from(value: Page) -> Self { - value.title + impl From for String { + fn from(value: Category) -> Self { + value.name } } #[derive(Debug, Deserialize)] - struct ApiAllPageContinueParams { - apcontinue: String, + struct ApiAllCategoryContinueParams { + accontinue: String, } - let api_url = format!( - "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500" - ); + let api_url = "https://wiki.archlinux.org/api.php?action=query&list=allcategories&format=json&aclimit=500"; - let mut pages: Vec = vec![]; + let mut categories: Vec = vec![]; - let body = reqwest::get(&api_url).await?.text().await?; - let mut api_resp: ApiResponse = + let body = reqwest::get(api_url).await?.text().await?; + let mut api_resp: ApiResponseWithContinue = serde_json::from_str(&body)?; - pages.append( + categories.append( &mut api_resp .query - .allpages + .allcategories .into_iter() .map(Into::into) .collect(), ); while let Some(continue_params) = api_resp.r#continue { - let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue); + let next_api_url = format!("{api_url}&accontinue={}", continue_params.accontinue); let body = reqwest::get(&next_api_url).await?.text().await?; api_resp = serde_json::from_str(&body)?; - pages.append( + categories.append( &mut api_resp .query - .allpages + .allcategories .into_iter() .map(Into::into) .collect(), ); } - Ok(pages) + Ok(categories) +} + +async fn fetch_pages_in_category(category: &str) -> Result, WikiError> { + #[derive(Debug, Deserialize)] + struct ApiCategoryMembersQuery { + categorymembers: Vec, + } + + #[derive(Debug, Deserialize)] + struct Page { + title: String, + } + + impl From for String { + fn from(value: Page) -> Self { + value.title + } + } + + let api_url = format!("https://wiki.archlinux.org/api.php?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=500&cmtitle=Category:{title}", title = urlencoding::encode(category)); + + let body = reqwest::get(dbg!(api_url)).await?.text().await?; + let api_resp: ApiResponse = serde_json::from_str(&dbg!(body))?; + + Ok(api_resp + .query + .categorymembers + .into_iter() + .map(Into::into) + .collect()) } -- GitLab From a11407c565fe90778ab87337a02e0b130d77c32c Mon Sep 17 00:00:00 2001 From: jackboxx Date: Mon, 15 Jan 2024 21:27:55 +0100 Subject: [PATCH 04/24] add --fast flag to allow only fetching pages without categories --- src/categories.rs | 4 -- src/cli.rs | 7 ++- src/main.rs | 14 +++-- src/wiki_api.rs | 136 +++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 143 insertions(+), 18 deletions(-) diff --git a/src/categories.rs b/src/categories.rs index 03a53e7..1285c2e 100644 --- a/src/categories.rs +++ b/src/categories.rs @@ -42,7 +42,3 @@ pub fn list_pages(categories: &HashMap>, flatten: bool) -> S }) .join("\n\n") } - -pub async fn fetch_page_categories(page: &str) -> Result, WikiError> { - todo!() -} diff --git a/src/cli.rs b/src/cli.rs index 8932176..c0e268f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -87,8 +87,8 @@ pub enum Commands { )] ListLanguages, #[command( - about = "Download the names of all pages on the ArchWiki", - long_about = "Download the names of all pages on the ArchWiki. Page names are used for the 'list-pages' and 'list-categories' commands" + about = "Download information about the pages and categories on the ArchWiki", + long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands" )] SyncWiki { #[arg(short = 'H', long)] @@ -101,6 +101,9 @@ pub enum Commands { /// Delay (in milliseconds) between requests. Note that this applies on a per thread basis. delay: Option, #[arg(short, long)] + /// Only fetch page names without parent category information. + fast: bool, + #[arg(short, long)] /// Print result to stdout instead of writing to a file. Output is formatted as YAML. print: bool, }, diff --git a/src/main.rs b/src/main.rs index 91a83a7..b507a32 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,7 +16,9 @@ use crate::{ languages::{fetch_all_langs, format_lang_table}, search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str}, - wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search}, + wiki_api::{ + fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search, fetch_wiki_tree, + }, }; mod categories; @@ -159,12 +161,16 @@ async fn main() -> Result<(), WikiError> { hide_progress, thread_count, delay, + fast, print, } => { let thread_count = thread_count.unwrap_or(num_cpus::get_physical()); - let res = fetch_all_pages().await?; - dbg!(res); - panic!("oops"); + let res = if !fast { + fetch_wiki_tree(thread_count, delay.unwrap_or(0), hide_progress).await? + } else { + let all_pages = fetch_all_pages().await?; + HashMap::from([("*".to_owned(), all_pages)]) + }; let out = serde_yaml::to_string(&res)?; diff --git a/src/wiki_api.rs b/src/wiki_api.rs index bf564f6..1dd5662 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -1,5 +1,9 @@ +use core::panic; use std::collections::HashMap; +use futures::future; +use indicatif::{MultiProgress, ProgressBar}; +use itertools::Itertools; use scraper::Html; use serde::Deserialize; use url::Url; @@ -99,20 +103,136 @@ pub async fn fetch_page_by_url(url: Url) -> Result { Ok(Html::parse_document(&body_with_abs_urls)) } -pub async fn fetch_all_pages() -> Result>, WikiError> { +/// Gets a list of all ArchWiki categories and the pages inside them. +/// All categories are treated as top-level and sub categories are ignored. +pub async fn fetch_wiki_tree( + thread_count: usize, + delay: u64, + hide_progress: bool, +) -> Result>, WikiError> { let categories = fetch_all_categories().await?; - let mut wiki = HashMap::new(); - for category in categories { - let pages = fetch_pages_in_category(&category).await?; - if !pages.is_empty() { - wiki.insert(category, pages); + let multi_bar = MultiProgress::new(); + let chunk_count = categories.len() / thread_count; + + let tasks = categories + .chunks(chunk_count) + .map(|chunk| { + let chunk = chunk.to_vec(); + + let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0)); + let bar = multi_bar.add(bar); + if hide_progress { + bar.finish_and_clear(); + } + + tokio::spawn(async move { + let mut wiki_sectoin = HashMap::new(); + for category in chunk { + tokio::time::sleep(std::time::Duration::from_millis(delay)).await; + + let pages = match fetch_pages_in_category(&category).await { + Ok(pages) => pages, + Err(_) => { + // wait if rate limited + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + fetch_pages_in_category(&category) + .await + .unwrap_or_else(|err| { + eprintln!( + "failed to fetch pages in category {}\n ERROR {err}", + category + ); + vec![] + }) + } + }; + + if !pages.is_empty() { + wiki_sectoin.insert(category.to_owned(), pages); + } + bar.inc(1); + } + + wiki_sectoin + }) + }) + .collect_vec(); + + let mut wiki = HashMap::new(); + let sections = future::join_all(tasks).await; + + for section in sections { + match section { + Ok(data) => { + wiki.extend(data); + } + Err(err) => panic!("failed to sync wiki\nERROR: {err}"), } } Ok(wiki) } +pub async fn fetch_all_pages() -> Result, WikiError> { + #[derive(Debug, Deserialize)] + struct ApiAllPagesQuery { + allpages: Vec, + } + + #[derive(Debug, Deserialize)] + struct Page { + title: String, + } + + impl From for String { + fn from(value: Page) -> Self { + value.title + } + } + + #[derive(Debug, Deserialize)] + struct ApiAllPageContinueParams { + apcontinue: String, + } + + let api_url = + "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500"; + + let mut pages: Vec = vec![]; + + let body = reqwest::get(api_url).await?.text().await?; + let mut api_resp: ApiResponseWithContinue = + serde_json::from_str(&body)?; + + pages.append( + &mut api_resp + .query + .allpages + .into_iter() + .map(Into::into) + .collect(), + ); + + while let Some(continue_params) = api_resp.r#continue { + let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue); + + let body = reqwest::get(&next_api_url).await?.text().await?; + api_resp = serde_json::from_str(&body)?; + + pages.append( + &mut api_resp + .query + .allpages + .into_iter() + .map(Into::into) + .collect(), + ); + } + + Ok(pages) +} + async fn fetch_all_categories() -> Result, WikiError> { #[derive(Debug, Deserialize)] struct ApiAllCategoriesQuery { @@ -191,8 +311,8 @@ async fn fetch_pages_in_category(category: &str) -> Result, WikiErro let api_url = format!("https://wiki.archlinux.org/api.php?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=500&cmtitle=Category:{title}", title = urlencoding::encode(category)); - let body = reqwest::get(dbg!(api_url)).await?.text().await?; - let api_resp: ApiResponse = serde_json::from_str(&dbg!(body))?; + let body = reqwest::get(api_url).await?.text().await?; + let api_resp: ApiResponse = serde_json::from_str(&body)?; Ok(api_resp .query -- GitLab From 23bea9d7a51d74dc933f86f94f3096154988ff87 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Mon, 15 Jan 2024 21:42:43 +0100 Subject: [PATCH 05/24] wip: add local-wiki command & change save paths to be urlencoded --- src/cli.rs | 18 +++++++++++++++--- src/error.rs | 2 ++ src/main.rs | 22 ++++++++++++++++++++-- src/utils.rs | 7 +++---- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index c0e268f..87fb214 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; +use html2md::common; use crate::formats::PageFormat; @@ -69,7 +70,7 @@ pub enum Commands { /// Only show pages in this category category: Option, #[arg(short, long)] - /// Use different file to read pages from + /// Use a different file to read pages from page_file: Option, }, #[command( @@ -78,7 +79,7 @@ pub enum Commands { )] ListCategories { #[arg(short, long)] - /// Use different file to read pages from + /// Use a different file to read pages from page_file: Option, }, #[command( @@ -87,7 +88,7 @@ pub enum Commands { )] ListLanguages, #[command( - about = "Download information about the pages and categories on the ArchWiki", + about = "Download information about the pages and categories on the ArchWiki (takes a while)", long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands" )] SyncWiki { @@ -107,6 +108,17 @@ pub enum Commands { /// Print result to stdout instead of writing to a file. Output is formatted as YAML. print: bool, }, + #[command( + about = "Download a copy of the ArchWiki. Will take a long time :)", + long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory." + )] + LocalWiki { + /// Location to store the local copy of the wiki at. + location: PathBuf, + #[arg(short, long)] + /// Use a different file to read pages from + page_file: Option, + }, #[command( about = "Retrive information related to this tool", long_about = "Retrive information related to this tool. All Info is shown by default." diff --git a/src/error.rs b/src/error.rs index 4cc13cd..5734c32 100644 --- a/src/error.rs +++ b/src/error.rs @@ -50,4 +50,6 @@ pub enum WikiError { NoPageFound(String), #[error("The category '{}' could not be found", .0)] NoCategoryFound(String), + #[error("{}", .0)] + Other(String), } diff --git a/src/main.rs b/src/main.rs index b507a32..f97aad6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -125,7 +125,7 @@ async fn main() -> Result<(), WikiError> { page_file, } => { let path = page_file.unwrap_or(default_page_file_path); - let file = read_pages_file_as_str(path)?; + let file = read_pages_file_as_str(&path)?; let pages_map: HashMap> = serde_yaml::from_str(&file)?; @@ -144,7 +144,7 @@ async fn main() -> Result<(), WikiError> { } Commands::ListCategories { page_file } => { let path = page_file.unwrap_or(default_page_file_path); - let file = read_pages_file_as_str(path)?; + let file = read_pages_file_as_str(&path)?; let pages_map: HashMap> = serde_yaml::from_str(&file)?; @@ -184,6 +184,24 @@ async fn main() -> Result<(), WikiError> { println!("{out}"); } } + Commands::LocalWiki { + location, + page_file, + } => { + let path = page_file.unwrap_or(default_page_file_path); + let Ok(file) = read_pages_file_as_str(&path) else { + return Err(WikiError::Path("page file does not exist".to_owned())); + }; + + let Ok(pages_map) = serde_yaml::from_str::>>(&file) else { + return Err(WikiError::Other(format!( + "page file is malformed\nfile: {}", + path.to_string_lossy() + ))); + }; + + todo!("oh boy"); + } Commands::Info { show_cache_dir, show_data_dir, diff --git a/src/utils.rs b/src/utils.rs index 44d11ed..ba75ffe 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -63,8 +63,8 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String { .replace("poster=\"/", &format!("poster=\"{base_url}/")) } -pub fn read_pages_file_as_str(path: PathBuf) -> Result { - fs::read_to_string(&path).map_err(|err| { +pub fn read_pages_file_as_str(path: &Path) -> Result { + fs::read_to_string(path).map_err(|err| { match err.kind() { ErrorKind::NotFound => WikiError::IO(io::Error::new(ErrorKind::NotFound, format!("Could not find pages file at '{}'. Try running 'archwiki-rs sync-wiki' to create the missing file.", path.to_string_lossy()))), _ => err.into() @@ -73,8 +73,7 @@ pub fn read_pages_file_as_str(path: PathBuf) -> Result { } fn to_save_file_name(page: &str) -> String { - let regex = Regex::new("[^-0-9A-Za-z_]").expect("'[^0-9A-Za-z_]' should be a valid regex"); - regex.replace_all(page, "_").to_string() + urlencoding::encode(page).to_string() } #[cfg(test)] -- GitLab From 8e3e09c1d04071d8359eef25fd157e680cd390b0 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Mon, 15 Jan 2024 21:57:26 +0100 Subject: [PATCH 06/24] fix tests --- src/cli.rs | 1 - src/formats/html.rs | 17 ++++++----- src/formats/markdown.rs | 10 +++--- src/formats/plain_text.rs | 32 ++++++++------------ src/main.rs | 12 +------- src/utils.rs | 17 +++++++---- src/wiki_api.rs | 2 +- tests/cli.rs | 64 +-------------------------------------- 8 files changed, 41 insertions(+), 114 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 87fb214..c764214 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,7 +1,6 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; -use html2md::common; use crate::formats::PageFormat; diff --git a/src/formats/html.rs b/src/formats/html.rs index ca60728..6ceca4c 100644 --- a/src/formats/html.rs +++ b/src/formats/html.rs @@ -1,11 +1,16 @@ -use scraper::Html; +use scraper::{Html, Selector}; /// Converts the body of the ArchWiki page to a HTML string pub fn convert_page_to_html(document: &Html, page: &str) -> String { + let body_selector = Selector::parse("body").expect("body should be a valid css selector"); format!( "

{heading}

\n{body}", heading = page, - body = document.html() + body = document + .select(&body_selector) + .next() + .map(|body| body.inner_html()) + .unwrap_or_default() ) } @@ -17,11 +22,9 @@ mod tests { #[tokio::test] async fn test_convert_page_to_html() { let page = "test page"; - let input = format!( - r#"
+ let input = r#"
Hello, world! -
"# - ); +
"#; let expected_output = format!( r#"

{page}

@@ -30,7 +33,7 @@ mod tests {
"# ); - let document = Html::parse_document(&input); + let document = Html::parse_document(input); let output = convert_page_to_html(&document, page); assert_eq!(output, expected_output); diff --git a/src/formats/markdown.rs b/src/formats/markdown.rs index 126fa58..23a2ba9 100644 --- a/src/formats/markdown.rs +++ b/src/formats/markdown.rs @@ -14,11 +14,9 @@ mod tests { #[tokio::test] async fn test_convert_page_to_markdown() { let page = "test page"; - let input = format!( - r#"
-

Hello, world!

-
"# - ); + let input = r#"
+

Hello, world!

+
"#; let expected_output = format!( r#"# {page} @@ -26,7 +24,7 @@ mod tests { ### Hello, world! ###"# ); - let document = Html::parse_document(&input); + let document = Html::parse_document(input); let output = convert_page_to_markdown(&document, page); assert_eq!(output, expected_output); diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs index b160857..754d4f9 100644 --- a/src/formats/plain_text.rs +++ b/src/formats/plain_text.rs @@ -90,35 +90,29 @@ mod tests { #[tokio::test] async fn test_convert_page_to_plain_text() { { - let input = format!( - r#" -

Hello, world!

-
how are you
- I'm great -
"# - ); + let input = r#" +

Hello, world!

+
how are you
+ I'm great +
"#; - let expected_output = format!( - r#" - Hello, world! - how are you - I'm great -"# - ); + let expected_output = r#" + Hello, world! + how are you + I'm great + "#; - let document = Html::parse_document(&input); + let document = Html::parse_document(input); let output = convert_page_to_plain_text(&document, false); assert_eq!(output, expected_output); } { - let input = format!( - r#"
+ let input = r#"

Hello, world!

example -
"# - ); +
"#; let expected_output = format!( r#" diff --git a/src/main.rs b/src/main.rs index f97aad6..44e24af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,9 +6,6 @@ use directories::BaseDirs; use error::WikiError; use formats::plain_text::convert_page_to_plain_text; use itertools::Itertools; -use scraper::Html; -use url::Url; -use wiki_api::fetch_page_by_url; use crate::{ categories::list_pages, @@ -71,7 +68,7 @@ async fn main() -> Result<(), WikiError> { let out = if use_cached_page { fs::read_to_string(&page_cache_path)? } else { - match fetch_document(&page, lang.as_deref()).await { + match fetch_page(&page, lang.as_deref()).await { Ok(document) => match format { PageFormat::PlainText => convert_page_to_plain_text(&document, show_urls), PageFormat::Markdown => convert_page_to_markdown(&document, &page), @@ -251,10 +248,3 @@ async fn main() -> Result<(), WikiError> { Ok(()) } - -async fn fetch_document(page: &str, lang: Option<&str>) -> Result { - match Url::parse(page) { - Ok(url) => fetch_page_by_url(url).await, - Err(_) => fetch_page(page, lang).await, - } -} diff --git a/src/utils.rs b/src/utils.rs index ba75ffe..81347c7 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,7 +4,6 @@ use std::{ path::{Path, PathBuf}, }; -use regex::Regex; use scraper::node::Element; use crate::{error::WikiError, formats::PageFormat}; @@ -73,7 +72,10 @@ pub fn read_pages_file_as_str(path: &Path) -> Result { } fn to_save_file_name(page: &str) -> String { - urlencoding::encode(page).to_string() + urlencoding::encode(page) + .to_string() + .replace('.', "\\.") + .replace('~', "\\~") } #[cfg(test)] @@ -85,10 +87,13 @@ mod tests { fn test_to_save_file_name() { let cases = [ ("Neovim", "Neovim"), - ("3D Mouse", "3D_Mouse"), - ("/etc/fstab", "_etc_fstab"), - (".NET", "_NET"), - ("ASUS MeMO Pad 7 (ME176C(X))", "ASUS_MeMO_Pad_7__ME176C_X__"), + ("3D Mouse", "3D%20Mouse"), + ("/etc/fstab", "%2Fetc%2Ffstab"), + (".NET", "\\.NET"), + ( + "ASUS MeMO Pad 7 (ME176C(X))", + "ASUS%20MeMO%20Pad%207%20%28ME176C%28X%29%29", + ), ]; for (input, output) in cases { diff --git a/src/wiki_api.rs b/src/wiki_api.rs index 1dd5662..cb00e67 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -90,7 +90,7 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result Result { +async fn fetch_page_by_url(url: Url) -> Result { let base_url = format!( "{schema}://{host}", schema = url.scheme(), diff --git a/tests/cli.rs b/tests/cli.rs index 5759687..6857dbb 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,9 +1,5 @@ use assert_cmd::Command; -use assert_fs::prelude::{FileWriteStr, PathChild}; -use predicates::{ - prelude::{predicate, PredicateBooleanExt}, - Predicate, -}; +use predicates::prelude::{predicate, PredicateBooleanExt}; #[test] fn test_cli_info_cmd() -> Result<(), Box> { @@ -55,22 +51,6 @@ fn test_cli_read_page_cmd() -> Result<(), Box> { cmd.assert().failure().stderr(pstr::starts_with("Neovim")); } - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["read-page", "-i", "https://wiki.archlinux.org/title/Emacs"]); - - cmd.assert() - .success() - .stdout(pstr::contains("Installation")); - } - - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["read-page", "-i", "https://google.com"]); - - cmd.assert().failure(); - } - Ok(()) } @@ -121,45 +101,3 @@ fn test_cli_list_languages_cmd() -> Result<(), Box> { Ok(()) } - -#[test] -fn test_cli_local_wiki_info() -> Result<(), Box> { - use predicate::str as pstr; - - let stdout = { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["sync-wiki", "-p", "-m", "10"]); - - let stdout = String::from_utf8(cmd.assert().success().get_output().stdout.clone()).unwrap(); - pstr::contains("About Arch").eval(&stdout); - - stdout - }; - - let tmp_dir = assert_fs::TempDir::new().unwrap(); - tmp_dir.child("pages.yml").write_str(&stdout).unwrap(); - - let tmp_file_path = tmp_dir.path().join("pages.yml"); - - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["list-pages", "-p", tmp_file_path.to_str().unwrap()]); - - cmd.assert().success().stdout(pstr::contains( - "About Arch: -───┤Arch boot process -───┤Arch build system", - )); - } - - { - let mut cmd = Command::cargo_bin("archwiki-rs")?; - cmd.args(["list-categories", "-p", tmp_file_path.to_str().unwrap()]); - - cmd.assert() - .success() - .stdout(pstr::contains("\n").count(10)); - } - - Ok(()) -} -- GitLab From bf8435bc96abfcb2d10f002448d818860816d487 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Tue, 16 Jan 2024 14:45:05 +0100 Subject: [PATCH 07/24] improve missing page file error --- src/cli.rs | 3 +++ src/main.rs | 27 +++++++++++++++++++-------- src/utils.rs | 13 +++++++++++-- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index c764214..f02e9cb 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -106,6 +106,9 @@ pub enum Commands { #[arg(short, long)] /// Print result to stdout instead of writing to a file. Output is formatted as YAML. print: bool, + #[arg(short, long)] + /// Use custom output file location + out_file: Option, }, #[command( about = "Download a copy of the ArchWiki. Will take a long time :)", diff --git a/src/main.rs b/src/main.rs index 44e24af..9e417ed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -121,8 +121,11 @@ async fn main() -> Result<(), WikiError> { category, page_file, } => { - let path = page_file.unwrap_or(default_page_file_path); - let file = read_pages_file_as_str(&path)?; + let (path, is_default) = page_file + .map(|path| (path, false)) + .unwrap_or((default_page_file_path, true)); + + let file = read_pages_file_as_str(&path, is_default)?; let pages_map: HashMap> = serde_yaml::from_str(&file)?; @@ -140,8 +143,11 @@ async fn main() -> Result<(), WikiError> { println!("{out}"); } Commands::ListCategories { page_file } => { - let path = page_file.unwrap_or(default_page_file_path); - let file = read_pages_file_as_str(&path)?; + let (path, is_default) = page_file + .map(|path| (path, false)) + .unwrap_or((default_page_file_path, true)); + + let file = read_pages_file_as_str(&path, is_default)?; let pages_map: HashMap> = serde_yaml::from_str(&file)?; @@ -160,6 +166,7 @@ async fn main() -> Result<(), WikiError> { delay, fast, print, + out_file, } => { let thread_count = thread_count.unwrap_or(num_cpus::get_physical()); let res = if !fast { @@ -172,10 +179,11 @@ async fn main() -> Result<(), WikiError> { let out = serde_yaml::to_string(&res)?; if !print { - fs::write(&default_page_file_path, out)?; + let path = out_file.unwrap_or(default_page_file_path); + fs::write(&path, out)?; if !hide_progress { - println!("data saved to {}", default_page_file_path.to_string_lossy()); + println!("data saved to {}", path.to_string_lossy()); } } else { println!("{out}"); @@ -185,8 +193,11 @@ async fn main() -> Result<(), WikiError> { location, page_file, } => { - let path = page_file.unwrap_or(default_page_file_path); - let Ok(file) = read_pages_file_as_str(&path) else { + let (path, is_default) = page_file + .map(|path| (path, false)) + .unwrap_or((default_page_file_path, true)); + + let Ok(file) = read_pages_file_as_str(&path, is_default) else { return Err(WikiError::Path("page file does not exist".to_owned())); }; diff --git a/src/utils.rs b/src/utils.rs index 81347c7..efa94ae 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -62,10 +62,19 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String { .replace("poster=\"/", &format!("poster=\"{base_url}/")) } -pub fn read_pages_file_as_str(path: &Path) -> Result { +pub fn read_pages_file_as_str(path: &Path, is_default_path: bool) -> Result { fs::read_to_string(path).map_err(|err| { match err.kind() { - ErrorKind::NotFound => WikiError::IO(io::Error::new(ErrorKind::NotFound, format!("Could not find pages file at '{}'. Try running 'archwiki-rs sync-wiki' to create the missing file.", path.to_string_lossy()))), + ErrorKind::NotFound => { + let path_str =path.to_string_lossy(); + let extra_path_arg = if is_default_path { + String::new() + } else { + format!(" --out-file {path_str}") + }; + + WikiError::IO(io::Error::new(ErrorKind::NotFound, format!("Could not find pages file at '{path_str}'. Try running 'archwiki-rs sync-wiki{extra_path_arg}' to create the missing file." ))) + } _ => err.into() } }) -- GitLab From 81fe38f5f0ecb5f7a91dca1022d82cdd5f2a5271 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Tue, 16 Jan 2024 22:05:13 +0100 Subject: [PATCH 08/24] use wiki media generator api to hugely improve preformance --- src/cli.rs | 11 +-- src/error.rs | 2 - src/main.rs | 64 ++++++------- src/utils.rs | 40 +++++++- src/wiki_api.rs | 242 ++++++++++++------------------------------------ 5 files changed, 122 insertions(+), 237 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index f02e9cb..a6f68d4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -87,7 +87,7 @@ pub enum Commands { )] ListLanguages, #[command( - about = "Download information about the pages and categories on the ArchWiki (takes a while)", + about = "Download information about the pages and categories on the ArchWiki", long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands" )] SyncWiki { @@ -95,15 +95,6 @@ pub enum Commands { /// Hide progress indicators hide_progress: bool, #[arg(short, long)] - /// Number of threads to use for fetching data from the ArchWiki - thread_count: Option, - #[arg(short, long)] - /// Delay (in milliseconds) between requests. Note that this applies on a per thread basis. - delay: Option, - #[arg(short, long)] - /// Only fetch page names without parent category information. - fast: bool, - #[arg(short, long)] /// Print result to stdout instead of writing to a file. Output is formatted as YAML. print: bool, #[arg(short, long)] diff --git a/src/error.rs b/src/error.rs index 5734c32..4cc13cd 100644 --- a/src/error.rs +++ b/src/error.rs @@ -50,6 +50,4 @@ pub enum WikiError { NoPageFound(String), #[error("The category '{}' could not be found", .0)] NoCategoryFound(String), - #[error("{}", .0)] - Other(String), } diff --git a/src/main.rs b/src/main.rs index 9e417ed..3e1aa3c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,11 @@ -use std::{collections::HashMap, fs}; +use std::fs; use clap::Parser; use cli::{CliArgs, Commands}; use directories::BaseDirs; use error::WikiError; use formats::plain_text::convert_page_to_plain_text; +use indicatif::ProgressBar; use itertools::Itertools; use crate::{ @@ -12,10 +13,11 @@ use crate::{ formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat}, languages::{fetch_all_langs, format_lang_table}, search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, - utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str}, - wiki_api::{ - fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search, fetch_wiki_tree, + utils::{ + create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree, + UNCATEGORIZED_KEY, }, + wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search}, }; mod categories; @@ -125,19 +127,16 @@ async fn main() -> Result<(), WikiError> { .map(|path| (path, false)) .unwrap_or((default_page_file_path, true)); - let file = read_pages_file_as_str(&path, is_default)?; - - let pages_map: HashMap> = serde_yaml::from_str(&file)?; - + let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; let out = if let Some(category) = category { - pages_map + wiki_tree .get(&category) .ok_or(WikiError::NoCategoryFound(category))? .iter() .sorted() .join("\n") } else { - list_pages(&pages_map, flatten) + list_pages(&wiki_tree, flatten) }; println!("{out}"); @@ -147,11 +146,14 @@ async fn main() -> Result<(), WikiError> { .map(|path| (path, false)) .unwrap_or((default_page_file_path, true)); - let file = read_pages_file_as_str(&path, is_default)?; - - let pages_map: HashMap> = serde_yaml::from_str(&file)?; + let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; + let out = wiki_tree + .keys() + .unique() + .sorted() + .filter(|cat| cat.as_str() != UNCATEGORIZED_KEY) + .join("\n"); - let out = pages_map.keys().unique().sorted().join("\n"); println!("{out}"); } Commands::ListLanguages => { @@ -162,21 +164,22 @@ async fn main() -> Result<(), WikiError> { } Commands::SyncWiki { hide_progress, - thread_count, - delay, - fast, print, out_file, } => { - let thread_count = thread_count.unwrap_or(num_cpus::get_physical()); - let res = if !fast { - fetch_wiki_tree(thread_count, delay.unwrap_or(0), hide_progress).await? - } else { - let all_pages = fetch_all_pages().await?; - HashMap::from([("*".to_owned(), all_pages)]) - }; + let spinner = ProgressBar::new_spinner(); + if hide_progress { + spinner.finish_and_clear(); + } - let out = serde_yaml::to_string(&res)?; + let _spin_task = std::thread::spawn(move || loop { + spinner.tick(); + std::thread::sleep(std::time::Duration::from_millis(100)); + }); + + let wiki_tree = fetch_all_pages().await?; + + let out = serde_yaml::to_string(&wiki_tree)?; if !print { let path = out_file.unwrap_or(default_page_file_path); @@ -197,17 +200,6 @@ async fn main() -> Result<(), WikiError> { .map(|path| (path, false)) .unwrap_or((default_page_file_path, true)); - let Ok(file) = read_pages_file_as_str(&path, is_default) else { - return Err(WikiError::Path("page file does not exist".to_owned())); - }; - - let Ok(pages_map) = serde_yaml::from_str::>>(&file) else { - return Err(WikiError::Other(format!( - "page file is malformed\nfile: {}", - path.to_string_lossy() - ))); - }; - todo!("oh boy"); } Commands::Info { diff --git a/src/utils.rs b/src/utils.rs index efa94ae..570381a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,13 +1,17 @@ use std::{ + collections::HashMap, fs, io::{self, ErrorKind}, path::{Path, PathBuf}, }; +use itertools::Itertools; use scraper::node::Element; use crate::{error::WikiError, formats::PageFormat}; +pub const UNCATEGORIZED_KEY: &str = "UNCATEGORIZED"; + /// Construct a path to cache a page. Different page formats are cached separately. /// All none word characters are escaped with an '_' pub fn create_cache_page_path(page: &str, format: &PageFormat, cache_dir: &Path) -> PathBuf { @@ -62,11 +66,14 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String { .replace("poster=\"/", &format!("poster=\"{base_url}/")) } -pub fn read_pages_file_as_str(path: &Path, is_default_path: bool) -> Result { - fs::read_to_string(path).map_err(|err| { +pub fn read_pages_file_as_category_tree( + path: &Path, + is_default_path: bool, +) -> Result>, WikiError> { + let content = fs::read_to_string(path).map_err(|err| { match err.kind() { ErrorKind::NotFound => { - let path_str =path.to_string_lossy(); + let path_str = path.to_string_lossy(); let extra_path_arg = if is_default_path { String::new() } else { @@ -77,7 +84,32 @@ pub fn read_pages_file_as_str(path: &Path, is_default_path: bool) -> Result err.into() } - }) + })?; + + let page_to_category_map: HashMap> = serde_yaml::from_str(&content)?; + + let mut category_to_page_map = HashMap::new(); + let mut uncategorized_pages = vec![]; + + for (page, cats) in page_to_category_map.into_iter().collect_vec() { + if cats.is_empty() { + uncategorized_pages.push(page) + } else { + for cat in cats { + let mut pages: Vec = + category_to_page_map.get(&cat).cloned().unwrap_or_default(); + pages.push(page.clone()); + + category_to_page_map.insert(cat, pages); + } + } + } + + if !uncategorized_pages.is_empty() { + category_to_page_map.insert(UNCATEGORIZED_KEY.to_owned(), uncategorized_pages); + } + + Ok(category_to_page_map) } fn to_save_file_name(page: &str) -> String { diff --git a/src/wiki_api.rs b/src/wiki_api.rs index cb00e67..d636236 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -1,9 +1,5 @@ -use core::panic; use std::collections::HashMap; -use futures::future; -use indicatif::{MultiProgress, ProgressBar}; -use itertools::Itertools; use scraper::Html; use serde::Deserialize; use url::Url; @@ -17,6 +13,18 @@ use crate::{ utils::update_relative_urls, }; +const BLOCK_LISTED_CATEGORY_PREFIXES: &[&str] = &[ + "Pages flagged with", + "Sections flagged with", + "Pages or sections flagged with", + "Pages where template include size is exceeded", + "Pages with broken package links", + "Pages with broken section links", + "Pages with missing package links", + "Pages with missing section links", + "Pages with dead links", +]; + #[derive(Debug, Clone, serde::Deserialize)] pub struct ApiResponse { pub query: T, @@ -103,221 +111,85 @@ async fn fetch_page_by_url(url: Url) -> Result { Ok(Html::parse_document(&body_with_abs_urls)) } -/// Gets a list of all ArchWiki categories and the pages inside them. -/// All categories are treated as top-level and sub categories are ignored. -pub async fn fetch_wiki_tree( - thread_count: usize, - delay: u64, - hide_progress: bool, -) -> Result>, WikiError> { - let categories = fetch_all_categories().await?; - - let multi_bar = MultiProgress::new(); - let chunk_count = categories.len() / thread_count; - - let tasks = categories - .chunks(chunk_count) - .map(|chunk| { - let chunk = chunk.to_vec(); - - let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0)); - let bar = multi_bar.add(bar); - if hide_progress { - bar.finish_and_clear(); - } - - tokio::spawn(async move { - let mut wiki_sectoin = HashMap::new(); - for category in chunk { - tokio::time::sleep(std::time::Duration::from_millis(delay)).await; - - let pages = match fetch_pages_in_category(&category).await { - Ok(pages) => pages, - Err(_) => { - // wait if rate limited - tokio::time::sleep(std::time::Duration::from_secs(1)).await; - fetch_pages_in_category(&category) - .await - .unwrap_or_else(|err| { - eprintln!( - "failed to fetch pages in category {}\n ERROR {err}", - category - ); - vec![] - }) - } - }; - - if !pages.is_empty() { - wiki_sectoin.insert(category.to_owned(), pages); - } - bar.inc(1); - } - - wiki_sectoin - }) - }) - .collect_vec(); - - let mut wiki = HashMap::new(); - let sections = future::join_all(tasks).await; - - for section in sections { - match section { - Ok(data) => { - wiki.extend(data); - } - Err(err) => panic!("failed to sync wiki\nERROR: {err}"), - } - } - - Ok(wiki) -} - -pub async fn fetch_all_pages() -> Result, WikiError> { +/// TODO +pub async fn fetch_all_pages() -> Result>, WikiError> { #[derive(Debug, Deserialize)] struct ApiAllPagesQuery { - allpages: Vec, + pages: HashMap, } #[derive(Debug, Deserialize)] struct Page { title: String, - } - - impl From for String { - fn from(value: Page) -> Self { - value.title - } - } - - #[derive(Debug, Deserialize)] - struct ApiAllPageContinueParams { - apcontinue: String, - } - - let api_url = - "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500"; - - let mut pages: Vec = vec![]; - - let body = reqwest::get(api_url).await?.text().await?; - let mut api_resp: ApiResponseWithContinue = - serde_json::from_str(&body)?; - - pages.append( - &mut api_resp - .query - .allpages - .into_iter() - .map(Into::into) - .collect(), - ); - - while let Some(continue_params) = api_resp.r#continue { - let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue); - - let body = reqwest::get(&next_api_url).await?.text().await?; - api_resp = serde_json::from_str(&body)?; - - pages.append( - &mut api_resp - .query - .allpages - .into_iter() - .map(Into::into) - .collect(), - ); - } - - Ok(pages) -} - -async fn fetch_all_categories() -> Result, WikiError> { - #[derive(Debug, Deserialize)] - struct ApiAllCategoriesQuery { - allcategories: Vec, + categories: Option>, } #[derive(Debug, Deserialize)] struct Category { - #[serde[rename = "*"]] - name: String, + title: String, } impl From for String { fn from(value: Category) -> Self { - value.name + value + .title + .split_once("Category:") + .map(|(_, title)| title.to_owned()) + .unwrap_or(value.title) } } #[derive(Debug, Deserialize)] - struct ApiAllCategoryContinueParams { - accontinue: String, + struct ApiAllPageContinueParams { + gapcontinue: Option, + clcontinue: Option, } - let api_url = "https://wiki.archlinux.org/api.php?action=query&list=allcategories&format=json&aclimit=500"; + let api_url = + "https://wiki.archlinux.org/api.php?action=query&generator=allpages&prop=categories&format=json&gaplimit=max&cllimit=max"; - let mut categories: Vec = vec![]; + let mut pages: Vec = vec![]; let body = reqwest::get(api_url).await?.text().await?; - let mut api_resp: ApiResponseWithContinue = + let mut api_resp: ApiResponseWithContinue = serde_json::from_str(&body)?; - categories.append( - &mut api_resp - .query - .allcategories - .into_iter() - .map(Into::into) - .collect(), - ); + pages.append(&mut api_resp.query.pages.into_values().collect()); while let Some(continue_params) = api_resp.r#continue { - let next_api_url = format!("{api_url}&accontinue={}", continue_params.accontinue); + let next_api_url = if let Some(gapcontinue) = continue_params.gapcontinue { + format!("{api_url}&gapcontinue={}", gapcontinue) + } else if let Some(clcontinue) = continue_params.clcontinue { + format!("{api_url}&clcontinue={}", clcontinue) + } else { + break; + }; let body = reqwest::get(&next_api_url).await?.text().await?; api_resp = serde_json::from_str(&body)?; - categories.append( - &mut api_resp - .query - .allcategories - .into_iter() - .map(Into::into) - .collect(), - ); + pages.append(&mut api_resp.query.pages.into_values().collect()); } - Ok(categories) + let page_category_tree = pages.into_iter().map(|page| { + ( + page.title, + page.categories + .map(|cats| { + cats.into_iter() + .map::(Into::into) + .filter(|cat| !is_blocked_category(cat)) + .collect() + }) + .unwrap_or_default(), + ) + }); + + Ok(HashMap::from_iter(page_category_tree)) } -async fn fetch_pages_in_category(category: &str) -> Result, WikiError> { - #[derive(Debug, Deserialize)] - struct ApiCategoryMembersQuery { - categorymembers: Vec, - } - - #[derive(Debug, Deserialize)] - struct Page { - title: String, - } - - impl From for String { - fn from(value: Page) -> Self { - value.title - } - } - - let api_url = format!("https://wiki.archlinux.org/api.php?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=500&cmtitle=Category:{title}", title = urlencoding::encode(category)); - - let body = reqwest::get(api_url).await?.text().await?; - let api_resp: ApiResponse = serde_json::from_str(&body)?; - - Ok(api_resp - .query - .categorymembers - .into_iter() - .map(Into::into) - .collect()) +fn is_blocked_category(category: &str) -> bool { + BLOCK_LISTED_CATEGORY_PREFIXES + .iter() + .any(|blocked_prefix| category.starts_with(blocked_prefix)) } -- GitLab From 3c6d03986f2bf51f4e8e8f318c5630d2e009454d Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 10:43:39 +0100 Subject: [PATCH 09/24] add local-wiki function --- src/cli.rs | 47 ++++++++++-------- src/main.rs | 125 +++++++++++++++++++++++++++++++++++++++++++++--- src/utils.rs | 4 +- src/wiki_api.rs | 9 +++- 4 files changed, 156 insertions(+), 29 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index a6f68d4..5a71859 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -19,25 +19,25 @@ pub enum Commands { )] ReadPage { #[arg(short, long)] - /// Don't cache the read page locally + /// Don't cache the read page locally. no_cache_write: bool, #[arg(short, long)] - /// Don't read the page from cache even if an entry for it is cached + /// Don't read the page from cache even if an entry for it is cached. ignore_cache: bool, #[arg(short, long)] /// Don't invalidate the cache even if it is considered stale. A cache is considered stale /// after it hasn't been updated in more then 14 days. disable_cache_invalidation: bool, #[arg(short, long)] - /// Show URLs for plain-text output + /// Show URLs for plain-text output. show_urls: bool, #[arg(short, long)] /// Preferred page language lang: Option, #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] - /// The format that the page should be displayed in + /// The format that the page should be displayed in. format: PageFormat, - /// The name of the page to read or an absolute URL to the page + /// The name of the page to read or an absolute URL to the page. page: String, }, #[command( @@ -47,10 +47,10 @@ pub enum Commands { Search { search: String, #[arg(short, long, default_value_t = String::from("en"))] - /// Preferred language of the content to search for + /// Preferred language of the content to search for. lang: String, #[arg(short = 'L', long, default_value_t = 5)] - /// Maximum number of results + /// Maximum number of results. limit: u16, #[arg(short, long)] /// Search for pages by text content instead of title. Uses the 'query' API action instead @@ -63,13 +63,13 @@ pub enum Commands { )] ListPages { #[arg(short, long)] - /// Flatten all pages and don't show their category names + /// Flatten all pages and don't show their category names. flatten: bool, #[arg(short, long)] - /// Only show pages in this category + /// Only show pages in this category. category: Option, #[arg(short, long)] - /// Use a different file to read pages from + /// Use a different file to read pages from. page_file: Option, }, #[command( @@ -78,7 +78,7 @@ pub enum Commands { )] ListCategories { #[arg(short, long)] - /// Use a different file to read pages from + /// Use a different file to read pages from. page_file: Option, }, #[command( @@ -92,13 +92,13 @@ pub enum Commands { )] SyncWiki { #[arg(short = 'H', long)] - /// Hide progress indicators + /// Hide progress indicators. hide_progress: bool, #[arg(short, long)] /// Print result to stdout instead of writing to a file. Output is formatted as YAML. print: bool, #[arg(short, long)] - /// Use custom output file location + /// Use custom output file location. out_file: Option, }, #[command( @@ -106,11 +106,20 @@ pub enum Commands { long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory." )] LocalWiki { - /// Location to store the local copy of the wiki at. - location: PathBuf, #[arg(short, long)] - /// Use a different file to read pages from + /// Use a different file to read pages from. page_file: Option, + #[arg(short = 'H', long)] + /// Hide progress indicators. + hide_progress: bool, + #[arg(short, long)] + /// Override directory at 'location' if it already exists. + override_wiki_directory: bool, + #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] + /// The format that the page should be displayed in. + format: PageFormat, + /// Location to store the local copy of the wiki at. + location: PathBuf, }, #[command( about = "Retrive information related to this tool", @@ -118,13 +127,13 @@ pub enum Commands { )] Info { #[arg(short = 'c', long)] - /// Location of the cache directory + /// Location of the cache directory. show_cache_dir: bool, #[arg(short = 'd', long)] - /// Location of the data directory + /// Location of the data directory. show_data_dir: bool, #[arg(short, long)] - /// Only show values and not the properties they belong to or their descriptions + /// Only show values and not the properties they belong to or their descriptions. only_values: bool, }, } diff --git a/src/main.rs b/src/main.rs index 3e1aa3c..f5451aa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,13 @@ -use std::fs; +use std::{fs, io, path::Path}; -use clap::Parser; +use clap::{builder::PossibleValue, Parser, ValueEnum}; use cli::{CliArgs, Commands}; use directories::BaseDirs; use error::WikiError; use formats::plain_text::convert_page_to_plain_text; -use indicatif::ProgressBar; +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use itertools::Itertools; +use wiki_api::fetch_page_without_recommendations; use crate::{ categories::list_pages, @@ -15,7 +16,7 @@ use crate::{ search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, utils::{ create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree, - UNCATEGORIZED_KEY, + to_save_file_name, UNCATEGORIZED_KEY, }, wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search}, }; @@ -178,7 +179,6 @@ async fn main() -> Result<(), WikiError> { }); let wiki_tree = fetch_all_pages().await?; - let out = serde_yaml::to_string(&wiki_tree)?; if !print { @@ -194,13 +194,89 @@ async fn main() -> Result<(), WikiError> { } Commands::LocalWiki { location, + format, page_file, + override_wiki_directory, + hide_progress, } => { let (path, is_default) = page_file .map(|path| (path, false)) .unwrap_or((default_page_file_path, true)); - todo!("oh boy"); + let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; + + create_dir_if_not_exists(&location, !override_wiki_directory)?; + + if !hide_progress { + if let Some(format) = format + .to_possible_value() + .as_ref() + .map(PossibleValue::get_name) + { + println!("downloading pages as {format}\n",) + } + } + + let all_bars = MultiProgress::new(); + + let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count(); + let category_bar = all_bars.add( + ProgressBar::new(category_count.try_into().unwrap_or(0)) + .with_prefix("fetching categories") + .with_style( + ProgressStyle::with_template("[{prefix:^22}]\t {pos:>4}/{len:4}") + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + category_bar.finish_and_clear(); + } + + for (cat, pages) in wiki_tree { + if pages.is_empty() { + continue; + } + + let cat_dir = location.join(to_save_file_name(&cat)); + create_dir_if_not_exists(&cat_dir, !override_wiki_directory)?; + + let bar = all_bars.add( + ProgressBar::new(pages.len().try_into().unwrap_or(0)) + .with_prefix("fetching sub-pages") + .with_style( + ProgressStyle::with_template( + "[{prefix:^22}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}", + ) + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + bar.finish_and_clear(); + } + + category_bar.inc(1); + for page in pages { + bar.inc(1); + + match write_page_to_local_wiki(&page, &cat_dir, &format).await { + Ok(()) => {} + Err(err) => { + eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}") + } + } + } + } + + if !hide_progress { + println!( + "saved local copy of the ArchWiki to '{}'", + location.to_string_lossy() + ) + } } Commands::Info { show_cache_dir, @@ -251,3 +327,40 @@ async fn main() -> Result<(), WikiError> { Ok(()) } + +async fn write_page_to_local_wiki( + page: &str, + parent_dir: &Path, + format: &PageFormat, +) -> Result<(), WikiError> { + let document = fetch_page_without_recommendations(page).await?; + + let (content, ext) = match format { + PageFormat::PlainText => (convert_page_to_plain_text(&document, false), ""), + PageFormat::Markdown => (convert_page_to_markdown(&document, page), "md"), + PageFormat::Html => (convert_page_to_html(&document, page), "html"), + }; + + let file_path = parent_dir.join(to_save_file_name(page)).with_extension(ext); + + fs::write(file_path, content)?; + Ok(()) +} + +fn create_dir_if_not_exists(dir: &Path, err_when_exists: bool) -> Result<(), WikiError> { + match fs::create_dir(dir) { + Ok(_) => {} + Err(err) => { + if err.kind() != io::ErrorKind::AlreadyExists { + return Err(err.into()); + } else if err_when_exists { + return Err(WikiError::Path(format!( + "ERROR: directory '{}' already exists", + dir.to_string_lossy() + ))); + } + } + } + + Ok(()) +} diff --git a/src/utils.rs b/src/utils.rs index 570381a..9db1671 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -10,7 +10,7 @@ use scraper::node::Element; use crate::{error::WikiError, formats::PageFormat}; -pub const UNCATEGORIZED_KEY: &str = "UNCATEGORIZED"; +pub const UNCATEGORIZED_KEY: &str = "Uncategorized"; /// Construct a path to cache a page. Different page formats are cached separately. /// All none word characters are escaped with an '_' @@ -112,7 +112,7 @@ pub fn read_pages_file_as_category_tree( Ok(category_to_page_map) } -fn to_save_file_name(page: &str) -> String { +pub fn to_save_file_name(page: &str) -> String { urlencoding::encode(page) .to_string() .replace('.', "\\.") diff --git a/src/wiki_api.rs b/src/wiki_api.rs index d636236..eb842d8 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -83,12 +83,17 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result Result { let raw_url = format!( "https://wiki.archlinux.org/rest.php/v1/page/{title}/html", - title = urlencoding::encode(page_title) + title = urlencoding::encode(page) ); - let url = Url::parse(&raw_url)?; + let url = Url::parse(&raw_url)?; let document = fetch_page_by_url(url).await?; Ok(document) } -- GitLab From 50be65f15c2a86b715a8d76755c7381d5dda74fb Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 12:26:51 +0100 Subject: [PATCH 10/24] multi-thread archwiki page fetching --- Cargo.lock | 1 + Cargo.toml | 1 + src/cli.rs | 4 + src/main.rs | 259 +++++++++++++++++++++++++++++++++++++--------------- 4 files changed, 192 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cbcb5a7..76d3788 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -114,6 +114,7 @@ dependencies = [ "termination", "thiserror", "tokio", + "unicode-width", "url", "urlencoding", ] diff --git a/Cargo.toml b/Cargo.toml index ac4c33e..c85ef8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ serde_yaml = "0.9.27" termination = "0.1.2" thiserror = "1.0.50" tokio = { version = "1.33.0", features = ["full"] } +unicode-width = "0.1.11" url = "2.4.1" urlencoding = "2.1.3" diff --git a/src/cli.rs b/src/cli.rs index 5a71859..1498f1b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -106,6 +106,10 @@ pub enum Commands { long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory." )] LocalWiki { + #[arg(short, long)] + /// Amount of threads to use for fetching pages from the ArchWiki. If not provided the + /// number of physical cores is used. + thread_count: Option, #[arg(short, long)] /// Use a different file to read pages from. page_file: Option, diff --git a/src/main.rs b/src/main.rs index f5451aa..61de8bd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,16 @@ -use std::{fs, io, path::Path}; +use std::{ + collections::HashMap, + fs, io, + path::{Path, PathBuf}, + sync::Arc, +}; use clap::{builder::PossibleValue, Parser, ValueEnum}; use cli::{CliArgs, Commands}; use directories::BaseDirs; use error::WikiError; use formats::plain_text::convert_page_to_plain_text; +use futures::future; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use itertools::Itertools; use wiki_api::fetch_page_without_recommendations; @@ -196,87 +202,27 @@ async fn main() -> Result<(), WikiError> { location, format, page_file, + thread_count, override_wiki_directory, hide_progress, } => { + let thread_count = thread_count.unwrap_or(num_cpus::get_physical()).max(1); + let (path, is_default) = page_file .map(|path| (path, false)) .unwrap_or((default_page_file_path, true)); let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; - create_dir_if_not_exists(&location, !override_wiki_directory)?; - - if !hide_progress { - if let Some(format) = format - .to_possible_value() - .as_ref() - .map(PossibleValue::get_name) - { - println!("downloading pages as {format}\n",) - } - } - - let all_bars = MultiProgress::new(); - - let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count(); - let category_bar = all_bars.add( - ProgressBar::new(category_count.try_into().unwrap_or(0)) - .with_prefix("fetching categories") - .with_style( - ProgressStyle::with_template("[{prefix:^22}]\t {pos:>4}/{len:4}") - .unwrap() - .progress_chars("##-"), - ), - ); - - if hide_progress { - category_bar.finish_and_clear(); - } - - for (cat, pages) in wiki_tree { - if pages.is_empty() { - continue; - } - - let cat_dir = location.join(to_save_file_name(&cat)); - create_dir_if_not_exists(&cat_dir, !override_wiki_directory)?; - - let bar = all_bars.add( - ProgressBar::new(pages.len().try_into().unwrap_or(0)) - .with_prefix("fetching sub-pages") - .with_style( - ProgressStyle::with_template( - "[{prefix:^22}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}", - ) - .unwrap() - .progress_chars("##-"), - ), - ); - - if hide_progress { - bar.finish_and_clear(); - } - - category_bar.inc(1); - for page in pages { - bar.inc(1); - - match write_page_to_local_wiki(&page, &cat_dir, &format).await { - Ok(()) => {} - Err(err) => { - eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}") - } - } - } - } - - if !hide_progress { - println!( - "saved local copy of the ArchWiki to '{}'", - location.to_string_lossy() - ) - } + download_wiki( + wiki_tree, + format, + location, + thread_count, + override_wiki_directory, + hide_progress, + ) + .await?; } Commands::Info { show_cache_dir, @@ -328,6 +274,156 @@ async fn main() -> Result<(), WikiError> { Ok(()) } +async fn download_wiki( + wiki_tree: HashMap>, + format: PageFormat, + location: PathBuf, + thread_count: usize, + override_wiki_directory: bool, + hide_progress: bool, +) -> Result<(), WikiError> { + create_dir_if_not_exists(&location, !override_wiki_directory)?; + + if !hide_progress { + if let Some(format) = format + .to_possible_value() + .as_ref() + .map(PossibleValue::get_name) + { + println!("downloading pages as {format}\n",) + } + } + + let multibar = MultiProgress::new(); + + let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count(); + let category_bar = multibar.add( + ProgressBar::new(category_count.try_into().unwrap_or(0)) + .with_prefix("---FETCHING CATEGORIES---") + .with_style( + ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}") + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + category_bar.finish_and_clear(); + } + + let wiki_tree_without_empty_cats = wiki_tree + .into_iter() + .filter(|(_, p)| !p.is_empty()) + .collect_vec(); + + let chunk_count = wiki_tree_without_empty_cats.len() / thread_count; + + let format = Arc::new(format); + let location = Arc::new(location); + let multibar = Arc::new(multibar); + let catbar = Arc::new(category_bar); + + let wiki_tree_chunks = wiki_tree_without_empty_cats + .chunks(chunk_count) + .map(ToOwned::to_owned) + .map(Arc::new) + .collect_vec(); + + let tasks = wiki_tree_chunks + .into_iter() + .map(|chunk| { + let chunk = Arc::clone(&chunk); + + let format_ref = Arc::clone(&format); + let location_ref = Arc::clone(&location); + let multibar_ref = Arc::clone(&multibar); + let catbar_ref = Arc::clone(&catbar); + + tokio::spawn(async move { + download_wiki_chunk( + &chunk, + &format_ref, + &location_ref, + hide_progress, + &multibar_ref, + &catbar_ref, + ) + .await + .unwrap(); + }) + }) + .collect_vec(); + + future::join_all(tasks).await; + + if !hide_progress { + println!( + "saved local copy of the ArchWiki to '{}'", + location.to_string_lossy() + ) + } + + Ok(()) +} + +async fn download_wiki_chunk( + chunk: &[(String, Vec)], + format: &PageFormat, + location: &Path, + hide_progress: bool, + multibar: &MultiProgress, + catbar: &ProgressBar, +) -> Result<(), WikiError> { + for (cat, pages) in chunk { + let cat_dir = location.join(to_save_file_name(cat)); + create_dir_if_not_exists(&cat_dir, false)?; + + let width = unicode_width::UnicodeWidthStr::width(cat.as_str()); + + let leak_str: &'static str = Box::leak( + format!( + " fetching pages in \"{}\"", + if width <= 18 { + truncate_unicode_str(18, cat) + } else { + truncate_unicode_str(15, cat) + "..." + } + ) + .into_boxed_str(), + ); + + let bar = multibar.add( + ProgressBar::new(pages.len().try_into().unwrap_or(0)) + .with_prefix(leak_str) + .with_style( + ProgressStyle::with_template( + "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}", + ) + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + bar.finish_and_clear(); + } + + catbar.inc(1); + for page in pages { + bar.inc(1); + + match write_page_to_local_wiki(page, &cat_dir, format).await { + Ok(()) => {} + Err(err) => { + eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}") + } + } + } + } + + Ok(()) +} + async fn write_page_to_local_wiki( page: &str, parent_dir: &Path, @@ -364,3 +460,20 @@ fn create_dir_if_not_exists(dir: &Path, err_when_exists: bool) -> Result<(), Wik Ok(()) } + +fn truncate_unicode_str(n: usize, text: &str) -> String { + let mut count = 0; + let mut res = vec![]; + let mut chars = text.chars(); + + while count < n { + if let Some(char) = chars.next() { + count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0); + res.push(char); + } else { + break; + } + } + + res.into_iter().collect::() +} -- GitLab From 17747aa5df1bf359547a538769e19ddaaab4b4a3 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 12:46:42 +0100 Subject: [PATCH 11/24] improve error reporting in local-wiki sub-command --- src/main.rs | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/main.rs b/src/main.rs index 61de8bd..65f0465 100644 --- a/src/main.rs +++ b/src/main.rs @@ -349,12 +349,31 @@ async fn download_wiki( &catbar_ref, ) .await - .unwrap(); }) }) .collect_vec(); - future::join_all(tasks).await; + let results = future::join_all(tasks).await; + + for result in results { + match result { + Ok(Ok(failed_fetchs)) => { + if !failed_fetchs.is_empty() { + for (page, err) in failed_fetchs { + eprintln!("WARNING: failed to page '{page}'\nREASON: {err}"); + } + } + } + Ok(Err(thread_err)) => { + eprintln!( + "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}" + ); + } + Err(_) => { + eprintln!("ERROR: failed to join threads, some pages might be missing"); + } + } + } if !hide_progress { println!( @@ -366,6 +385,8 @@ async fn download_wiki( Ok(()) } +type FailedPageFetches = Vec<(String, WikiError)>; + async fn download_wiki_chunk( chunk: &[(String, Vec)], format: &PageFormat, @@ -373,7 +394,9 @@ async fn download_wiki_chunk( hide_progress: bool, multibar: &MultiProgress, catbar: &ProgressBar, -) -> Result<(), WikiError> { +) -> Result { + let mut failed_fetches = vec![]; + for (cat, pages) in chunk { let cat_dir = location.join(to_save_file_name(cat)); create_dir_if_not_exists(&cat_dir, false)?; @@ -414,14 +437,12 @@ async fn download_wiki_chunk( match write_page_to_local_wiki(page, &cat_dir, format).await { Ok(()) => {} - Err(err) => { - eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}") - } + Err(err) => failed_fetches.push((page.to_owned(), err)), } } } - Ok(()) + Ok(failed_fetches) } async fn write_page_to_local_wiki( -- GitLab From 3dca9544b15900bba3fdc663bbb0876c125cc029 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 12:54:26 +0100 Subject: [PATCH 12/24] split up un-categorized data into chunks to improve local-wiki performance --- src/utils.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index 9db1671..34a3f1e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -106,7 +106,16 @@ pub fn read_pages_file_as_category_tree( } if !uncategorized_pages.is_empty() { - category_to_page_map.insert(UNCATEGORIZED_KEY.to_owned(), uncategorized_pages); + for (i, uncategoriesed_chunk) in uncategorized_pages + .into_iter() + .sorted() + .chunks(500) + .into_iter() + .enumerate() + { + let key = format!("{UNCATEGORIZED_KEY} #{n}", n = i + 1); + category_to_page_map.insert(key, uncategoriesed_chunk.collect_vec()); + } } Ok(category_to_page_map) -- GitLab From 9625ccf0f57d79244fbc9c7449542f49da9ea9ac Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 13:08:11 +0100 Subject: [PATCH 13/24] fix variable name --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 65f0465..9a85d47 100644 --- a/src/main.rs +++ b/src/main.rs @@ -316,7 +316,7 @@ async fn download_wiki( .filter(|(_, p)| !p.is_empty()) .collect_vec(); - let chunk_count = wiki_tree_without_empty_cats.len() / thread_count; + let chunk_size = wiki_tree_without_empty_cats.len() / thread_count; let format = Arc::new(format); let location = Arc::new(location); @@ -324,7 +324,7 @@ async fn download_wiki( let catbar = Arc::new(category_bar); let wiki_tree_chunks = wiki_tree_without_empty_cats - .chunks(chunk_count) + .chunks(chunk_size) .map(ToOwned::to_owned) .map(Arc::new) .collect_vec(); -- GitLab From a0dc74ca88337e3a8ffd26f7f6252738243e50cd Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 20:58:32 +0100 Subject: [PATCH 14/24] evenly chunk wiki categories based on page count --- src/main.rs | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index 9a85d47..3848d1a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -316,24 +316,17 @@ async fn download_wiki( .filter(|(_, p)| !p.is_empty()) .collect_vec(); - let chunk_size = wiki_tree_without_empty_cats.len() / thread_count; - let format = Arc::new(format); let location = Arc::new(location); let multibar = Arc::new(multibar); let catbar = Arc::new(category_bar); - let wiki_tree_chunks = wiki_tree_without_empty_cats - .chunks(chunk_size) - .map(ToOwned::to_owned) - .map(Arc::new) - .collect_vec(); + let wiki_tree_chunks = + chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count); let tasks = wiki_tree_chunks .into_iter() .map(|chunk| { - let chunk = Arc::clone(&chunk); - let format_ref = Arc::clone(&format); let location_ref = Arc::clone(&location); let multibar_ref = Arc::clone(&multibar); @@ -498,3 +491,23 @@ fn truncate_unicode_str(n: usize, text: &str) -> String { res.into_iter().collect::() } + +fn chunk_wiki_with_even_page_distribution( + wiki_tree: Vec<(String, Vec)>, + chunk_count: usize, +) -> Vec)>> { + let mut chunks: Vec)>> = (0..chunk_count).map(|_| vec![]).collect(); + + for entry in wiki_tree { + if let Some(chunk) = chunks.iter_mut().min_by(|a, b| { + let count_a = a.iter().map(|(_, pages)| pages.len()).sum::(); + let count_b = b.iter().map(|(_, pages)| pages.len()).sum::(); + + count_a.cmp(&count_b) + }) { + chunk.push(entry); + } + } + + chunks +} -- GitLab From f4105d35d2564dc42e9d28c9bb0beecc6763facc Mon Sep 17 00:00:00 2001 From: jackboxx Date: Wed, 17 Jan 2024 21:36:24 +0100 Subject: [PATCH 15/24] use 'sanitize_filename' to create save file names --- Cargo.lock | 11 +++++++++++ Cargo.toml | 1 + src/utils.rs | 16 +++++----------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 76d3788..ec56aac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,6 +107,7 @@ dependencies = [ "pretty_assertions", "regex", "reqwest", + "sanitize-filename", "scraper", "serde", "serde_json", @@ -1547,6 +1548,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "sanitize-filename" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ed72fbaf78e6f2d41744923916966c4fbe3d7c74e3037a8ee482f1115572603" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "schannel" version = "0.1.23" diff --git a/Cargo.toml b/Cargo.toml index c85ef8b..a56e976 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ itertools = "0.11.0" num_cpus = "1.16.0" regex = "1.10.2" reqwest = "0.11.22" +sanitize-filename = "0.5.0" scraper = "0.18.1" serde = { version = "1.0.190", features = ["derive"] } serde_json = "1.0.108" diff --git a/src/utils.rs b/src/utils.rs index 34a3f1e..8c095a5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -122,10 +122,7 @@ pub fn read_pages_file_as_category_tree( } pub fn to_save_file_name(page: &str) -> String { - urlencoding::encode(page) - .to_string() - .replace('.', "\\.") - .replace('~', "\\~") + sanitize_filename::sanitize(page) } #[cfg(test)] @@ -137,13 +134,10 @@ mod tests { fn test_to_save_file_name() { let cases = [ ("Neovim", "Neovim"), - ("3D Mouse", "3D%20Mouse"), - ("/etc/fstab", "%2Fetc%2Ffstab"), - (".NET", "\\.NET"), - ( - "ASUS MeMO Pad 7 (ME176C(X))", - "ASUS%20MeMO%20Pad%207%20%28ME176C%28X%29%29", - ), + ("3D Mouse", "3D Mouse"), + ("/etc/fstab", "etcfstab"), + (".NET", ".NET"), + ("ASUS MeMO Pad 7 (ME176C(X))", "ASUS MeMO Pad 7 (ME176C(X))"), ]; for (input, output) in cases { -- GitLab From 83ffa1445815c68fcb00761b58e9f2dcd6bbc952 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Thu, 18 Jan 2024 18:31:09 +0100 Subject: [PATCH 16/24] don't override existing wiki files by default --- src/cli.rs | 2 +- src/main.rs | 53 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 1498f1b..024f92a 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -118,7 +118,7 @@ pub enum Commands { hide_progress: bool, #[arg(short, long)] /// Override directory at 'location' if it already exists. - override_wiki_directory: bool, + override_existing_files: bool, #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] /// The format that the page should be displayed in. format: PageFormat, diff --git a/src/main.rs b/src/main.rs index 3848d1a..2bcaaa0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -203,7 +203,7 @@ async fn main() -> Result<(), WikiError> { format, page_file, thread_count, - override_wiki_directory, + override_existing_files, hide_progress, } => { let thread_count = thread_count.unwrap_or(num_cpus::get_physical()).max(1); @@ -219,7 +219,7 @@ async fn main() -> Result<(), WikiError> { format, location, thread_count, - override_wiki_directory, + override_existing_files, hide_progress, ) .await?; @@ -279,10 +279,10 @@ async fn download_wiki( format: PageFormat, location: PathBuf, thread_count: usize, - override_wiki_directory: bool, + override_exisiting_files: bool, hide_progress: bool, ) -> Result<(), WikiError> { - create_dir_if_not_exists(&location, !override_wiki_directory)?; + create_dir_if_not_exists(&location)?; if !hide_progress { if let Some(format) = format @@ -338,6 +338,7 @@ async fn download_wiki( &format_ref, &location_ref, hide_progress, + override_exisiting_files, &multibar_ref, &catbar_ref, ) @@ -385,6 +386,7 @@ async fn download_wiki_chunk( format: &PageFormat, location: &Path, hide_progress: bool, + override_exisiting_files: bool, multibar: &MultiProgress, catbar: &ProgressBar, ) -> Result { @@ -392,7 +394,7 @@ async fn download_wiki_chunk( for (cat, pages) in chunk { let cat_dir = location.join(to_save_file_name(cat)); - create_dir_if_not_exists(&cat_dir, false)?; + create_dir_if_not_exists(&cat_dir)?; let width = unicode_width::UnicodeWidthStr::width(cat.as_str()); @@ -428,9 +430,12 @@ async fn download_wiki_chunk( for page in pages { bar.inc(1); - match write_page_to_local_wiki(page, &cat_dir, format).await { - Ok(()) => {} - Err(err) => failed_fetches.push((page.to_owned(), err)), + let path = page_path(page, format, &cat_dir); + if override_exisiting_files || !path.exists() { + match write_page_to_local_wiki(page, &path, format).await { + Ok(()) => {} + Err(err) => failed_fetches.push((page.to_owned(), err)), + } } } } @@ -440,34 +445,36 @@ async fn download_wiki_chunk( async fn write_page_to_local_wiki( page: &str, - parent_dir: &Path, + page_path: &Path, format: &PageFormat, ) -> Result<(), WikiError> { let document = fetch_page_without_recommendations(page).await?; - - let (content, ext) = match format { - PageFormat::PlainText => (convert_page_to_plain_text(&document, false), ""), - PageFormat::Markdown => (convert_page_to_markdown(&document, page), "md"), - PageFormat::Html => (convert_page_to_html(&document, page), "html"), + let content = match format { + PageFormat::PlainText => convert_page_to_plain_text(&document, false), + PageFormat::Markdown => convert_page_to_markdown(&document, page), + PageFormat::Html => convert_page_to_html(&document, page), }; - let file_path = parent_dir.join(to_save_file_name(page)).with_extension(ext); - - fs::write(file_path, content)?; + fs::write(page_path, content)?; Ok(()) } -fn create_dir_if_not_exists(dir: &Path, err_when_exists: bool) -> Result<(), WikiError> { +fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf { + let ext = match format { + PageFormat::PlainText => "", + PageFormat::Markdown => "md", + PageFormat::Html => "html", + }; + + parent_dir.join(to_save_file_name(page)).with_extension(ext) +} + +fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> { match fs::create_dir(dir) { Ok(_) => {} Err(err) => { if err.kind() != io::ErrorKind::AlreadyExists { return Err(err.into()); - } else if err_when_exists { - return Err(WikiError::Path(format!( - "ERROR: directory '{}' already exists", - dir.to_string_lossy() - ))); } } } -- GitLab From a7fc2bc6846e049e15c02ec9788f0bc3aee67008 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Thu, 18 Jan 2024 18:48:29 +0100 Subject: [PATCH 17/24] move functions out of main and add doc comments --- src/main.rs | 289 ++----------------------------------------- src/utils.rs | 40 ++++++ src/wiki_api.rs | 12 +- src/wiki_download.rs | 258 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 317 insertions(+), 282 deletions(-) create mode 100644 src/wiki_download.rs diff --git a/src/main.rs b/src/main.rs index 2bcaaa0..cd55d7b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,19 +1,12 @@ -use std::{ - collections::HashMap, - fs, io, - path::{Path, PathBuf}, - sync::Arc, -}; +use std::fs; -use clap::{builder::PossibleValue, Parser, ValueEnum}; +use clap::Parser; use cli::{CliArgs, Commands}; use directories::BaseDirs; use error::WikiError; use formats::plain_text::convert_page_to_plain_text; -use futures::future; -use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; + use itertools::Itertools; -use wiki_api::fetch_page_without_recommendations; use crate::{ categories::list_pages, @@ -22,9 +15,10 @@ use crate::{ search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel}, utils::{ create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree, - to_save_file_name, UNCATEGORIZED_KEY, + UNCATEGORIZED_KEY, }, - wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search}, + wiki_api::{fetch_open_search, fetch_page, fetch_text_search}, + wiki_download::{download_wiki, sync_wiki_info}, }; mod categories; @@ -35,6 +29,7 @@ mod languages; mod search; mod utils; mod wiki_api; +mod wiki_download; const PAGE_FILE_NAME: &str = "pages.yml"; @@ -174,29 +169,8 @@ async fn main() -> Result<(), WikiError> { print, out_file, } => { - let spinner = ProgressBar::new_spinner(); - if hide_progress { - spinner.finish_and_clear(); - } - - let _spin_task = std::thread::spawn(move || loop { - spinner.tick(); - std::thread::sleep(std::time::Duration::from_millis(100)); - }); - - let wiki_tree = fetch_all_pages().await?; - let out = serde_yaml::to_string(&wiki_tree)?; - - if !print { - let path = out_file.unwrap_or(default_page_file_path); - fs::write(&path, out)?; - - if !hide_progress { - println!("data saved to {}", path.to_string_lossy()); - } - } else { - println!("{out}"); - } + let path = out_file.unwrap_or(default_page_file_path); + sync_wiki_info(&path, print, hide_progress).await?; } Commands::LocalWiki { location, @@ -273,248 +247,3 @@ async fn main() -> Result<(), WikiError> { Ok(()) } - -async fn download_wiki( - wiki_tree: HashMap>, - format: PageFormat, - location: PathBuf, - thread_count: usize, - override_exisiting_files: bool, - hide_progress: bool, -) -> Result<(), WikiError> { - create_dir_if_not_exists(&location)?; - - if !hide_progress { - if let Some(format) = format - .to_possible_value() - .as_ref() - .map(PossibleValue::get_name) - { - println!("downloading pages as {format}\n",) - } - } - - let multibar = MultiProgress::new(); - - let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count(); - let category_bar = multibar.add( - ProgressBar::new(category_count.try_into().unwrap_or(0)) - .with_prefix("---FETCHING CATEGORIES---") - .with_style( - ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}") - .unwrap() - .progress_chars("##-"), - ), - ); - - if hide_progress { - category_bar.finish_and_clear(); - } - - let wiki_tree_without_empty_cats = wiki_tree - .into_iter() - .filter(|(_, p)| !p.is_empty()) - .collect_vec(); - - let format = Arc::new(format); - let location = Arc::new(location); - let multibar = Arc::new(multibar); - let catbar = Arc::new(category_bar); - - let wiki_tree_chunks = - chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count); - - let tasks = wiki_tree_chunks - .into_iter() - .map(|chunk| { - let format_ref = Arc::clone(&format); - let location_ref = Arc::clone(&location); - let multibar_ref = Arc::clone(&multibar); - let catbar_ref = Arc::clone(&catbar); - - tokio::spawn(async move { - download_wiki_chunk( - &chunk, - &format_ref, - &location_ref, - hide_progress, - override_exisiting_files, - &multibar_ref, - &catbar_ref, - ) - .await - }) - }) - .collect_vec(); - - let results = future::join_all(tasks).await; - - for result in results { - match result { - Ok(Ok(failed_fetchs)) => { - if !failed_fetchs.is_empty() { - for (page, err) in failed_fetchs { - eprintln!("WARNING: failed to page '{page}'\nREASON: {err}"); - } - } - } - Ok(Err(thread_err)) => { - eprintln!( - "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}" - ); - } - Err(_) => { - eprintln!("ERROR: failed to join threads, some pages might be missing"); - } - } - } - - if !hide_progress { - println!( - "saved local copy of the ArchWiki to '{}'", - location.to_string_lossy() - ) - } - - Ok(()) -} - -type FailedPageFetches = Vec<(String, WikiError)>; - -async fn download_wiki_chunk( - chunk: &[(String, Vec)], - format: &PageFormat, - location: &Path, - hide_progress: bool, - override_exisiting_files: bool, - multibar: &MultiProgress, - catbar: &ProgressBar, -) -> Result { - let mut failed_fetches = vec![]; - - for (cat, pages) in chunk { - let cat_dir = location.join(to_save_file_name(cat)); - create_dir_if_not_exists(&cat_dir)?; - - let width = unicode_width::UnicodeWidthStr::width(cat.as_str()); - - let leak_str: &'static str = Box::leak( - format!( - " fetching pages in \"{}\"", - if width <= 18 { - truncate_unicode_str(18, cat) - } else { - truncate_unicode_str(15, cat) + "..." - } - ) - .into_boxed_str(), - ); - - let bar = multibar.add( - ProgressBar::new(pages.len().try_into().unwrap_or(0)) - .with_prefix(leak_str) - .with_style( - ProgressStyle::with_template( - "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}", - ) - .unwrap() - .progress_chars("##-"), - ), - ); - - if hide_progress { - bar.finish_and_clear(); - } - - catbar.inc(1); - for page in pages { - bar.inc(1); - - let path = page_path(page, format, &cat_dir); - if override_exisiting_files || !path.exists() { - match write_page_to_local_wiki(page, &path, format).await { - Ok(()) => {} - Err(err) => failed_fetches.push((page.to_owned(), err)), - } - } - } - } - - Ok(failed_fetches) -} - -async fn write_page_to_local_wiki( - page: &str, - page_path: &Path, - format: &PageFormat, -) -> Result<(), WikiError> { - let document = fetch_page_without_recommendations(page).await?; - let content = match format { - PageFormat::PlainText => convert_page_to_plain_text(&document, false), - PageFormat::Markdown => convert_page_to_markdown(&document, page), - PageFormat::Html => convert_page_to_html(&document, page), - }; - - fs::write(page_path, content)?; - Ok(()) -} - -fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf { - let ext = match format { - PageFormat::PlainText => "", - PageFormat::Markdown => "md", - PageFormat::Html => "html", - }; - - parent_dir.join(to_save_file_name(page)).with_extension(ext) -} - -fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> { - match fs::create_dir(dir) { - Ok(_) => {} - Err(err) => { - if err.kind() != io::ErrorKind::AlreadyExists { - return Err(err.into()); - } - } - } - - Ok(()) -} - -fn truncate_unicode_str(n: usize, text: &str) -> String { - let mut count = 0; - let mut res = vec![]; - let mut chars = text.chars(); - - while count < n { - if let Some(char) = chars.next() { - count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0); - res.push(char); - } else { - break; - } - } - - res.into_iter().collect::() -} - -fn chunk_wiki_with_even_page_distribution( - wiki_tree: Vec<(String, Vec)>, - chunk_count: usize, -) -> Vec)>> { - let mut chunks: Vec)>> = (0..chunk_count).map(|_| vec![]).collect(); - - for entry in wiki_tree { - if let Some(chunk) = chunks.iter_mut().min_by(|a, b| { - let count_a = a.iter().map(|(_, pages)| pages.len()).sum::(); - let count_b = b.iter().map(|(_, pages)| pages.len()).sum::(); - - count_a.cmp(&count_b) - }) { - chunk.push(entry); - } - } - - chunks -} diff --git a/src/utils.rs b/src/utils.rs index 8c095a5..89288bc 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -125,6 +125,46 @@ pub fn to_save_file_name(page: &str) -> String { sanitize_filename::sanitize(page) } +pub fn truncate_unicode_str(n: usize, text: &str) -> String { + let mut count = 0; + let mut res = vec![]; + let mut chars = text.chars(); + + while count < n { + if let Some(char) = chars.next() { + count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0); + res.push(char); + } else { + break; + } + } + + res.into_iter().collect::() +} + +pub fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf { + let ext = match format { + PageFormat::PlainText => "", + PageFormat::Markdown => "md", + PageFormat::Html => "html", + }; + + parent_dir.join(to_save_file_name(page)).with_extension(ext) +} + +pub fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> { + match fs::create_dir(dir) { + Ok(_) => {} + Err(err) => { + if err.kind() != io::ErrorKind::AlreadyExists { + return Err(err.into()); + } + } + } + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/wiki_api.rs b/src/wiki_api.rs index eb842d8..e681190 100644 --- a/src/wiki_api.rs +++ b/src/wiki_api.rs @@ -86,7 +86,7 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result Result { let raw_url = format!( "https://wiki.archlinux.org/rest.php/v1/page/{title}/html", @@ -116,7 +116,15 @@ async fn fetch_page_by_url(url: Url) -> Result { Ok(Html::parse_document(&body_with_abs_urls)) } -/// TODO +/// Gets the names of all pages on the ArchWiki and the categories that they belong to. +/// +/// ### Example +/// +/// ```sh +/// Wine # page name +/// - Emulation # category +/// - Gaming # category +/// ``` pub async fn fetch_all_pages() -> Result>, WikiError> { #[derive(Debug, Deserialize)] struct ApiAllPagesQuery { diff --git a/src/wiki_download.rs b/src/wiki_download.rs new file mode 100644 index 0000000..4c252f4 --- /dev/null +++ b/src/wiki_download.rs @@ -0,0 +1,258 @@ +use std::{ + collections::HashMap, + fs, + path::{Path, PathBuf}, + sync::Arc, +}; + +use super::formats::plain_text::convert_page_to_plain_text; + +use clap::{builder::PossibleValue, ValueEnum}; +use futures::future; +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use itertools::Itertools; + +use crate::{ + error::WikiError, + formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat}, + utils::truncate_unicode_str, + utils::{create_dir_if_not_exists, page_path, to_save_file_name}, + wiki_api::fetch_all_pages, + wiki_api::fetch_page_without_recommendations, +}; + +pub async fn sync_wiki_info( + page_path: &Path, + print: bool, + hide_progress: bool, +) -> Result<(), WikiError> { + let spinner = ProgressBar::new_spinner(); + if hide_progress { + spinner.finish_and_clear(); + } + + let _spin_task = std::thread::spawn(move || loop { + spinner.tick(); + std::thread::sleep(std::time::Duration::from_millis(100)); + }); + + let wiki_tree = fetch_all_pages().await?; + let out = serde_yaml::to_string(&wiki_tree)?; + + if !print { + fs::write(page_path, out)?; + + if !hide_progress { + println!("data saved to {}", page_path.to_string_lossy()); + } + } else { + println!("{out}"); + } + + Ok(()) +} + +pub async fn download_wiki( + wiki_tree: HashMap>, + format: PageFormat, + location: PathBuf, + thread_count: usize, + override_exisiting_files: bool, + hide_progress: bool, +) -> Result<(), WikiError> { + create_dir_if_not_exists(&location)?; + + if !hide_progress { + if let Some(format) = format + .to_possible_value() + .as_ref() + .map(PossibleValue::get_name) + { + println!("downloading pages as {format}\n",) + } + } + + let multibar = MultiProgress::new(); + + let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count(); + let category_bar = multibar.add( + ProgressBar::new(category_count.try_into().unwrap_or(0)) + .with_prefix("---FETCHING CATEGORIES---") + .with_style( + ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}") + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + category_bar.finish_and_clear(); + } + + let wiki_tree_without_empty_cats = wiki_tree + .into_iter() + .filter(|(_, p)| !p.is_empty()) + .collect_vec(); + + let format = Arc::new(format); + let location = Arc::new(location); + let multibar = Arc::new(multibar); + let catbar = Arc::new(category_bar); + + let wiki_tree_chunks = + chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count); + + let tasks = wiki_tree_chunks + .into_iter() + .map(|chunk| { + let format_ref = Arc::clone(&format); + let location_ref = Arc::clone(&location); + let multibar_ref = Arc::clone(&multibar); + let catbar_ref = Arc::clone(&catbar); + + tokio::spawn(async move { + download_wiki_chunk( + &chunk, + &format_ref, + &location_ref, + hide_progress, + override_exisiting_files, + &multibar_ref, + &catbar_ref, + ) + .await + }) + }) + .collect_vec(); + + let results = future::join_all(tasks).await; + + for result in results { + match result { + Ok(Ok(failed_fetchs)) => { + if !failed_fetchs.is_empty() { + for (page, err) in failed_fetchs { + eprintln!("WARNING: failed to page '{page}'\nREASON: {err}"); + } + } + } + Ok(Err(thread_err)) => { + eprintln!( + "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}" + ); + } + Err(_) => { + eprintln!("ERROR: failed to join threads, some pages might be missing"); + } + } + } + + if !hide_progress { + println!( + "saved local copy of the ArchWiki to '{}'", + location.to_string_lossy() + ) + } + + Ok(()) +} + +type FailedPageFetches = Vec<(String, WikiError)>; + +async fn download_wiki_chunk( + chunk: &[(String, Vec)], + format: &PageFormat, + location: &Path, + hide_progress: bool, + override_exisiting_files: bool, + multibar: &MultiProgress, + catbar: &ProgressBar, +) -> Result { + let mut failed_fetches = vec![]; + + for (cat, pages) in chunk { + let cat_dir = location.join(to_save_file_name(cat)); + create_dir_if_not_exists(&cat_dir)?; + + let width = unicode_width::UnicodeWidthStr::width(cat.as_str()); + + let leak_str: &'static str = Box::leak( + format!( + " fetching pages in \"{}\"", + if width <= 18 { + truncate_unicode_str(18, cat) + } else { + truncate_unicode_str(15, cat) + "..." + } + ) + .into_boxed_str(), + ); + + let bar = multibar.add( + ProgressBar::new(pages.len().try_into().unwrap_or(0)) + .with_prefix(leak_str) + .with_style( + ProgressStyle::with_template( + "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}", + ) + .unwrap() + .progress_chars("##-"), + ), + ); + + if hide_progress { + bar.finish_and_clear(); + } + + catbar.inc(1); + for page in pages { + bar.inc(1); + + let path = page_path(page, format, &cat_dir); + if override_exisiting_files || !path.exists() { + match write_page_to_local_wiki(page, &path, format).await { + Ok(()) => {} + Err(err) => failed_fetches.push((page.to_owned(), err)), + } + } + } + } + + Ok(failed_fetches) +} + +async fn write_page_to_local_wiki( + page: &str, + page_path: &Path, + format: &PageFormat, +) -> Result<(), WikiError> { + let document = fetch_page_without_recommendations(page).await?; + let content = match format { + PageFormat::PlainText => convert_page_to_plain_text(&document, false), + PageFormat::Markdown => convert_page_to_markdown(&document, page), + PageFormat::Html => convert_page_to_html(&document, page), + }; + + fs::write(page_path, content)?; + Ok(()) +} + +fn chunk_wiki_with_even_page_distribution( + wiki_tree: Vec<(String, Vec)>, + chunk_count: usize, +) -> Vec)>> { + let mut chunks: Vec)>> = (0..chunk_count).map(|_| vec![]).collect(); + + for entry in wiki_tree { + if let Some(chunk) = chunks.iter_mut().min_by(|a, b| { + let count_a = a.iter().map(|(_, pages)| pages.len()).sum::(); + let count_b = b.iter().map(|(_, pages)| pages.len()).sum::(); + + count_a.cmp(&count_b) + }) { + chunk.push(entry); + } + } + + chunks +} -- GitLab From 856fe5b01ec595641d66cdbe27a1a09b4d093c2d Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 14:08:04 +0100 Subject: [PATCH 18/24] improve help command messages --- src/cli.rs | 64 +++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 024f92a..14af1ab 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -15,129 +15,125 @@ pub struct CliArgs { pub enum Commands { #[command( about = "Read a page from the ArchWiki", - long_about = "Read a page from the ArchWiki, if the page is not found similar page names are recommended. A list of page names is in the pages.yml file which can be updated with the 'sync-wiki' command." + long_about = "Read a page from the ArchWiki, if the page is not found similar page names are recommended" )] ReadPage { #[arg(short, long)] - /// Don't cache the read page locally. + /// Don't cache the read page locally no_cache_write: bool, #[arg(short, long)] - /// Don't read the page from cache even if an entry for it is cached. + /// Don't read the page from cache even if an entry for it is cached ignore_cache: bool, #[arg(short, long)] /// Don't invalidate the cache even if it is considered stale. A cache is considered stale - /// after it hasn't been updated in more then 14 days. + /// after it hasn't been updated in more then 14 days disable_cache_invalidation: bool, #[arg(short, long)] - /// Show URLs for plain-text output. + /// Show URLs for plain-text output show_urls: bool, #[arg(short, long)] /// Preferred page language lang: Option, #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] - /// The format that the page should be displayed in. + /// The format that the page should be displayed in format: PageFormat, - /// The name of the page to read or an absolute URL to the page. + /// The name of the page to read or an absolute URL to the page page: String, }, #[command( about = "Search the ArchWiki for pages", - long_about = "Search the ArchWiki for pages by title. Uses the 'opensearch' API action to perform queries." + long_about = "Search the ArchWiki for pages" )] Search { search: String, #[arg(short, long, default_value_t = String::from("en"))] - /// Preferred language of the content to search for. + /// Preferred language of the content to search for lang: String, #[arg(short = 'L', long, default_value_t = 5)] - /// Maximum number of results. + /// Maximum number of results limit: u16, #[arg(short, long)] - /// Search for pages by text content instead of title. Uses the 'query' API action instead - /// of 'opensearch'. + /// Search for pages by text content instead of title text_search: bool, }, #[command( about = "List all pages from the ArchWiki that have been downloaded", - long_about = "List all pages from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading." + long_about = "List all pages from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading" )] ListPages { #[arg(short, long)] - /// Flatten all pages and don't show their category names. + /// Flatten all pages and don't show their category names flatten: bool, #[arg(short, long)] - /// Only show pages in this category. + /// Only show pages in this category category: Option, #[arg(short, long)] - /// Use a different file to read pages from. + /// Use a different file to read pages from page_file: Option, }, #[command( about = "List all categories from the ArchWiki that have been downloaded", - long_about = "List categories from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading." + long_about = "List categories from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading" )] ListCategories { #[arg(short, long)] - /// Use a different file to read pages from. + /// Use a different file to read pages from page_file: Option, }, #[command( about = "List all languages that the ArchWiki supports", - long_about = "List all languages that the ArchWiki supports." + long_about = "List all languages that the ArchWiki supports" )] ListLanguages, #[command( about = "Download information about the pages and categories on the ArchWiki", - long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands" + long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' sub-commands" )] SyncWiki { #[arg(short = 'H', long)] - /// Hide progress indicators. + /// Hide progress indicators hide_progress: bool, #[arg(short, long)] - /// Print result to stdout instead of writing to a file. Output is formatted as YAML. + /// Print result to stdout instead of writing to a file. Output is formatted as YAML print: bool, #[arg(short, long)] - /// Use custom output file location. + /// Use custom output file location out_file: Option, }, #[command( about = "Download a copy of the ArchWiki. Will take a long time :)", - long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory." + long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub-categories are put at the top level of the wiki directory" )] LocalWiki { #[arg(short, long)] /// Amount of threads to use for fetching pages from the ArchWiki. If not provided the - /// number of physical cores is used. + /// number of physical cores is used thread_count: Option, #[arg(short, long)] - /// Use a different file to read pages from. + /// Use a different file to read pages from page_file: Option, #[arg(short = 'H', long)] - /// Hide progress indicators. + /// Hide progress indicators hide_progress: bool, #[arg(short, long)] - /// Override directory at 'location' if it already exists. + /// Override already downloaded files override_existing_files: bool, #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] - /// The format that the page should be displayed in. + /// The format that the page should be displayed in format: PageFormat, - /// Location to store the local copy of the wiki at. + /// Location to store the local copy of the wiki at location: PathBuf, }, #[command( about = "Retrive information related to this tool", - long_about = "Retrive information related to this tool. All Info is shown by default." + long_about = "Retrive information related to this tool" )] Info { #[arg(short = 'c', long)] - /// Location of the cache directory. show_cache_dir: bool, #[arg(short = 'd', long)] - /// Location of the data directory. show_data_dir: bool, #[arg(short, long)] - /// Only show values and not the properties they belong to or their descriptions. only_values: bool, }, } -- GitLab From bba7db23eb1435250d7e9facd3d8e974ade2f1e1 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 14:42:17 +0100 Subject: [PATCH 19/24] update README --- README.md | 55 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 0409007..b92dc43 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # archwiki-rs 📖 A CLI tool to read pages from the ArchWiki -## Table of contents + + - [Installation](#installation) * [crates.io](#cratesio) * [Source](#source) @@ -10,18 +11,22 @@ A CLI tool to read pages from the ArchWiki + [Basic request](#basic-request) + [Using a different format](#using-a-different-format) + [Caching](#caching) - + [404 page not found (-̥̥̥n-̥̥̥ )](#404-page-not-found--̥̥̥n-̥̥̥-) + + [404 page not found (-̥̥̥n-̥̥̥ )](#404-page-not-found--%CC%A5%CC%A5%CC%A5n-%CC%A5%CC%A5%CC%A5-) * [Searching the ArchWiki](#searching-the-archwiki) + [Search by title](#search-by-title) + [Search for text](#search-for-text) * [Downloading wiki info](#downloading-wiki-info) - + [Possible speed-ups](#possible-speed-ups) * [Listing ArchWiki information](#listing-archwiki-information) + [Listing pages](#listing-pages) + [Listing categories](#listing-categories) + [Listing languages](#listing-languages) + * [Downloading a local copy of the ArchWiki](#downloading-a-local-copy-of-the-archwiki) + + [Possible speed-ups](#possible-speed-ups) * [Other Information](#other-information) - [Plugins](#plugins) +- [Alternatives](#alternatives) + + ## Installation Currently, you can only install this tool from [ crates.io ](https://crates.io/crates/archwiki-rs) @@ -83,7 +88,7 @@ uses stderr to give the user suggestions on what they might have wanted to type. An example shell script to do something like this is available in the [repository](https://github.com/jackboxx/archwiki-rs) -under the name `example.sh`. +under the name `example.sh` which can be used like this `sh example.sh `. ### Searching the ArchWiki @@ -106,25 +111,13 @@ that the search term is in ### Downloading wiki info -Page names are stored locally to prevent having to scrape the entire table of contents of -the ArchWiki with every command. - -Use this command to fetch all page names. -Be warned, since this scrapes multiple thousand links, this can be quite slow (-, - )…zzzZZ +Page and category names are stored locally for faster look-ups. +Use this command to fetch all page and category names. ```sh archwiki-rs sync-wiki ``` -#### Possible speed-ups - -If you don't mind your CPU and network becoming a bit saturated you can increase the -amount of threads used to fetch data from the wiki - -```sh -archwiki-rs sync-wiki -t 8 -``` - ### Listing ArchWiki information #### Listing pages @@ -163,6 +156,27 @@ And the same for available languages archwiki-rs list-languages ``` +### Downloading a local copy of the ArchWiki + +Use this command to download a local copy of the ArchWiki. Be warned, that this makes over +10,000 requests for page content to the ArchWiki so it takes a while to finish (-, -)…zzzZZ + +```sh +archwiki-rs local-wiki ~/local-archwiki --format markdown +``` + +#### Possible speed-ups + +If you don't mind your CPU and network becoming a bit saturated you can increase the +amount of threads used to fetch data from the wiki. + +Keep in mind that you might get rate limited by the ArchWiki if make too many requests at once. + +```sh +archwiki-rs local-wiki -t 8 +``` + + ### Other Information Other information such as the value/location of the `cache directory` can be obtained @@ -185,3 +199,8 @@ Here's a list of programs that have plugins for `archwiki-rs` to make your life - [Neovim](https://github.com/Jackboxx/archwiki-nvim) - [Obsidian](https://github.com/Jackboxx/archwiki-obsidian) + +## Alternatives + +If you are using Arch Linux a great alternative for this tool is the `wikiman` CLI tool +in combination with the `arch-wiki-docs` package. -- GitLab From c4b79c142b6dbf3ce803b5c38fa153316164812a Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 14:43:51 +0100 Subject: [PATCH 20/24] update README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b92dc43..a404e62 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,8 @@ archwiki-rs list-languages ### Downloading a local copy of the ArchWiki -Use this command to download a local copy of the ArchWiki. Be warned, that this makes over -10,000 requests for page content to the ArchWiki so it takes a while to finish (-, -)…zzzZZ +Use this command to download a local copy of the ArchWiki. Be warned, this command makes over +10,000 requests to the ArchWiki so it takes a while to finish (-, -)…zzzZZ ```sh archwiki-rs local-wiki ~/local-archwiki --format markdown -- GitLab From f989b36f128ee6c3d2bda9d7efef52ac2be1d71b Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 14:48:44 +0100 Subject: [PATCH 21/24] add --show-urls option to local-wiki sub-command --- src/cli.rs | 3 +++ src/main.rs | 2 ++ src/wiki_download.rs | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 14af1ab..02b5ebb 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -116,6 +116,9 @@ pub enum Commands { /// Hide progress indicators hide_progress: bool, #[arg(short, long)] + /// Show URLs in plain-text files + show_urls: bool, + #[arg(short, long)] /// Override already downloaded files override_existing_files: bool, #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)] diff --git a/src/main.rs b/src/main.rs index cd55d7b..f2fbec5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -177,6 +177,7 @@ async fn main() -> Result<(), WikiError> { format, page_file, thread_count, + show_urls, override_existing_files, hide_progress, } => { @@ -195,6 +196,7 @@ async fn main() -> Result<(), WikiError> { thread_count, override_existing_files, hide_progress, + show_urls, ) .await?; } diff --git a/src/wiki_download.rs b/src/wiki_download.rs index 4c252f4..30bb827 100644 --- a/src/wiki_download.rs +++ b/src/wiki_download.rs @@ -59,6 +59,7 @@ pub async fn download_wiki( thread_count: usize, override_exisiting_files: bool, hide_progress: bool, + show_urls: bool, ) -> Result<(), WikiError> { create_dir_if_not_exists(&location)?; @@ -116,6 +117,7 @@ pub async fn download_wiki( &format_ref, &location_ref, hide_progress, + show_urls, override_exisiting_files, &multibar_ref, &catbar_ref, @@ -159,11 +161,13 @@ pub async fn download_wiki( type FailedPageFetches = Vec<(String, WikiError)>; +#[allow(clippy::too_many_arguments)] async fn download_wiki_chunk( chunk: &[(String, Vec)], format: &PageFormat, location: &Path, hide_progress: bool, + show_urls: bool, override_exisiting_files: bool, multibar: &MultiProgress, catbar: &ProgressBar, @@ -210,7 +214,7 @@ async fn download_wiki_chunk( let path = page_path(page, format, &cat_dir); if override_exisiting_files || !path.exists() { - match write_page_to_local_wiki(page, &path, format).await { + match write_page_to_local_wiki(page, &path, format, show_urls).await { Ok(()) => {} Err(err) => failed_fetches.push((page.to_owned(), err)), } @@ -225,10 +229,11 @@ async fn write_page_to_local_wiki( page: &str, page_path: &Path, format: &PageFormat, + show_urls: bool, ) -> Result<(), WikiError> { let document = fetch_page_without_recommendations(page).await?; let content = match format { - PageFormat::PlainText => convert_page_to_plain_text(&document, false), + PageFormat::PlainText => convert_page_to_plain_text(&document, show_urls), PageFormat::Markdown => convert_page_to_markdown(&document, page), PageFormat::Html => convert_page_to_html(&document, page), }; -- GitLab From 12c6f510a922503c24d53d004702e4731ebb9852 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 15:06:11 +0100 Subject: [PATCH 22/24] improve error reporting for local-wiki command --- src/main.rs | 3 +++ src/wiki_download.rs | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/main.rs b/src/main.rs index f2fbec5..b138809 100644 --- a/src/main.rs +++ b/src/main.rs @@ -50,8 +50,10 @@ async fn main() -> Result<(), WikiError> { let cache_dir = base_dir.cache_dir().join("archwiki-rs"); let data_dir = base_dir.data_local_dir().join("archwiki-rs"); + let log_dir = data_dir.join("logs"); fs::create_dir_all(&cache_dir)?; fs::create_dir_all(&data_dir)?; + fs::create_dir_all(&log_dir)?; let default_page_file_path = data_dir.join(PAGE_FILE_NAME); @@ -193,6 +195,7 @@ async fn main() -> Result<(), WikiError> { wiki_tree, format, location, + &log_dir, thread_count, override_existing_files, hide_progress, diff --git a/src/wiki_download.rs b/src/wiki_download.rs index 30bb827..9c54637 100644 --- a/src/wiki_download.rs +++ b/src/wiki_download.rs @@ -52,10 +52,12 @@ pub async fn sync_wiki_info( Ok(()) } +#[allow(clippy::too_many_arguments)] pub async fn download_wiki( wiki_tree: HashMap>, format: PageFormat, location: PathBuf, + log_dir: &Path, thread_count: usize, override_exisiting_files: bool, hide_progress: bool, @@ -63,13 +65,15 @@ pub async fn download_wiki( ) -> Result<(), WikiError> { create_dir_if_not_exists(&location)?; + let total_page_count = wiki_tree.values().map(|pages| pages.len()).sum::(); + if !hide_progress { if let Some(format) = format .to_possible_value() .as_ref() .map(PossibleValue::get_name) { - println!("downloading pages as {format}\n",) + println!("downloading {total_page_count} pages as {format}\n",) } } @@ -128,16 +132,11 @@ pub async fn download_wiki( .collect_vec(); let results = future::join_all(tasks).await; + let mut all_failed_fetches = vec![]; for result in results { match result { - Ok(Ok(failed_fetchs)) => { - if !failed_fetchs.is_empty() { - for (page, err) in failed_fetchs { - eprintln!("WARNING: failed to page '{page}'\nREASON: {err}"); - } - } - } + Ok(Ok(mut failed_fetchs)) => all_failed_fetches.append(&mut failed_fetchs), Ok(Err(thread_err)) => { eprintln!( "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}" @@ -149,6 +148,28 @@ pub async fn download_wiki( } } + if !hide_progress { + let successfuly_fetched_pages = total_page_count - all_failed_fetches.len(); + + println!("downloaded {successfuly_fetched_pages} pages successfully"); + println!("failed to download {} pages", all_failed_fetches.len()); + } + + if !all_failed_fetches.is_empty() { + let failed_fetches_str = all_failed_fetches + .into_iter() + .map(|(page, err)| format!("failed to page '{page}'\nREASON: {err}")) + .collect_vec() + .join("\n\n"); + + let path = log_dir.join("local-wiki-download-err.log"); + let write = fs::write(&path, failed_fetches_str); + + if write.is_ok() && !hide_progress { + println!("error log written to '{}'", path.to_string_lossy()); + } + } + if !hide_progress { println!( "saved local copy of the ArchWiki to '{}'", -- GitLab From b132a53601d983b197ad79102a62e3518acc2370 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 15:15:07 +0100 Subject: [PATCH 23/24] allow filtering list-pages sub-command by multiple categories instead of just 1 --- src/categories.rs | 25 ++++++++++++++++++++++--- src/cli.rs | 6 +++--- src/error.rs | 2 -- src/main.rs | 17 ++++++----------- 4 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/categories.rs b/src/categories.rs index 1285c2e..f14a0e5 100644 --- a/src/categories.rs +++ b/src/categories.rs @@ -27,13 +27,32 @@ use crate::error::WikiError; /// If it is not flattened the list is first ordered by category names and then by page names withing those /// categories. /// If it is flattened then it will by sorted by page names. -pub fn list_pages(categories: &HashMap>, flatten: bool) -> String { +pub fn list_pages( + wiki_tree: &HashMap>, + categories_filter: Option<&[String]>, + flatten: bool, +) -> String { if flatten { - return categories.values().flatten().unique().sorted().join("\n"); + return wiki_tree + .iter() + .filter_map(|(cat, pages)| { + categories_filter + .map(|filter| filter.iter().contains(cat).then_some(pages)) + .unwrap_or(Some(pages)) + }) + .flatten() + .unique() + .sorted() + .join("\n"); } - categories + wiki_tree .iter() + .filter_map(|(cat, pages)| { + categories_filter + .map(|filter| filter.iter().contains(cat).then_some((cat, pages))) + .unwrap_or(Some((cat, pages))) + }) .sorted() .map(|(cat, pages)| { let list = pages.iter().map(|p| format!("───┤{p}")).join("\n"); diff --git a/src/cli.rs b/src/cli.rs index 02b5ebb..6be1d1d 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -64,9 +64,9 @@ pub enum Commands { #[arg(short, long)] /// Flatten all pages and don't show their category names flatten: bool, - #[arg(short, long)] - /// Only show pages in this category - category: Option, + #[arg(short, long, value_delimiter = ',')] + /// Only show pages in these categories + categories: Vec, #[arg(short, long)] /// Use a different file to read pages from page_file: Option, diff --git a/src/error.rs b/src/error.rs index 4cc13cd..c729406 100644 --- a/src/error.rs +++ b/src/error.rs @@ -48,6 +48,4 @@ pub enum WikiError { InvalidApiResponse(InvalidApiResponseError), #[error("{}", .0)] NoPageFound(String), - #[error("The category '{}' could not be found", .0)] - NoCategoryFound(String), } diff --git a/src/main.rs b/src/main.rs index b138809..9ccc9cf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -124,7 +124,7 @@ async fn main() -> Result<(), WikiError> { } Commands::ListPages { flatten, - category, + categories, page_file, } => { let (path, is_default) = page_file @@ -132,16 +132,11 @@ async fn main() -> Result<(), WikiError> { .unwrap_or((default_page_file_path, true)); let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?; - let out = if let Some(category) = category { - wiki_tree - .get(&category) - .ok_or(WikiError::NoCategoryFound(category))? - .iter() - .sorted() - .join("\n") - } else { - list_pages(&wiki_tree, flatten) - }; + let out = list_pages( + &wiki_tree, + (!categories.is_empty()).then_some(&categories), + flatten, + ); println!("{out}"); } -- GitLab From 0125fa49d3986021e1d395c6971ac81acbc1d0a2 Mon Sep 17 00:00:00 2001 From: jackboxx Date: Fri, 19 Jan 2024 15:26:28 +0100 Subject: [PATCH 24/24] show failed fetch count only if failures occured --- src/wiki_download.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/wiki_download.rs b/src/wiki_download.rs index 9c54637..8137b4c 100644 --- a/src/wiki_download.rs +++ b/src/wiki_download.rs @@ -150,12 +150,14 @@ pub async fn download_wiki( if !hide_progress { let successfuly_fetched_pages = total_page_count - all_failed_fetches.len(); - println!("downloaded {successfuly_fetched_pages} pages successfully"); - println!("failed to download {} pages", all_failed_fetches.len()); } if !all_failed_fetches.is_empty() { + if !hide_progress { + println!("failed to download {} pages", all_failed_fetches.len()); + } + let failed_fetches_str = all_failed_fetches .into_iter() .map(|(page, err)| format!("failed to page '{page}'\nREASON: {err}")) -- GitLab