From c398b681a6fdaaa328bb418290688097ea64e33f Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Mon, 15 Jan 2024 19:48:47 +0100
Subject: [PATCH 01/24] use wiki media api to get page source

---
 Cargo.lock                |   7 +++
 Cargo.toml                |   1 +
 src/categories.rs         | 128 ++------------------------------------
 src/formats/html.rs       |  11 +---
 src/formats/markdown.rs   |   9 +--
 src/formats/plain_text.rs |  12 ++--
 src/main.rs               |  13 +---
 src/search.rs             |  27 ++------
 src/utils.rs              |  29 +--------
 src/wiki_api.rs           |  34 +++++-----
 10 files changed, 48 insertions(+), 223 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 2bc8ec6..cbcb5a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -115,6 +115,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "url",
+ "urlencoding",
 ]
 
 [[package]]
@@ -2060,6 +2061,12 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "urlencoding"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+
 [[package]]
 name = "utf-8"
 version = "0.7.6"
diff --git a/Cargo.toml b/Cargo.toml
index 1e42804..ac4c33e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ termination = "0.1.2"
 thiserror = "1.0.50"
 tokio = { version = "1.33.0", features = ["full"] }
 url = "2.4.1"
+urlencoding = "2.1.3"
 
 [dev-dependencies]
 assert_cmd = "2.0.12"
diff --git a/src/categories.rs b/src/categories.rs
index a26a702..23fb3ae 100644
--- a/src/categories.rs
+++ b/src/categories.rs
@@ -1,9 +1,5 @@
-use ::futures::future;
-use indicatif::{MultiProgress, ProgressBar};
 use itertools::Itertools;
-use scraper::{Html, Node, Selector};
-use std::{collections::HashMap, thread, time::Duration};
-use url::Url;
+use std::collections::HashMap;
 
 #[derive(Debug, Clone)]
 struct CategoryListItem {
@@ -11,11 +7,7 @@ struct CategoryListItem {
     url: String,
 }
 
-use crate::{
-    error::WikiError,
-    utils::{extract_tag_attr, get_elements_by_tag, HtmlTag},
-    wiki_api::fetch_page_by_url,
-};
+use crate::error::WikiError;
 
 /// Returns a print ready list of the provided page names in
 /// 1. A tree format if `flatten` is `false`:
@@ -55,127 +47,17 @@ pub fn list_pages(categories: &HashMap<String, Vec<String>>, flatten: bool) -> S
         .join("\n\n")
 }
 
+/// TODO replace with api call
 /// Scrapes the ArchWiki for all page names and their immediate parent category. Category nesting
 /// is ignored as a category can be a sub category of multiple other categories.
 ///
 /// Caution this function will most likely take several minutes to finish (-, – )…zzzZZ
+#[allow(unused)]
 pub async fn fetch_all_pages(
     hide_progress: bool,
     thread_count: usize,
     max_categories: Option<u32>,
     start_at: Option<&str>,
 ) -> Result<HashMap<String, Vec<String>>, WikiError> {
-    let from = start_at.unwrap_or("");
-    let limit = max_categories.unwrap_or(10000);
-
-    let base_url = "https://wiki.archlinux.org/index.php?title=Special:Categories";
-
-    let url = Url::parse_with_params(
-        base_url,
-        &[("from", from), ("limit", limit.to_string().as_str())],
-    )?;
-
-    let document = fetch_page_by_url(url).await?;
-
-    let body_class = ".mw-spcontent";
-    let selector = Selector::parse(body_class)
-        .unwrap_or_else(|_| panic!("{body_class} should be valid selector"));
-
-    let body = document.select(&selector).next().unwrap();
-
-    let category_list_element = get_elements_by_tag(*body, &HtmlTag::Ul)
-        .into_iter()
-        .next()
-        .unwrap();
-
-    let items = parse_category_list(category_list_element);
-    let multi_bar = MultiProgress::new();
-
-    let chunk_count = items.len() / thread_count;
-    let tasks = items
-        .chunks(chunk_count)
-        .map(|chunk| {
-            let chunk = chunk.to_vec();
-            let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0));
-            let bar = multi_bar.add(bar);
-            if hide_progress {
-                bar.finish_and_clear();
-            }
-
-            tokio::spawn(async move {
-                let mut res = Vec::with_capacity(chunk.len());
-                for item in chunk {
-                    let pages = match fetch_page_names_from_categoriy(&item.url).await {
-                        Ok(pages) => pages,
-
-                        Err(_) => {
-                            thread::sleep(Duration::from_secs(1));
-                            fetch_page_names_from_categoriy(&item.url)
-                                .await
-                                .unwrap_or_else(|err| {
-                                    eprintln!(
-                                        "failed to fetch pages in category {}\n ERROR {err}",
-                                        item.name
-                                    );
-                                    vec![]
-                                })
-                        }
-                    };
-
-                    res.push((item.name, pages));
-                    bar.inc(1);
-                }
-
-                res
-            })
-        })
-        .collect_vec();
-
-    let out = future::join_all(tasks)
-        .await
-        .into_iter()
-        .flatten()
-        .flatten()
-        .collect_vec();
-
-    Ok(HashMap::from_iter(out))
-}
-
-fn parse_category_list(list_node: ego_tree::NodeRef<'_, scraper::Node>) -> Vec<CategoryListItem> {
-    let list_items = get_elements_by_tag(list_node, &HtmlTag::Li);
-    list_items
-        .into_iter()
-        .flat_map(|li| {
-            let a_tag = li.first_child()?;
-            let a_tag_element = a_tag.value().as_element()?;
-
-            let name = a_tag.first_child()?.value().as_text()?.to_string();
-            let url = extract_tag_attr(a_tag_element, &HtmlTag::A, "href")?;
-
-            Some(CategoryListItem { name, url })
-        })
-        .collect()
-}
-
-/// Scrape the ArchWiki for a list of all page names that belong to a specific category
-async fn fetch_page_names_from_categoriy(url_str: &str) -> Result<Vec<String>, WikiError> {
-    let selector = Selector::parse("#mw-pages").expect("#mw-pages to be a valid css selector");
-
-    let body = reqwest::get(url_str).await?.text().await?;
-    let document = Html::parse_document(&body);
-
-    let Some(page_container) =  document.select(&selector).next() else {
-        return Ok(vec![])
-    };
-
-    Ok(page_container
-        .descendants()
-        .filter_map(|node| {
-            if let Node::Element(e) = node.value() {
-                extract_tag_attr(e, &HtmlTag::A, "title")
-            } else {
-                None
-            }
-        })
-        .collect())
+    todo!()
 }
diff --git a/src/formats/html.rs b/src/formats/html.rs
index 090cafd..ca60728 100644
--- a/src/formats/html.rs
+++ b/src/formats/html.rs
@@ -1,36 +1,31 @@
 use scraper::Html;
 
-use crate::utils::get_page_content;
-
 /// Converts the body of the ArchWiki page to a HTML string
 pub fn convert_page_to_html(document: &Html, page: &str) -> String {
-    let content = get_page_content(document).expect("page should have content");
-
     format!(
         "<h1>{heading}</h1>\n{body}",
         heading = page,
-        body = content.html()
+        body = document.html()
     )
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::utils::PAGE_CONTENT_CLASS;
     use pretty_assertions::assert_eq;
 
     #[tokio::test]
     async fn test_convert_page_to_html() {
         let page = "test page";
         let input = format!(
-            r#"<div class="{PAGE_CONTENT_CLASS}">
+            r#"<div>
     <title>Hello, world!</title>
 </div>"#
         );
 
         let expected_output = format!(
             r#"<h1>{page}</h1>
-<div class="{PAGE_CONTENT_CLASS}">
+<div>
     <title>Hello, world!</title>
 </div>"#
         );
diff --git a/src/formats/markdown.rs b/src/formats/markdown.rs
index c36449d..126fa58 100644
--- a/src/formats/markdown.rs
+++ b/src/formats/markdown.rs
@@ -1,26 +1,21 @@
 use scraper::Html;
 
-use crate::utils::get_page_content;
-
 /// Converts the body of the ArchWiki page to a Markdown string
 pub fn convert_page_to_markdown(document: &Html, page: &str) -> String {
-    let content = get_page_content(document).expect("page should have content");
-
-    let md = html2md::parse_html(&content.html());
+    let md = html2md::parse_html(&document.html());
     format!("# {heading}\n\n{body}", heading = page, body = md)
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::utils::PAGE_CONTENT_CLASS;
     use pretty_assertions::assert_eq;
 
     #[tokio::test]
     async fn test_convert_page_to_markdown() {
         let page = "test page";
         let input = format!(
-            r#"<div class="{PAGE_CONTENT_CLASS}">
+            r#"<div>
     <h3>Hello, world!</h3>
 </div>"#
         );
diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs
index 6b7b6a2..3f100c4 100644
--- a/src/formats/plain_text.rs
+++ b/src/formats/plain_text.rs
@@ -2,14 +2,13 @@ use colored::Colorize;
 use ego_tree::NodeRef;
 use scraper::{Html, Node};
 
-use crate::utils::{extract_tag_attr, get_page_content, HtmlTag};
+use crate::utils::{extract_tag_attr, HtmlTag};
 
 /// Converts the body of the ArchWiki page to a plain text string, removing all tags and
 /// only leaving the text node content. URLs can be shown in a markdown like syntax.
 pub fn convert_page_to_plain_text(document: &Html, show_urls: bool) -> String {
-    let content = get_page_content(document).expect("page should have content");
-
-    content
+    document
+        .root_element()
         .children()
         .map(|node| format_children(node, show_urls))
         .collect::<Vec<String>>()
@@ -86,14 +85,13 @@ fn wrap_text_in_url(text: &str, url: &str) -> String {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::utils::PAGE_CONTENT_CLASS;
     use pretty_assertions::assert_eq;
 
     #[tokio::test]
     async fn test_convert_page_to_plain_text() {
         {
             let input = format!(
-                r#"<div class="{PAGE_CONTENT_CLASS}">
+                r#"<div">
     <h3>Hello, world!</h3>
     <div>how <span><bold>are</bold></span> you</div>
     I'm great
@@ -116,7 +114,7 @@ mod tests {
 
         {
             let input = format!(
-                r#"<div class="{PAGE_CONTENT_CLASS}">
+                r#"<div>
     <h3>Hello, world!</h3>
     <a href="example.com">example</a>
 </div>"#
diff --git a/src/main.rs b/src/main.rs
index 493cb2a..c64be63 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,7 +15,7 @@ use crate::{
     formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat},
     languages::{fetch_all_langs, format_lang_table},
     search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel},
-    utils::{create_cache_page_path, get_page_content, page_cache_exists, read_pages_file_as_str},
+    utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str},
     wiki_api::{fetch_open_search, fetch_page, fetch_text_search},
 };
 
@@ -235,16 +235,7 @@ async fn main() -> Result<(), WikiError> {
 
 async fn fetch_document(page: &str, lang: Option<&str>) -> Result<Html, WikiError> {
     match Url::parse(page) {
-        Ok(url) => {
-            let document = fetch_page_by_url(url).await?;
-            if get_page_content(&document).is_none() {
-                return Err(WikiError::NoPageFound(
-                    "page is not a valid ArchWiki page".to_owned(),
-                ));
-            }
-
-            Ok(document)
-        }
+        Ok(url) => fetch_page_by_url(url).await,
         Err(_) => fetch_page(page, lang).await,
     }
 }
diff --git a/src/search.rs b/src/search.rs
index 15e85e2..e42e093 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -131,9 +131,10 @@ pub fn open_search_to_page_names(
     }
 }
 
+/// TODO
 /// Checks if the open search result contains a name that exactly matches the provided page name.
 /// If there is a match the corresponding page URL is returned.
-pub fn open_search_get_exact_match_url(
+pub fn open_search_is_page_exact_match(
     page: &str,
     search_result: &[OpenSearchItem],
 ) -> Result<Option<String>, WikiError> {
@@ -143,31 +144,15 @@ pub fn open_search_get_exact_match_url(
         IAR::OpenSearchMissingNthElement(1),
     ))?;
 
-    let page_urls = search_result.get(3).ok_or(WikiError::InvalidApiResponse(
-        IAR::OpenSearchMissingNthElement(3),
-    ))?;
-
     let OpenSearchItem::Array(names) = page_names else {
         return Err(WikiError::InvalidApiResponse(
             IAR::OpenSearchNthElementShouldBeArray(1),
-        ))
+        ));
     };
 
-    let OpenSearchItem::Array(urls) = page_urls  else {
-        return Err(WikiError::InvalidApiResponse(
-            IAR::OpenSearchNthElementShouldBeArray(3),
-        ))
-    };
-
-    if let Some(name) = names.first() {
-        if name == page {
-            Ok(urls.first().cloned())
-        } else {
-            Ok(None)
-        }
-    } else {
-        Ok(None)
-    }
+    Ok(names
+        .first()
+        .and_then(|name| (name == page).then_some(name.to_owned())))
 }
 
 #[cfg(test)]
diff --git a/src/utils.rs b/src/utils.rs
index b670f00..2dc1d6c 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -4,14 +4,11 @@ use std::{
     path::{Path, PathBuf},
 };
 
-use ego_tree::NodeRef;
 use regex::Regex;
-use scraper::{node::Element, ElementRef, Html, Node, Selector};
+use scraper::node::Element;
 
 use crate::{error::WikiError, formats::PageFormat};
 
-pub const PAGE_CONTENT_CLASS: &str = "mw-parser-output";
-
 pub enum HtmlTag {
     A,
     Ul,
@@ -63,30 +60,6 @@ pub fn page_cache_exists(
     Ok(secs_since_modified < fourteen_days)
 }
 
-/// Selects the body of an ArchWiki page
-pub fn get_page_content(document: &Html) -> Option<ElementRef<'_>> {
-    let class = format!(".{PAGE_CONTENT_CLASS}");
-    let selector =
-        Selector::parse(&class).unwrap_or_else(|_| panic!("{class} should be valid selector"));
-    document.select(&selector).next()
-}
-
-pub fn get_elements_by_tag<'a>(root: NodeRef<'a, Node>, tag: &HtmlTag) -> Vec<NodeRef<'a, Node>> {
-    root.children()
-        .flat_map(|n| {
-            if let Node::Element(e) = n.value() {
-                if e.name() == tag.name() {
-                    Some(n)
-                } else {
-                    None
-                }
-            } else {
-                None
-            }
-        })
-        .collect()
-}
-
 pub fn extract_tag_attr(element: &Element, tag: &HtmlTag, attr: &str) -> Option<String> {
     if element.name() == tag.name() {
         element.attr(attr).map(|attr| attr.to_owned())
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index 3c38758..ce4dd85 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -4,7 +4,7 @@ use url::Url;
 use crate::{
     error::WikiError,
     search::{
-        open_search_get_exact_match_url, open_search_to_page_names, OpenSearchItem,
+        open_search_is_page_exact_match, open_search_to_page_names, OpenSearchItem,
         TextSearchApiResponse, TextSearchItem,
     },
     utils::update_relative_urls,
@@ -25,7 +25,10 @@ pub async fn fetch_open_search(
     let res: Vec<OpenSearchItem> = serde_json::from_str(&body)?;
 
     // the first item in the response should be the search term
-    debug_assert_eq!(res.first(), Some(&OpenSearchItem::Single(search.to_owned())));
+    debug_assert_eq!(
+        res.first(),
+        Some(&OpenSearchItem::Single(search.to_owned()))
+    );
 
     Ok(res)
 }
@@ -46,37 +49,32 @@ pub async fn fetch_text_search(
     Ok(res.query.search)
 }
 
-/// Gets an ArchWiki pages entire content. Also updates all relative URLs to absolute URLs.
-/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim`
+/// Gets the HTML content of an ArchWiki page.
 ///
-/// If the ArchWiki page doesn't have exists the top 5 pages that are most
+/// If the ArchWiki page doesn't exists the top 5 pages that are most
 /// like the page that was given as an argument are returned as a `NoPageFound` error.
 pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result<Html, WikiError> {
     let lang = lang.unwrap_or("en");
-
     let search_res = fetch_open_search(page, lang, 5).await?;
 
-    let Some(url) = open_search_get_exact_match_url(page, &search_res)? else {
+    let Some(page_title) = open_search_is_page_exact_match(page, &search_res)? else {
         let similar_pages = open_search_to_page_names(&search_res)?;
         return Err(WikiError::NoPageFound(similar_pages.join("\n")));
     };
 
-    let parsed_url = Url::parse(&url)
-        .unwrap_or(Url::parse("https://wiki.archlinux.org").expect("should be a valid URL"));
-    let base_url = format!(
-        "{schema}://{host}",
-        schema = parsed_url.scheme(),
-        host = parsed_url.host_str().unwrap_or("")
+    let raw_url = format!(
+        "https://wiki.archlinux.org/rest.php/v1/page/{title}/html",
+        title = urlencoding::encode(&page_title)
     );
+    let url = Url::parse(&raw_url)?;
 
-    let body = reqwest::get(&url).await?.text().await?;
-    let body_with_abs_urls = update_relative_urls(&body, &base_url);
-
-    Ok(Html::parse_document(&body_with_abs_urls))
+    let document = fetch_page_by_url(url).await?;
+    Ok(document)
 }
 
 /// Gets an ArchWiki pages entire content. Also updates all relative URLs to absolute URLs.
-/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim`
+/// `/title/Neovim` -> `https://wiki.archlinux.org/title/Neovim`.
+/// A different base URL is used for pages that aren't hosted directly on `wiki.archlinux.org`
 ///
 /// If the page has no content a `NoPageFound` Error is returned.
 pub async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
-- 
GitLab


From 50a71e3959a4f1480483533f80a892339fec92c3 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Mon, 15 Jan 2024 20:22:18 +0100
Subject: [PATCH 02/24] wip: use fetch all endpoint to get pages

---
 src/categories.rs         | 21 ++----------
 src/cli.rs                |  8 ++---
 src/formats/plain_text.rs |  4 +--
 src/languages.rs          |  2 +-
 src/main.rs               | 17 ++++------
 src/search.rs             | 12 +++----
 src/utils.rs              | 20 ++----------
 src/wiki_api.rs           | 68 +++++++++++++++++++++++++++++++++++++--
 8 files changed, 86 insertions(+), 66 deletions(-)

diff --git a/src/categories.rs b/src/categories.rs
index 23fb3ae..03a53e7 100644
--- a/src/categories.rs
+++ b/src/categories.rs
@@ -1,12 +1,8 @@
+#![allow(unused)]
+
 use itertools::Itertools;
 use std::collections::HashMap;
 
-#[derive(Debug, Clone)]
-struct CategoryListItem {
-    name: String,
-    url: String,
-}
-
 use crate::error::WikiError;
 
 /// Returns a print ready list of the provided page names in
@@ -47,17 +43,6 @@ pub fn list_pages(categories: &HashMap<String, Vec<String>>, flatten: bool) -> S
         .join("\n\n")
 }
 
-/// TODO replace with api call
-/// Scrapes the ArchWiki for all page names and their immediate parent category. Category nesting
-/// is ignored as a category can be a sub category of multiple other categories.
-///
-/// Caution this function will most likely take several minutes to finish (-, – )…zzzZZ
-#[allow(unused)]
-pub async fn fetch_all_pages(
-    hide_progress: bool,
-    thread_count: usize,
-    max_categories: Option<u32>,
-    start_at: Option<&str>,
-) -> Result<HashMap<String, Vec<String>>, WikiError> {
+pub async fn fetch_page_categories(page: &str) -> Result<Vec<String>, WikiError> {
     todo!()
 }
diff --git a/src/cli.rs b/src/cli.rs
index b2f6a58..8932176 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -98,12 +98,8 @@ pub enum Commands {
         /// Number of threads to use for fetching data from the ArchWiki
         thread_count: Option<usize>,
         #[arg(short, long)]
-        /// Maximum amount of categories to fetch. If no value if provided all categories are
-        /// fetched.
-        max_categories: Option<u32>,
-        #[arg(short, long)]
-        /// First category that will be fetched. See 'https://wiki.archlinux.org/index.php?title=Special:Categories' for more information.
-        start_at: Option<String>,
+        /// Delay (in milliseconds) between requests. Note that this applies on a per thread basis.
+        delay: Option<u64>,
         #[arg(short, long)]
         /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
         print: bool,
diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs
index 3f100c4..b160857 100644
--- a/src/formats/plain_text.rs
+++ b/src/formats/plain_text.rs
@@ -2,7 +2,7 @@ use colored::Colorize;
 use ego_tree::NodeRef;
 use scraper::{Html, Node};
 
-use crate::utils::{extract_tag_attr, HtmlTag};
+use crate::utils::extract_tag_attr;
 
 /// Converts the body of the ArchWiki page to a plain text string, removing all tags and
 /// only leaving the text node content. URLs can be shown in a markdown like syntax.
@@ -29,7 +29,7 @@ pub fn format_children(node: NodeRef<Node>, show_urls: bool) -> String {
                 if show_urls {
                     wrap_text_in_url(
                         &child_text,
-                        &extract_tag_attr(e, &HtmlTag::A, "href").unwrap_or("".to_string()),
+                        &extract_tag_attr(e, "a", "href").unwrap_or("".to_string()),
                     )
                 } else {
                     child_text
diff --git a/src/languages.rs b/src/languages.rs
index 543084f..42f714b 100644
--- a/src/languages.rs
+++ b/src/languages.rs
@@ -20,7 +20,7 @@ pub struct Language {
 
 pub async fn fetch_all_langs() -> Result<Vec<Language>, WikiError> {
     let body = reqwest::get(LANGUAGE_API_URL).await?.text().await?;
-    let json: ApiResponse<LanguageApiResponse> = serde_json::from_str(&body)?;
+    let json: ApiResponse<LanguageApiResponse, ()> = serde_json::from_str(&body)?;
 
     Ok(json.query.languages)
 }
diff --git a/src/main.rs b/src/main.rs
index c64be63..862a44f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -11,12 +11,12 @@ use url::Url;
 use wiki_api::fetch_page_by_url;
 
 use crate::{
-    categories::{fetch_all_pages, list_pages},
+    categories::list_pages,
     formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat},
     languages::{fetch_all_langs, format_lang_table},
     search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel},
     utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str},
-    wiki_api::{fetch_open_search, fetch_page, fetch_text_search},
+    wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search},
 };
 
 mod categories;
@@ -158,18 +158,13 @@ async fn main() -> Result<(), WikiError> {
         Commands::SyncWiki {
             hide_progress,
             thread_count,
-            max_categories,
-            start_at,
+            delay,
             print,
         } => {
             let thread_count = thread_count.unwrap_or(num_cpus::get_physical());
-            let res = fetch_all_pages(
-                hide_progress,
-                thread_count,
-                max_categories,
-                start_at.as_deref(),
-            )
-            .await?;
+            let res = fetch_all_pages().await?;
+            println!("{}", res.join("\n"));
+            panic!("oops");
 
             let out = serde_yaml::to_string(&res)?;
 
diff --git a/src/search.rs b/src/search.rs
index e42e093..6af0f92 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -131,13 +131,11 @@ pub fn open_search_to_page_names(
     }
 }
 
-/// TODO
-/// Checks if the open search result contains a name that exactly matches the provided page name.
-/// If there is a match the corresponding page URL is returned.
-pub fn open_search_is_page_exact_match(
-    page: &str,
+/// Return provided page name if the top search result exactly matches it
+pub fn open_search_is_page_exact_match<'a>(
+    page: &'a str,
     search_result: &[OpenSearchItem],
-) -> Result<Option<String>, WikiError> {
+) -> Result<Option<&'a str>, WikiError> {
     use crate::error::InvalidApiResponseError as IAR;
 
     let page_names = search_result.get(1).ok_or(WikiError::InvalidApiResponse(
@@ -152,7 +150,7 @@ pub fn open_search_is_page_exact_match(
 
     Ok(names
         .first()
-        .and_then(|name| (name == page).then_some(name.to_owned())))
+        .and_then(|name| (name == page).then_some(page)))
 }
 
 #[cfg(test)]
diff --git a/src/utils.rs b/src/utils.rs
index 2dc1d6c..44d11ed 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -9,22 +9,6 @@ use scraper::node::Element;
 
 use crate::{error::WikiError, formats::PageFormat};
 
-pub enum HtmlTag {
-    A,
-    Ul,
-    Li,
-}
-
-impl HtmlTag {
-    pub fn name(&self) -> String {
-        match *self {
-            HtmlTag::A => "a".to_owned(),
-            HtmlTag::Ul => "ul".to_owned(),
-            HtmlTag::Li => "li".to_owned(),
-        }
-    }
-}
-
 /// Construct a path to cache a page. Different page formats are cached separately.
 /// All none word characters are escaped with an '_'
 pub fn create_cache_page_path(page: &str, format: &PageFormat, cache_dir: &Path) -> PathBuf {
@@ -60,8 +44,8 @@ pub fn page_cache_exists(
     Ok(secs_since_modified < fourteen_days)
 }
 
-pub fn extract_tag_attr(element: &Element, tag: &HtmlTag, attr: &str) -> Option<String> {
-    if element.name() == tag.name() {
+pub fn extract_tag_attr(element: &Element, tag: &str, attr: &str) -> Option<String> {
+    if element.name() == tag {
         element.attr(attr).map(|attr| attr.to_owned())
     } else {
         None
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index ce4dd85..237997e 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -1,4 +1,5 @@
 use scraper::Html;
+use serde::Deserialize;
 use url::Url;
 
 use crate::{
@@ -11,8 +12,9 @@ use crate::{
 };
 
 #[derive(Debug, Clone, serde::Deserialize)]
-pub struct ApiResponse<T> {
+pub struct ApiResponse<T, V> {
     pub query: T,
+    pub r#continue: Option<V>,
 }
 
 pub async fn fetch_open_search(
@@ -40,7 +42,7 @@ pub async fn fetch_text_search(
 ) -> Result<Vec<TextSearchItem>, WikiError> {
     let url = format!("https://wiki.archlinux.org/api.php?action=query&list=search&format=json&srwhat=text&uselang={lang}&srlimit={limit}&srsearch={search}");
     let body = reqwest::get(url).await?.text().await?;
-    let mut res: ApiResponse<TextSearchApiResponse> = serde_json::from_str(&body)?;
+    let mut res: ApiResponse<TextSearchApiResponse, ()> = serde_json::from_str(&body)?;
 
     for item in res.query.search.as_mut_slice() {
         item.prettify_snippet(search);
@@ -64,7 +66,7 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result<Html, WikiErro
 
     let raw_url = format!(
         "https://wiki.archlinux.org/rest.php/v1/page/{title}/html",
-        title = urlencoding::encode(&page_title)
+        title = urlencoding::encode(page_title)
     );
     let url = Url::parse(&raw_url)?;
 
@@ -89,3 +91,63 @@ pub async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
 
     Ok(Html::parse_document(&body_with_abs_urls))
 }
+
+pub async fn fetch_all_pages() -> Result<Vec<String>, WikiError> {
+    #[derive(Debug, Deserialize)]
+    struct ApiAllPagesQuery {
+        allpages: Vec<Page>,
+    }
+
+    #[derive(Debug, Deserialize)]
+    struct Page {
+        title: String,
+    }
+
+    impl From<Page> for String {
+        fn from(value: Page) -> Self {
+            value.title
+        }
+    }
+
+    #[derive(Debug, Deserialize)]
+    struct ApiAllPageContinueParams {
+        apcontinue: String,
+    }
+
+    let api_url = format!(
+        "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500"
+    );
+
+    let mut pages: Vec<String> = vec![];
+
+    let body = reqwest::get(&api_url).await?.text().await?;
+    let mut api_resp: ApiResponse<ApiAllPagesQuery, ApiAllPageContinueParams> =
+        serde_json::from_str(&body)?;
+
+    pages.append(
+        &mut api_resp
+            .query
+            .allpages
+            .into_iter()
+            .map(Into::into)
+            .collect(),
+    );
+
+    while let Some(continue_params) = api_resp.r#continue {
+        let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue);
+
+        let body = reqwest::get(&next_api_url).await?.text().await?;
+        api_resp = serde_json::from_str(&body)?;
+
+        pages.append(
+            &mut api_resp
+                .query
+                .allpages
+                .into_iter()
+                .map(Into::into)
+                .collect(),
+        );
+    }
+
+    Ok(pages)
+}
-- 
GitLab


From e817b9374b7042ec5de8e70653be00ae54e0ab54 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Mon, 15 Jan 2024 20:59:35 +0100
Subject: [PATCH 03/24] wip: fetch pages per category

---
 src/languages.rs |  2 +-
 src/main.rs      |  2 +-
 src/wiki_api.rs  | 98 ++++++++++++++++++++++++++++++++++++------------
 3 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/src/languages.rs b/src/languages.rs
index 42f714b..543084f 100644
--- a/src/languages.rs
+++ b/src/languages.rs
@@ -20,7 +20,7 @@ pub struct Language {
 
 pub async fn fetch_all_langs() -> Result<Vec<Language>, WikiError> {
     let body = reqwest::get(LANGUAGE_API_URL).await?.text().await?;
-    let json: ApiResponse<LanguageApiResponse, ()> = serde_json::from_str(&body)?;
+    let json: ApiResponse<LanguageApiResponse> = serde_json::from_str(&body)?;
 
     Ok(json.query.languages)
 }
diff --git a/src/main.rs b/src/main.rs
index 862a44f..91a83a7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -163,7 +163,7 @@ async fn main() -> Result<(), WikiError> {
         } => {
             let thread_count = thread_count.unwrap_or(num_cpus::get_physical());
             let res = fetch_all_pages().await?;
-            println!("{}", res.join("\n"));
+            dbg!(res);
             panic!("oops");
 
             let out = serde_yaml::to_string(&res)?;
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index 237997e..bf564f6 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+
 use scraper::Html;
 use serde::Deserialize;
 use url::Url;
@@ -12,7 +14,12 @@ use crate::{
 };
 
 #[derive(Debug, Clone, serde::Deserialize)]
-pub struct ApiResponse<T, V> {
+pub struct ApiResponse<T> {
+    pub query: T,
+}
+
+#[derive(Debug, Clone, serde::Deserialize)]
+pub struct ApiResponseWithContinue<T, V> {
     pub query: T,
     pub r#continue: Option<V>,
 }
@@ -42,7 +49,7 @@ pub async fn fetch_text_search(
 ) -> Result<Vec<TextSearchItem>, WikiError> {
     let url = format!("https://wiki.archlinux.org/api.php?action=query&list=search&format=json&srwhat=text&uselang={lang}&srlimit={limit}&srsearch={search}");
     let body = reqwest::get(url).await?.text().await?;
-    let mut res: ApiResponse<TextSearchApiResponse, ()> = serde_json::from_str(&body)?;
+    let mut res: ApiResponse<TextSearchApiResponse> = serde_json::from_str(&body)?;
 
     for item in res.query.search.as_mut_slice() {
         item.prettify_snippet(search);
@@ -92,62 +99,105 @@ pub async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
     Ok(Html::parse_document(&body_with_abs_urls))
 }
 
-pub async fn fetch_all_pages() -> Result<Vec<String>, WikiError> {
+pub async fn fetch_all_pages() -> Result<HashMap<String, Vec<String>>, WikiError> {
+    let categories = fetch_all_categories().await?;
+    let mut wiki = HashMap::new();
+
+    for category in categories {
+        let pages = fetch_pages_in_category(&category).await?;
+        if !pages.is_empty() {
+            wiki.insert(category, pages);
+        }
+    }
+
+    Ok(wiki)
+}
+
+async fn fetch_all_categories() -> Result<Vec<String>, WikiError> {
     #[derive(Debug, Deserialize)]
-    struct ApiAllPagesQuery {
-        allpages: Vec<Page>,
+    struct ApiAllCategoriesQuery {
+        allcategories: Vec<Category>,
     }
 
     #[derive(Debug, Deserialize)]
-    struct Page {
-        title: String,
+    struct Category {
+        #[serde[rename = "*"]]
+        name: String,
     }
 
-    impl From<Page> for String {
-        fn from(value: Page) -> Self {
-            value.title
+    impl From<Category> for String {
+        fn from(value: Category) -> Self {
+            value.name
         }
     }
 
     #[derive(Debug, Deserialize)]
-    struct ApiAllPageContinueParams {
-        apcontinue: String,
+    struct ApiAllCategoryContinueParams {
+        accontinue: String,
     }
 
-    let api_url = format!(
-        "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500"
-    );
+    let api_url = "https://wiki.archlinux.org/api.php?action=query&list=allcategories&format=json&aclimit=500";
 
-    let mut pages: Vec<String> = vec![];
+    let mut categories: Vec<String> = vec![];
 
-    let body = reqwest::get(&api_url).await?.text().await?;
-    let mut api_resp: ApiResponse<ApiAllPagesQuery, ApiAllPageContinueParams> =
+    let body = reqwest::get(api_url).await?.text().await?;
+    let mut api_resp: ApiResponseWithContinue<ApiAllCategoriesQuery, ApiAllCategoryContinueParams> =
         serde_json::from_str(&body)?;
 
-    pages.append(
+    categories.append(
         &mut api_resp
             .query
-            .allpages
+            .allcategories
             .into_iter()
             .map(Into::into)
             .collect(),
     );
 
     while let Some(continue_params) = api_resp.r#continue {
-        let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue);
+        let next_api_url = format!("{api_url}&accontinue={}", continue_params.accontinue);
 
         let body = reqwest::get(&next_api_url).await?.text().await?;
         api_resp = serde_json::from_str(&body)?;
 
-        pages.append(
+        categories.append(
             &mut api_resp
                 .query
-                .allpages
+                .allcategories
                 .into_iter()
                 .map(Into::into)
                 .collect(),
         );
     }
 
-    Ok(pages)
+    Ok(categories)
+}
+
+async fn fetch_pages_in_category(category: &str) -> Result<Vec<String>, WikiError> {
+    #[derive(Debug, Deserialize)]
+    struct ApiCategoryMembersQuery {
+        categorymembers: Vec<Page>,
+    }
+
+    #[derive(Debug, Deserialize)]
+    struct Page {
+        title: String,
+    }
+
+    impl From<Page> for String {
+        fn from(value: Page) -> Self {
+            value.title
+        }
+    }
+
+    let api_url = format!("https://wiki.archlinux.org/api.php?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=500&cmtitle=Category:{title}", title = urlencoding::encode(category));
+
+    let body = reqwest::get(dbg!(api_url)).await?.text().await?;
+    let api_resp: ApiResponse<ApiCategoryMembersQuery> = serde_json::from_str(&dbg!(body))?;
+
+    Ok(api_resp
+        .query
+        .categorymembers
+        .into_iter()
+        .map(Into::into)
+        .collect())
 }
-- 
GitLab


From a11407c565fe90778ab87337a02e0b130d77c32c Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Mon, 15 Jan 2024 21:27:55 +0100
Subject: [PATCH 04/24] add --fast flag to allow only fetching pages without
 categories

---
 src/categories.rs |   4 --
 src/cli.rs        |   7 ++-
 src/main.rs       |  14 +++--
 src/wiki_api.rs   | 136 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 143 insertions(+), 18 deletions(-)

diff --git a/src/categories.rs b/src/categories.rs
index 03a53e7..1285c2e 100644
--- a/src/categories.rs
+++ b/src/categories.rs
@@ -42,7 +42,3 @@ pub fn list_pages(categories: &HashMap<String, Vec<String>>, flatten: bool) -> S
         })
         .join("\n\n")
 }
-
-pub async fn fetch_page_categories(page: &str) -> Result<Vec<String>, WikiError> {
-    todo!()
-}
diff --git a/src/cli.rs b/src/cli.rs
index 8932176..c0e268f 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -87,8 +87,8 @@ pub enum Commands {
     )]
     ListLanguages,
     #[command(
-        about = "Download the names of all pages on the ArchWiki",
-        long_about = "Download the names of all pages on the ArchWiki. Page names are used for the 'list-pages' and 'list-categories' commands"
+        about = "Download information about the pages and categories on the ArchWiki",
+        long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands"
     )]
     SyncWiki {
         #[arg(short = 'H', long)]
@@ -101,6 +101,9 @@ pub enum Commands {
         /// Delay (in milliseconds) between requests. Note that this applies on a per thread basis.
         delay: Option<u64>,
         #[arg(short, long)]
+        /// Only fetch page names without parent category information.
+        fast: bool,
+        #[arg(short, long)]
         /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
         print: bool,
     },
diff --git a/src/main.rs b/src/main.rs
index 91a83a7..b507a32 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -16,7 +16,9 @@ use crate::{
     languages::{fetch_all_langs, format_lang_table},
     search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel},
     utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str},
-    wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search},
+    wiki_api::{
+        fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search, fetch_wiki_tree,
+    },
 };
 
 mod categories;
@@ -159,12 +161,16 @@ async fn main() -> Result<(), WikiError> {
             hide_progress,
             thread_count,
             delay,
+            fast,
             print,
         } => {
             let thread_count = thread_count.unwrap_or(num_cpus::get_physical());
-            let res = fetch_all_pages().await?;
-            dbg!(res);
-            panic!("oops");
+            let res = if !fast {
+                fetch_wiki_tree(thread_count, delay.unwrap_or(0), hide_progress).await?
+            } else {
+                let all_pages = fetch_all_pages().await?;
+                HashMap::from([("*".to_owned(), all_pages)])
+            };
 
             let out = serde_yaml::to_string(&res)?;
 
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index bf564f6..1dd5662 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -1,5 +1,9 @@
+use core::panic;
 use std::collections::HashMap;
 
+use futures::future;
+use indicatif::{MultiProgress, ProgressBar};
+use itertools::Itertools;
 use scraper::Html;
 use serde::Deserialize;
 use url::Url;
@@ -99,20 +103,136 @@ pub async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
     Ok(Html::parse_document(&body_with_abs_urls))
 }
 
-pub async fn fetch_all_pages() -> Result<HashMap<String, Vec<String>>, WikiError> {
+/// Gets a list of all ArchWiki categories and the pages inside them.
+/// All categories are treated as top-level and sub categories are ignored.
+pub async fn fetch_wiki_tree(
+    thread_count: usize,
+    delay: u64,
+    hide_progress: bool,
+) -> Result<HashMap<String, Vec<String>>, WikiError> {
     let categories = fetch_all_categories().await?;
-    let mut wiki = HashMap::new();
 
-    for category in categories {
-        let pages = fetch_pages_in_category(&category).await?;
-        if !pages.is_empty() {
-            wiki.insert(category, pages);
+    let multi_bar = MultiProgress::new();
+    let chunk_count = categories.len() / thread_count;
+
+    let tasks = categories
+        .chunks(chunk_count)
+        .map(|chunk| {
+            let chunk = chunk.to_vec();
+
+            let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0));
+            let bar = multi_bar.add(bar);
+            if hide_progress {
+                bar.finish_and_clear();
+            }
+
+            tokio::spawn(async move {
+                let mut wiki_sectoin = HashMap::new();
+                for category in chunk {
+                    tokio::time::sleep(std::time::Duration::from_millis(delay)).await;
+
+                    let pages = match fetch_pages_in_category(&category).await {
+                        Ok(pages) => pages,
+                        Err(_) => {
+                            // wait if rate limited
+                            tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                            fetch_pages_in_category(&category)
+                                .await
+                                .unwrap_or_else(|err| {
+                                    eprintln!(
+                                        "failed to fetch pages in category {}\n ERROR {err}",
+                                        category
+                                    );
+                                    vec![]
+                                })
+                        }
+                    };
+
+                    if !pages.is_empty() {
+                        wiki_sectoin.insert(category.to_owned(), pages);
+                    }
+                    bar.inc(1);
+                }
+
+                wiki_sectoin
+            })
+        })
+        .collect_vec();
+
+    let mut wiki = HashMap::new();
+    let sections = future::join_all(tasks).await;
+
+    for section in sections {
+        match section {
+            Ok(data) => {
+                wiki.extend(data);
+            }
+            Err(err) => panic!("failed to sync wiki\nERROR: {err}"),
         }
     }
 
     Ok(wiki)
 }
 
+pub async fn fetch_all_pages() -> Result<Vec<String>, WikiError> {
+    #[derive(Debug, Deserialize)]
+    struct ApiAllPagesQuery {
+        allpages: Vec<Page>,
+    }
+
+    #[derive(Debug, Deserialize)]
+    struct Page {
+        title: String,
+    }
+
+    impl From<Page> for String {
+        fn from(value: Page) -> Self {
+            value.title
+        }
+    }
+
+    #[derive(Debug, Deserialize)]
+    struct ApiAllPageContinueParams {
+        apcontinue: String,
+    }
+
+    let api_url =
+        "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500";
+
+    let mut pages: Vec<String> = vec![];
+
+    let body = reqwest::get(api_url).await?.text().await?;
+    let mut api_resp: ApiResponseWithContinue<ApiAllPagesQuery, ApiAllPageContinueParams> =
+        serde_json::from_str(&body)?;
+
+    pages.append(
+        &mut api_resp
+            .query
+            .allpages
+            .into_iter()
+            .map(Into::into)
+            .collect(),
+    );
+
+    while let Some(continue_params) = api_resp.r#continue {
+        let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue);
+
+        let body = reqwest::get(&next_api_url).await?.text().await?;
+        api_resp = serde_json::from_str(&body)?;
+
+        pages.append(
+            &mut api_resp
+                .query
+                .allpages
+                .into_iter()
+                .map(Into::into)
+                .collect(),
+        );
+    }
+
+    Ok(pages)
+}
+
 async fn fetch_all_categories() -> Result<Vec<String>, WikiError> {
     #[derive(Debug, Deserialize)]
     struct ApiAllCategoriesQuery {
@@ -191,8 +311,8 @@ async fn fetch_pages_in_category(category: &str) -> Result<Vec<String>, WikiErro
 
     let api_url = format!("https://wiki.archlinux.org/api.php?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=500&cmtitle=Category:{title}", title = urlencoding::encode(category));
 
-    let body = reqwest::get(dbg!(api_url)).await?.text().await?;
-    let api_resp: ApiResponse<ApiCategoryMembersQuery> = serde_json::from_str(&dbg!(body))?;
+    let body = reqwest::get(api_url).await?.text().await?;
+    let api_resp: ApiResponse<ApiCategoryMembersQuery> = serde_json::from_str(&body)?;
 
     Ok(api_resp
         .query
-- 
GitLab


From 23bea9d7a51d74dc933f86f94f3096154988ff87 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Mon, 15 Jan 2024 21:42:43 +0100
Subject: [PATCH 05/24] wip: add local-wiki command & change save paths to be
 urlencoded

---
 src/cli.rs   | 18 +++++++++++++++---
 src/error.rs |  2 ++
 src/main.rs  | 22 ++++++++++++++++++++--
 src/utils.rs |  7 +++----
 4 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index c0e268f..87fb214 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -1,6 +1,7 @@
 use std::path::PathBuf;
 
 use clap::{Parser, Subcommand};
+use html2md::common;
 
 use crate::formats::PageFormat;
 
@@ -69,7 +70,7 @@ pub enum Commands {
         /// Only show pages in this category
         category: Option<String>,
         #[arg(short, long)]
-        /// Use different file to read pages from
+        /// Use a different file to read pages from
         page_file: Option<PathBuf>,
     },
     #[command(
@@ -78,7 +79,7 @@ pub enum Commands {
     )]
     ListCategories {
         #[arg(short, long)]
-        /// Use different file to read pages from
+        /// Use a different file to read pages from
         page_file: Option<PathBuf>,
     },
     #[command(
@@ -87,7 +88,7 @@ pub enum Commands {
     )]
     ListLanguages,
     #[command(
-        about = "Download information about the pages and categories on the ArchWiki",
+        about = "Download information about the pages and categories on the ArchWiki (takes a while)",
         long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands"
     )]
     SyncWiki {
@@ -107,6 +108,17 @@ pub enum Commands {
         /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
         print: bool,
     },
+    #[command(
+        about = "Download a copy of the ArchWiki. Will take a long time :)",
+        long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory."
+    )]
+    LocalWiki {
+        /// Location to store the local copy of the wiki at.
+        location: PathBuf,
+        #[arg(short, long)]
+        /// Use a different file to read pages from
+        page_file: Option<PathBuf>,
+    },
     #[command(
         about = "Retrive information related to this tool",
         long_about = "Retrive information related to this tool. All Info is shown by default."
diff --git a/src/error.rs b/src/error.rs
index 4cc13cd..5734c32 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -50,4 +50,6 @@ pub enum WikiError {
     NoPageFound(String),
     #[error("The category '{}' could not be found", .0)]
     NoCategoryFound(String),
+    #[error("{}", .0)]
+    Other(String),
 }
diff --git a/src/main.rs b/src/main.rs
index b507a32..f97aad6 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -125,7 +125,7 @@ async fn main() -> Result<(), WikiError> {
             page_file,
         } => {
             let path = page_file.unwrap_or(default_page_file_path);
-            let file = read_pages_file_as_str(path)?;
+            let file = read_pages_file_as_str(&path)?;
 
             let pages_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&file)?;
 
@@ -144,7 +144,7 @@ async fn main() -> Result<(), WikiError> {
         }
         Commands::ListCategories { page_file } => {
             let path = page_file.unwrap_or(default_page_file_path);
-            let file = read_pages_file_as_str(path)?;
+            let file = read_pages_file_as_str(&path)?;
 
             let pages_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&file)?;
 
@@ -184,6 +184,24 @@ async fn main() -> Result<(), WikiError> {
                 println!("{out}");
             }
         }
+        Commands::LocalWiki {
+            location,
+            page_file,
+        } => {
+            let path = page_file.unwrap_or(default_page_file_path);
+            let Ok(file) = read_pages_file_as_str(&path) else {
+                return Err(WikiError::Path("page file does not exist".to_owned()));
+            };
+
+            let Ok(pages_map) = serde_yaml::from_str::<HashMap<String, Vec<String>>>(&file) else {
+                return Err(WikiError::Other(format!(
+                    "page file is malformed\nfile: {}",
+                    path.to_string_lossy()
+                )));
+            };
+
+            todo!("oh boy");
+        }
         Commands::Info {
             show_cache_dir,
             show_data_dir,
diff --git a/src/utils.rs b/src/utils.rs
index 44d11ed..ba75ffe 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -63,8 +63,8 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String {
         .replace("poster=\"/", &format!("poster=\"{base_url}/"))
 }
 
-pub fn read_pages_file_as_str(path: PathBuf) -> Result<String, WikiError> {
-    fs::read_to_string(&path).map_err(|err| {
+pub fn read_pages_file_as_str(path: &Path) -> Result<String, WikiError> {
+    fs::read_to_string(path).map_err(|err| {
         match err.kind() {
             ErrorKind::NotFound => WikiError::IO(io::Error::new(ErrorKind::NotFound,  format!("Could not find pages file at '{}'. Try running 'archwiki-rs sync-wiki' to create the missing file.", path.to_string_lossy()))),
             _ => err.into()
@@ -73,8 +73,7 @@ pub fn read_pages_file_as_str(path: PathBuf) -> Result<String, WikiError> {
 }
 
 fn to_save_file_name(page: &str) -> String {
-    let regex = Regex::new("[^-0-9A-Za-z_]").expect("'[^0-9A-Za-z_]' should be a valid regex");
-    regex.replace_all(page, "_").to_string()
+    urlencoding::encode(page).to_string()
 }
 
 #[cfg(test)]
-- 
GitLab


From 8e3e09c1d04071d8359eef25fd157e680cd390b0 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Mon, 15 Jan 2024 21:57:26 +0100
Subject: [PATCH 06/24] fix tests

---
 src/cli.rs                |  1 -
 src/formats/html.rs       | 17 ++++++-----
 src/formats/markdown.rs   | 10 +++---
 src/formats/plain_text.rs | 32 ++++++++------------
 src/main.rs               | 12 +-------
 src/utils.rs              | 17 +++++++----
 src/wiki_api.rs           |  2 +-
 tests/cli.rs              | 64 +--------------------------------------
 8 files changed, 41 insertions(+), 114 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 87fb214..c764214 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -1,7 +1,6 @@
 use std::path::PathBuf;
 
 use clap::{Parser, Subcommand};
-use html2md::common;
 
 use crate::formats::PageFormat;
 
diff --git a/src/formats/html.rs b/src/formats/html.rs
index ca60728..6ceca4c 100644
--- a/src/formats/html.rs
+++ b/src/formats/html.rs
@@ -1,11 +1,16 @@
-use scraper::Html;
+use scraper::{Html, Selector};
 
 /// Converts the body of the ArchWiki page to a HTML string
 pub fn convert_page_to_html(document: &Html, page: &str) -> String {
+    let body_selector = Selector::parse("body").expect("body should be a valid css selector");
     format!(
         "<h1>{heading}</h1>\n{body}",
         heading = page,
-        body = document.html()
+        body = document
+            .select(&body_selector)
+            .next()
+            .map(|body| body.inner_html())
+            .unwrap_or_default()
     )
 }
 
@@ -17,11 +22,9 @@ mod tests {
     #[tokio::test]
     async fn test_convert_page_to_html() {
         let page = "test page";
-        let input = format!(
-            r#"<div>
+        let input = r#"<div>
     <title>Hello, world!</title>
-</div>"#
-        );
+</div>"#;
 
         let expected_output = format!(
             r#"<h1>{page}</h1>
@@ -30,7 +33,7 @@ mod tests {
 </div>"#
         );
 
-        let document = Html::parse_document(&input);
+        let document = Html::parse_document(input);
         let output = convert_page_to_html(&document, page);
 
         assert_eq!(output, expected_output);
diff --git a/src/formats/markdown.rs b/src/formats/markdown.rs
index 126fa58..23a2ba9 100644
--- a/src/formats/markdown.rs
+++ b/src/formats/markdown.rs
@@ -14,11 +14,9 @@ mod tests {
     #[tokio::test]
     async fn test_convert_page_to_markdown() {
         let page = "test page";
-        let input = format!(
-            r#"<div>
-    <h3>Hello, world!</h3>
-</div>"#
-        );
+        let input = r#"<div>
+            <h3>Hello, world!</h3>
+            </div>"#;
 
         let expected_output = format!(
             r#"# {page}
@@ -26,7 +24,7 @@ mod tests {
 ### Hello, world! ###"#
         );
 
-        let document = Html::parse_document(&input);
+        let document = Html::parse_document(input);
         let output = convert_page_to_markdown(&document, page);
 
         assert_eq!(output, expected_output);
diff --git a/src/formats/plain_text.rs b/src/formats/plain_text.rs
index b160857..754d4f9 100644
--- a/src/formats/plain_text.rs
+++ b/src/formats/plain_text.rs
@@ -90,35 +90,29 @@ mod tests {
     #[tokio::test]
     async fn test_convert_page_to_plain_text() {
         {
-            let input = format!(
-                r#"<div">
-    <h3>Hello, world!</h3>
-    <div>how <span><bold>are</bold></span> you</div>
-    I'm great
-</div>"#
-            );
+            let input = r#"<div">
+                <h3>Hello, world!</h3>
+                <div>how <span><bold>are</bold></span> you</div>
+                I'm great
+                </div>"#;
 
-            let expected_output = format!(
-                r#"
-    Hello, world!
-    how are you
-    I'm great
-"#
-            );
+            let expected_output = r#"
+                Hello, world!
+                how are you
+                I'm great
+                "#;
 
-            let document = Html::parse_document(&input);
+            let document = Html::parse_document(input);
             let output = convert_page_to_plain_text(&document, false);
 
             assert_eq!(output, expected_output);
         }
 
         {
-            let input = format!(
-                r#"<div>
+            let input = r#"<div>
     <h3>Hello, world!</h3>
     <a href="example.com">example</a>
-</div>"#
-            );
+</div>"#;
 
             let expected_output = format!(
                 r#"
diff --git a/src/main.rs b/src/main.rs
index f97aad6..44e24af 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,9 +6,6 @@ use directories::BaseDirs;
 use error::WikiError;
 use formats::plain_text::convert_page_to_plain_text;
 use itertools::Itertools;
-use scraper::Html;
-use url::Url;
-use wiki_api::fetch_page_by_url;
 
 use crate::{
     categories::list_pages,
@@ -71,7 +68,7 @@ async fn main() -> Result<(), WikiError> {
             let out = if use_cached_page {
                 fs::read_to_string(&page_cache_path)?
             } else {
-                match fetch_document(&page, lang.as_deref()).await {
+                match fetch_page(&page, lang.as_deref()).await {
                     Ok(document) => match format {
                         PageFormat::PlainText => convert_page_to_plain_text(&document, show_urls),
                         PageFormat::Markdown => convert_page_to_markdown(&document, &page),
@@ -251,10 +248,3 @@ async fn main() -> Result<(), WikiError> {
 
     Ok(())
 }
-
-async fn fetch_document(page: &str, lang: Option<&str>) -> Result<Html, WikiError> {
-    match Url::parse(page) {
-        Ok(url) => fetch_page_by_url(url).await,
-        Err(_) => fetch_page(page, lang).await,
-    }
-}
diff --git a/src/utils.rs b/src/utils.rs
index ba75ffe..81347c7 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -4,7 +4,6 @@ use std::{
     path::{Path, PathBuf},
 };
 
-use regex::Regex;
 use scraper::node::Element;
 
 use crate::{error::WikiError, formats::PageFormat};
@@ -73,7 +72,10 @@ pub fn read_pages_file_as_str(path: &Path) -> Result<String, WikiError> {
 }
 
 fn to_save_file_name(page: &str) -> String {
-    urlencoding::encode(page).to_string()
+    urlencoding::encode(page)
+        .to_string()
+        .replace('.', "\\.")
+        .replace('~', "\\~")
 }
 
 #[cfg(test)]
@@ -85,10 +87,13 @@ mod tests {
     fn test_to_save_file_name() {
         let cases = [
             ("Neovim", "Neovim"),
-            ("3D Mouse", "3D_Mouse"),
-            ("/etc/fstab", "_etc_fstab"),
-            (".NET", "_NET"),
-            ("ASUS MeMO Pad 7 (ME176C(X))", "ASUS_MeMO_Pad_7__ME176C_X__"),
+            ("3D Mouse", "3D%20Mouse"),
+            ("/etc/fstab", "%2Fetc%2Ffstab"),
+            (".NET", "\\.NET"),
+            (
+                "ASUS MeMO Pad 7 (ME176C(X))",
+                "ASUS%20MeMO%20Pad%207%20%28ME176C%28X%29%29",
+            ),
         ];
 
         for (input, output) in cases {
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index 1dd5662..cb00e67 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -90,7 +90,7 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result<Html, WikiErro
 /// A different base URL is used for pages that aren't hosted directly on `wiki.archlinux.org`
 ///
 /// If the page has no content a `NoPageFound` Error is returned.
-pub async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
+async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
     let base_url = format!(
         "{schema}://{host}",
         schema = url.scheme(),
diff --git a/tests/cli.rs b/tests/cli.rs
index 5759687..6857dbb 100644
--- a/tests/cli.rs
+++ b/tests/cli.rs
@@ -1,9 +1,5 @@
 use assert_cmd::Command;
-use assert_fs::prelude::{FileWriteStr, PathChild};
-use predicates::{
-    prelude::{predicate, PredicateBooleanExt},
-    Predicate,
-};
+use predicates::prelude::{predicate, PredicateBooleanExt};
 
 #[test]
 fn test_cli_info_cmd() -> Result<(), Box<dyn std::error::Error>> {
@@ -55,22 +51,6 @@ fn test_cli_read_page_cmd() -> Result<(), Box<dyn std::error::Error>> {
         cmd.assert().failure().stderr(pstr::starts_with("Neovim"));
     }
 
-    {
-        let mut cmd = Command::cargo_bin("archwiki-rs")?;
-        cmd.args(["read-page", "-i", "https://wiki.archlinux.org/title/Emacs"]);
-
-        cmd.assert()
-            .success()
-            .stdout(pstr::contains("Installation"));
-    }
-
-    {
-        let mut cmd = Command::cargo_bin("archwiki-rs")?;
-        cmd.args(["read-page", "-i", "https://google.com"]);
-
-        cmd.assert().failure();
-    }
-
     Ok(())
 }
 
@@ -121,45 +101,3 @@ fn test_cli_list_languages_cmd() -> Result<(), Box<dyn std::error::Error>> {
 
     Ok(())
 }
-
-#[test]
-fn test_cli_local_wiki_info() -> Result<(), Box<dyn std::error::Error>> {
-    use predicate::str as pstr;
-
-    let stdout = {
-        let mut cmd = Command::cargo_bin("archwiki-rs")?;
-        cmd.args(["sync-wiki", "-p", "-m", "10"]);
-
-        let stdout = String::from_utf8(cmd.assert().success().get_output().stdout.clone()).unwrap();
-        pstr::contains("About Arch").eval(&stdout);
-
-        stdout
-    };
-
-    let tmp_dir = assert_fs::TempDir::new().unwrap();
-    tmp_dir.child("pages.yml").write_str(&stdout).unwrap();
-
-    let tmp_file_path = tmp_dir.path().join("pages.yml");
-
-    {
-        let mut cmd = Command::cargo_bin("archwiki-rs")?;
-        cmd.args(["list-pages", "-p", tmp_file_path.to_str().unwrap()]);
-
-        cmd.assert().success().stdout(pstr::contains(
-            "About Arch:
-───┤Arch boot process
-───┤Arch build system",
-        ));
-    }
-
-    {
-        let mut cmd = Command::cargo_bin("archwiki-rs")?;
-        cmd.args(["list-categories", "-p", tmp_file_path.to_str().unwrap()]);
-
-        cmd.assert()
-            .success()
-            .stdout(pstr::contains("\n").count(10));
-    }
-
-    Ok(())
-}
-- 
GitLab


From bf8435bc96abfcb2d10f002448d818860816d487 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Tue, 16 Jan 2024 14:45:05 +0100
Subject: [PATCH 07/24] improve missing page file error

---
 src/cli.rs   |  3 +++
 src/main.rs  | 27 +++++++++++++++++++--------
 src/utils.rs | 13 +++++++++++--
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index c764214..f02e9cb 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -106,6 +106,9 @@ pub enum Commands {
         #[arg(short, long)]
         /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
         print: bool,
+        #[arg(short, long)]
+        /// Use custom output file location
+        out_file: Option<PathBuf>,
     },
     #[command(
         about = "Download a copy of the ArchWiki. Will take a long time :)",
diff --git a/src/main.rs b/src/main.rs
index 44e24af..9e417ed 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -121,8 +121,11 @@ async fn main() -> Result<(), WikiError> {
             category,
             page_file,
         } => {
-            let path = page_file.unwrap_or(default_page_file_path);
-            let file = read_pages_file_as_str(&path)?;
+            let (path, is_default) = page_file
+                .map(|path| (path, false))
+                .unwrap_or((default_page_file_path, true));
+
+            let file = read_pages_file_as_str(&path, is_default)?;
 
             let pages_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&file)?;
 
@@ -140,8 +143,11 @@ async fn main() -> Result<(), WikiError> {
             println!("{out}");
         }
         Commands::ListCategories { page_file } => {
-            let path = page_file.unwrap_or(default_page_file_path);
-            let file = read_pages_file_as_str(&path)?;
+            let (path, is_default) = page_file
+                .map(|path| (path, false))
+                .unwrap_or((default_page_file_path, true));
+
+            let file = read_pages_file_as_str(&path, is_default)?;
 
             let pages_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&file)?;
 
@@ -160,6 +166,7 @@ async fn main() -> Result<(), WikiError> {
             delay,
             fast,
             print,
+            out_file,
         } => {
             let thread_count = thread_count.unwrap_or(num_cpus::get_physical());
             let res = if !fast {
@@ -172,10 +179,11 @@ async fn main() -> Result<(), WikiError> {
             let out = serde_yaml::to_string(&res)?;
 
             if !print {
-                fs::write(&default_page_file_path, out)?;
+                let path = out_file.unwrap_or(default_page_file_path);
+                fs::write(&path, out)?;
 
                 if !hide_progress {
-                    println!("data saved to {}", default_page_file_path.to_string_lossy());
+                    println!("data saved to {}", path.to_string_lossy());
                 }
             } else {
                 println!("{out}");
@@ -185,8 +193,11 @@ async fn main() -> Result<(), WikiError> {
             location,
             page_file,
         } => {
-            let path = page_file.unwrap_or(default_page_file_path);
-            let Ok(file) = read_pages_file_as_str(&path) else {
+            let (path, is_default) = page_file
+                .map(|path| (path, false))
+                .unwrap_or((default_page_file_path, true));
+
+            let Ok(file) = read_pages_file_as_str(&path, is_default) else {
                 return Err(WikiError::Path("page file does not exist".to_owned()));
             };
 
diff --git a/src/utils.rs b/src/utils.rs
index 81347c7..efa94ae 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -62,10 +62,19 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String {
         .replace("poster=\"/", &format!("poster=\"{base_url}/"))
 }
 
-pub fn read_pages_file_as_str(path: &Path) -> Result<String, WikiError> {
+pub fn read_pages_file_as_str(path: &Path, is_default_path: bool) -> Result<String, WikiError> {
     fs::read_to_string(path).map_err(|err| {
         match err.kind() {
-            ErrorKind::NotFound => WikiError::IO(io::Error::new(ErrorKind::NotFound,  format!("Could not find pages file at '{}'. Try running 'archwiki-rs sync-wiki' to create the missing file.", path.to_string_lossy()))),
+            ErrorKind::NotFound =>  {
+                let path_str =path.to_string_lossy();
+                let extra_path_arg = if is_default_path {
+                    String::new()
+                } else {
+                    format!(" --out-file {path_str}")
+                };
+
+                WikiError::IO(io::Error::new(ErrorKind::NotFound,  format!("Could not find pages file at '{path_str}'. Try running 'archwiki-rs sync-wiki{extra_path_arg}' to create the missing file." )))
+            }
             _ => err.into()
         }
     })
-- 
GitLab


From 81fe38f5f0ecb5f7a91dca1022d82cdd5f2a5271 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Tue, 16 Jan 2024 22:05:13 +0100
Subject: [PATCH 08/24] use wiki media generator api to hugely improve
 preformance

---
 src/cli.rs      |  11 +--
 src/error.rs    |   2 -
 src/main.rs     |  64 ++++++-------
 src/utils.rs    |  40 +++++++-
 src/wiki_api.rs | 242 ++++++++++++------------------------------------
 5 files changed, 122 insertions(+), 237 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index f02e9cb..a6f68d4 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -87,7 +87,7 @@ pub enum Commands {
     )]
     ListLanguages,
     #[command(
-        about = "Download information about the pages and categories on the ArchWiki (takes a while)",
+        about = "Download information about the pages and categories on the ArchWiki",
         long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands"
     )]
     SyncWiki {
@@ -95,15 +95,6 @@ pub enum Commands {
         /// Hide progress indicators
         hide_progress: bool,
         #[arg(short, long)]
-        /// Number of threads to use for fetching data from the ArchWiki
-        thread_count: Option<usize>,
-        #[arg(short, long)]
-        /// Delay (in milliseconds) between requests. Note that this applies on a per thread basis.
-        delay: Option<u64>,
-        #[arg(short, long)]
-        /// Only fetch page names without parent category information.
-        fast: bool,
-        #[arg(short, long)]
         /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
         print: bool,
         #[arg(short, long)]
diff --git a/src/error.rs b/src/error.rs
index 5734c32..4cc13cd 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -50,6 +50,4 @@ pub enum WikiError {
     NoPageFound(String),
     #[error("The category '{}' could not be found", .0)]
     NoCategoryFound(String),
-    #[error("{}", .0)]
-    Other(String),
 }
diff --git a/src/main.rs b/src/main.rs
index 9e417ed..3e1aa3c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,10 +1,11 @@
-use std::{collections::HashMap, fs};
+use std::fs;
 
 use clap::Parser;
 use cli::{CliArgs, Commands};
 use directories::BaseDirs;
 use error::WikiError;
 use formats::plain_text::convert_page_to_plain_text;
+use indicatif::ProgressBar;
 use itertools::Itertools;
 
 use crate::{
@@ -12,10 +13,11 @@ use crate::{
     formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat},
     languages::{fetch_all_langs, format_lang_table},
     search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel},
-    utils::{create_cache_page_path, page_cache_exists, read_pages_file_as_str},
-    wiki_api::{
-        fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search, fetch_wiki_tree,
+    utils::{
+        create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree,
+        UNCATEGORIZED_KEY,
     },
+    wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search},
 };
 
 mod categories;
@@ -125,19 +127,16 @@ async fn main() -> Result<(), WikiError> {
                 .map(|path| (path, false))
                 .unwrap_or((default_page_file_path, true));
 
-            let file = read_pages_file_as_str(&path, is_default)?;
-
-            let pages_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&file)?;
-
+            let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?;
             let out = if let Some(category) = category {
-                pages_map
+                wiki_tree
                     .get(&category)
                     .ok_or(WikiError::NoCategoryFound(category))?
                     .iter()
                     .sorted()
                     .join("\n")
             } else {
-                list_pages(&pages_map, flatten)
+                list_pages(&wiki_tree, flatten)
             };
 
             println!("{out}");
@@ -147,11 +146,14 @@ async fn main() -> Result<(), WikiError> {
                 .map(|path| (path, false))
                 .unwrap_or((default_page_file_path, true));
 
-            let file = read_pages_file_as_str(&path, is_default)?;
-
-            let pages_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&file)?;
+            let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?;
+            let out = wiki_tree
+                .keys()
+                .unique()
+                .sorted()
+                .filter(|cat| cat.as_str() != UNCATEGORIZED_KEY)
+                .join("\n");
 
-            let out = pages_map.keys().unique().sorted().join("\n");
             println!("{out}");
         }
         Commands::ListLanguages => {
@@ -162,21 +164,22 @@ async fn main() -> Result<(), WikiError> {
         }
         Commands::SyncWiki {
             hide_progress,
-            thread_count,
-            delay,
-            fast,
             print,
             out_file,
         } => {
-            let thread_count = thread_count.unwrap_or(num_cpus::get_physical());
-            let res = if !fast {
-                fetch_wiki_tree(thread_count, delay.unwrap_or(0), hide_progress).await?
-            } else {
-                let all_pages = fetch_all_pages().await?;
-                HashMap::from([("*".to_owned(), all_pages)])
-            };
+            let spinner = ProgressBar::new_spinner();
+            if hide_progress {
+                spinner.finish_and_clear();
+            }
 
-            let out = serde_yaml::to_string(&res)?;
+            let _spin_task = std::thread::spawn(move || loop {
+                spinner.tick();
+                std::thread::sleep(std::time::Duration::from_millis(100));
+            });
+
+            let wiki_tree = fetch_all_pages().await?;
+
+            let out = serde_yaml::to_string(&wiki_tree)?;
 
             if !print {
                 let path = out_file.unwrap_or(default_page_file_path);
@@ -197,17 +200,6 @@ async fn main() -> Result<(), WikiError> {
                 .map(|path| (path, false))
                 .unwrap_or((default_page_file_path, true));
 
-            let Ok(file) = read_pages_file_as_str(&path, is_default) else {
-                return Err(WikiError::Path("page file does not exist".to_owned()));
-            };
-
-            let Ok(pages_map) = serde_yaml::from_str::<HashMap<String, Vec<String>>>(&file) else {
-                return Err(WikiError::Other(format!(
-                    "page file is malformed\nfile: {}",
-                    path.to_string_lossy()
-                )));
-            };
-
             todo!("oh boy");
         }
         Commands::Info {
diff --git a/src/utils.rs b/src/utils.rs
index efa94ae..570381a 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,13 +1,17 @@
 use std::{
+    collections::HashMap,
     fs,
     io::{self, ErrorKind},
     path::{Path, PathBuf},
 };
 
+use itertools::Itertools;
 use scraper::node::Element;
 
 use crate::{error::WikiError, formats::PageFormat};
 
+pub const UNCATEGORIZED_KEY: &str = "UNCATEGORIZED";
+
 /// Construct a path to cache a page. Different page formats are cached separately.
 /// All none word characters are escaped with an '_'
 pub fn create_cache_page_path(page: &str, format: &PageFormat, cache_dir: &Path) -> PathBuf {
@@ -62,11 +66,14 @@ pub fn update_relative_urls(html: &str, base_url: &str) -> String {
         .replace("poster=\"/", &format!("poster=\"{base_url}/"))
 }
 
-pub fn read_pages_file_as_str(path: &Path, is_default_path: bool) -> Result<String, WikiError> {
-    fs::read_to_string(path).map_err(|err| {
+pub fn read_pages_file_as_category_tree(
+    path: &Path,
+    is_default_path: bool,
+) -> Result<HashMap<String, Vec<String>>, WikiError> {
+    let content = fs::read_to_string(path).map_err(|err| {
         match err.kind() {
             ErrorKind::NotFound =>  {
-                let path_str =path.to_string_lossy();
+                let path_str = path.to_string_lossy();
                 let extra_path_arg = if is_default_path {
                     String::new()
                 } else {
@@ -77,7 +84,32 @@ pub fn read_pages_file_as_str(path: &Path, is_default_path: bool) -> Result<Stri
             }
             _ => err.into()
         }
-    })
+    })?;
+
+    let page_to_category_map: HashMap<String, Vec<String>> = serde_yaml::from_str(&content)?;
+
+    let mut category_to_page_map = HashMap::new();
+    let mut uncategorized_pages = vec![];
+
+    for (page, cats) in page_to_category_map.into_iter().collect_vec() {
+        if cats.is_empty() {
+            uncategorized_pages.push(page)
+        } else {
+            for cat in cats {
+                let mut pages: Vec<String> =
+                    category_to_page_map.get(&cat).cloned().unwrap_or_default();
+                pages.push(page.clone());
+
+                category_to_page_map.insert(cat, pages);
+            }
+        }
+    }
+
+    if !uncategorized_pages.is_empty() {
+        category_to_page_map.insert(UNCATEGORIZED_KEY.to_owned(), uncategorized_pages);
+    }
+
+    Ok(category_to_page_map)
 }
 
 fn to_save_file_name(page: &str) -> String {
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index cb00e67..d636236 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -1,9 +1,5 @@
-use core::panic;
 use std::collections::HashMap;
 
-use futures::future;
-use indicatif::{MultiProgress, ProgressBar};
-use itertools::Itertools;
 use scraper::Html;
 use serde::Deserialize;
 use url::Url;
@@ -17,6 +13,18 @@ use crate::{
     utils::update_relative_urls,
 };
 
+const BLOCK_LISTED_CATEGORY_PREFIXES: &[&str] = &[
+    "Pages flagged with",
+    "Sections flagged with",
+    "Pages or sections flagged with",
+    "Pages where template include size is exceeded",
+    "Pages with broken package links",
+    "Pages with broken section links",
+    "Pages with missing package links",
+    "Pages with missing section links",
+    "Pages with dead links",
+];
+
 #[derive(Debug, Clone, serde::Deserialize)]
 pub struct ApiResponse<T> {
     pub query: T,
@@ -103,221 +111,85 @@ async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
     Ok(Html::parse_document(&body_with_abs_urls))
 }
 
-/// Gets a list of all ArchWiki categories and the pages inside them.
-/// All categories are treated as top-level and sub categories are ignored.
-pub async fn fetch_wiki_tree(
-    thread_count: usize,
-    delay: u64,
-    hide_progress: bool,
-) -> Result<HashMap<String, Vec<String>>, WikiError> {
-    let categories = fetch_all_categories().await?;
-
-    let multi_bar = MultiProgress::new();
-    let chunk_count = categories.len() / thread_count;
-
-    let tasks = categories
-        .chunks(chunk_count)
-        .map(|chunk| {
-            let chunk = chunk.to_vec();
-
-            let bar = ProgressBar::new(chunk.len().try_into().unwrap_or(0));
-            let bar = multi_bar.add(bar);
-            if hide_progress {
-                bar.finish_and_clear();
-            }
-
-            tokio::spawn(async move {
-                let mut wiki_sectoin = HashMap::new();
-                for category in chunk {
-                    tokio::time::sleep(std::time::Duration::from_millis(delay)).await;
-
-                    let pages = match fetch_pages_in_category(&category).await {
-                        Ok(pages) => pages,
-                        Err(_) => {
-                            // wait if rate limited
-                            tokio::time::sleep(std::time::Duration::from_secs(1)).await;
-                            fetch_pages_in_category(&category)
-                                .await
-                                .unwrap_or_else(|err| {
-                                    eprintln!(
-                                        "failed to fetch pages in category {}\n ERROR {err}",
-                                        category
-                                    );
-                                    vec![]
-                                })
-                        }
-                    };
-
-                    if !pages.is_empty() {
-                        wiki_sectoin.insert(category.to_owned(), pages);
-                    }
-                    bar.inc(1);
-                }
-
-                wiki_sectoin
-            })
-        })
-        .collect_vec();
-
-    let mut wiki = HashMap::new();
-    let sections = future::join_all(tasks).await;
-
-    for section in sections {
-        match section {
-            Ok(data) => {
-                wiki.extend(data);
-            }
-            Err(err) => panic!("failed to sync wiki\nERROR: {err}"),
-        }
-    }
-
-    Ok(wiki)
-}
-
-pub async fn fetch_all_pages() -> Result<Vec<String>, WikiError> {
+/// TODO
+pub async fn fetch_all_pages() -> Result<HashMap<String, Vec<String>>, WikiError> {
     #[derive(Debug, Deserialize)]
     struct ApiAllPagesQuery {
-        allpages: Vec<Page>,
+        pages: HashMap<String, Page>,
     }
 
     #[derive(Debug, Deserialize)]
     struct Page {
         title: String,
-    }
-
-    impl From<Page> for String {
-        fn from(value: Page) -> Self {
-            value.title
-        }
-    }
-
-    #[derive(Debug, Deserialize)]
-    struct ApiAllPageContinueParams {
-        apcontinue: String,
-    }
-
-    let api_url =
-        "https://wiki.archlinux.org/api.php?action=query&list=allpages&format=json&aplimit=500";
-
-    let mut pages: Vec<String> = vec![];
-
-    let body = reqwest::get(api_url).await?.text().await?;
-    let mut api_resp: ApiResponseWithContinue<ApiAllPagesQuery, ApiAllPageContinueParams> =
-        serde_json::from_str(&body)?;
-
-    pages.append(
-        &mut api_resp
-            .query
-            .allpages
-            .into_iter()
-            .map(Into::into)
-            .collect(),
-    );
-
-    while let Some(continue_params) = api_resp.r#continue {
-        let next_api_url = format!("{api_url}&apcontinue={}", continue_params.apcontinue);
-
-        let body = reqwest::get(&next_api_url).await?.text().await?;
-        api_resp = serde_json::from_str(&body)?;
-
-        pages.append(
-            &mut api_resp
-                .query
-                .allpages
-                .into_iter()
-                .map(Into::into)
-                .collect(),
-        );
-    }
-
-    Ok(pages)
-}
-
-async fn fetch_all_categories() -> Result<Vec<String>, WikiError> {
-    #[derive(Debug, Deserialize)]
-    struct ApiAllCategoriesQuery {
-        allcategories: Vec<Category>,
+        categories: Option<Vec<Category>>,
     }
 
     #[derive(Debug, Deserialize)]
     struct Category {
-        #[serde[rename = "*"]]
-        name: String,
+        title: String,
     }
 
     impl From<Category> for String {
         fn from(value: Category) -> Self {
-            value.name
+            value
+                .title
+                .split_once("Category:")
+                .map(|(_, title)| title.to_owned())
+                .unwrap_or(value.title)
         }
     }
 
     #[derive(Debug, Deserialize)]
-    struct ApiAllCategoryContinueParams {
-        accontinue: String,
+    struct ApiAllPageContinueParams {
+        gapcontinue: Option<String>,
+        clcontinue: Option<String>,
     }
 
-    let api_url = "https://wiki.archlinux.org/api.php?action=query&list=allcategories&format=json&aclimit=500";
+    let api_url =
+        "https://wiki.archlinux.org/api.php?action=query&generator=allpages&prop=categories&format=json&gaplimit=max&cllimit=max";
 
-    let mut categories: Vec<String> = vec![];
+    let mut pages: Vec<Page> = vec![];
 
     let body = reqwest::get(api_url).await?.text().await?;
-    let mut api_resp: ApiResponseWithContinue<ApiAllCategoriesQuery, ApiAllCategoryContinueParams> =
+    let mut api_resp: ApiResponseWithContinue<ApiAllPagesQuery, ApiAllPageContinueParams> =
         serde_json::from_str(&body)?;
 
-    categories.append(
-        &mut api_resp
-            .query
-            .allcategories
-            .into_iter()
-            .map(Into::into)
-            .collect(),
-    );
+    pages.append(&mut api_resp.query.pages.into_values().collect());
 
     while let Some(continue_params) = api_resp.r#continue {
-        let next_api_url = format!("{api_url}&accontinue={}", continue_params.accontinue);
+        let next_api_url = if let Some(gapcontinue) = continue_params.gapcontinue {
+            format!("{api_url}&gapcontinue={}", gapcontinue)
+        } else if let Some(clcontinue) = continue_params.clcontinue {
+            format!("{api_url}&clcontinue={}", clcontinue)
+        } else {
+            break;
+        };
 
         let body = reqwest::get(&next_api_url).await?.text().await?;
         api_resp = serde_json::from_str(&body)?;
 
-        categories.append(
-            &mut api_resp
-                .query
-                .allcategories
-                .into_iter()
-                .map(Into::into)
-                .collect(),
-        );
+        pages.append(&mut api_resp.query.pages.into_values().collect());
     }
 
-    Ok(categories)
+    let page_category_tree = pages.into_iter().map(|page| {
+        (
+            page.title,
+            page.categories
+                .map(|cats| {
+                    cats.into_iter()
+                        .map::<String, _>(Into::into)
+                        .filter(|cat| !is_blocked_category(cat))
+                        .collect()
+                })
+                .unwrap_or_default(),
+        )
+    });
+
+    Ok(HashMap::from_iter(page_category_tree))
 }
 
-async fn fetch_pages_in_category(category: &str) -> Result<Vec<String>, WikiError> {
-    #[derive(Debug, Deserialize)]
-    struct ApiCategoryMembersQuery {
-        categorymembers: Vec<Page>,
-    }
-
-    #[derive(Debug, Deserialize)]
-    struct Page {
-        title: String,
-    }
-
-    impl From<Page> for String {
-        fn from(value: Page) -> Self {
-            value.title
-        }
-    }
-
-    let api_url = format!("https://wiki.archlinux.org/api.php?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=500&cmtitle=Category:{title}", title = urlencoding::encode(category));
-
-    let body = reqwest::get(api_url).await?.text().await?;
-    let api_resp: ApiResponse<ApiCategoryMembersQuery> = serde_json::from_str(&body)?;
-
-    Ok(api_resp
-        .query
-        .categorymembers
-        .into_iter()
-        .map(Into::into)
-        .collect())
+fn is_blocked_category(category: &str) -> bool {
+    BLOCK_LISTED_CATEGORY_PREFIXES
+        .iter()
+        .any(|blocked_prefix| category.starts_with(blocked_prefix))
 }
-- 
GitLab


From 3c6d03986f2bf51f4e8e8f318c5630d2e009454d Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 10:43:39 +0100
Subject: [PATCH 09/24] add local-wiki function

---
 src/cli.rs      |  47 ++++++++++--------
 src/main.rs     | 125 +++++++++++++++++++++++++++++++++++++++++++++---
 src/utils.rs    |   4 +-
 src/wiki_api.rs |   9 +++-
 4 files changed, 156 insertions(+), 29 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index a6f68d4..5a71859 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -19,25 +19,25 @@ pub enum Commands {
     )]
     ReadPage {
         #[arg(short, long)]
-        /// Don't cache the read page locally
+        /// Don't cache the read page locally.
         no_cache_write: bool,
         #[arg(short, long)]
-        /// Don't read the page from cache even if an entry for it is cached
+        /// Don't read the page from cache even if an entry for it is cached.
         ignore_cache: bool,
         #[arg(short, long)]
         /// Don't invalidate the cache even if it is considered stale. A cache is considered stale
         /// after it hasn't been updated in more then 14 days.
         disable_cache_invalidation: bool,
         #[arg(short, long)]
-        /// Show URLs for plain-text output
+        /// Show URLs for plain-text output.
         show_urls: bool,
         #[arg(short, long)]
         /// Preferred page language
         lang: Option<String>,
         #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)]
-        /// The format that the page should be displayed in
+        /// The format that the page should be displayed in.
         format: PageFormat,
-        /// The name of the page to read or an absolute URL to the page
+        /// The name of the page to read or an absolute URL to the page.
         page: String,
     },
     #[command(
@@ -47,10 +47,10 @@ pub enum Commands {
     Search {
         search: String,
         #[arg(short, long, default_value_t = String::from("en"))]
-        /// Preferred language of the content to search for
+        /// Preferred language of the content to search for.
         lang: String,
         #[arg(short = 'L', long, default_value_t = 5)]
-        /// Maximum number of results
+        /// Maximum number of results.
         limit: u16,
         #[arg(short, long)]
         /// Search for pages by text content instead of title. Uses the 'query' API action instead
@@ -63,13 +63,13 @@ pub enum Commands {
     )]
     ListPages {
         #[arg(short, long)]
-        /// Flatten all pages and don't show their category names
+        /// Flatten all pages and don't show their category names.
         flatten: bool,
         #[arg(short, long)]
-        /// Only show pages in this category
+        /// Only show pages in this category.
         category: Option<String>,
         #[arg(short, long)]
-        /// Use a different file to read pages from
+        /// Use a different file to read pages from.
         page_file: Option<PathBuf>,
     },
     #[command(
@@ -78,7 +78,7 @@ pub enum Commands {
     )]
     ListCategories {
         #[arg(short, long)]
-        /// Use a different file to read pages from
+        /// Use a different file to read pages from.
         page_file: Option<PathBuf>,
     },
     #[command(
@@ -92,13 +92,13 @@ pub enum Commands {
     )]
     SyncWiki {
         #[arg(short = 'H', long)]
-        /// Hide progress indicators
+        /// Hide progress indicators.
         hide_progress: bool,
         #[arg(short, long)]
         /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
         print: bool,
         #[arg(short, long)]
-        /// Use custom output file location
+        /// Use custom output file location.
         out_file: Option<PathBuf>,
     },
     #[command(
@@ -106,11 +106,20 @@ pub enum Commands {
         long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory."
     )]
     LocalWiki {
-        /// Location to store the local copy of the wiki at.
-        location: PathBuf,
         #[arg(short, long)]
-        /// Use a different file to read pages from
+        /// Use a different file to read pages from.
         page_file: Option<PathBuf>,
+        #[arg(short = 'H', long)]
+        /// Hide progress indicators.
+        hide_progress: bool,
+        #[arg(short, long)]
+        /// Override directory at 'location' if it already exists.
+        override_wiki_directory: bool,
+        #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)]
+        /// The format that the page should be displayed in.
+        format: PageFormat,
+        /// Location to store the local copy of the wiki at.
+        location: PathBuf,
     },
     #[command(
         about = "Retrive information related to this tool",
@@ -118,13 +127,13 @@ pub enum Commands {
     )]
     Info {
         #[arg(short = 'c', long)]
-        /// Location of the cache directory
+        /// Location of the cache directory.
         show_cache_dir: bool,
         #[arg(short = 'd', long)]
-        /// Location of the data directory
+        /// Location of the data directory.
         show_data_dir: bool,
         #[arg(short, long)]
-        /// Only show values and not the properties they belong to or their descriptions
+        /// Only show values and not the properties they belong to or their descriptions.
         only_values: bool,
     },
 }
diff --git a/src/main.rs b/src/main.rs
index 3e1aa3c..f5451aa 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,12 +1,13 @@
-use std::fs;
+use std::{fs, io, path::Path};
 
-use clap::Parser;
+use clap::{builder::PossibleValue, Parser, ValueEnum};
 use cli::{CliArgs, Commands};
 use directories::BaseDirs;
 use error::WikiError;
 use formats::plain_text::convert_page_to_plain_text;
-use indicatif::ProgressBar;
+use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use itertools::Itertools;
+use wiki_api::fetch_page_without_recommendations;
 
 use crate::{
     categories::list_pages,
@@ -15,7 +16,7 @@ use crate::{
     search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel},
     utils::{
         create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree,
-        UNCATEGORIZED_KEY,
+        to_save_file_name, UNCATEGORIZED_KEY,
     },
     wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search},
 };
@@ -178,7 +179,6 @@ async fn main() -> Result<(), WikiError> {
             });
 
             let wiki_tree = fetch_all_pages().await?;
-
             let out = serde_yaml::to_string(&wiki_tree)?;
 
             if !print {
@@ -194,13 +194,89 @@ async fn main() -> Result<(), WikiError> {
         }
         Commands::LocalWiki {
             location,
+            format,
             page_file,
+            override_wiki_directory,
+            hide_progress,
         } => {
             let (path, is_default) = page_file
                 .map(|path| (path, false))
                 .unwrap_or((default_page_file_path, true));
 
-            todo!("oh boy");
+            let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?;
+
+            create_dir_if_not_exists(&location, !override_wiki_directory)?;
+
+            if !hide_progress {
+                if let Some(format) = format
+                    .to_possible_value()
+                    .as_ref()
+                    .map(PossibleValue::get_name)
+                {
+                    println!("downloading pages as {format}\n",)
+                }
+            }
+
+            let all_bars = MultiProgress::new();
+
+            let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count();
+            let category_bar = all_bars.add(
+                ProgressBar::new(category_count.try_into().unwrap_or(0))
+                    .with_prefix("fetching categories")
+                    .with_style(
+                        ProgressStyle::with_template("[{prefix:^22}]\t {pos:>4}/{len:4}")
+                            .unwrap()
+                            .progress_chars("##-"),
+                    ),
+            );
+
+            if hide_progress {
+                category_bar.finish_and_clear();
+            }
+
+            for (cat, pages) in wiki_tree {
+                if pages.is_empty() {
+                    continue;
+                }
+
+                let cat_dir = location.join(to_save_file_name(&cat));
+                create_dir_if_not_exists(&cat_dir, !override_wiki_directory)?;
+
+                let bar = all_bars.add(
+                    ProgressBar::new(pages.len().try_into().unwrap_or(0))
+                        .with_prefix("fetching sub-pages")
+                        .with_style(
+                            ProgressStyle::with_template(
+                                "[{prefix:^22}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}",
+                            )
+                            .unwrap()
+                            .progress_chars("##-"),
+                        ),
+                );
+
+                if hide_progress {
+                    bar.finish_and_clear();
+                }
+
+                category_bar.inc(1);
+                for page in pages {
+                    bar.inc(1);
+
+                    match write_page_to_local_wiki(&page, &cat_dir, &format).await {
+                        Ok(()) => {}
+                        Err(err) => {
+                            eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}")
+                        }
+                    }
+                }
+            }
+
+            if !hide_progress {
+                println!(
+                    "saved local copy of the ArchWiki to '{}'",
+                    location.to_string_lossy()
+                )
+            }
         }
         Commands::Info {
             show_cache_dir,
@@ -251,3 +327,40 @@ async fn main() -> Result<(), WikiError> {
 
     Ok(())
 }
+
+async fn write_page_to_local_wiki(
+    page: &str,
+    parent_dir: &Path,
+    format: &PageFormat,
+) -> Result<(), WikiError> {
+    let document = fetch_page_without_recommendations(page).await?;
+
+    let (content, ext) = match format {
+        PageFormat::PlainText => (convert_page_to_plain_text(&document, false), ""),
+        PageFormat::Markdown => (convert_page_to_markdown(&document, page), "md"),
+        PageFormat::Html => (convert_page_to_html(&document, page), "html"),
+    };
+
+    let file_path = parent_dir.join(to_save_file_name(page)).with_extension(ext);
+
+    fs::write(file_path, content)?;
+    Ok(())
+}
+
+fn create_dir_if_not_exists(dir: &Path, err_when_exists: bool) -> Result<(), WikiError> {
+    match fs::create_dir(dir) {
+        Ok(_) => {}
+        Err(err) => {
+            if err.kind() != io::ErrorKind::AlreadyExists {
+                return Err(err.into());
+            } else if err_when_exists {
+                return Err(WikiError::Path(format!(
+                    "ERROR: directory '{}' already exists",
+                    dir.to_string_lossy()
+                )));
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/utils.rs b/src/utils.rs
index 570381a..9db1671 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -10,7 +10,7 @@ use scraper::node::Element;
 
 use crate::{error::WikiError, formats::PageFormat};
 
-pub const UNCATEGORIZED_KEY: &str = "UNCATEGORIZED";
+pub const UNCATEGORIZED_KEY: &str = "Uncategorized";
 
 /// Construct a path to cache a page. Different page formats are cached separately.
 /// All none word characters are escaped with an '_'
@@ -112,7 +112,7 @@ pub fn read_pages_file_as_category_tree(
     Ok(category_to_page_map)
 }
 
-fn to_save_file_name(page: &str) -> String {
+pub fn to_save_file_name(page: &str) -> String {
     urlencoding::encode(page)
         .to_string()
         .replace('.', "\\.")
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index d636236..eb842d8 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -83,12 +83,17 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result<Html, WikiErro
         return Err(WikiError::NoPageFound(similar_pages.join("\n")));
     };
 
+    fetch_page_without_recommendations(page_title).await
+}
+
+/// TODO
+pub async fn fetch_page_without_recommendations(page: &str) -> Result<Html, WikiError> {
     let raw_url = format!(
         "https://wiki.archlinux.org/rest.php/v1/page/{title}/html",
-        title = urlencoding::encode(page_title)
+        title = urlencoding::encode(page)
     );
-    let url = Url::parse(&raw_url)?;
 
+    let url = Url::parse(&raw_url)?;
     let document = fetch_page_by_url(url).await?;
     Ok(document)
 }
-- 
GitLab


From 50be65f15c2a86b715a8d76755c7381d5dda74fb Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 12:26:51 +0100
Subject: [PATCH 10/24] multi-thread archwiki page fetching

---
 Cargo.lock  |   1 +
 Cargo.toml  |   1 +
 src/cli.rs  |   4 +
 src/main.rs | 259 +++++++++++++++++++++++++++++++++++++---------------
 4 files changed, 192 insertions(+), 73 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cbcb5a7..76d3788 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -114,6 +114,7 @@ dependencies = [
  "termination",
  "thiserror",
  "tokio",
+ "unicode-width",
  "url",
  "urlencoding",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index ac4c33e..c85ef8b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,7 @@ serde_yaml = "0.9.27"
 termination = "0.1.2"
 thiserror = "1.0.50"
 tokio = { version = "1.33.0", features = ["full"] }
+unicode-width = "0.1.11"
 url = "2.4.1"
 urlencoding = "2.1.3"
 
diff --git a/src/cli.rs b/src/cli.rs
index 5a71859..1498f1b 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -106,6 +106,10 @@ pub enum Commands {
         long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory."
     )]
     LocalWiki {
+        #[arg(short, long)]
+        /// Amount of threads to use for fetching pages from the ArchWiki. If not provided the
+        /// number of physical cores is used.
+        thread_count: Option<usize>,
         #[arg(short, long)]
         /// Use a different file to read pages from.
         page_file: Option<PathBuf>,
diff --git a/src/main.rs b/src/main.rs
index f5451aa..61de8bd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,10 +1,16 @@
-use std::{fs, io, path::Path};
+use std::{
+    collections::HashMap,
+    fs, io,
+    path::{Path, PathBuf},
+    sync::Arc,
+};
 
 use clap::{builder::PossibleValue, Parser, ValueEnum};
 use cli::{CliArgs, Commands};
 use directories::BaseDirs;
 use error::WikiError;
 use formats::plain_text::convert_page_to_plain_text;
+use futures::future;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use itertools::Itertools;
 use wiki_api::fetch_page_without_recommendations;
@@ -196,87 +202,27 @@ async fn main() -> Result<(), WikiError> {
             location,
             format,
             page_file,
+            thread_count,
             override_wiki_directory,
             hide_progress,
         } => {
+            let thread_count = thread_count.unwrap_or(num_cpus::get_physical()).max(1);
+
             let (path, is_default) = page_file
                 .map(|path| (path, false))
                 .unwrap_or((default_page_file_path, true));
 
             let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?;
 
-            create_dir_if_not_exists(&location, !override_wiki_directory)?;
-
-            if !hide_progress {
-                if let Some(format) = format
-                    .to_possible_value()
-                    .as_ref()
-                    .map(PossibleValue::get_name)
-                {
-                    println!("downloading pages as {format}\n",)
-                }
-            }
-
-            let all_bars = MultiProgress::new();
-
-            let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count();
-            let category_bar = all_bars.add(
-                ProgressBar::new(category_count.try_into().unwrap_or(0))
-                    .with_prefix("fetching categories")
-                    .with_style(
-                        ProgressStyle::with_template("[{prefix:^22}]\t {pos:>4}/{len:4}")
-                            .unwrap()
-                            .progress_chars("##-"),
-                    ),
-            );
-
-            if hide_progress {
-                category_bar.finish_and_clear();
-            }
-
-            for (cat, pages) in wiki_tree {
-                if pages.is_empty() {
-                    continue;
-                }
-
-                let cat_dir = location.join(to_save_file_name(&cat));
-                create_dir_if_not_exists(&cat_dir, !override_wiki_directory)?;
-
-                let bar = all_bars.add(
-                    ProgressBar::new(pages.len().try_into().unwrap_or(0))
-                        .with_prefix("fetching sub-pages")
-                        .with_style(
-                            ProgressStyle::with_template(
-                                "[{prefix:^22}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}",
-                            )
-                            .unwrap()
-                            .progress_chars("##-"),
-                        ),
-                );
-
-                if hide_progress {
-                    bar.finish_and_clear();
-                }
-
-                category_bar.inc(1);
-                for page in pages {
-                    bar.inc(1);
-
-                    match write_page_to_local_wiki(&page, &cat_dir, &format).await {
-                        Ok(()) => {}
-                        Err(err) => {
-                            eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}")
-                        }
-                    }
-                }
-            }
-
-            if !hide_progress {
-                println!(
-                    "saved local copy of the ArchWiki to '{}'",
-                    location.to_string_lossy()
-                )
-            }
+            download_wiki(
+                wiki_tree,
+                format,
+                location,
+                thread_count,
+                override_wiki_directory,
+                hide_progress,
+            )
+            .await?;
         }
         Commands::Info {
             show_cache_dir,
@@ -328,6 +274,156 @@ async fn main() -> Result<(), WikiError> {
     Ok(())
 }
 
+async fn download_wiki(
+    wiki_tree: HashMap<String, Vec<String>>,
+    format: PageFormat,
+    location: PathBuf,
+    thread_count: usize,
+    override_wiki_directory: bool,
+    hide_progress: bool,
+) -> Result<(), WikiError> {
+    create_dir_if_not_exists(&location, !override_wiki_directory)?;
+
+    if !hide_progress {
+        if let Some(format) = format
+            .to_possible_value()
+            .as_ref()
+            .map(PossibleValue::get_name)
+        {
+            println!("downloading pages as {format}\n",)
+        }
+    }
+
+    let multibar = MultiProgress::new();
+
+    let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count();
+    let category_bar = multibar.add(
+        ProgressBar::new(category_count.try_into().unwrap_or(0))
+            .with_prefix("---FETCHING CATEGORIES---")
+            .with_style(
+                ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}")
+                    .unwrap()
+                    .progress_chars("##-"),
+            ),
+    );
+
+    if hide_progress {
+        category_bar.finish_and_clear();
+    }
+
+    let wiki_tree_without_empty_cats = wiki_tree
+        .into_iter()
+        .filter(|(_, p)| !p.is_empty())
+        .collect_vec();
+
+    let chunk_count = wiki_tree_without_empty_cats.len() / thread_count;
+
+    let format = Arc::new(format);
+    let location = Arc::new(location);
+    let multibar = Arc::new(multibar);
+    let catbar = Arc::new(category_bar);
+
+    let wiki_tree_chunks = wiki_tree_without_empty_cats
+        .chunks(chunk_count)
+        .map(ToOwned::to_owned)
+        .map(Arc::new)
+        .collect_vec();
+
+    let tasks = wiki_tree_chunks
+        .into_iter()
+        .map(|chunk| {
+            let chunk = Arc::clone(&chunk);
+
+            let format_ref = Arc::clone(&format);
+            let location_ref = Arc::clone(&location);
+            let multibar_ref = Arc::clone(&multibar);
+            let catbar_ref = Arc::clone(&catbar);
+
+            tokio::spawn(async move {
+                download_wiki_chunk(
+                    &chunk,
+                    &format_ref,
+                    &location_ref,
+                    hide_progress,
+                    &multibar_ref,
+                    &catbar_ref,
+                )
+                .await
+                .unwrap();
+            })
+        })
+        .collect_vec();
+
+    future::join_all(tasks).await;
+
+    if !hide_progress {
+        println!(
+            "saved local copy of the ArchWiki to '{}'",
+            location.to_string_lossy()
+        )
+    }
+
+    Ok(())
+}
+
+async fn download_wiki_chunk(
+    chunk: &[(String, Vec<String>)],
+    format: &PageFormat,
+    location: &Path,
+    hide_progress: bool,
+    multibar: &MultiProgress,
+    catbar: &ProgressBar,
+) -> Result<(), WikiError> {
+    for (cat, pages) in chunk {
+        let cat_dir = location.join(to_save_file_name(cat));
+        create_dir_if_not_exists(&cat_dir, false)?;
+
+        let width = unicode_width::UnicodeWidthStr::width(cat.as_str());
+
+        let leak_str: &'static str = Box::leak(
+            format!(
+                " fetching pages in \"{}\"",
+                if width <= 18 {
+                    truncate_unicode_str(18, cat)
+                } else {
+                    truncate_unicode_str(15, cat) + "..."
+                }
+            )
+            .into_boxed_str(),
+        );
+
+        let bar = multibar.add(
+            ProgressBar::new(pages.len().try_into().unwrap_or(0))
+                .with_prefix(leak_str)
+                .with_style(
+                    ProgressStyle::with_template(
+                        "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}",
+                    )
+                    .unwrap()
+                    .progress_chars("##-"),
+                ),
+        );
+
+        if hide_progress {
+            bar.finish_and_clear();
+        }
+
+        catbar.inc(1);
+        for page in pages {
+            bar.inc(1);
+
+            match write_page_to_local_wiki(page, &cat_dir, format).await {
+                Ok(()) => {}
+                Err(err) => {
+                    eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}")
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
 async fn write_page_to_local_wiki(
     page: &str,
     parent_dir: &Path,
@@ -364,3 +460,20 @@ fn create_dir_if_not_exists(dir: &Path, err_when_exists: bool) -> Result<(), Wik
 
     Ok(())
 }
+
+fn truncate_unicode_str(n: usize, text: &str) -> String {
+    let mut count = 0;
+    let mut res = vec![];
+    let mut chars = text.chars();
+
+    while count < n {
+        if let Some(char) = chars.next() {
+            count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0);
+            res.push(char);
+        } else {
+            break;
+        }
+    }
+
+    res.into_iter().collect::<String>()
+}
-- 
GitLab


From 17747aa5df1bf359547a538769e19ddaaab4b4a3 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 12:46:42 +0100
Subject: [PATCH 11/24] improve error reporting in local-wiki sub-command

---
 src/main.rs | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 61de8bd..65f0465 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -349,12 +349,31 @@ async fn download_wiki(
                     &catbar_ref,
                 )
                 .await
-                .unwrap();
             })
         })
         .collect_vec();
 
-    future::join_all(tasks).await;
+    let results = future::join_all(tasks).await;
+
+    for result in results {
+        match result {
+            Ok(Ok(failed_fetchs)) => {
+                if !failed_fetchs.is_empty() {
+                    for (page, err) in failed_fetchs {
+                        eprintln!("WARNING: failed to page '{page}'\nREASON: {err}");
+                    }
+                }
+            }
+            Ok(Err(thread_err)) => {
+                eprintln!(
+                    "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}"
+                );
+            }
+            Err(_) => {
+                eprintln!("ERROR: failed to join threads, some pages might be missing");
+            }
+        }
+    }
 
     if !hide_progress {
         println!(
@@ -366,6 +385,8 @@ async fn download_wiki(
     Ok(())
 }
 
+type FailedPageFetches = Vec<(String, WikiError)>;
+
 async fn download_wiki_chunk(
     chunk: &[(String, Vec<String>)],
     format: &PageFormat,
@@ -373,7 +394,9 @@ async fn download_wiki_chunk(
     hide_progress: bool,
     multibar: &MultiProgress,
     catbar: &ProgressBar,
-) -> Result<(), WikiError> {
+) -> Result<FailedPageFetches, WikiError> {
+    let mut failed_fetches = vec![];
+
     for (cat, pages) in chunk {
         let cat_dir = location.join(to_save_file_name(cat));
         create_dir_if_not_exists(&cat_dir, false)?;
@@ -414,14 +437,12 @@ async fn download_wiki_chunk(
 
             match write_page_to_local_wiki(page, &cat_dir, format).await {
                 Ok(()) => {}
-                Err(err) => {
-                    eprintln!("[WARNING] FAILED TO FETCH PAGE '{page}'\nERROR: {err}")
-                }
+                Err(err) => failed_fetches.push((page.to_owned(), err)),
             }
         }
     }
 
-    Ok(())
+    Ok(failed_fetches)
 }
 
 async fn write_page_to_local_wiki(
-- 
GitLab


From 3dca9544b15900bba3fdc663bbb0876c125cc029 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 12:54:26 +0100
Subject: [PATCH 12/24] split up un-categorized data into chunks to improve
 local-wiki performance

---
 src/utils.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/utils.rs b/src/utils.rs
index 9db1671..34a3f1e 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -106,7 +106,16 @@ pub fn read_pages_file_as_category_tree(
     }
 
     if !uncategorized_pages.is_empty() {
-        category_to_page_map.insert(UNCATEGORIZED_KEY.to_owned(), uncategorized_pages);
+        for (i, uncategoriesed_chunk) in uncategorized_pages
+            .into_iter()
+            .sorted()
+            .chunks(500)
+            .into_iter()
+            .enumerate()
+        {
+            let key = format!("{UNCATEGORIZED_KEY} #{n}", n = i + 1);
+            category_to_page_map.insert(key, uncategoriesed_chunk.collect_vec());
+        }
     }
 
     Ok(category_to_page_map)
-- 
GitLab


From 9625ccf0f57d79244fbc9c7449542f49da9ea9ac Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 13:08:11 +0100
Subject: [PATCH 13/24] fix variable name

---
 src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 65f0465..9a85d47 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -316,7 +316,7 @@ async fn download_wiki(
         .filter(|(_, p)| !p.is_empty())
         .collect_vec();
 
-    let chunk_count = wiki_tree_without_empty_cats.len() / thread_count;
+    let chunk_size = wiki_tree_without_empty_cats.len() / thread_count;
 
     let format = Arc::new(format);
     let location = Arc::new(location);
@@ -324,7 +324,7 @@ async fn download_wiki(
     let catbar = Arc::new(category_bar);
 
     let wiki_tree_chunks = wiki_tree_without_empty_cats
-        .chunks(chunk_count)
+        .chunks(chunk_size)
         .map(ToOwned::to_owned)
         .map(Arc::new)
         .collect_vec();
-- 
GitLab


From a0dc74ca88337e3a8ffd26f7f6252738243e50cd Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 20:58:32 +0100
Subject: [PATCH 14/24] evenly chunk wiki categories based on page count

---
 src/main.rs | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 9a85d47..3848d1a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -316,24 +316,17 @@ async fn download_wiki(
         .filter(|(_, p)| !p.is_empty())
         .collect_vec();
 
-    let chunk_size = wiki_tree_without_empty_cats.len() / thread_count;
-
     let format = Arc::new(format);
     let location = Arc::new(location);
     let multibar = Arc::new(multibar);
     let catbar = Arc::new(category_bar);
 
-    let wiki_tree_chunks = wiki_tree_without_empty_cats
-        .chunks(chunk_size)
-        .map(ToOwned::to_owned)
-        .map(Arc::new)
-        .collect_vec();
+    let wiki_tree_chunks =
+        chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count);
 
     let tasks = wiki_tree_chunks
         .into_iter()
         .map(|chunk| {
-            let chunk = Arc::clone(&chunk);
-
             let format_ref = Arc::clone(&format);
             let location_ref = Arc::clone(&location);
             let multibar_ref = Arc::clone(&multibar);
@@ -498,3 +491,23 @@ fn truncate_unicode_str(n: usize, text: &str) -> String {
 
     res.into_iter().collect::<String>()
 }
+
+fn chunk_wiki_with_even_page_distribution(
+    wiki_tree: Vec<(String, Vec<String>)>,
+    chunk_count: usize,
+) -> Vec<Vec<(String, Vec<String>)>> {
+    let mut chunks: Vec<Vec<(String, Vec<String>)>> = (0..chunk_count).map(|_| vec![]).collect();
+
+    for entry in wiki_tree {
+        if let Some(chunk) = chunks.iter_mut().min_by(|a, b| {
+            let count_a = a.iter().map(|(_, pages)| pages.len()).sum::<usize>();
+            let count_b = b.iter().map(|(_, pages)| pages.len()).sum::<usize>();
+
+            count_a.cmp(&count_b)
+        }) {
+            chunk.push(entry);
+        }
+    }
+
+    chunks
+}
-- 
GitLab


From f4105d35d2564dc42e9d28c9bb0beecc6763facc Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Wed, 17 Jan 2024 21:36:24 +0100
Subject: [PATCH 15/24] use 'sanitize_filename' to create save file names

---
 Cargo.lock   | 11 +++++++++++
 Cargo.toml   |  1 +
 src/utils.rs | 16 +++++-----------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 76d3788..ec56aac 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -107,6 +107,7 @@ dependencies = [
  "pretty_assertions",
  "regex",
  "reqwest",
+ "sanitize-filename",
  "scraper",
  "serde",
  "serde_json",
@@ -1547,6 +1548,16 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "sanitize-filename"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ed72fbaf78e6f2d41744923916966c4fbe3d7c74e3037a8ee482f1115572603"
+dependencies = [
+ "lazy_static",
+ "regex",
+]
+
 [[package]]
 name = "schannel"
 version = "0.1.23"
diff --git a/Cargo.toml b/Cargo.toml
index c85ef8b..a56e976 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,7 @@ itertools = "0.11.0"
 num_cpus = "1.16.0"
 regex = "1.10.2"
 reqwest = "0.11.22"
+sanitize-filename = "0.5.0"
 scraper = "0.18.1"
 serde = { version = "1.0.190", features = ["derive"] }
 serde_json = "1.0.108"
diff --git a/src/utils.rs b/src/utils.rs
index 34a3f1e..8c095a5 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -122,10 +122,7 @@ pub fn read_pages_file_as_category_tree(
 }
 
 pub fn to_save_file_name(page: &str) -> String {
-    urlencoding::encode(page)
-        .to_string()
-        .replace('.', "\\.")
-        .replace('~', "\\~")
+    sanitize_filename::sanitize(page)
 }
 
 #[cfg(test)]
@@ -137,13 +134,10 @@ mod tests {
     fn test_to_save_file_name() {
         let cases = [
             ("Neovim", "Neovim"),
-            ("3D Mouse", "3D%20Mouse"),
-            ("/etc/fstab", "%2Fetc%2Ffstab"),
-            (".NET", "\\.NET"),
-            (
-                "ASUS MeMO Pad 7 (ME176C(X))",
-                "ASUS%20MeMO%20Pad%207%20%28ME176C%28X%29%29",
-            ),
+            ("3D Mouse", "3D Mouse"),
+            ("/etc/fstab", "etcfstab"),
+            (".NET", ".NET"),
+            ("ASUS MeMO Pad 7 (ME176C(X))", "ASUS MeMO Pad 7 (ME176C(X))"),
         ];
 
         for (input, output) in cases {
-- 
GitLab


From 83ffa1445815c68fcb00761b58e9f2dcd6bbc952 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Thu, 18 Jan 2024 18:31:09 +0100
Subject: [PATCH 16/24] don't override existing wiki files by default

---
 src/cli.rs  |  2 +-
 src/main.rs | 53 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 1498f1b..024f92a 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -118,7 +118,7 @@ pub enum Commands {
         hide_progress: bool,
         #[arg(short, long)]
         /// Override directory at 'location' if it already exists.
-        override_wiki_directory: bool,
+        override_existing_files: bool,
         #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)]
         /// The format that the page should be displayed in.
         format: PageFormat,
diff --git a/src/main.rs b/src/main.rs
index 3848d1a..2bcaaa0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -203,7 +203,7 @@ async fn main() -> Result<(), WikiError> {
             format,
             page_file,
             thread_count,
-            override_wiki_directory,
+            override_existing_files,
             hide_progress,
         } => {
             let thread_count = thread_count.unwrap_or(num_cpus::get_physical()).max(1);
@@ -219,7 +219,7 @@ async fn main() -> Result<(), WikiError> {
                 format,
                 location,
                 thread_count,
-                override_wiki_directory,
+                override_existing_files,
                 hide_progress,
             )
             .await?;
@@ -279,10 +279,10 @@ async fn download_wiki(
     format: PageFormat,
     location: PathBuf,
     thread_count: usize,
-    override_wiki_directory: bool,
+    override_exisiting_files: bool,
     hide_progress: bool,
 ) -> Result<(), WikiError> {
-    create_dir_if_not_exists(&location, !override_wiki_directory)?;
+    create_dir_if_not_exists(&location)?;
 
     if !hide_progress {
         if let Some(format) = format
@@ -338,6 +338,7 @@ async fn download_wiki(
                     &format_ref,
                     &location_ref,
                     hide_progress,
+                    override_exisiting_files,
                     &multibar_ref,
                     &catbar_ref,
                 )
@@ -385,6 +386,7 @@ async fn download_wiki_chunk(
     format: &PageFormat,
     location: &Path,
     hide_progress: bool,
+    override_exisiting_files: bool,
     multibar: &MultiProgress,
     catbar: &ProgressBar,
 ) -> Result<FailedPageFetches, WikiError> {
@@ -392,7 +394,7 @@ async fn download_wiki_chunk(
 
     for (cat, pages) in chunk {
         let cat_dir = location.join(to_save_file_name(cat));
-        create_dir_if_not_exists(&cat_dir, false)?;
+        create_dir_if_not_exists(&cat_dir)?;
 
         let width = unicode_width::UnicodeWidthStr::width(cat.as_str());
 
@@ -428,9 +430,12 @@ async fn download_wiki_chunk(
         for page in pages {
             bar.inc(1);
 
-            match write_page_to_local_wiki(page, &cat_dir, format).await {
-                Ok(()) => {}
-                Err(err) => failed_fetches.push((page.to_owned(), err)),
+            let path = page_path(page, format, &cat_dir);
+            if override_exisiting_files || !path.exists() {
+                match write_page_to_local_wiki(page, &path, format).await {
+                    Ok(()) => {}
+                    Err(err) => failed_fetches.push((page.to_owned(), err)),
+                }
             }
         }
     }
@@ -440,34 +445,36 @@ async fn download_wiki_chunk(
 
 async fn write_page_to_local_wiki(
     page: &str,
-    parent_dir: &Path,
+    page_path: &Path,
     format: &PageFormat,
 ) -> Result<(), WikiError> {
     let document = fetch_page_without_recommendations(page).await?;
-
-    let (content, ext) = match format {
-        PageFormat::PlainText => (convert_page_to_plain_text(&document, false), ""),
-        PageFormat::Markdown => (convert_page_to_markdown(&document, page), "md"),
-        PageFormat::Html => (convert_page_to_html(&document, page), "html"),
+    let content = match format {
+        PageFormat::PlainText => convert_page_to_plain_text(&document, false),
+        PageFormat::Markdown => convert_page_to_markdown(&document, page),
+        PageFormat::Html => convert_page_to_html(&document, page),
     };
 
-    let file_path = parent_dir.join(to_save_file_name(page)).with_extension(ext);
-
-    fs::write(file_path, content)?;
+    fs::write(page_path, content)?;
     Ok(())
 }
 
-fn create_dir_if_not_exists(dir: &Path, err_when_exists: bool) -> Result<(), WikiError> {
+fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf {
+    let ext = match format {
+        PageFormat::PlainText => "",
+        PageFormat::Markdown => "md",
+        PageFormat::Html => "html",
+    };
+
+    parent_dir.join(to_save_file_name(page)).with_extension(ext)
+}
+
+fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> {
     match fs::create_dir(dir) {
         Ok(_) => {}
         Err(err) => {
             if err.kind() != io::ErrorKind::AlreadyExists {
                 return Err(err.into());
-            } else if err_when_exists {
-                return Err(WikiError::Path(format!(
-                    "ERROR: directory '{}' already exists",
-                    dir.to_string_lossy()
-                )));
             }
         }
     }
-- 
GitLab


From a7fc2bc6846e049e15c02ec9788f0bc3aee67008 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Thu, 18 Jan 2024 18:48:29 +0100
Subject: [PATCH 17/24] move functions out of main and add doc comments

---
 src/main.rs          | 289 ++-----------------------------------------
 src/utils.rs         |  40 ++++++
 src/wiki_api.rs      |  12 +-
 src/wiki_download.rs | 258 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 317 insertions(+), 282 deletions(-)
 create mode 100644 src/wiki_download.rs

diff --git a/src/main.rs b/src/main.rs
index 2bcaaa0..cd55d7b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,19 +1,12 @@
-use std::{
-    collections::HashMap,
-    fs, io,
-    path::{Path, PathBuf},
-    sync::Arc,
-};
+use std::fs;
 
-use clap::{builder::PossibleValue, Parser, ValueEnum};
+use clap::Parser;
 use cli::{CliArgs, Commands};
 use directories::BaseDirs;
 use error::WikiError;
 use formats::plain_text::convert_page_to_plain_text;
-use futures::future;
-use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
+
 use itertools::Itertools;
-use wiki_api::fetch_page_without_recommendations;
 
 use crate::{
     categories::list_pages,
@@ -22,9 +15,10 @@ use crate::{
     search::{format_open_search_table, format_text_search_table, open_search_to_page_url_tupel},
     utils::{
         create_cache_page_path, page_cache_exists, read_pages_file_as_category_tree,
-        to_save_file_name, UNCATEGORIZED_KEY,
+        UNCATEGORIZED_KEY,
     },
-    wiki_api::{fetch_all_pages, fetch_open_search, fetch_page, fetch_text_search},
+    wiki_api::{fetch_open_search, fetch_page, fetch_text_search},
+    wiki_download::{download_wiki, sync_wiki_info},
 };
 
 mod categories;
@@ -35,6 +29,7 @@ mod languages;
 mod search;
 mod utils;
 mod wiki_api;
+mod wiki_download;
 
 const PAGE_FILE_NAME: &str = "pages.yml";
 
@@ -174,29 +169,8 @@ async fn main() -> Result<(), WikiError> {
             print,
             out_file,
         } => {
-            let spinner = ProgressBar::new_spinner();
-            if hide_progress {
-                spinner.finish_and_clear();
-            }
-
-            let _spin_task = std::thread::spawn(move || loop {
-                spinner.tick();
-                std::thread::sleep(std::time::Duration::from_millis(100));
-            });
-
-            let wiki_tree = fetch_all_pages().await?;
-            let out = serde_yaml::to_string(&wiki_tree)?;
-
-            if !print {
-                let path = out_file.unwrap_or(default_page_file_path);
-                fs::write(&path, out)?;
-
-                if !hide_progress {
-                    println!("data saved to {}", path.to_string_lossy());
-                }
-            } else {
-                println!("{out}");
-            }
+            let path = out_file.unwrap_or(default_page_file_path);
+            sync_wiki_info(&path, print, hide_progress).await?;
         }
         Commands::LocalWiki {
             location,
@@ -273,248 +247,3 @@ async fn main() -> Result<(), WikiError> {
 
     Ok(())
 }
-
-async fn download_wiki(
-    wiki_tree: HashMap<String, Vec<String>>,
-    format: PageFormat,
-    location: PathBuf,
-    thread_count: usize,
-    override_exisiting_files: bool,
-    hide_progress: bool,
-) -> Result<(), WikiError> {
-    create_dir_if_not_exists(&location)?;
-
-    if !hide_progress {
-        if let Some(format) = format
-            .to_possible_value()
-            .as_ref()
-            .map(PossibleValue::get_name)
-        {
-            println!("downloading pages as {format}\n",)
-        }
-    }
-
-    let multibar = MultiProgress::new();
-
-    let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count();
-    let category_bar = multibar.add(
-        ProgressBar::new(category_count.try_into().unwrap_or(0))
-            .with_prefix("---FETCHING CATEGORIES---")
-            .with_style(
-                ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}")
-                    .unwrap()
-                    .progress_chars("##-"),
-            ),
-    );
-
-    if hide_progress {
-        category_bar.finish_and_clear();
-    }
-
-    let wiki_tree_without_empty_cats = wiki_tree
-        .into_iter()
-        .filter(|(_, p)| !p.is_empty())
-        .collect_vec();
-
-    let format = Arc::new(format);
-    let location = Arc::new(location);
-    let multibar = Arc::new(multibar);
-    let catbar = Arc::new(category_bar);
-
-    let wiki_tree_chunks =
-        chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count);
-
-    let tasks = wiki_tree_chunks
-        .into_iter()
-        .map(|chunk| {
-            let format_ref = Arc::clone(&format);
-            let location_ref = Arc::clone(&location);
-            let multibar_ref = Arc::clone(&multibar);
-            let catbar_ref = Arc::clone(&catbar);
-
-            tokio::spawn(async move {
-                download_wiki_chunk(
-                    &chunk,
-                    &format_ref,
-                    &location_ref,
-                    hide_progress,
-                    override_exisiting_files,
-                    &multibar_ref,
-                    &catbar_ref,
-                )
-                .await
-            })
-        })
-        .collect_vec();
-
-    let results = future::join_all(tasks).await;
-
-    for result in results {
-        match result {
-            Ok(Ok(failed_fetchs)) => {
-                if !failed_fetchs.is_empty() {
-                    for (page, err) in failed_fetchs {
-                        eprintln!("WARNING: failed to page '{page}'\nREASON: {err}");
-                    }
-                }
-            }
-            Ok(Err(thread_err)) => {
-                eprintln!(
-                    "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}"
-                );
-            }
-            Err(_) => {
-                eprintln!("ERROR: failed to join threads, some pages might be missing");
-            }
-        }
-    }
-
-    if !hide_progress {
-        println!(
-            "saved local copy of the ArchWiki to '{}'",
-            location.to_string_lossy()
-        )
-    }
-
-    Ok(())
-}
-
-type FailedPageFetches = Vec<(String, WikiError)>;
-
-async fn download_wiki_chunk(
-    chunk: &[(String, Vec<String>)],
-    format: &PageFormat,
-    location: &Path,
-    hide_progress: bool,
-    override_exisiting_files: bool,
-    multibar: &MultiProgress,
-    catbar: &ProgressBar,
-) -> Result<FailedPageFetches, WikiError> {
-    let mut failed_fetches = vec![];
-
-    for (cat, pages) in chunk {
-        let cat_dir = location.join(to_save_file_name(cat));
-        create_dir_if_not_exists(&cat_dir)?;
-
-        let width = unicode_width::UnicodeWidthStr::width(cat.as_str());
-
-        let leak_str: &'static str = Box::leak(
-            format!(
-                " fetching pages in \"{}\"",
-                if width <= 18 {
-                    truncate_unicode_str(18, cat)
-                } else {
-                    truncate_unicode_str(15, cat) + "..."
-                }
-            )
-            .into_boxed_str(),
-        );
-
-        let bar = multibar.add(
-            ProgressBar::new(pages.len().try_into().unwrap_or(0))
-                .with_prefix(leak_str)
-                .with_style(
-                    ProgressStyle::with_template(
-                        "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}",
-                    )
-                    .unwrap()
-                    .progress_chars("##-"),
-                ),
-        );
-
-        if hide_progress {
-            bar.finish_and_clear();
-        }
-
-        catbar.inc(1);
-        for page in pages {
-            bar.inc(1);
-
-            let path = page_path(page, format, &cat_dir);
-            if override_exisiting_files || !path.exists() {
-                match write_page_to_local_wiki(page, &path, format).await {
-                    Ok(()) => {}
-                    Err(err) => failed_fetches.push((page.to_owned(), err)),
-                }
-            }
-        }
-    }
-
-    Ok(failed_fetches)
-}
-
-async fn write_page_to_local_wiki(
-    page: &str,
-    page_path: &Path,
-    format: &PageFormat,
-) -> Result<(), WikiError> {
-    let document = fetch_page_without_recommendations(page).await?;
-    let content = match format {
-        PageFormat::PlainText => convert_page_to_plain_text(&document, false),
-        PageFormat::Markdown => convert_page_to_markdown(&document, page),
-        PageFormat::Html => convert_page_to_html(&document, page),
-    };
-
-    fs::write(page_path, content)?;
-    Ok(())
-}
-
-fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf {
-    let ext = match format {
-        PageFormat::PlainText => "",
-        PageFormat::Markdown => "md",
-        PageFormat::Html => "html",
-    };
-
-    parent_dir.join(to_save_file_name(page)).with_extension(ext)
-}
-
-fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> {
-    match fs::create_dir(dir) {
-        Ok(_) => {}
-        Err(err) => {
-            if err.kind() != io::ErrorKind::AlreadyExists {
-                return Err(err.into());
-            }
-        }
-    }
-
-    Ok(())
-}
-
-fn truncate_unicode_str(n: usize, text: &str) -> String {
-    let mut count = 0;
-    let mut res = vec![];
-    let mut chars = text.chars();
-
-    while count < n {
-        if let Some(char) = chars.next() {
-            count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0);
-            res.push(char);
-        } else {
-            break;
-        }
-    }
-
-    res.into_iter().collect::<String>()
-}
-
-fn chunk_wiki_with_even_page_distribution(
-    wiki_tree: Vec<(String, Vec<String>)>,
-    chunk_count: usize,
-) -> Vec<Vec<(String, Vec<String>)>> {
-    let mut chunks: Vec<Vec<(String, Vec<String>)>> = (0..chunk_count).map(|_| vec![]).collect();
-
-    for entry in wiki_tree {
-        if let Some(chunk) = chunks.iter_mut().min_by(|a, b| {
-            let count_a = a.iter().map(|(_, pages)| pages.len()).sum::<usize>();
-            let count_b = b.iter().map(|(_, pages)| pages.len()).sum::<usize>();
-
-            count_a.cmp(&count_b)
-        }) {
-            chunk.push(entry);
-        }
-    }
-
-    chunks
-}
diff --git a/src/utils.rs b/src/utils.rs
index 8c095a5..89288bc 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -125,6 +125,46 @@ pub fn to_save_file_name(page: &str) -> String {
     sanitize_filename::sanitize(page)
 }
 
+pub fn truncate_unicode_str(n: usize, text: &str) -> String {
+    let mut count = 0;
+    let mut res = vec![];
+    let mut chars = text.chars();
+
+    while count < n {
+        if let Some(char) = chars.next() {
+            count += unicode_width::UnicodeWidthChar::width(char).unwrap_or(0);
+            res.push(char);
+        } else {
+            break;
+        }
+    }
+
+    res.into_iter().collect::<String>()
+}
+
+pub fn page_path(page: &str, format: &PageFormat, parent_dir: &Path) -> PathBuf {
+    let ext = match format {
+        PageFormat::PlainText => "",
+        PageFormat::Markdown => "md",
+        PageFormat::Html => "html",
+    };
+
+    parent_dir.join(to_save_file_name(page)).with_extension(ext)
+}
+
+pub fn create_dir_if_not_exists(dir: &Path) -> Result<(), WikiError> {
+    match fs::create_dir(dir) {
+        Ok(_) => {}
+        Err(err) => {
+            if err.kind() != io::ErrorKind::AlreadyExists {
+                return Err(err.into());
+            }
+        }
+    }
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/wiki_api.rs b/src/wiki_api.rs
index eb842d8..e681190 100644
--- a/src/wiki_api.rs
+++ b/src/wiki_api.rs
@@ -86,7 +86,7 @@ pub async fn fetch_page(page: &str, lang: Option<&str>) -> Result<Html, WikiErro
     fetch_page_without_recommendations(page_title).await
 }
 
-/// TODO
+/// Gets the HTML content of an ArchWiki page.
 pub async fn fetch_page_without_recommendations(page: &str) -> Result<Html, WikiError> {
     let raw_url = format!(
         "https://wiki.archlinux.org/rest.php/v1/page/{title}/html",
@@ -116,7 +116,15 @@ async fn fetch_page_by_url(url: Url) -> Result<Html, WikiError> {
     Ok(Html::parse_document(&body_with_abs_urls))
 }
 
-/// TODO
+/// Gets the names of all pages on the ArchWiki and the categories that they belong to.
+///
+/// ### Example
+///                                                                                                                                                                                                                                      
+/// ```sh
+/// Wine        # page name
+/// - Emulation # category
+/// - Gaming    # category
+/// ```
 pub async fn fetch_all_pages() -> Result<HashMap<String, Vec<String>>, WikiError> {
     #[derive(Debug, Deserialize)]
     struct ApiAllPagesQuery {
diff --git a/src/wiki_download.rs b/src/wiki_download.rs
new file mode 100644
index 0000000..4c252f4
--- /dev/null
+++ b/src/wiki_download.rs
@@ -0,0 +1,258 @@
+use std::{
+    collections::HashMap,
+    fs,
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use super::formats::plain_text::convert_page_to_plain_text;
+
+use clap::{builder::PossibleValue, ValueEnum};
+use futures::future;
+use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
+use itertools::Itertools;
+
+use crate::{
+    error::WikiError,
+    formats::{html::convert_page_to_html, markdown::convert_page_to_markdown, PageFormat},
+    utils::truncate_unicode_str,
+    utils::{create_dir_if_not_exists, page_path, to_save_file_name},
+    wiki_api::fetch_all_pages,
+    wiki_api::fetch_page_without_recommendations,
+};
+
+pub async fn sync_wiki_info(
+    page_path: &Path,
+    print: bool,
+    hide_progress: bool,
+) -> Result<(), WikiError> {
+    let spinner = ProgressBar::new_spinner();
+    if hide_progress {
+        spinner.finish_and_clear();
+    }
+
+    let _spin_task = std::thread::spawn(move || loop {
+        spinner.tick();
+        std::thread::sleep(std::time::Duration::from_millis(100));
+    });
+
+    let wiki_tree = fetch_all_pages().await?;
+    let out = serde_yaml::to_string(&wiki_tree)?;
+
+    if !print {
+        fs::write(page_path, out)?;
+
+        if !hide_progress {
+            println!("data saved to {}", page_path.to_string_lossy());
+        }
+    } else {
+        println!("{out}");
+    }
+
+    Ok(())
+}
+
+pub async fn download_wiki(
+    wiki_tree: HashMap<String, Vec<String>>,
+    format: PageFormat,
+    location: PathBuf,
+    thread_count: usize,
+    override_exisiting_files: bool,
+    hide_progress: bool,
+) -> Result<(), WikiError> {
+    create_dir_if_not_exists(&location)?;
+
+    if !hide_progress {
+        if let Some(format) = format
+            .to_possible_value()
+            .as_ref()
+            .map(PossibleValue::get_name)
+        {
+            println!("downloading pages as {format}\n",)
+        }
+    }
+
+    let multibar = MultiProgress::new();
+
+    let category_count = wiki_tree.values().filter(|v| !v.is_empty()).count();
+    let category_bar = multibar.add(
+        ProgressBar::new(category_count.try_into().unwrap_or(0))
+            .with_prefix("---FETCHING CATEGORIES---")
+            .with_style(
+                ProgressStyle::with_template("[{prefix:^40}]\t {pos:>4}/{len:4}")
+                    .unwrap()
+                    .progress_chars("##-"),
+            ),
+    );
+
+    if hide_progress {
+        category_bar.finish_and_clear();
+    }
+
+    let wiki_tree_without_empty_cats = wiki_tree
+        .into_iter()
+        .filter(|(_, p)| !p.is_empty())
+        .collect_vec();
+
+    let format = Arc::new(format);
+    let location = Arc::new(location);
+    let multibar = Arc::new(multibar);
+    let catbar = Arc::new(category_bar);
+
+    let wiki_tree_chunks =
+        chunk_wiki_with_even_page_distribution(wiki_tree_without_empty_cats, thread_count);
+
+    let tasks = wiki_tree_chunks
+        .into_iter()
+        .map(|chunk| {
+            let format_ref = Arc::clone(&format);
+            let location_ref = Arc::clone(&location);
+            let multibar_ref = Arc::clone(&multibar);
+            let catbar_ref = Arc::clone(&catbar);
+
+            tokio::spawn(async move {
+                download_wiki_chunk(
+                    &chunk,
+                    &format_ref,
+                    &location_ref,
+                    hide_progress,
+                    override_exisiting_files,
+                    &multibar_ref,
+                    &catbar_ref,
+                )
+                .await
+            })
+        })
+        .collect_vec();
+
+    let results = future::join_all(tasks).await;
+
+    for result in results {
+        match result {
+            Ok(Ok(failed_fetchs)) => {
+                if !failed_fetchs.is_empty() {
+                    for (page, err) in failed_fetchs {
+                        eprintln!("WARNING: failed to page '{page}'\nREASON: {err}");
+                    }
+                }
+            }
+            Ok(Err(thread_err)) => {
+                eprintln!(
+                    "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}"
+                );
+            }
+            Err(_) => {
+                eprintln!("ERROR: failed to join threads, some pages might be missing");
+            }
+        }
+    }
+
+    if !hide_progress {
+        println!(
+            "saved local copy of the ArchWiki to '{}'",
+            location.to_string_lossy()
+        )
+    }
+
+    Ok(())
+}
+
+type FailedPageFetches = Vec<(String, WikiError)>;
+
+async fn download_wiki_chunk(
+    chunk: &[(String, Vec<String>)],
+    format: &PageFormat,
+    location: &Path,
+    hide_progress: bool,
+    override_exisiting_files: bool,
+    multibar: &MultiProgress,
+    catbar: &ProgressBar,
+) -> Result<FailedPageFetches, WikiError> {
+    let mut failed_fetches = vec![];
+
+    for (cat, pages) in chunk {
+        let cat_dir = location.join(to_save_file_name(cat));
+        create_dir_if_not_exists(&cat_dir)?;
+
+        let width = unicode_width::UnicodeWidthStr::width(cat.as_str());
+
+        let leak_str: &'static str = Box::leak(
+            format!(
+                " fetching pages in \"{}\"",
+                if width <= 18 {
+                    truncate_unicode_str(18, cat)
+                } else {
+                    truncate_unicode_str(15, cat) + "..."
+                }
+            )
+            .into_boxed_str(),
+        );
+
+        let bar = multibar.add(
+            ProgressBar::new(pages.len().try_into().unwrap_or(0))
+                .with_prefix(leak_str)
+                .with_style(
+                    ProgressStyle::with_template(
+                        "[{prefix:<40}]\t {bar:40.cyan/blue} {pos:>4}/{len:4}",
+                    )
+                    .unwrap()
+                    .progress_chars("##-"),
+                ),
+        );
+
+        if hide_progress {
+            bar.finish_and_clear();
+        }
+
+        catbar.inc(1);
+        for page in pages {
+            bar.inc(1);
+
+            let path = page_path(page, format, &cat_dir);
+            if override_exisiting_files || !path.exists() {
+                match write_page_to_local_wiki(page, &path, format).await {
+                    Ok(()) => {}
+                    Err(err) => failed_fetches.push((page.to_owned(), err)),
+                }
+            }
+        }
+    }
+
+    Ok(failed_fetches)
+}
+
+async fn write_page_to_local_wiki(
+    page: &str,
+    page_path: &Path,
+    format: &PageFormat,
+) -> Result<(), WikiError> {
+    let document = fetch_page_without_recommendations(page).await?;
+    let content = match format {
+        PageFormat::PlainText => convert_page_to_plain_text(&document, false),
+        PageFormat::Markdown => convert_page_to_markdown(&document, page),
+        PageFormat::Html => convert_page_to_html(&document, page),
+    };
+
+    fs::write(page_path, content)?;
+    Ok(())
+}
+
+fn chunk_wiki_with_even_page_distribution(
+    wiki_tree: Vec<(String, Vec<String>)>,
+    chunk_count: usize,
+) -> Vec<Vec<(String, Vec<String>)>> {
+    let mut chunks: Vec<Vec<(String, Vec<String>)>> = (0..chunk_count).map(|_| vec![]).collect();
+
+    for entry in wiki_tree {
+        if let Some(chunk) = chunks.iter_mut().min_by(|a, b| {
+            let count_a = a.iter().map(|(_, pages)| pages.len()).sum::<usize>();
+            let count_b = b.iter().map(|(_, pages)| pages.len()).sum::<usize>();
+
+            count_a.cmp(&count_b)
+        }) {
+            chunk.push(entry);
+        }
+    }
+
+    chunks
+}
-- 
GitLab


From 856fe5b01ec595641d66cdbe27a1a09b4d093c2d Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 14:08:04 +0100
Subject: [PATCH 18/24] improve help command messages

---
 src/cli.rs | 64 +++++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 024f92a..14af1ab 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -15,129 +15,125 @@ pub struct CliArgs {
 pub enum Commands {
     #[command(
         about = "Read a page from the ArchWiki",
-        long_about = "Read a page from the ArchWiki, if the page is not found similar page names are recommended. A list of page names is in the pages.yml file which can be updated with the 'sync-wiki' command."
+        long_about = "Read a page from the ArchWiki, if the page is not found similar page names are recommended"
     )]
     ReadPage {
         #[arg(short, long)]
-        /// Don't cache the read page locally.
+        /// Don't cache the read page locally
         no_cache_write: bool,
         #[arg(short, long)]
-        /// Don't read the page from cache even if an entry for it is cached.
+        /// Don't read the page from cache even if an entry for it is cached
         ignore_cache: bool,
         #[arg(short, long)]
         /// Don't invalidate the cache even if it is considered stale. A cache is considered stale
-        /// after it hasn't been updated in more then 14 days.
+        /// after it hasn't been updated in more then 14 days
         disable_cache_invalidation: bool,
         #[arg(short, long)]
-        /// Show URLs for plain-text output.
+        /// Show URLs for plain-text output
         show_urls: bool,
         #[arg(short, long)]
         /// Preferred page language
         lang: Option<String>,
         #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)]
-        /// The format that the page should be displayed in.
+        /// The format that the page should be displayed in
         format: PageFormat,
-        /// The name of the page to read or an absolute URL to the page.
+        /// The name of the page to read or an absolute URL to the page
         page: String,
     },
     #[command(
         about = "Search the ArchWiki for pages",
-        long_about = "Search the ArchWiki for pages by title. Uses the 'opensearch' API action to perform queries."
+        long_about = "Search the ArchWiki for pages"
     )]
     Search {
         search: String,
         #[arg(short, long, default_value_t = String::from("en"))]
-        /// Preferred language of the content to search for.
+        /// Preferred language of the content to search for
         lang: String,
         #[arg(short = 'L', long, default_value_t = 5)]
-        /// Maximum number of results.
+        /// Maximum number of results
         limit: u16,
         #[arg(short, long)]
-        /// Search for pages by text content instead of title. Uses the 'query' API action instead
-        /// of 'opensearch'.
+        /// Search for pages by text content instead of title
         text_search: bool,
     },
     #[command(
         about = "List all pages from the ArchWiki that have been downloaded",
-        long_about = "List all pages from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading."
+        long_about = "List all pages from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading"
     )]
     ListPages {
         #[arg(short, long)]
-        /// Flatten all pages and don't show their category names.
+        /// Flatten all pages and don't show their category names
         flatten: bool,
         #[arg(short, long)]
-        /// Only show pages in this category.
+        /// Only show pages in this category
         category: Option<String>,
         #[arg(short, long)]
-        /// Use a different file to read pages from.
+        /// Use a different file to read pages from
         page_file: Option<PathBuf>,
     },
     #[command(
         about = "List all categories from the ArchWiki that have been downloaded",
-        long_about = "List categories  from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading."
+        long_about = "List categories  from the ArchWiki that have been downloaded. See 'sync-wiki' for information on downloading"
     )]
     ListCategories {
         #[arg(short, long)]
-        /// Use a different file to read pages from.
+        /// Use a different file to read pages from
         page_file: Option<PathBuf>,
     },
     #[command(
         about = "List all languages that the ArchWiki supports",
-        long_about = "List all languages that the ArchWiki supports."
+        long_about = "List all languages that the ArchWiki supports"
     )]
     ListLanguages,
     #[command(
         about = "Download information about the pages and categories on the ArchWiki",
-        long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' commands"
+        long_about = "Download information about the pages and categories on the ArchWiki. Page and category names are used for the 'list-pages' and 'list-categories' sub-commands"
     )]
     SyncWiki {
         #[arg(short = 'H', long)]
-        /// Hide progress indicators.
+        /// Hide progress indicators
         hide_progress: bool,
         #[arg(short, long)]
-        /// Print result to stdout instead of writing to a file. Output is formatted as YAML.
+        /// Print result to stdout instead of writing to a file. Output is formatted as YAML
         print: bool,
         #[arg(short, long)]
-        /// Use custom output file location.
+        /// Use custom output file location
         out_file: Option<PathBuf>,
     },
     #[command(
         about = "Download a copy of the ArchWiki. Will take a long time :)",
-        long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub categories are put at the top level of the directory."
+        long_about = "Download a copy of the ArchWiki. Will take a long time :). The exact hierarchy of the wiki is not mainted, sub-categories are put at the top level of the wiki directory"
     )]
     LocalWiki {
         #[arg(short, long)]
         /// Amount of threads to use for fetching pages from the ArchWiki. If not provided the
-        /// number of physical cores is used.
+        /// number of physical cores is used
         thread_count: Option<usize>,
         #[arg(short, long)]
-        /// Use a different file to read pages from.
+        /// Use a different file to read pages from
         page_file: Option<PathBuf>,
         #[arg(short = 'H', long)]
-        /// Hide progress indicators.
+        /// Hide progress indicators
         hide_progress: bool,
         #[arg(short, long)]
-        /// Override directory at 'location' if it already exists.
+        /// Override already downloaded files
         override_existing_files: bool,
         #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)]
-        /// The format that the page should be displayed in.
+        /// The format that the page should be displayed in
         format: PageFormat,
-        /// Location to store the local copy of the wiki at.
+        /// Location to store the local copy of the wiki at
         location: PathBuf,
     },
     #[command(
         about = "Retrive information related to this tool",
-        long_about = "Retrive information related to this tool. All Info is shown by default."
+        long_about = "Retrive information related to this tool"
     )]
     Info {
         #[arg(short = 'c', long)]
-        /// Location of the cache directory.
         show_cache_dir: bool,
         #[arg(short = 'd', long)]
-        /// Location of the data directory.
         show_data_dir: bool,
         #[arg(short, long)]
-        /// Only show values and not the properties they belong to or their descriptions.
         only_values: bool,
     },
 }
-- 
GitLab


From bba7db23eb1435250d7e9facd3d8e974ade2f1e1 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 14:42:17 +0100
Subject: [PATCH 19/24] update README

---
 README.md | 55 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 0409007..b92dc43 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 # archwiki-rs 📖
 A CLI tool to read pages from the ArchWiki
 
-## Table of contents
+<!-- toc -->
+
 - [Installation](#installation)
   * [crates.io](#cratesio)
   * [Source](#source)
@@ -10,18 +11,22 @@ A CLI tool to read pages from the ArchWiki
     + [Basic request](#basic-request)
     + [Using a different format](#using-a-different-format)
     + [Caching](#caching)
-    + [404 page not found (-̥̥̥n-̥̥̥ )](#404-page-not-found--̥̥̥n-̥̥̥-)
+    + [404 page not found (-̥̥̥n-̥̥̥ )](#404-page-not-found--%CC%A5%CC%A5%CC%A5n-%CC%A5%CC%A5%CC%A5-)
   * [Searching the ArchWiki](#searching-the-archwiki)
     + [Search by title](#search-by-title)
     + [Search for text](#search-for-text)
   * [Downloading wiki info](#downloading-wiki-info)
-    + [Possible speed-ups](#possible-speed-ups)
   * [Listing ArchWiki information](#listing-archwiki-information)
     + [Listing pages](#listing-pages)
     + [Listing categories](#listing-categories)
     + [Listing languages](#listing-languages)
+  * [Downloading a local copy of the ArchWiki](#downloading-a-local-copy-of-the-archwiki)
+    + [Possible speed-ups](#possible-speed-ups)
   * [Other Information](#other-information)
 - [Plugins](#plugins)
+- [Alternatives](#alternatives)
+
+<!-- tocstop -->
 
 ## Installation
 Currently, you can only install this tool from [ crates.io ](https://crates.io/crates/archwiki-rs) 
@@ -83,7 +88,7 @@ uses stderr to give the user suggestions on what they might have wanted to type.
 
 
 An example shell script to do something like this is available in the [repository](https://github.com/jackboxx/archwiki-rs)
-under the name `example.sh`.
+under the name `example.sh` which can be used like this `sh example.sh <page-name>`.
 
 ### Searching the ArchWiki
 
@@ -106,25 +111,13 @@ that the search term is in
 
 ### Downloading wiki info
 
-Page names are stored locally to prevent having to scrape the entire table of contents of
-the ArchWiki with every command.
-
-Use this command to fetch all page names. 
-Be warned, since this scrapes multiple thousand links, this can be quite  slow (-, - )…zzzZZ
+Page and category names are stored locally for faster look-ups.
+Use this command to fetch all page and category names. 
 
 ```sh
 archwiki-rs sync-wiki
 ```
 
-#### Possible speed-ups
-
-If you don't mind your CPU and network becoming a bit saturated you can increase the
-amount of threads used to fetch data from the wiki
-
-```sh
-archwiki-rs sync-wiki -t 8
-```
-
 ### Listing ArchWiki information
 
 #### Listing pages
@@ -163,6 +156,27 @@ And the same for available languages
 archwiki-rs list-languages
 ```
 
+### Downloading a local copy of the ArchWiki
+
+Use this command to download a local copy of the ArchWiki. Be warned, that this makes over
+10,000 requests for page content to the ArchWiki so it takes a while to finish (-, -)…zzzZZ
+
+```sh
+archwiki-rs local-wiki ~/local-archwiki --format markdown
+```
+
+#### Possible speed-ups
+
+If you don't mind your CPU and network becoming a bit saturated you can increase the
+amount of threads used to fetch data from the wiki. 
+
+Keep in mind that you might get rate limited by the ArchWiki if make too many requests at once.
+
+```sh
+archwiki-rs local-wiki -t 8
+```
+
+
 ### Other Information
 
 Other information such as the value/location of the `cache directory` can be obtained
@@ -185,3 +199,8 @@ Here's a list of programs that have plugins for `archwiki-rs` to make your life
 
 - [Neovim](https://github.com/Jackboxx/archwiki-nvim)
 - [Obsidian](https://github.com/Jackboxx/archwiki-obsidian)
+
+## Alternatives
+
+If you are using Arch Linux a great alternative for this tool is the `wikiman` CLI tool
+in combination with the `arch-wiki-docs` package.
-- 
GitLab


From c4b79c142b6dbf3ce803b5c38fa153316164812a Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 14:43:51 +0100
Subject: [PATCH 20/24] update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b92dc43..a404e62 100644
--- a/README.md
+++ b/README.md
@@ -158,8 +158,8 @@ archwiki-rs list-languages
 
 ### Downloading a local copy of the ArchWiki
 
-Use this command to download a local copy of the ArchWiki. Be warned, that this makes over
-10,000 requests for page content to the ArchWiki so it takes a while to finish (-, -)…zzzZZ
+Use this command to download a local copy of the ArchWiki. Be warned, this command makes over
+10,000 requests to the ArchWiki so it takes a while to finish (-, -)…zzzZZ
 
 ```sh
 archwiki-rs local-wiki ~/local-archwiki --format markdown
-- 
GitLab


From f989b36f128ee6c3d2bda9d7efef52ac2be1d71b Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 14:48:44 +0100
Subject: [PATCH 21/24] add --show-urls option to local-wiki sub-command

---
 src/cli.rs           | 3 +++
 src/main.rs          | 2 ++
 src/wiki_download.rs | 9 +++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 14af1ab..02b5ebb 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -116,6 +116,9 @@ pub enum Commands {
         /// Hide progress indicators
         hide_progress: bool,
         #[arg(short, long)]
+        /// Show URLs in plain-text files
+        show_urls: bool,
+        #[arg(short, long)]
         /// Override already downloaded files
         override_existing_files: bool,
         #[arg(short, long, value_enum, default_value_t = PageFormat::PlainText)]
diff --git a/src/main.rs b/src/main.rs
index cd55d7b..f2fbec5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -177,6 +177,7 @@ async fn main() -> Result<(), WikiError> {
             format,
             page_file,
             thread_count,
+            show_urls,
             override_existing_files,
             hide_progress,
         } => {
@@ -195,6 +196,7 @@ async fn main() -> Result<(), WikiError> {
                 thread_count,
                 override_existing_files,
                 hide_progress,
+                show_urls,
             )
             .await?;
         }
diff --git a/src/wiki_download.rs b/src/wiki_download.rs
index 4c252f4..30bb827 100644
--- a/src/wiki_download.rs
+++ b/src/wiki_download.rs
@@ -59,6 +59,7 @@ pub async fn download_wiki(
     thread_count: usize,
     override_exisiting_files: bool,
     hide_progress: bool,
+    show_urls: bool,
 ) -> Result<(), WikiError> {
     create_dir_if_not_exists(&location)?;
 
@@ -116,6 +117,7 @@ pub async fn download_wiki(
                     &format_ref,
                     &location_ref,
                     hide_progress,
+                    show_urls,
                     override_exisiting_files,
                     &multibar_ref,
                     &catbar_ref,
@@ -159,11 +161,13 @@ pub async fn download_wiki(
 
 type FailedPageFetches = Vec<(String, WikiError)>;
 
+#[allow(clippy::too_many_arguments)]
 async fn download_wiki_chunk(
     chunk: &[(String, Vec<String>)],
     format: &PageFormat,
     location: &Path,
     hide_progress: bool,
+    show_urls: bool,
     override_exisiting_files: bool,
     multibar: &MultiProgress,
     catbar: &ProgressBar,
@@ -210,7 +214,7 @@ async fn download_wiki_chunk(
 
             let path = page_path(page, format, &cat_dir);
             if override_exisiting_files || !path.exists() {
-                match write_page_to_local_wiki(page, &path, format).await {
+                match write_page_to_local_wiki(page, &path, format, show_urls).await {
                     Ok(()) => {}
                     Err(err) => failed_fetches.push((page.to_owned(), err)),
                 }
@@ -225,10 +229,11 @@ async fn write_page_to_local_wiki(
     page: &str,
     page_path: &Path,
     format: &PageFormat,
+    show_urls: bool,
 ) -> Result<(), WikiError> {
     let document = fetch_page_without_recommendations(page).await?;
     let content = match format {
-        PageFormat::PlainText => convert_page_to_plain_text(&document, false),
+        PageFormat::PlainText => convert_page_to_plain_text(&document, show_urls),
         PageFormat::Markdown => convert_page_to_markdown(&document, page),
         PageFormat::Html => convert_page_to_html(&document, page),
     };
-- 
GitLab


From 12c6f510a922503c24d53d004702e4731ebb9852 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 15:06:11 +0100
Subject: [PATCH 22/24] improve error reporting for local-wiki command

---
 src/main.rs          |  3 +++
 src/wiki_download.rs | 37 +++++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index f2fbec5..b138809 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -50,8 +50,10 @@ async fn main() -> Result<(), WikiError> {
 
     let cache_dir = base_dir.cache_dir().join("archwiki-rs");
     let data_dir = base_dir.data_local_dir().join("archwiki-rs");
+    let log_dir = data_dir.join("logs");
     fs::create_dir_all(&cache_dir)?;
     fs::create_dir_all(&data_dir)?;
+    fs::create_dir_all(&log_dir)?;
 
     let default_page_file_path = data_dir.join(PAGE_FILE_NAME);
 
@@ -193,6 +195,7 @@ async fn main() -> Result<(), WikiError> {
                 wiki_tree,
                 format,
                 location,
+                &log_dir,
                 thread_count,
                 override_existing_files,
                 hide_progress,
diff --git a/src/wiki_download.rs b/src/wiki_download.rs
index 30bb827..9c54637 100644
--- a/src/wiki_download.rs
+++ b/src/wiki_download.rs
@@ -52,10 +52,12 @@ pub async fn sync_wiki_info(
     Ok(())
 }
 
+#[allow(clippy::too_many_arguments)]
 pub async fn download_wiki(
     wiki_tree: HashMap<String, Vec<String>>,
     format: PageFormat,
     location: PathBuf,
+    log_dir: &Path,
     thread_count: usize,
     override_exisiting_files: bool,
     hide_progress: bool,
@@ -63,13 +65,15 @@ pub async fn download_wiki(
 ) -> Result<(), WikiError> {
     create_dir_if_not_exists(&location)?;
 
+    let total_page_count = wiki_tree.values().map(|pages| pages.len()).sum::<usize>();
+
     if !hide_progress {
         if let Some(format) = format
             .to_possible_value()
             .as_ref()
             .map(PossibleValue::get_name)
         {
-            println!("downloading pages as {format}\n",)
+            println!("downloading {total_page_count} pages as {format}\n",)
         }
     }
 
@@ -128,16 +132,11 @@ pub async fn download_wiki(
         .collect_vec();
 
     let results = future::join_all(tasks).await;
+    let mut all_failed_fetches = vec![];
 
     for result in results {
         match result {
-            Ok(Ok(failed_fetchs)) => {
-                if !failed_fetchs.is_empty() {
-                    for (page, err) in failed_fetchs {
-                        eprintln!("WARNING: failed to page '{page}'\nREASON: {err}");
-                    }
-                }
-            }
+            Ok(Ok(mut failed_fetchs)) => all_failed_fetches.append(&mut failed_fetchs),
             Ok(Err(thread_err)) => {
                 eprintln!(
                     "ERROR: a thread paniced, some pages might be missing\nREASON: {thread_err}"
@@ -149,6 +148,28 @@ pub async fn download_wiki(
         }
     }
 
+    if !hide_progress {
+        let successfuly_fetched_pages = total_page_count - all_failed_fetches.len();
+
+        println!("downloaded {successfuly_fetched_pages} pages successfully");
+        println!("failed to download {} pages", all_failed_fetches.len());
+    }
+
+    if !all_failed_fetches.is_empty() {
+        let failed_fetches_str = all_failed_fetches
+            .into_iter()
+            .map(|(page, err)| format!("failed to page '{page}'\nREASON: {err}"))
+            .collect_vec()
+            .join("\n\n");
+
+        let path = log_dir.join("local-wiki-download-err.log");
+        let write = fs::write(&path, failed_fetches_str);
+
+        if write.is_ok() && !hide_progress {
+            println!("error log written to '{}'", path.to_string_lossy());
+        }
+    }
+
     if !hide_progress {
         println!(
             "saved local copy of the ArchWiki to '{}'",
-- 
GitLab


From b132a53601d983b197ad79102a62e3518acc2370 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 15:15:07 +0100
Subject: [PATCH 23/24] allow filtering list-pages sub-command by multiple
 categories instead of just 1

---
 src/categories.rs | 25 ++++++++++++++++++++++---
 src/cli.rs        |  6 +++---
 src/error.rs      |  2 --
 src/main.rs       | 17 ++++++-----------
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/categories.rs b/src/categories.rs
index 1285c2e..f14a0e5 100644
--- a/src/categories.rs
+++ b/src/categories.rs
@@ -27,13 +27,32 @@ use crate::error::WikiError;
 /// If it is not flattened the list is first ordered by category names and then by page names withing those
 /// categories.
 /// If it is flattened then it will by sorted by page names.
-pub fn list_pages(categories: &HashMap<String, Vec<String>>, flatten: bool) -> String {
+pub fn list_pages(
+    wiki_tree: &HashMap<String, Vec<String>>,
+    categories_filter: Option<&[String]>,
+    flatten: bool,
+) -> String {
     if flatten {
-        return categories.values().flatten().unique().sorted().join("\n");
+        return wiki_tree
+            .iter()
+            .filter_map(|(cat, pages)| {
+                categories_filter
+                    .map(|filter| filter.iter().contains(cat).then_some(pages))
+                    .unwrap_or(Some(pages))
+            })
+            .flatten()
+            .unique()
+            .sorted()
+            .join("\n");
     }
 
-    categories
+    wiki_tree
         .iter()
+        .filter_map(|(cat, pages)| {
+            categories_filter
+                .map(|filter| filter.iter().contains(cat).then_some((cat, pages)))
+                .unwrap_or(Some((cat, pages)))
+        })
         .sorted()
         .map(|(cat, pages)| {
             let list = pages.iter().map(|p| format!("───┤{p}")).join("\n");
diff --git a/src/cli.rs b/src/cli.rs
index 02b5ebb..6be1d1d 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -64,9 +64,9 @@ pub enum Commands {
         #[arg(short, long)]
         /// Flatten all pages and don't show their category names
         flatten: bool,
-        #[arg(short, long)]
-        /// Only show pages in this category
-        category: Option<String>,
+        #[arg(short, long, value_delimiter = ',')]
+        /// Only show pages in these categories
+        categories: Vec<String>,
         #[arg(short, long)]
         /// Use a different file to read pages from
         page_file: Option<PathBuf>,
diff --git a/src/error.rs b/src/error.rs
index 4cc13cd..c729406 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -48,6 +48,4 @@ pub enum WikiError {
     InvalidApiResponse(InvalidApiResponseError),
     #[error("{}", .0)]
     NoPageFound(String),
-    #[error("The category '{}' could not be found", .0)]
-    NoCategoryFound(String),
 }
diff --git a/src/main.rs b/src/main.rs
index b138809..9ccc9cf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -124,7 +124,7 @@ async fn main() -> Result<(), WikiError> {
         }
         Commands::ListPages {
             flatten,
-            category,
+            categories,
             page_file,
         } => {
             let (path, is_default) = page_file
@@ -132,16 +132,11 @@ async fn main() -> Result<(), WikiError> {
                 .unwrap_or((default_page_file_path, true));
 
             let wiki_tree = read_pages_file_as_category_tree(&path, is_default)?;
-            let out = if let Some(category) = category {
-                wiki_tree
-                    .get(&category)
-                    .ok_or(WikiError::NoCategoryFound(category))?
-                    .iter()
-                    .sorted()
-                    .join("\n")
-            } else {
-                list_pages(&wiki_tree, flatten)
-            };
+            let out = list_pages(
+                &wiki_tree,
+                (!categories.is_empty()).then_some(&categories),
+                flatten,
+            );
 
             println!("{out}");
         }
-- 
GitLab


From 0125fa49d3986021e1d395c6971ac81acbc1d0a2 Mon Sep 17 00:00:00 2001
From: jackboxx <lucygschwantner@gmail.com>
Date: Fri, 19 Jan 2024 15:26:28 +0100
Subject: [PATCH 24/24] show failed fetch count only if failures occured

---
 src/wiki_download.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/wiki_download.rs b/src/wiki_download.rs
index 9c54637..8137b4c 100644
--- a/src/wiki_download.rs
+++ b/src/wiki_download.rs
@@ -150,12 +150,14 @@ pub async fn download_wiki(
 
     if !hide_progress {
         let successfuly_fetched_pages = total_page_count - all_failed_fetches.len();
-
         println!("downloaded {successfuly_fetched_pages} pages successfully");
-        println!("failed to download {} pages", all_failed_fetches.len());
     }
 
     if !all_failed_fetches.is_empty() {
+        if !hide_progress {
+            println!("failed to download {} pages", all_failed_fetches.len());
+        }
+
         let failed_fetches_str = all_failed_fetches
             .into_iter()
             .map(|(page, err)| format!("failed to page '{page}'\nREASON: {err}"))
-- 
GitLab