File: html.hpp

package info (click to toggle)
syndication-domination 1.0-3
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 408 kB
sloc: cpp: 1,101; makefile: 42; sh: 10
file content (104 lines) | stat: -rw-r--r-- 2,511 bytes
#pragma once


#include "utils.hpp"
#include <stdexcept>
#include <string>
#include <cstring>

#include <tidy/tidy.h>
#include <tidy/tidybuffio.h>

#include <pugixml.hpp>


using namespace pugi;

/**
* Represents an HTML document.
*
* Upon construction it will convert the given HTML using _tidy_, then feed it
* to pugixml for parsing.
*
* This parser is mostly useful for feed readers, so it only provides very
* little information and it's not suitable as a full-fledged HTML parser.
*
* Values are parsed on the fly when requested, this is mostly to avoid
* unnecessary overhead trying to parse unneeded information ahead of time.
*
* In case some value cannot be found, it will just contain an empty string.
*/
class Html {
private:
    xml_document doc;
    xml_node head;

    std::string title{""};
    std::string icon_url{""};
    std::string img_url{""};
    std::string rss_url{""};
    std::string description{""};
    std::string article{""};
    std::string body{""};

    /**
    * Applies a default configuration set to a TidyDoc.
    */
    static void configure_tidy_doc(TidyDoc &doc);

    /**
    * Returns a TidyDoc given a valid file path.
    */
    TidyDoc tidy_doc_from_file(std::string path);

    /**
    * Converts a TidyDoc document to XML, and returns it as a string.
    */
    std::string convert_to_xml(TidyDoc doc);

    static inline const std::vector<std::string> USELESS_CHILDREN = {
        "script", "form", "input", "label", "nav", "footer", "header"
    };

    /**
    * Removes children that are deemed useless for the information this class
    * needs to parse.
    */
    void remove_useless_children(xml_node &root);

    /**
    * Constructs an Html object from a TidyDoc document.
    */
    Html(TidyDoc &tdoc);

    /**
    * Returns the `body` node from the current xml_document.
    */
    xml_node get_body_node();

public:

    /**
    * Constructs the Html object from a valid file path.
    * 
    * @param path a valid file path to a local HTML document.
    */
    Html(std::string path);

    /**
    * Constructs the Html object from a string containing valid HTML.
    *
    * @param s a string containing the HTML to parse
    */
    static Html from_string(std::string s);

    std::string get_title();
    std::string get_icon_url();
    std::string get_img_url();
    std::string get_rss_url();
    std::string get_body();
    std::string get_article();
    std::string get_description();

    std::string to_json(bool metadata_only=false);
};