File: feed.hpp

package info (click to toggle)
syndication-domination 1.0-3
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 408 kB
sloc: cpp: 1,101; makefile: 42; sh: 10
file content (142 lines) | stat: -rw-r--r-- 4,036 bytes
#pragma once

#include <pugixml.hpp>
#include <string>
#include <vector>

#include "utils.hpp"
#include "feed_type.hpp"
#include "feed_item.hpp"
#include "extraction_param.hpp"

using namespace pugi;

/**
* Represents an RSS/Atom feed.
*
* Upon construction it will try its best to parse useful information out of
* the provided file.
*
* In case some value cannot be found, it will just contain an empty string.
*/
class Feed {
private:

    std::string path;
    xml_document doc;
    xml_node feed_node;

    FeedType type{FeedType::INVALID};

    std::string title;
    std::string description;
    std::string url;
    std::string last_update;
    std::string img_url;
    std::string rss_url;

    std::vector<FeedItem> feed_items{};

    /**
    * Verifies if the feed is a valid RSS or Atom file; also sets the type
    * variable.
    */
    bool verify_feed();

    /**
    * Returns the xml_node containing the feed, which is `channel` for RSS
    * or the root node for Atom.
    */
    xml_node get_feed_node();

    /**
    * Makes relative urls absolute and/or adds the protocol if not present.
    * This fixing is done in place on the string passed.
    */
    void fix_url(std::string &s);

    /**
    * Tries to extract the website url and returns it.
    */
    std::string extract_url();

    static inline const std::vector<ExtractionParam> __LAST_UPDATE_PARAMS{
        {ExtractionParam::ParamType::CHILD, {"updated"}},
        {ExtractionParam::ParamType::CHILD, {"lastBuildDate"}},
        {ExtractionParam::ParamType::CHILD, {"pubDate"}},
        {ExtractionParam::ParamType::CHILD, {"dc:date"}}
    };
    /**
    * Tries to extract and return the last update date and time, alternatively
    * returns the current date and time.
    */
    std::string extract_last_update();

    static inline const std::vector<ExtractionParam> __IMG_URL_PARAMS{
        {ExtractionParam::ParamType::CHILD, {"icon"}},
        {ExtractionParam::ParamType::CHILD, {"image", "url"}},
        {ExtractionParam::ParamType::CHILD, {"image"}},
        {ExtractionParam::ParamType::CHILD, {"logo"}},
        {ExtractionParam::ParamType::ATTRIBUTE, {"itunes:image"}, "href"}
    };
    /**
    * Tries to extract an image url representing the feed (icon, logo...) and
    * returns it.
    */
    std::string extract_img_url();

    /**
    * Tries to extract the url to the same feed being parsed and returns it.
    * This is useful for updating the feed in the future.
    */
    std::string extract_rss_url();

    static inline const std::vector<ExtractionParam> __DESCRIPTION_PARAMS{
        {ExtractionParam::ParamType::CHILD, {"description"}},
        {ExtractionParam::ParamType::CHILD, {"subtitle"}}
    };
    /**
    * Extracts all of the feed data and assigns the relevant internal
    * fields.
    */
    void extract_feed_data();

    /**
    * Extracts all of the items (articles, posts) in the feed and stores them
    * inside the `feed_items` vector as FeedItem objects.
    */
    void extract_feed_items();

    /**
    * Entry point of the class, parses all the relevant content. Called by
    * the constructor.
    */
    void parse();

public:

    /**
    * Constructs the Feed object from a valid RSS/Atom file path.
    * It will also automatically construct a vector of FeedItem objects
    * representing the feed items or articles found in the feed.
    * 
    * @param path a valid file path to an RSS or Atom XML file.
    */
    Feed(std::string path) : path{path} {
        parse();
    }

    std::string get_title() { return title; }
    std::string get_description() { return description; }
    std::string get_url() { return url; }
    std::string get_last_update() { return last_update; }
    std::string get_img_url() { return img_url; }
    std::string get_rss_url() { return rss_url; }

    std::vector<FeedItem> get_items() { return feed_items; }

    /**
    * Represents the Feed object (itself) as a json, returned as a string.
    */
    std::string to_json(bool no_items=false);
};