1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
|
#include "feed.hpp"
bool Feed::verify_feed() {
std::string root_name = doc.document_element().name();
SynDomUtils::lower(root_name);
if (root_name == "rss") {
type = FeedType::RSS;
return true;
}
if (root_name == "rdf:rdf") {
type = FeedType::RDF;
return true;
}
if (root_name == "feed") {
type = FeedType::ATOM;
return true;
}
type = FeedType::INVALID;
return false;
}
xml_node Feed::get_feed_node() {
if (type == FeedType::RSS || type == FeedType::RDF) {
return doc.document_element().child("channel");
}
return doc.document_element();
}
void Feed::fix_url(std::string &s) {
SynDomUtils::trim(s);
if (s.empty() || SynDomUtils::is_url(s)) return;
if (SynDomUtils::str_has_prefix(s, "/") && s != url) {
s = url + s;
return;
}
s = "http://" + s;
}
std::string Feed::extract_url() {
std::string res = feed_node.child("link").text().as_string();
if (!res.empty()) return res;
res = SynDomUtils::extract_link(
feed_node, {"alternate"}, {"text/html"}, false, true
);
if (!res.empty()) return res;
res = feed_node.child("id").text().as_string();
if (SynDomUtils::is_url(res)) return res;
return "";
}
std::string Feed::extract_last_update() {
std::string res = SynDomUtils::extract_from_node(
feed_node, __LAST_UPDATE_PARAMS
);
if (!res.empty()) return res;
return SynDomUtils::current_time();
}
std::string Feed::extract_img_url() {
return SynDomUtils::extract_from_node(
feed_node, __IMG_URL_PARAMS
);
}
std::string Feed::extract_rss_url() {
return SynDomUtils::extract_link(
feed_node,
{"self"}, {"application/rss+xml", "application/atom+xml"},
false, true
);
}
void Feed::extract_feed_data() {
feed_node = get_feed_node();
// title
// simple enough to be done raw
title = (feed_node.child("title").text().as_string());
SynDomUtils::trim(title);
// description
description = SynDomUtils::extract_from_node(
feed_node, __DESCRIPTION_PARAMS
);
// url, as in the website link relative to the feed
url = extract_url();
fix_url(url);
// last_update
last_update = extract_last_update();
// img_url
img_url = extract_img_url();
fix_url(img_url);
// rss_url
rss_url = extract_rss_url();
fix_url(rss_url);
}
void Feed::extract_feed_items() {
if (type == FeedType::RDF) {
xml_node item = feed_node.child("items").child("rdf:Seq").child(
"rdf:li"
);
while (item) {
std::string url = item.attribute("rdf:resource").value();
xml_node item_info = doc.document_element()
.find_child_by_attribute("item", "rdf:about", url.c_str());
if (item_info) {
feed_items.push_back(FeedItem(item_info, url));
}
item = item.next_sibling("rdf:li");
}
return;
}
std::string item_tag = "item";
xml_node first_item = feed_node.child(item_tag.c_str());
if (!first_item) {
item_tag = "entry";
first_item = feed_node.child(item_tag.c_str());
}
if (!first_item) return;
xml_node item = first_item;
while (item) {
feed_items.push_back(FeedItem(item, url));
item = item.next_sibling(item_tag.c_str());
}
}
void Feed::parse() {
xml_parse_result res = doc.load_file(path.c_str());
if (!res) {
throw std::runtime_error("Error parsing XML file: "+path);
}
if (!verify_feed()) {
throw std::runtime_error(
"Error: the XML file provided is not a feed: "+path
);
}
extract_feed_data();
extract_feed_items();
}
std::string Feed::to_json(bool no_items) {
std::string res = "\n{\n"
" \"title\": \"" + title + "\",\n"
" \"description\": \"" + description + "\",\n"
" \"url\": \"" + url + "\",\n"
" \"last_update\": \"" + last_update + "\",\n"
" \"img_url\": \"" + img_url + "\",\n"
" \"rss_url\": \"" + rss_url;
if (!no_items) {
res += "\",\n \"items\": [\n";
for (auto fi: feed_items) {
res += fi.to_json() + ",\n";
}
res = res.substr(0, res.size()-2) + "\n";
res += " ]\n}\n";
}
else {
res += "\n}\n";
}
return res;
}
|