commit 18cb194861bd22cc396f6284a05f185ecfaf7a86
parent 3ad1a9fd70b7f68915987477e88968ea56a8199f
Author: lash <dev@holbrook.no>
Date: Sun, 28 Jul 2024 01:25:31 +0100
Change rss parser, include dublincore date in entry rss parse
Diffstat:
7 files changed, 405 insertions(+), 3 deletions(-)
diff --git a/crier-lib/Cargo.lock b/crier-lib/Cargo.lock
@@ -39,6 +39,17 @@ dependencies = [
]
[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi",
+]
+
+[[package]]
name = "autocfg"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -110,9 +121,11 @@ dependencies = [
"atom_syndication",
"chrono",
"digest",
+ "env_logger",
"feed-rs",
"http",
"itertools",
+ "log",
"mediatype",
"quick-xml 0.31.0",
"rs_sha512",
@@ -233,6 +246,19 @@ dependencies = [
]
[[package]]
+name = "env_logger"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
+dependencies = [
+ "atty",
+ "humantime",
+ "log",
+ "regex",
+ "termcolor",
+]
+
+[[package]]
name = "errno"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -302,6 +328,15 @@ dependencies = [
]
[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
name = "http"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -313,6 +348,12 @@ dependencies = [
]
[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
+[[package]]
name = "iana-time-zone"
version = "0.1.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -645,6 +686,15 @@ dependencies = [
]
[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
name = "tinyvec"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -773,6 +823,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/crier-lib/Cargo.toml b/crier-lib/Cargo.toml
@@ -8,9 +8,7 @@ authors = ["Louis Holbrook <dev@holbrook.no>"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-#rss = "^2.0.3"
feed-rs = "^2.0"
-rss = "^2.0"
#activitypub_federation = "^0.4.0"
#json-ld = "^0.14.1"
digest = "^0.10.7"
@@ -27,6 +25,10 @@ atom_syndication = "^0.12"
#version = "^0.31"
#features = ["serialize"]
+[dependencies.rss]
+version = "^2.0"
+features = ["atom"]
+
[patch.crates-io]
#atom_syndication = { path = "/home/lash/src/contrib/atom_syndication" }
atom_syndication = { git = "git://holbrook.no/contrib/atom_syndication", rev="9985c1610b2b819f5bd2f7a719567ee0b5419b85" } #branch="lash/entry-fromstr"
@@ -36,5 +38,14 @@ tempfile = "3.3.0"
mediatype = "^0.19"
quick-xml = "^0.31"
+[dependencies.env_logger]
+version = "^0.9"
+optional = true
+
+[dependencies.log]
+version = "^0.4"
+optional = true
+
[features]
fs = []
+logging = ["log", "env_logger"]
diff --git a/crier-lib/src/lib.rs b/crier-lib/src/lib.rs
@@ -28,6 +28,8 @@ pub mod mem;
mod meta;
mod cache;
+mod rss;
+mod log;
use meta::FeedMetadata;
use mem::CacheWriter;
use cache::Cache;
@@ -37,6 +39,7 @@ pub enum Error {
WriteError,
CacheError,
ParseError,
+ IncompleteError,
}
pub struct Sequencer<'a> {
@@ -254,6 +257,27 @@ impl SequencerEntry {
fn to_writer(&self, v: Vec<u8>) -> BufWriter<Vec<u8>> {
BufWriter::with_capacity(10241024, v)
}
+
+}
+
+fn get_base_date(entry: &Entry) -> FixedDateTime {
+ let d: FixedDateTime;
+
+ match entry.published {
+ Some(v) => {
+ return FixedDateTime::parse_from_rfc2822(v.to_rfc2822().as_str()).unwrap();
+ },
+ None => {},
+ };
+
+ match entry.updated {
+ Some(v) => {
+ return FixedDateTime::parse_from_rfc2822(v.to_rfc2822().as_str()).unwrap();
+ },
+ None => {},
+ };
+
+ return FixedDateTime::parse_from_rfc2822(entry.updated.unwrap().to_rfc2822().as_str()).unwrap();
}
/// TODO: split out field translations to separate module
@@ -268,11 +292,13 @@ impl Into<Vec<u8>> for SequencerEntry {
b = Vec::new();
w = o.to_writer(b);
+ let mut d = get_base_date(&self.entry);
+
out_entry = OutEntry::default();
out_entry.set_id(self.entry.id);
out_entry.set_title(self.entry.title.unwrap().content);
- let mut d = FixedDateTime::parse_from_rfc2822(self.entry.published.unwrap().to_rfc2822().as_str()).unwrap();
+
out_entry.set_published(Some(d.clone()));
match self.entry.updated {
diff --git a/crier-lib/src/log.rs b/crier-lib/src/log.rs
@@ -0,0 +1,21 @@
+#[cfg(feature = "logging")]
+use env_logger;
+pub use log::debug;
+pub use log::info;
+
+pub fn init() {
+ env_logger::init();
+}
+
+#[cfg(not(feature = "logging"))]
+#[macro_export]
+macro_rules! info {
+ (*) => {};
+}
+
+#[cfg(not(feature = "logging"))]
+#[macro_export]
+macro_rules! debug {
+ (*) => {};
+}
+
diff --git a/crier-lib/src/rss.rs b/crier-lib/src/rss.rs
@@ -0,0 +1,184 @@
+use std::path::Path;
+use std::fs::File;
+use std::io::BufReader;
+use crate::Error;
+use crate::log::info;
+use crate::log::debug;
+
+use rss::Channel;
+use rss::Item;
+use rss::extension::dublincore::DublinCoreExtension;
+use atom_syndication::Feed;
+use atom_syndication::Entry;
+use atom_syndication::Text;
+use atom_syndication::TextType;
+use atom_syndication::FixedDateTime;
+use chrono::naive::NaiveDateTime;
+use chrono::Local;
+use chrono::offset::Utc;
+
+/// try to coerce the item field into a valid date
+fn parse_date(v: &String) -> Result<FixedDateTime, Error> {
+ match FixedDateTime::parse_from_rfc2822(v.as_str()) {
+ Ok(r) => {
+ return Ok(r);
+ },
+ Err(e) => {},
+ };
+ match FixedDateTime::parse_from_rfc3339(v.as_str()) {
+ Ok(r) => {
+ return Ok(r);
+ },
+ Err(e) => {},
+ };
+ match FixedDateTime::parse_from_str(v.as_str(), "%Y-%m-%dT%H:%M:%S") {
+ Ok(r) => {
+ return Ok(r);
+ },
+ Err(e) => {
+ },
+ };
+ match NaiveDateTime::parse_from_str(v.as_str(), "%Y-%m-%dT%H:%M:%S") {
+ Ok(r) => {
+ return Ok(r.and_utc().fixed_offset());
+ },
+ Err(e) => {
+ },
+ };
+
+
+ Err(Error::ParseError)
+}
+
+/// try different item fields to determine the date
+fn get_base_date(ipt: &Item) -> Result<FixedDateTime, Error> {
+ let mut ds = String::new();
+
+ match &ipt.pub_date {
+ Some(v) => {
+ ds.push_str(v.as_str());
+ },
+ _ => {},
+ };
+ match parse_date(&ds) {
+ Ok(v) => {
+ return Ok(v);
+ },
+ Err(e) => {},
+ };
+
+ match &ipt.dublin_core_ext {
+ Some(v) => {
+ for vv in v.dates() {
+ match parse_date(vv) {
+ Ok(vvv) => {
+ return Ok(vvv);
+ },
+ Err(e) => {
+ debug!("no date");
+ },
+ }
+ }
+ },
+ _ => {},
+ }
+
+ Err(Error::IncompleteError)
+}
+
+/// coerce the rss item into an atom entry
+fn translate_item(ipt: Item) -> Result<Entry, Error> {
+ let mut opt = Entry::default();
+
+ match &ipt.title {
+ Some(v) => {
+ opt.set_title(Text::plain(v));
+ },
+ _ => {},
+ };
+
+ match get_base_date(&ipt) {
+ Ok(v) => {
+ opt.set_published(v);
+ },
+ Err(e) => {
+ return Err(e);
+ }
+ }
+ Ok(opt)
+}
+
+
+fn translate(ipt: Channel, allow_fail: bool) -> Result<Feed, Error> {
+ let mut entries: Vec<Entry>;
+ let mut opt = Feed::default();
+
+ opt.set_title(Text::plain(&ipt.title));
+
+ opt.set_subtitle(Some(Text::plain(&ipt.description)));
+
+ entries = vec!();
+ for v in ipt.into_items() {
+ match translate_item(v) {
+ Ok(v) => {
+ entries.push(v);
+ },
+ Err(e) => {
+ if !allow_fail {
+ return Err(Error::IncompleteError);
+ }
+ },
+ }
+ }
+
+ opt.set_entries(entries);
+ opt.set_updated(Local::now().to_utc());
+ Ok(opt)
+}
+
+pub fn from_file(fp: &str, allow_entry_fail: bool) -> Result<Feed, Error> {
+ let mut o: Channel;
+ let r: Feed;
+
+ let p = Path::new(fp);
+ let f = File::open(p).unwrap();
+ let b = BufReader::new(f);
+
+ match Channel::read_from(b) {
+ Ok(v) => {
+ o = v;
+ },
+ Err(e) => {
+ return Err(Error::ParseError);
+ },
+ };
+ o.set_dublin_core_ext(DublinCoreExtension::default());
+ translate(o, allow_entry_fail)
+}
+
+
+mod test {
+ use std::path::Path;
+ use atom_syndication::Feed;
+ use crate::log;
+
+ #[test]
+ fn test_rss_from_file() {
+ env_logger::init();
+ let mut r: Feed;
+ match super::from_file("testdata/test.rss.xml", false) {
+ Ok(v) => {
+ },
+ Err(e) => {
+ panic!("{:?}", e);
+ },
+ };
+// match super::from_file("testdata/test.atom.xml", false) {
+// Ok(v) => {
+// panic!("expected fail");
+// },
+// Err(e) => {
+// },
+// };
+ }
+}
diff --git a/crier-lib/src/tests.rs b/crier-lib/src/tests.rs
@@ -20,6 +20,7 @@ use crate::io::FeedGet;
use crate::meta::FeedMetadata;
use crate::Feed;
use crate::io::fs::FsCache;
+use crate::mem::MemCache;
#[cfg(feature = "fs")]
use crate::io::fs::FsFeed;
@@ -283,3 +284,15 @@ fn test_meta() {
o.apply(&mut feed).unwrap();
}
+#[test]
+fn test_rss() {
+let fs = FsFeed{};
+ let mut cache = MemCache::new();
+ let fs = FsFeed{};
+
+ let feed = fs.get("testdata/test.rss.xml", None).unwrap();
+ let mut seq = Sequencer::new();
+ seq = seq.with_cache(&mut cache);
+
+ seq.add_from(feed);
+}
diff --git a/crier-lib/testdata/test.rss.xml b/crier-lib/testdata/test.rss.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns="http://purl.org/rss/1.0/"
+ xmlns:admin="http://webns.net/mvcb/"
+ xmlns:content="http://purl.org/rss/1.0/modules/content/"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:syn="http://purl.org/rss/1.0/modules/syndication/"
+ xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/"
+>
+
+<channel rdf:about="https://holbrook.no">
+<title>Bluto won't get Olivia</title>
+<link>https://holbrook.no</link>
+<description>This is the default summary of the project</description>
+<dc:date>2024-07-27T23:54:25</dc:date>
+<dc:publisher>Louis Holbrook</dc:publisher>
+<dc:creator>Louis Holbrook</dc:creator>
+<items>
+ <rdf:Seq>
+ <rdf:li rdf:resource="https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz" />
+ </rdf:Seq>
+</items>
+</channel>
+<item rdf:about="https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz">
+<title>bluto 0.0.2-alpha.1</title>
+<link>https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz</link>
+<description>Release announcement: Bluto won't get Olivia
+=============================================
+Version release: 0.0.2-alpha.1
+
+License: perl
+
+Source bundles
+--------------
+* https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz
+
+
+VCS
+---
+* git+git://git.defalsify.org/bluto.git
+
+
+ONLINE RESOURCES
+----------------
+* https://holbrook.no
+* https://git.defalsify.org
+
+
+CHANGELOG
+---------
+* foo bar baz
+* xyzzy foo foo
+* inky pinky blinky sue
+
+-----
+
+Generated by bluto v0.0.1 (perl v5.38.2) at 2024-07-27T23:54:24Z
+
+-----
+</description>
+<dc:date>2024-07-27T23:54:24</dc:date>
+</item>
+</rdf:RDF>
+\ No newline at end of file