crier

Unnamed repository; edit this file 'description' to name the repository.
Info | Log | Files | Refs

commit 18cb194861bd22cc396f6284a05f185ecfaf7a86
parent 3ad1a9fd70b7f68915987477e88968ea56a8199f
Author: lash <dev@holbrook.no>
Date:   Sun, 28 Jul 2024 01:25:31 +0100

Change rss parser, include dublincore date in entry rss parse

Diffstat:
Mcrier-lib/Cargo.lock | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrier-lib/Cargo.toml | 15+++++++++++++--
Mcrier-lib/src/lib.rs | 28+++++++++++++++++++++++++++-
Acrier-lib/src/log.rs | 21+++++++++++++++++++++
Acrier-lib/src/rss.rs | 184+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrier-lib/src/tests.rs | 13+++++++++++++
Acrier-lib/testdata/test.rss.xml | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 405 insertions(+), 3 deletions(-)

diff --git a/crier-lib/Cargo.lock b/crier-lib/Cargo.lock @@ -39,6 +39,17 @@ dependencies = [ ] [[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] name = "autocfg" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -110,9 +121,11 @@ dependencies = [ "atom_syndication", "chrono", "digest", + "env_logger", "feed-rs", "http", "itertools", + "log", "mediatype", "quick-xml 0.31.0", "rs_sha512", @@ -233,6 +246,19 @@ dependencies = [ ] [[package]] +name = "env_logger" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] name = "errno" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -302,6 +328,15 @@ dependencies = [ ] [[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] name = "http" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -313,6 +348,12 @@ dependencies = [ ] [[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] name = "iana-time-zone" version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -645,6 +686,15 @@ dependencies = [ ] [[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] name = "tinyvec" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -773,6 +823,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] name = "windows-core" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/crier-lib/Cargo.toml b/crier-lib/Cargo.toml @@ -8,9 +8,7 @@ authors = ["Louis Holbrook <dev@holbrook.no>"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -#rss = "^2.0.3" feed-rs = "^2.0" -rss = "^2.0" #activitypub_federation = "^0.4.0" #json-ld = "^0.14.1" digest = "^0.10.7" @@ -27,6 +25,10 @@ atom_syndication = "^0.12" #version = "^0.31" #features = ["serialize"] +[dependencies.rss] +version = "^2.0" +features = ["atom"] + [patch.crates-io] #atom_syndication = { path = "/home/lash/src/contrib/atom_syndication" } atom_syndication = { git = "git://holbrook.no/contrib/atom_syndication", rev="9985c1610b2b819f5bd2f7a719567ee0b5419b85" } #branch="lash/entry-fromstr" @@ -36,5 +38,14 @@ tempfile = "3.3.0" mediatype = "^0.19" quick-xml = "^0.31" +[dependencies.env_logger] +version = "^0.9" +optional = true + +[dependencies.log] +version = "^0.4" +optional = true + [features] fs = [] +logging = ["log", "env_logger"] diff --git a/crier-lib/src/lib.rs b/crier-lib/src/lib.rs @@ -28,6 +28,8 @@ pub mod mem; mod meta; mod cache; +mod rss; +mod log; use meta::FeedMetadata; use mem::CacheWriter; use cache::Cache; @@ -37,6 +39,7 @@ pub enum Error { WriteError, CacheError, ParseError, + IncompleteError, } pub struct Sequencer<'a> { @@ -254,6 +257,27 @@ impl SequencerEntry { fn to_writer(&self, v: Vec<u8>) -> BufWriter<Vec<u8>> { BufWriter::with_capacity(10241024, v) } + +} + +fn get_base_date(entry: &Entry) -> FixedDateTime { + let d: FixedDateTime; + + match entry.published { + Some(v) => { + return FixedDateTime::parse_from_rfc2822(v.to_rfc2822().as_str()).unwrap(); + }, + None => {}, + }; + + match entry.updated { + Some(v) => { + return FixedDateTime::parse_from_rfc2822(v.to_rfc2822().as_str()).unwrap(); + }, + None => {}, + }; + + return FixedDateTime::parse_from_rfc2822(entry.updated.unwrap().to_rfc2822().as_str()).unwrap(); } /// TODO: split out field translations to separate module @@ -268,11 +292,13 @@ impl Into<Vec<u8>> for SequencerEntry { b = Vec::new(); w = o.to_writer(b); + let mut d = get_base_date(&self.entry); + out_entry = OutEntry::default(); out_entry.set_id(self.entry.id); out_entry.set_title(self.entry.title.unwrap().content); - let mut d = FixedDateTime::parse_from_rfc2822(self.entry.published.unwrap().to_rfc2822().as_str()).unwrap(); + out_entry.set_published(Some(d.clone())); match self.entry.updated { diff --git a/crier-lib/src/log.rs b/crier-lib/src/log.rs @@ -0,0 +1,21 @@ +#[cfg(feature = "logging")] +use env_logger; +pub use log::debug; +pub use log::info; + +pub fn init() { + env_logger::init(); +} + +#[cfg(not(feature = "logging"))] +#[macro_export] +macro_rules! info { + (*) => {}; +} + +#[cfg(not(feature = "logging"))] +#[macro_export] +macro_rules! debug { + (*) => {}; +} + diff --git a/crier-lib/src/rss.rs b/crier-lib/src/rss.rs @@ -0,0 +1,184 @@ +use std::path::Path; +use std::fs::File; +use std::io::BufReader; +use crate::Error; +use crate::log::info; +use crate::log::debug; + +use rss::Channel; +use rss::Item; +use rss::extension::dublincore::DublinCoreExtension; +use atom_syndication::Feed; +use atom_syndication::Entry; +use atom_syndication::Text; +use atom_syndication::TextType; +use atom_syndication::FixedDateTime; +use chrono::naive::NaiveDateTime; +use chrono::Local; +use chrono::offset::Utc; + +/// try to coerce the item field into a valid date +fn parse_date(v: &String) -> Result<FixedDateTime, Error> { + match FixedDateTime::parse_from_rfc2822(v.as_str()) { + Ok(r) => { + return Ok(r); + }, + Err(e) => {}, + }; + match FixedDateTime::parse_from_rfc3339(v.as_str()) { + Ok(r) => { + return Ok(r); + }, + Err(e) => {}, + }; + match FixedDateTime::parse_from_str(v.as_str(), "%Y-%m-%dT%H:%M:%S") { + Ok(r) => { + return Ok(r); + }, + Err(e) => { + }, + }; + match NaiveDateTime::parse_from_str(v.as_str(), "%Y-%m-%dT%H:%M:%S") { + Ok(r) => { + return Ok(r.and_utc().fixed_offset()); + }, + Err(e) => { + }, + }; + + + Err(Error::ParseError) +} + +/// try different item fields to determine the date +fn get_base_date(ipt: &Item) -> Result<FixedDateTime, Error> { + let mut ds = String::new(); + + match &ipt.pub_date { + Some(v) => { + ds.push_str(v.as_str()); + }, + _ => {}, + }; + match parse_date(&ds) { + Ok(v) => { + return Ok(v); + }, + Err(e) => {}, + }; + + match &ipt.dublin_core_ext { + Some(v) => { + for vv in v.dates() { + match parse_date(vv) { + Ok(vvv) => { + return Ok(vvv); + }, + Err(e) => { + debug!("no date"); + }, + } + } + }, + _ => {}, + } + + Err(Error::IncompleteError) +} + +/// coerce the rss item into an atom entry +fn translate_item(ipt: Item) -> Result<Entry, Error> { + let mut opt = Entry::default(); + + match &ipt.title { + Some(v) => { + opt.set_title(Text::plain(v)); + }, + _ => {}, + }; + + match get_base_date(&ipt) { + Ok(v) => { + opt.set_published(v); + }, + Err(e) => { + return Err(e); + } + } + Ok(opt) +} + + +fn translate(ipt: Channel, allow_fail: bool) -> Result<Feed, Error> { + let mut entries: Vec<Entry>; + let mut opt = Feed::default(); + + opt.set_title(Text::plain(&ipt.title)); + + opt.set_subtitle(Some(Text::plain(&ipt.description))); + + entries = vec!(); + for v in ipt.into_items() { + match translate_item(v) { + Ok(v) => { + entries.push(v); + }, + Err(e) => { + if !allow_fail { + return Err(Error::IncompleteError); + } + }, + } + } + + opt.set_entries(entries); + opt.set_updated(Local::now().to_utc()); + Ok(opt) +} + +pub fn from_file(fp: &str, allow_entry_fail: bool) -> Result<Feed, Error> { + let mut o: Channel; + let r: Feed; + + let p = Path::new(fp); + let f = File::open(p).unwrap(); + let b = BufReader::new(f); + + match Channel::read_from(b) { + Ok(v) => { + o = v; + }, + Err(e) => { + return Err(Error::ParseError); + }, + }; + o.set_dublin_core_ext(DublinCoreExtension::default()); + translate(o, allow_entry_fail) +} + + +mod test { + use std::path::Path; + use atom_syndication::Feed; + use crate::log; + + #[test] + fn test_rss_from_file() { + env_logger::init(); + let mut r: Feed; + match super::from_file("testdata/test.rss.xml", false) { + Ok(v) => { + }, + Err(e) => { + panic!("{:?}", e); + }, + }; +// match super::from_file("testdata/test.atom.xml", false) { +// Ok(v) => { +// panic!("expected fail"); +// }, +// Err(e) => { +// }, +// }; + } +} diff --git a/crier-lib/src/tests.rs b/crier-lib/src/tests.rs @@ -20,6 +20,7 @@ use crate::io::FeedGet; use crate::meta::FeedMetadata; use crate::Feed; use crate::io::fs::FsCache; +use crate::mem::MemCache; #[cfg(feature = "fs")] use crate::io::fs::FsFeed; @@ -283,3 +284,15 @@ fn test_meta() { o.apply(&mut feed).unwrap(); } +#[test] +fn test_rss() { +let fs = FsFeed{}; + let mut cache = MemCache::new(); + let fs = FsFeed{}; + + let feed = fs.get("testdata/test.rss.xml", None).unwrap(); + let mut seq = Sequencer::new(); + seq = seq.with_cache(&mut cache); + + seq.add_from(feed); +} diff --git a/crier-lib/testdata/test.rss.xml b/crier-lib/testdata/test.rss.xml @@ -0,0 +1,65 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<rdf:RDF + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns="http://purl.org/rss/1.0/" + xmlns:admin="http://webns.net/mvcb/" + xmlns:content="http://purl.org/rss/1.0/modules/content/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" + xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" +> + +<channel rdf:about="https://holbrook.no"> +<title>Bluto won&#x27;t get Olivia</title> +<link>https://holbrook.no</link> +<description>This is the default summary of the project</description> +<dc:date>2024-07-27T23:54:25</dc:date> +<dc:publisher>Louis Holbrook</dc:publisher> +<dc:creator>Louis Holbrook</dc:creator> +<items> + <rdf:Seq> + <rdf:li rdf:resource="https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz" /> + </rdf:Seq> +</items> +</channel> +<item rdf:about="https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz"> +<title>bluto 0.0.2-alpha.1</title> +<link>https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz</link> +<description>Release announcement: Bluto won&#x27;t get Olivia +============================================= +Version release: 0.0.2-alpha.1 + +License: perl + +Source bundles +-------------- +* https://holbrook.no/share/releases/bluto/bluto-0.0.2-alpha.1+build.7ed4da0449182876aad5e79f6ba8ed1e06766c12.tar.gz + + +VCS +--- +* git+git://git.defalsify.org/bluto.git + + +ONLINE RESOURCES +---------------- +* https://holbrook.no +* https://git.defalsify.org + + +CHANGELOG +--------- +* foo bar baz +* xyzzy foo foo +* inky pinky blinky sue + +----- + +Generated by bluto v0.0.1 (perl v5.38.2) at 2024-07-27T23:54:24Z + +----- +</description> +<dc:date>2024-07-27T23:54:24</dc:date> +</item> +</rdf:RDF> +\ No newline at end of file