kitab

Unnamed repository; edit this file 'description' to name the repository.
Info | Log | Files | Refs | LICENSE

commit 526fba3b6631caf3cf1a37e29ec5744fabe6ace0
parent 68a17a5fd159bf8e1cd9be2a492d3a3b157b6986
Author: lash <dev@holbrook.no>
Date:   Mon, 25 Jul 2022 11:02:38 +0000

Add multirecord rdf processing, auto-check parse matches

Diffstat:
Msrc/biblatex.rs | 20++++++++++++++++----
Asrc/error.rs | 2++
Msrc/lib.rs | 2++
Msrc/main.rs | 45++++++++++++++++++++++++++++++++++++++-------
Msrc/meta.rs | 15+++++++--------
Msrc/rdf.rs | 51+++++++++++++++++++++++++++++++++++++++++++--------
Mtestdata/meta.ttl | 7+++++++
7 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/src/biblatex.rs b/src/biblatex.rs @@ -3,7 +3,10 @@ use std::io::{ }; use std::str; -use log::debug; +use log::{ + debug, + error, +}; use biblatex::{ Bibliography, Type, @@ -11,6 +14,7 @@ use biblatex::{ }; use crate::meta::MetaData; +use crate::error::ParseError; fn parse_digest(entry: &Entry) -> Vec<u8> { let note = entry.get("note").unwrap(); @@ -38,10 +42,18 @@ fn parse_digest(entry: &Entry) -> Vec<u8> { digest } -pub fn read_all(mut r: impl Read) -> Vec<MetaData> { +pub fn read_all(mut r: impl Read) -> Result<Vec<MetaData>, ParseError> { let mut s = String::new(); let c = r.read_to_string(&mut s); - let bib = Bibliography::parse(&s).unwrap(); + let bib = match Bibliography::parse(&s) { + Ok(v) => { + v + }, + Err(e) => { + error!("parse error for biblatex"); + return Err(ParseError); + }, + }; let mut rr: Vec<MetaData> = vec!(); @@ -83,5 +95,5 @@ pub fn read_all(mut r: impl Read) -> Vec<MetaData> { debug!("read metadata {:?}", &m); rr.push(m); } - rr + Ok(rr) } diff --git a/src/error.rs b/src/error.rs @@ -0,0 +1,2 @@ +#[derive(Debug)] +pub struct ParseError; diff --git a/src/lib.rs b/src/lib.rs @@ -10,6 +10,8 @@ pub mod rdf; pub mod biblatex; +pub mod error; + #[cfg(test)] mod tests { use env_logger; diff --git a/src/main.rs b/src/main.rs @@ -27,6 +27,7 @@ use log::{ use kitab::rdf::{ read as rdf_read, + read_all as rdf_read_all, write as rdf_write, }; use kitab::biblatex::{ @@ -129,19 +130,45 @@ fn store(index_path: &Path, m: &MetaData) { debug!("stored as rdf {:?}", fp); } -fn exec_import_rdf(f: &Path, index_path: &Path) { +fn exec_import_rdf(f: &Path, index_path: &Path) -> bool { let f = File::open(f).unwrap(); - let m = rdf_read(&f); - store(index_path, &m); + let entries = match rdf_read_all(&f) { + Ok(v) => { + v + }, + Err(e) => { + return false; + } + }; + + debug!("successfully processed rdf import source"); + + for m in entries { + info!("importing rdf source {:?}", &m); + store(index_path, &m); + } + true } -fn exec_import_biblatex(f: &Path, index_path: &Path) { +fn exec_import_biblatex(f: &Path, index_path: &Path) -> bool { let f = File::open(f).unwrap(); - let entries = biblatex_read_all(&f); + let entries = match biblatex_read_all(&f) { + Ok(v) => { + v + }, + Err(e) => { + return false; + } + }; + + debug!("successfully processed biblatex import source"); for m in entries { + info!("importing biblatex source {:?}", &m); store(index_path, &m); } + + true } fn exec_scan(p: &Path, index_path: &Path) { @@ -180,8 +207,12 @@ fn main() { Some(v) => { let p = str_to_path(v); info!("have path {:?}", &p); - //return exec_import_rdf(p.as_path(), index_dir.as_path()); - return exec_import_biblatex(p.as_path(), index_dir.as_path()); + if exec_import_rdf(p.as_path(), index_dir.as_path()) { + return; + } + if exec_import_biblatex(p.as_path(), index_dir.as_path()) { + return; + } }, _ => {}, } diff --git a/src/meta.rs b/src/meta.rs @@ -39,7 +39,6 @@ use crate::dc::{ use log::{ debug, - info, }; pub type PublishDate = (u8, u8, u32); @@ -308,27 +307,27 @@ impl MetaData { match predicate.to_lowercase().as_str() { "title" => { self.set_title(object); - info!("found title: {}", object); + debug!("found title: {}", object); }, "author" => { self.set_author(object); - info!("found author: {}", object); + debug!("found author: {}", object); }, "subject" => { self.set_subject(object); - info!("found subject: {}", object); + debug!("found subject: {}", object); }, "typ" => { self.set_typ(object); - info!("found typ: {}", object); + debug!("found typ: {}", object); }, "language" => { self.set_language(object); - info!("found language: {}", object); + debug!("found language: {}", object); }, "mime" => { self.set_mime_str(object); - info!("found mime: {}", object); + debug!("found mime: {}", object); }, _ => { return false; @@ -353,7 +352,7 @@ impl MetaData { if self.mime() == None { let mime = tree_magic::from_filepath(path); self.set_mime_str(&mime); - info!("magic set mime {}", mime); + debug!("magic set mime {}", mime); } } diff --git a/src/rdf.rs b/src/rdf.rs @@ -29,10 +29,11 @@ use urn::{ use log::{ debug, info, + error, }; use crate::meta::MetaData; - +use crate::error::ParseError; use crate::dc::{ DC_IRI_TITLE, DC_IRI_CREATOR, @@ -42,6 +43,7 @@ use crate::dc::{ DC_IRI_MEDIATYPE, }; +#[derive(Debug)] pub enum RdfError { UrnError(UrnError), HashMismatchError, @@ -125,7 +127,7 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf let v = subject_urn.nss(); let b = hex::decode(&v).unwrap(); if metadata.fingerprint().len() == 0 { - info!("setting fingerprint {}", v); + debug!("setting fingerprint {}", v); metadata.set_fingerprint(b); } else if metadata.fingerprint() != v { return Err(RdfError::HashMismatchError); @@ -136,32 +138,32 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf DC_IRI_TITLE => { let title = triple.object.to_string().replace("\"", ""); metadata.set_title(title.as_str()); - info!("found title: {}", title); + debug!("found title: {}", title); }, DC_IRI_CREATOR => { let author = triple.object.to_string().replace("\"", ""); metadata.set_author(author.as_str()); - info!("found author: {}", author); + debug!("found author: {}", author); }, DC_IRI_SUBJECT => { let mut subject = triple.object.to_string().replace("\"", ""); metadata.set_subject(subject.as_str()); - info!("found subject: {}", subject); + debug!("found subject: {}", subject); }, DC_IRI_LANGUAGE => { let mut lang = triple.object.to_string().replace("\"", ""); metadata.set_language(lang.as_str()); - info!("found language: {}", lang); + debug!("found language: {}", lang); }, DC_IRI_TYPE => { let mut typ = triple.object.to_string().replace("\"", ""); metadata.set_typ(typ.as_str()); - info!("found entry type: {}", typ); + debug!("found entry type: {}", typ); }, DC_IRI_MEDIATYPE => { let mut mime_type = triple.object.to_string().replace("\"", ""); metadata.set_mime_str(mime_type.as_str()); - info!("found mime type: {}", mime_type); + debug!("found mime type: {}", mime_type); }, _ => { debug!("skipping unknown predicate: {}", field); @@ -170,7 +172,40 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf Ok(()) } +pub fn read_all(r: impl Read) -> Result<Vec<MetaData>, ParseError> { + let mut rr: Vec<MetaData> = vec!(); + let bf = BufReader::new(r); + let mut tp = TurtleParser::new(bf, None); + rr.push(MetaData::empty()); + let mut i: usize = 0; + let r: Result<_, TurtleError> = tp.parse_all(&mut |r| { + match r { + Triple{subject, predicate, object } => { + match handle_parse_match(&mut rr[i], r) { + Err(HashMismatchError) => { + rr.push(MetaData::empty()); + i += 1; + match handle_parse_match(&mut rr[i], r) { + Err(e) => { + error!("{:?}", e); + }, + _ => {}, + }; + }, + _ => {}, + }; + }, + } + Ok(()) + }); + // TODO: should check validity of all records + if rr[0].fingerprint() == "" { + return Err(ParseError); + } + Ok(rr) +} pub fn read(r: impl Read) -> MetaData { + let mut rr: Vec<MetaData> = vec!(); let mut metadata = MetaData::empty(); let bf = BufReader::new(r); let mut tp = TurtleParser::new(bf, None); diff --git a/testdata/meta.ttl b/testdata/meta.ttl @@ -8,3 +8,10 @@ dcterms:type "article" ; dcterms:MediaType "application/pdf" ; dcterms:language "en-US" . + +<urn:sha512:f450b0b35ed8bd1c00b45b4f6ebd645079ae8bf3b8abd28aea62fc2ab3bab2878e021e0b6c182f776e24e5ed956c204d647b4c5b0f64a73e3753f736ffe2818c> + dcterms:title "Towards a Decentralized Data Marketplace for Smart Cities"; + dcterms:creator "Gowri Sankar Ramachandran, Rahul Radhakrishnan and Bhaskar Krishnamachari"; + dcterms:type "article" ; + dcterms:MediaType "application/pdf" ; + dcterms:language "en-US" .