commit 526fba3b6631caf3cf1a37e29ec5744fabe6ace0
parent 68a17a5fd159bf8e1cd9be2a492d3a3b157b6986
Author: lash <dev@holbrook.no>
Date: Mon, 25 Jul 2022 11:02:38 +0000
Add multirecord rdf processing, auto-check parse matches
Diffstat:
7 files changed, 115 insertions(+), 27 deletions(-)
diff --git a/src/biblatex.rs b/src/biblatex.rs
@@ -3,7 +3,10 @@ use std::io::{
};
use std::str;
-use log::debug;
+use log::{
+ debug,
+ error,
+};
use biblatex::{
Bibliography,
Type,
@@ -11,6 +14,7 @@ use biblatex::{
};
use crate::meta::MetaData;
+use crate::error::ParseError;
fn parse_digest(entry: &Entry) -> Vec<u8> {
let note = entry.get("note").unwrap();
@@ -38,10 +42,18 @@ fn parse_digest(entry: &Entry) -> Vec<u8> {
digest
}
-pub fn read_all(mut r: impl Read) -> Vec<MetaData> {
+pub fn read_all(mut r: impl Read) -> Result<Vec<MetaData>, ParseError> {
let mut s = String::new();
let c = r.read_to_string(&mut s);
- let bib = Bibliography::parse(&s).unwrap();
+ let bib = match Bibliography::parse(&s) {
+ Ok(v) => {
+ v
+ },
+ Err(e) => {
+ error!("parse error for biblatex");
+ return Err(ParseError);
+ },
+ };
let mut rr: Vec<MetaData> = vec!();
@@ -83,5 +95,5 @@ pub fn read_all(mut r: impl Read) -> Vec<MetaData> {
debug!("read metadata {:?}", &m);
rr.push(m);
}
- rr
+ Ok(rr)
}
diff --git a/src/error.rs b/src/error.rs
@@ -0,0 +1,2 @@
+#[derive(Debug)]
+pub struct ParseError;
diff --git a/src/lib.rs b/src/lib.rs
@@ -10,6 +10,8 @@ pub mod rdf;
pub mod biblatex;
+pub mod error;
+
#[cfg(test)]
mod tests {
use env_logger;
diff --git a/src/main.rs b/src/main.rs
@@ -27,6 +27,7 @@ use log::{
use kitab::rdf::{
read as rdf_read,
+ read_all as rdf_read_all,
write as rdf_write,
};
use kitab::biblatex::{
@@ -129,19 +130,45 @@ fn store(index_path: &Path, m: &MetaData) {
debug!("stored as rdf {:?}", fp);
}
-fn exec_import_rdf(f: &Path, index_path: &Path) {
+fn exec_import_rdf(f: &Path, index_path: &Path) -> bool {
let f = File::open(f).unwrap();
- let m = rdf_read(&f);
- store(index_path, &m);
+ let entries = match rdf_read_all(&f) {
+ Ok(v) => {
+ v
+ },
+ Err(e) => {
+ return false;
+ }
+ };
+
+ debug!("successfully processed rdf import source");
+
+ for m in entries {
+ info!("importing rdf source {:?}", &m);
+ store(index_path, &m);
+ }
+ true
}
-fn exec_import_biblatex(f: &Path, index_path: &Path) {
+fn exec_import_biblatex(f: &Path, index_path: &Path) -> bool {
let f = File::open(f).unwrap();
- let entries = biblatex_read_all(&f);
+ let entries = match biblatex_read_all(&f) {
+ Ok(v) => {
+ v
+ },
+ Err(e) => {
+ return false;
+ }
+ };
+
+ debug!("successfully processed biblatex import source");
for m in entries {
+ info!("importing biblatex source {:?}", &m);
store(index_path, &m);
}
+
+ true
}
fn exec_scan(p: &Path, index_path: &Path) {
@@ -180,8 +207,12 @@ fn main() {
Some(v) => {
let p = str_to_path(v);
info!("have path {:?}", &p);
- //return exec_import_rdf(p.as_path(), index_dir.as_path());
- return exec_import_biblatex(p.as_path(), index_dir.as_path());
+ if exec_import_rdf(p.as_path(), index_dir.as_path()) {
+ return;
+ }
+ if exec_import_biblatex(p.as_path(), index_dir.as_path()) {
+ return;
+ }
},
_ => {},
}
diff --git a/src/meta.rs b/src/meta.rs
@@ -39,7 +39,6 @@ use crate::dc::{
use log::{
debug,
- info,
};
pub type PublishDate = (u8, u8, u32);
@@ -308,27 +307,27 @@ impl MetaData {
match predicate.to_lowercase().as_str() {
"title" => {
self.set_title(object);
- info!("found title: {}", object);
+ debug!("found title: {}", object);
},
"author" => {
self.set_author(object);
- info!("found author: {}", object);
+ debug!("found author: {}", object);
},
"subject" => {
self.set_subject(object);
- info!("found subject: {}", object);
+ debug!("found subject: {}", object);
},
"typ" => {
self.set_typ(object);
- info!("found typ: {}", object);
+ debug!("found typ: {}", object);
},
"language" => {
self.set_language(object);
- info!("found language: {}", object);
+ debug!("found language: {}", object);
},
"mime" => {
self.set_mime_str(object);
- info!("found mime: {}", object);
+ debug!("found mime: {}", object);
},
_ => {
return false;
@@ -353,7 +352,7 @@ impl MetaData {
if self.mime() == None {
let mime = tree_magic::from_filepath(path);
self.set_mime_str(&mime);
- info!("magic set mime {}", mime);
+ debug!("magic set mime {}", mime);
}
}
diff --git a/src/rdf.rs b/src/rdf.rs
@@ -29,10 +29,11 @@ use urn::{
use log::{
debug,
info,
+ error,
};
use crate::meta::MetaData;
-
+use crate::error::ParseError;
use crate::dc::{
DC_IRI_TITLE,
DC_IRI_CREATOR,
@@ -42,6 +43,7 @@ use crate::dc::{
DC_IRI_MEDIATYPE,
};
+#[derive(Debug)]
pub enum RdfError {
UrnError(UrnError),
HashMismatchError,
@@ -125,7 +127,7 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf
let v = subject_urn.nss();
let b = hex::decode(&v).unwrap();
if metadata.fingerprint().len() == 0 {
- info!("setting fingerprint {}", v);
+ debug!("setting fingerprint {}", v);
metadata.set_fingerprint(b);
} else if metadata.fingerprint() != v {
return Err(RdfError::HashMismatchError);
@@ -136,32 +138,32 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf
DC_IRI_TITLE => {
let title = triple.object.to_string().replace("\"", "");
metadata.set_title(title.as_str());
- info!("found title: {}", title);
+ debug!("found title: {}", title);
},
DC_IRI_CREATOR => {
let author = triple.object.to_string().replace("\"", "");
metadata.set_author(author.as_str());
- info!("found author: {}", author);
+ debug!("found author: {}", author);
},
DC_IRI_SUBJECT => {
let mut subject = triple.object.to_string().replace("\"", "");
metadata.set_subject(subject.as_str());
- info!("found subject: {}", subject);
+ debug!("found subject: {}", subject);
},
DC_IRI_LANGUAGE => {
let mut lang = triple.object.to_string().replace("\"", "");
metadata.set_language(lang.as_str());
- info!("found language: {}", lang);
+ debug!("found language: {}", lang);
},
DC_IRI_TYPE => {
let mut typ = triple.object.to_string().replace("\"", "");
metadata.set_typ(typ.as_str());
- info!("found entry type: {}", typ);
+ debug!("found entry type: {}", typ);
},
DC_IRI_MEDIATYPE => {
let mut mime_type = triple.object.to_string().replace("\"", "");
metadata.set_mime_str(mime_type.as_str());
- info!("found mime type: {}", mime_type);
+ debug!("found mime type: {}", mime_type);
},
_ => {
debug!("skipping unknown predicate: {}", field);
@@ -170,7 +172,40 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf
Ok(())
}
+pub fn read_all(r: impl Read) -> Result<Vec<MetaData>, ParseError> {
+ let mut rr: Vec<MetaData> = vec!();
+ let bf = BufReader::new(r);
+ let mut tp = TurtleParser::new(bf, None);
+ rr.push(MetaData::empty());
+ let mut i: usize = 0;
+ let r: Result<_, TurtleError> = tp.parse_all(&mut |r| {
+ match r {
+ Triple{subject, predicate, object } => {
+ match handle_parse_match(&mut rr[i], r) {
+ Err(HashMismatchError) => {
+ rr.push(MetaData::empty());
+ i += 1;
+ match handle_parse_match(&mut rr[i], r) {
+ Err(e) => {
+ error!("{:?}", e);
+ },
+ _ => {},
+ };
+ },
+ _ => {},
+ };
+ },
+ }
+ Ok(())
+ });
+ // TODO: should check validity of all records
+ if rr[0].fingerprint() == "" {
+ return Err(ParseError);
+ }
+ Ok(rr)
+}
pub fn read(r: impl Read) -> MetaData {
+ let mut rr: Vec<MetaData> = vec!();
let mut metadata = MetaData::empty();
let bf = BufReader::new(r);
let mut tp = TurtleParser::new(bf, None);
diff --git a/testdata/meta.ttl b/testdata/meta.ttl
@@ -8,3 +8,10 @@
dcterms:type "article" ;
dcterms:MediaType "application/pdf" ;
dcterms:language "en-US" .
+
+<urn:sha512:f450b0b35ed8bd1c00b45b4f6ebd645079ae8bf3b8abd28aea62fc2ab3bab2878e021e0b6c182f776e24e5ed956c204d647b4c5b0f64a73e3753f736ffe2818c>
+ dcterms:title "Towards a Decentralized Data Marketplace for Smart Cities";
+ dcterms:creator "Gowri Sankar Ramachandran, Rahul Radhakrishnan and Bhaskar Krishnamachari";
+ dcterms:type "article" ;
+ dcterms:MediaType "application/pdf" ;
+ dcterms:language "en-US" .