commit 526fba3b6631caf3cf1a37e29ec5744fabe6ace0
parent 68a17a5fd159bf8e1cd9be2a492d3a3b157b6986
Author: lash <dev@holbrook.no>
Date:   Mon, 25 Jul 2022 11:02:38 +0000
Add multirecord rdf processing, auto-check parse matches
Diffstat:
7 files changed, 115 insertions(+), 27 deletions(-)
diff --git a/src/biblatex.rs b/src/biblatex.rs
@@ -3,7 +3,10 @@ use std::io::{
 };
 use std::str;
 
-use log::debug;
+use log::{
+    debug,
+    error,
+};
 use biblatex::{
     Bibliography,
     Type,
@@ -11,6 +14,7 @@ use biblatex::{
 };
 
 use crate::meta::MetaData;
+use crate::error::ParseError;
 
 fn parse_digest(entry: &Entry) -> Vec<u8> {
     let note = entry.get("note").unwrap();
@@ -38,10 +42,18 @@ fn parse_digest(entry: &Entry) -> Vec<u8> {
     digest
 }
 
-pub fn read_all(mut r: impl Read) -> Vec<MetaData> {
+pub fn read_all(mut r: impl Read) -> Result<Vec<MetaData>, ParseError> {
     let mut s = String::new();
     let c = r.read_to_string(&mut s);
-    let bib = Bibliography::parse(&s).unwrap();
+    let bib = match Bibliography::parse(&s) {
+        Ok(v) => {
+            v
+        },
+        Err(e) => {
+            error!("parse error for biblatex");
+            return Err(ParseError);
+        },
+    };
 
     let mut rr: Vec<MetaData> = vec!();
 
@@ -83,5 +95,5 @@ pub fn read_all(mut r: impl Read) -> Vec<MetaData> {
         debug!("read metadata {:?}", &m);
         rr.push(m);
     }
-    rr
+    Ok(rr)
 }
diff --git a/src/error.rs b/src/error.rs
@@ -0,0 +1,2 @@
+#[derive(Debug)]
+pub struct ParseError;
diff --git a/src/lib.rs b/src/lib.rs
@@ -10,6 +10,8 @@ pub mod rdf;
 
 pub mod biblatex;
 
+pub mod error;
+
 #[cfg(test)]
 mod tests {
     use env_logger;
diff --git a/src/main.rs b/src/main.rs
@@ -27,6 +27,7 @@ use log::{
 
 use kitab::rdf::{
     read as rdf_read,
+    read_all as rdf_read_all,
     write as rdf_write,
 };
 use kitab::biblatex::{
@@ -129,19 +130,45 @@ fn store(index_path: &Path, m: &MetaData) {
     debug!("stored as rdf {:?}", fp);
 }
 
-fn exec_import_rdf(f: &Path, index_path: &Path) {
+fn exec_import_rdf(f: &Path, index_path: &Path) -> bool {
     let f = File::open(f).unwrap();
-    let m = rdf_read(&f);
-    store(index_path, &m);    
+    let entries = match rdf_read_all(&f) {
+        Ok(v) => {
+            v
+        },
+        Err(e) => {
+            return false;
+        }
+    };
+
+    debug!("successfully processed rdf import source");
+
+    for m in entries {
+        info!("importing rdf source {:?}", &m);
+        store(index_path, &m);
+    }
+    true
 }
 
-fn exec_import_biblatex(f: &Path, index_path: &Path) {
+fn exec_import_biblatex(f: &Path, index_path: &Path) -> bool {
     let f = File::open(f).unwrap();
-    let entries = biblatex_read_all(&f);
+    let entries = match biblatex_read_all(&f) {
+         Ok(v) => {
+            v
+        },
+        Err(e) => {
+            return false;
+        }       
+    };
+
+    debug!("successfully processed biblatex import source");
 
     for m in entries {
+        info!("importing biblatex source {:?}", &m);
         store(index_path, &m);    
     }
+
+    true
 }
 
 fn exec_scan(p: &Path, index_path: &Path) {
@@ -180,8 +207,12 @@ fn main() {
         Some(v) => {
             let p = str_to_path(v);
             info!("have path {:?}", &p);
-            //return exec_import_rdf(p.as_path(), index_dir.as_path());
-            return exec_import_biblatex(p.as_path(), index_dir.as_path());
+            if exec_import_rdf(p.as_path(), index_dir.as_path()) {
+                return;
+            }
+            if exec_import_biblatex(p.as_path(), index_dir.as_path()) {
+                return;
+            }
         },
         _ => {},
     }
diff --git a/src/meta.rs b/src/meta.rs
@@ -39,7 +39,6 @@ use crate::dc::{
 
 use log::{
     debug,
-    info,
 };
 
 pub type PublishDate = (u8, u8, u32);
@@ -308,27 +307,27 @@ impl MetaData {
         match predicate.to_lowercase().as_str() {
             "title" => {
                 self.set_title(object);
-                info!("found title: {}", object);
+                debug!("found title: {}", object);
             },
             "author" => {
                 self.set_author(object);
-                info!("found author: {}", object);
+                debug!("found author: {}", object);
             },
             "subject" => {
                 self.set_subject(object);
-                info!("found subject: {}", object);
+                debug!("found subject: {}", object);
             },
             "typ" => {
                 self.set_typ(object);
-                info!("found typ: {}", object);
+                debug!("found typ: {}", object);
             },
             "language" => {
                 self.set_language(object);
-                info!("found language: {}", object);
+                debug!("found language: {}", object);
             },
             "mime" => {
                 self.set_mime_str(object);
-                info!("found mime: {}", object);
+                debug!("found mime: {}", object);
             },
             _ => {
                 return false;
@@ -353,7 +352,7 @@ impl MetaData {
         if self.mime() == None {
             let mime = tree_magic::from_filepath(path);
             self.set_mime_str(&mime);
-            info!("magic set mime {}", mime);
+            debug!("magic set mime {}", mime);
         }
     }
 
diff --git a/src/rdf.rs b/src/rdf.rs
@@ -29,10 +29,11 @@ use urn::{
 use log::{
     debug,
     info,
+    error,
 };
 
 use crate::meta::MetaData;
-
+use crate::error::ParseError;
 use crate::dc::{
     DC_IRI_TITLE,
     DC_IRI_CREATOR,
@@ -42,6 +43,7 @@ use crate::dc::{
     DC_IRI_MEDIATYPE,
 };
 
+#[derive(Debug)]
 pub enum RdfError {
     UrnError(UrnError),
     HashMismatchError,
@@ -125,7 +127,7 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf
     let v = subject_urn.nss();
     let b = hex::decode(&v).unwrap();
     if metadata.fingerprint().len() == 0 {
-        info!("setting fingerprint {}", v);
+        debug!("setting fingerprint {}", v);
         metadata.set_fingerprint(b);
     } else if metadata.fingerprint() != v {
         return Err(RdfError::HashMismatchError);
@@ -136,32 +138,32 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf
         DC_IRI_TITLE => {
             let title = triple.object.to_string().replace("\"", "");
             metadata.set_title(title.as_str());
-            info!("found title: {}", title);
+            debug!("found title: {}", title);
         },
         DC_IRI_CREATOR => {
             let author = triple.object.to_string().replace("\"", "");
             metadata.set_author(author.as_str());
-            info!("found author: {}", author);
+            debug!("found author: {}", author);
         },
         DC_IRI_SUBJECT => {
             let mut subject = triple.object.to_string().replace("\"", "");
             metadata.set_subject(subject.as_str());
-            info!("found subject: {}", subject);
+            debug!("found subject: {}", subject);
         },
         DC_IRI_LANGUAGE => {
             let mut lang = triple.object.to_string().replace("\"", "");
             metadata.set_language(lang.as_str());
-            info!("found language: {}", lang);
+            debug!("found language: {}", lang);
         },
         DC_IRI_TYPE => {
             let mut typ = triple.object.to_string().replace("\"", "");
             metadata.set_typ(typ.as_str());
-            info!("found entry type: {}", typ);
+            debug!("found entry type: {}", typ);
         },
         DC_IRI_MEDIATYPE => {
             let mut mime_type = triple.object.to_string().replace("\"", "");
             metadata.set_mime_str(mime_type.as_str());
-            info!("found mime type: {}", mime_type);
+            debug!("found mime type: {}", mime_type);
         },
         _ => {
             debug!("skipping unknown predicate: {}", field);
@@ -170,7 +172,40 @@ fn handle_parse_match(metadata: &mut MetaData, triple: Triple) -> Result<(), Rdf
     Ok(())
 }
 
+pub fn read_all(r: impl Read) -> Result<Vec<MetaData>, ParseError> {
+    let mut rr: Vec<MetaData> = vec!();
+    let bf = BufReader::new(r);
+    let mut tp = TurtleParser::new(bf, None);
+    rr.push(MetaData::empty());
+    let mut i: usize = 0;
+    let r: Result<_, TurtleError> = tp.parse_all(&mut |r| {
+        match r {
+            Triple{subject, predicate, object } => {
+                match handle_parse_match(&mut rr[i], r) {
+                    Err(HashMismatchError) => {
+                        rr.push(MetaData::empty());
+                        i += 1;
+                        match handle_parse_match(&mut rr[i], r) {
+                            Err(e) => {
+                                error!("{:?}", e);
+                            },
+                            _ => {},
+                        };
+                    },
+                    _ => {},
+                };
+            },
+        }
+        Ok(())
+    });
+    // TODO: should check validity of all records
+    if rr[0].fingerprint() == "" {
+        return Err(ParseError);
+    }
+    Ok(rr)
+}
 pub fn read(r: impl Read) -> MetaData {
+    let mut rr: Vec<MetaData> = vec!();
     let mut metadata = MetaData::empty();
     let bf = BufReader::new(r);
     let mut tp = TurtleParser::new(bf, None);
diff --git a/testdata/meta.ttl b/testdata/meta.ttl
@@ -8,3 +8,10 @@
 	dcterms:type "article" ;
 	dcterms:MediaType "application/pdf" ;
 	dcterms:language "en-US" .
+
+<urn:sha512:f450b0b35ed8bd1c00b45b4f6ebd645079ae8bf3b8abd28aea62fc2ab3bab2878e021e0b6c182f776e24e5ed956c204d647b4c5b0f64a73e3753f736ffe2818c>
+	dcterms:title "Towards a Decentralized Data Marketplace for Smart Cities";
+	dcterms:creator "Gowri Sankar Ramachandran, Rahul Radhakrishnan and Bhaskar Krishnamachari";
+	dcterms:type "article" ;
+	dcterms:MediaType "application/pdf" ;
+	dcterms:language "en-US" .