diff --git a/.gitignore b/.gitignore index 9df1afb..45647a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ data +data-test target output diff --git a/pom.xml b/pom.xml index 9e75a55..5192c9c 100644 --- a/pom.xml +++ b/pom.xml @@ -30,9 +30,9 @@ 8.6.3 - com.googlecode.json-simple - json-simple - 1.1.1 + com.google.code.gson + gson + 2.8.6 diff --git a/src/main/java/org/RI/P2/Indexer.java b/src/main/java/org/RI/P2/Indexer.java index 2259f43..2c4d4fb 100644 --- a/src/main/java/org/RI/P2/Indexer.java +++ b/src/main/java/org/RI/P2/Indexer.java @@ -19,16 +19,16 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.JSONValue; +import com.google.gson.Gson; public class Indexer { IndexWriter index; @@ -57,13 +57,12 @@ public class Indexer { return files; } - JSONArray parseJSONFile(File file) throws IOException { + Paper parseJSONFile(File file) throws IOException { InputStream jsonFile = new FileInputStream(file); Reader readerJson = new InputStreamReader(jsonFile); - Object fileObject = JSONValue.parse(readerJson); - JSONArray arrayObject = new JSONArray(); - arrayObject.add(fileObject); - return arrayObject; + Gson gson = new Gson(); + Paper data = gson.fromJson(readerJson, Paper.class); + return data; } void createIndex() throws IOException { @@ -73,11 +72,16 @@ public class Indexer { index = new IndexWriter(dir, config); } - void addDocuments(JSONArray jsonObjects) throws IOException { - for (JSONObject object : (List) jsonObjects) { - Document doc = new Document(); - index.addDocument(doc); + void addDocument(Paper paper) throws IOException { + Document doc = new Document(); + doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES)); + doc.add(new TextField("title", paper.metadata.title, Field.Store.YES)); + for (Author author : paper.metadata.authors) { + String authorName = author.first + " " + author.middle + " " + author.last; + authorName = authorName.replaceAll("\\p{P}", ""); + doc.add(new TextField("authors", authorName, Field.Store.YES)); } + index.addDocument(doc); } void commitChanges() throws IOException { @@ -88,8 +92,8 @@ public class Indexer { void populateIndex() throws IOException, ParseException { createIndex(); for (File file : files) { - JSONArray jsonObjects = parseJSONFile(file); - addDocument(jsonObjects); + Paper paper = parseJSONFile(file); + addDocument(paper); } commitChanges(); } diff --git a/src/main/java/org/RI/P2/Paper.java b/src/main/java/org/RI/P2/Paper.java new file mode 100644 index 0000000..e443c1f --- /dev/null +++ b/src/main/java/org/RI/P2/Paper.java @@ -0,0 +1,45 @@ +package org.RI.P2; + +import java.util.List; + +class Affiliation { + String laboratory; + String institution; +} + +class Location { + String postCode; + String settlement; + String region; + String country; +} + +class Author { + String first; + List middle; + String last; + String suffix; + Affiliation affiliation; + Location location; + String email; +} + +class Metadata { + String title; + List authors; +} + +class Abstract { + String text; +} + +class Body_Text { + String text; +} + +public class Paper { + String paper_id; + Metadata metadata; + List _abstract; + List body_text; +}