diff --git a/.gitignore b/.gitignore
index 9df1afb..45647a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
data
+data-test
target
output
diff --git a/pom.xml b/pom.xml
index 9e75a55..5192c9c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -30,9 +30,9 @@
8.6.3
- com.googlecode.json-simple
- json-simple
- 1.1.1
+ com.google.code.gson
+ gson
+ 2.8.6
diff --git a/src/main/java/org/RI/P2/Indexer.java b/src/main/java/org/RI/P2/Indexer.java
index 2259f43..2c4d4fb 100644
--- a/src/main/java/org/RI/P2/Indexer.java
+++ b/src/main/java/org/RI/P2/Indexer.java
@@ -19,16 +19,16 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.JSONValue;
+import com.google.gson.Gson;
public class Indexer {
IndexWriter index;
@@ -57,13 +57,12 @@ public class Indexer {
return files;
}
- JSONArray parseJSONFile(File file) throws IOException {
+ Paper parseJSONFile(File file) throws IOException {
InputStream jsonFile = new FileInputStream(file);
Reader readerJson = new InputStreamReader(jsonFile);
- Object fileObject = JSONValue.parse(readerJson);
- JSONArray arrayObject = new JSONArray();
- arrayObject.add(fileObject);
- return arrayObject;
+ Gson gson = new Gson();
+ Paper data = gson.fromJson(readerJson, Paper.class);
+ return data;
}
void createIndex() throws IOException {
@@ -73,11 +72,16 @@ public class Indexer {
index = new IndexWriter(dir, config);
}
- void addDocuments(JSONArray jsonObjects) throws IOException {
- for (JSONObject object : (List) jsonObjects) {
- Document doc = new Document();
- index.addDocument(doc);
+ void addDocument(Paper paper) throws IOException {
+ Document doc = new Document();
+ doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES));
+ doc.add(new TextField("title", paper.metadata.title, Field.Store.YES));
+ for (Author author : paper.metadata.authors) {
+ String authorName = author.first + " " + author.middle + " " + author.last;
+ authorName = authorName.replaceAll("\\p{P}", "");
+ doc.add(new TextField("authors", authorName, Field.Store.YES));
}
+ index.addDocument(doc);
}
void commitChanges() throws IOException {
@@ -88,8 +92,8 @@ public class Indexer {
void populateIndex() throws IOException, ParseException {
createIndex();
for (File file : files) {
- JSONArray jsonObjects = parseJSONFile(file);
- addDocument(jsonObjects);
+ Paper paper = parseJSONFile(file);
+ addDocument(paper);
}
commitChanges();
}
diff --git a/src/main/java/org/RI/P2/Paper.java b/src/main/java/org/RI/P2/Paper.java
new file mode 100644
index 0000000..e443c1f
--- /dev/null
+++ b/src/main/java/org/RI/P2/Paper.java
@@ -0,0 +1,45 @@
+package org.RI.P2;
+
+import java.util.List;
+
+class Affiliation {
+ String laboratory;
+ String institution;
+}
+
+class Location {
+ String postCode;
+ String settlement;
+ String region;
+ String country;
+}
+
+class Author {
+ String first;
+ List middle;
+ String last;
+ String suffix;
+ Affiliation affiliation;
+ Location location;
+ String email;
+}
+
+class Metadata {
+ String title;
+ List authors;
+}
+
+class Abstract {
+ String text;
+}
+
+class Body_Text {
+ String text;
+}
+
+public class Paper {
+ String paper_id;
+ Metadata metadata;
+ List _abstract;
+ List body_text;
+}