diff --git a/pom.xml b/pom.xml
index b75f0cb..e6a0303 100644
--- a/pom.xml
+++ b/pom.xml
@@ -30,6 +30,16 @@
tika-core
1.22
+
+ org.apache.tika
+ tika-langdetect
+ 1.22
+
+
+ org.apache.tika
+ tika-parsers
+ 1.21
+
diff --git a/src/main/java/org/RI/P1/File.java b/src/main/java/org/RI/P1/File.java
deleted file mode 100644
index 6c96be1..0000000
--- a/src/main/java/org/RI/P1/File.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package org.RI.P1;
-
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.*;
-import org.apache.tika.parser.*;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class File {
- private String filename;
- private String type;
- private String encoding;
- private String language;
- private InputStream inputStream;
- private Metadata metadata;
- private ContentHandler contentHandler;
- private ParseContext parseContext;
- private AutoDetectParser parser;
-
- File() {
- }
-
- File(String file) throws FileNotFoundException {
- inputStream = new FileInputStream(file);
- metadata = new Metadata();
- parser = new AutoDetectParser();
- contentHandler = new BodyContentHandler();
- parseContext = new ParseContext();
- }
-
- private void setAttributes() throws IOException, TikaException, SAXException {
- parser.parse(inputStream, contentHandler, metadata, parseContext);
- filename = metadata.get(TikaCoreProperties.TITLE);
- type = metadata.get(Metadata.CONTENT_TYPE);
- encoding = metadata.get(Metadata.CONTENT_ENCODING);
- language = metadata.get(Metadata.CONTENT_LANGUAGE);
- }
-
-}
diff --git a/src/main/java/org/RI/P1/FileData.java b/src/main/java/org/RI/P1/FileData.java
new file mode 100644
index 0000000..4b33451
--- /dev/null
+++ b/src/main/java/org/RI/P1/FileData.java
@@ -0,0 +1,66 @@
+package org.RI.P1;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.*;
+import org.apache.tika.parser.*;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class FileData {
+ private String filename;
+ private String type;
+ private String encoding;
+ private LanguageResult language;
+ private InputStream inputStream;
+ private Metadata metadata;
+ private BodyContentHandler contentHandler;
+ private ParseContext parseContext;
+ private AutoDetectParser parser;
+ private LanguageDetector langIdentifier;
+
+ FileData() {
+ }
+
+ FileData(File file) throws FileNotFoundException {
+ inputStream = new FileInputStream(file);
+ metadata = new Metadata();
+ parser = new AutoDetectParser();
+ contentHandler = new BodyContentHandler(-1);
+ parseContext = new ParseContext();
+ langIdentifier = new OptimaizeLangDetector().loadModels();
+ }
+
+ private void setMetadata() throws IOException, TikaException, SAXException {
+ parser.parse(inputStream, contentHandler, metadata, parseContext);
+ filename = metadata.get(TikaCoreProperties.TITLE);
+ type = metadata.get(Metadata.CONTENT_TYPE);
+ encoding = metadata.get(Metadata.CONTENT_ENCODING);
+ language = langIdentifier.detect(contentHandler.toString());
+ }
+
+ @Override
+ public String toString() {
+ return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
+ + language.getLanguage() + "\n";
+ }
+
+ public static void main(String[] args) throws IOException, TikaException, SAXException {
+ try {
+ File file = new File(args[0]);
+ FileData data = new FileData(file);
+ data.setMetadata();
+ System.out.println(data);
+ } catch (FileNotFoundException exp) {
+ System.out.println("The file " + args[0] + " could not be found");
+ }
+ }
+}
diff --git a/src/main/java/org/RI/P1/TODO.org b/src/main/java/org/RI/P1/TODO.org
index dc665ad..7d0ef6c 100644
--- a/src/main/java/org/RI/P1/TODO.org
+++ b/src/main/java/org/RI/P1/TODO.org
@@ -1,9 +1,10 @@
* P1
-** TODO Create a table with information of all documents
-| filename | type | encoding | language |
** TODO Extract all URLs
** TODO Write to a file all word occurrences and frequencies
Sorted in a decreasing manner
** TODO Plot word frequencies
With gnuplot, with documents of at least 3 different languages.
We'll fit this to the Booth and Federowicz equation
+** DONE Create a table with information of all documents
+CLOSED: [2020-10-25 Sun 19:58]
+| filename | type | encoding | language |