diff --git a/pom.xml b/pom.xml index b75f0cb..e6a0303 100644 --- a/pom.xml +++ b/pom.xml @@ -30,6 +30,16 @@ tika-core 1.22 + + org.apache.tika + tika-langdetect + 1.22 + + + org.apache.tika + tika-parsers + 1.21 + diff --git a/src/main/java/org/RI/P1/File.java b/src/main/java/org/RI/P1/File.java deleted file mode 100644 index 6c96be1..0000000 --- a/src/main/java/org/RI/P1/File.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.RI.P1; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.*; -import org.apache.tika.parser.*; -import org.apache.tika.sax.BodyContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -public class File { - private String filename; - private String type; - private String encoding; - private String language; - private InputStream inputStream; - private Metadata metadata; - private ContentHandler contentHandler; - private ParseContext parseContext; - private AutoDetectParser parser; - - File() { - } - - File(String file) throws FileNotFoundException { - inputStream = new FileInputStream(file); - metadata = new Metadata(); - parser = new AutoDetectParser(); - contentHandler = new BodyContentHandler(); - parseContext = new ParseContext(); - } - - private void setAttributes() throws IOException, TikaException, SAXException { - parser.parse(inputStream, contentHandler, metadata, parseContext); - filename = metadata.get(TikaCoreProperties.TITLE); - type = metadata.get(Metadata.CONTENT_TYPE); - encoding = metadata.get(Metadata.CONTENT_ENCODING); - language = metadata.get(Metadata.CONTENT_LANGUAGE); - } - -} diff --git a/src/main/java/org/RI/P1/FileData.java b/src/main/java/org/RI/P1/FileData.java new file mode 100644 index 0000000..4b33451 --- /dev/null +++ b/src/main/java/org/RI/P1/FileData.java @@ -0,0 +1,66 @@ +package org.RI.P1; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageResult; +import org.apache.tika.metadata.*; +import org.apache.tika.parser.*; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.SAXException; + +public class FileData { + private String filename; + private String type; + private String encoding; + private LanguageResult language; + private InputStream inputStream; + private Metadata metadata; + private BodyContentHandler contentHandler; + private ParseContext parseContext; + private AutoDetectParser parser; + private LanguageDetector langIdentifier; + + FileData() { + } + + FileData(File file) throws FileNotFoundException { + inputStream = new FileInputStream(file); + metadata = new Metadata(); + parser = new AutoDetectParser(); + contentHandler = new BodyContentHandler(-1); + parseContext = new ParseContext(); + langIdentifier = new OptimaizeLangDetector().loadModels(); + } + + private void setMetadata() throws IOException, TikaException, SAXException { + parser.parse(inputStream, contentHandler, metadata, parseContext); + filename = metadata.get(TikaCoreProperties.TITLE); + type = metadata.get(Metadata.CONTENT_TYPE); + encoding = metadata.get(Metadata.CONTENT_ENCODING); + language = langIdentifier.detect(contentHandler.toString()); + } + + @Override + public String toString() { + return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: " + + language.getLanguage() + "\n"; + } + + public static void main(String[] args) throws IOException, TikaException, SAXException { + try { + File file = new File(args[0]); + FileData data = new FileData(file); + data.setMetadata(); + System.out.println(data); + } catch (FileNotFoundException exp) { + System.out.println("The file " + args[0] + " could not be found"); + } + } +} diff --git a/src/main/java/org/RI/P1/TODO.org b/src/main/java/org/RI/P1/TODO.org index dc665ad..7d0ef6c 100644 --- a/src/main/java/org/RI/P1/TODO.org +++ b/src/main/java/org/RI/P1/TODO.org @@ -1,9 +1,10 @@ * P1 -** TODO Create a table with information of all documents -| filename | type | encoding | language | ** TODO Extract all URLs ** TODO Write to a file all word occurrences and frequencies Sorted in a decreasing manner ** TODO Plot word frequencies With gnuplot, with documents of at least 3 different languages. We'll fit this to the Booth and Federowicz equation +** DONE Create a table with information of all documents +CLOSED: [2020-10-25 Sun 19:58] +| filename | type | encoding | language |