diff --git a/.gitignore b/.gitignore index eb5a316..73df60d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ target +.classpath +.project +.settings diff --git a/shell.nix b/shell.nix index d78b1ca..8295bf5 100644 --- a/shell.nix +++ b/shell.nix @@ -2,4 +2,4 @@ with pkgs; -mkShell { buildInputs = [ jdk11 maven ]; } +mkShell { buildInputs = [ jdk11 maven gnuplot ]; } diff --git a/src/main/java/org/RI/P1/EjemploSimple.java b/src/main/java/org/RI/P1/EjemploSimple.java deleted file mode 100644 index dba1be8..0000000 --- a/src/main/java/org/RI/P1/EjemploSimple.java +++ /dev/null @@ -1,22 +0,0 @@ -import java.io.File; -import org.apache.tika.Tika ; - -public class EjemploSimple { - - public static void main(String[] args) throws Exception { - - // Creamos una instancia de Tika con la configuracion por defecto - Tika tika = new Tika(); - // Se parsean los ficheros pasados como argumento y se extrae el contenido - for (String file : args) { - File f = new File(file); - // Detectamos el MIME tipo del fichero - String type = tika.detect(f); - System.out.println(file +":"+type); - // Extraemos el texto plano en un string - String text = tika.parseToString(f); - System.out.print(text); - } - } -} - diff --git a/src/main/java/org/RI/P1/File.java b/src/main/java/org/RI/P1/File.java new file mode 100644 index 0000000..6c96be1 --- /dev/null +++ b/src/main/java/org/RI/P1/File.java @@ -0,0 +1,45 @@ +package org.RI.P1; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.*; +import org.apache.tika.parser.*; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class File { + private String filename; + private String type; + private String encoding; + private String language; + private InputStream inputStream; + private Metadata metadata; + private ContentHandler contentHandler; + private ParseContext parseContext; + private AutoDetectParser parser; + + File() { + } + + File(String file) throws FileNotFoundException { + inputStream = new FileInputStream(file); + metadata = new Metadata(); + parser = new AutoDetectParser(); + contentHandler = new BodyContentHandler(); + parseContext = new ParseContext(); + } + + private void setAttributes() throws IOException, TikaException, SAXException { + parser.parse(inputStream, contentHandler, metadata, parseContext); + filename = metadata.get(TikaCoreProperties.TITLE); + type = metadata.get(Metadata.CONTENT_TYPE); + encoding = metadata.get(Metadata.CONTENT_ENCODING); + language = metadata.get(Metadata.CONTENT_LANGUAGE); + } + +} diff --git a/src/main/java/org/RI/P1/TODO.org b/src/main/java/org/RI/P1/TODO.org new file mode 100644 index 0000000..dc665ad --- /dev/null +++ b/src/main/java/org/RI/P1/TODO.org @@ -0,0 +1,9 @@ +* P1 +** TODO Create a table with information of all documents +| filename | type | encoding | language | +** TODO Extract all URLs +** TODO Write to a file all word occurrences and frequencies +Sorted in a decreasing manner +** TODO Plot word frequencies +With gnuplot, with documents of at least 3 different languages. +We'll fit this to the Booth and Federowicz equation