Add File class and TODOs

2020-10-25 13:51:13 +01:00 · 2020-10-25 13:51:13 +01:00 · 69a5fbc678
parent 6fd237612a
commit 69a5fbc678
5 changed files with 58 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,4 @@
 target
+.classpath
+.project
+.settings
--- a/shell.nix
+++ b/shell.nix
@ -2,4 +2,4 @@

 with pkgs;

-mkShell { buildInputs = [ jdk11 maven ]; }
+mkShell { buildInputs = [ jdk11 maven gnuplot ]; }
--- a/src/main/java/org/RI/P1/EjemploSimple.java
+++ b/src/main/java/org/RI/P1/EjemploSimple.java
@ -1,22 +0,0 @@
-import java.io.File;
-import org.apache.tika.Tika ;
-
-public class EjemploSimple {
-
-    public static void main(String[] args) throws Exception {
-    
-        // Creamos una instancia de Tika con la configuracion por defecto
-        Tika tika = new Tika();
-        // Se parsean los ficheros pasados como argumento y se extrae el contenido
-        for (String file : args) { 
-            File f = new File(file);
-            // Detectamos el MIME tipo del fichero
-            String type = tika.detect(f); 
-            System.out.println(file +":"+type);
-            // Extraemos el texto plano en un string
-            String text = tika.parseToString(f); 
-            System.out.print(text);
-        }
-    } 
-}
-
--- a/src/main/java/org/RI/P1/File.java
+++ b/src/main/java/org/RI/P1/File.java
@ -0,0 +1,45 @@
+package org.RI.P1;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.*;
+import org.apache.tika.parser.*;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class File {
+    private String filename;
+    private String type;
+    private String encoding;
+    private String language;
+    private InputStream inputStream;
+    private Metadata metadata;
+    private ContentHandler contentHandler;
+    private ParseContext parseContext;
+    private AutoDetectParser parser;
+
+    File() {
+    }
+
+    File(String file) throws FileNotFoundException {
+        inputStream = new FileInputStream(file);
+        metadata = new Metadata();
+        parser = new AutoDetectParser();
+        contentHandler = new BodyContentHandler();
+        parseContext = new ParseContext();
+    }
+
+    private void setAttributes() throws IOException, TikaException, SAXException {
+        parser.parse(inputStream, contentHandler, metadata, parseContext);
+        filename = metadata.get(TikaCoreProperties.TITLE);
+        type = metadata.get(Metadata.CONTENT_TYPE);
+        encoding = metadata.get(Metadata.CONTENT_ENCODING);
+        language = metadata.get(Metadata.CONTENT_LANGUAGE);
+    }
+
+}
--- a/src/main/java/org/RI/P1/TODO.org
+++ b/src/main/java/org/RI/P1/TODO.org
@ -0,0 +1,9 @@
+* P1
+** TODO Create a table with information of all documents
+| filename | type | encoding | language |
+** TODO Extract all URLs
+** TODO Write to a file all word occurrences and frequencies
+Sorted in a decreasing manner
+** TODO Plot word frequencies
+With gnuplot, with documents of at least 3 different languages.
+We'll fit this to the Booth and Federowicz equation