Add File class and TODOs

This commit is contained in:
coolneng 2020-10-25 13:51:13 +01:00
parent 6fd237612a
commit 69a5fbc678
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
5 changed files with 58 additions and 23 deletions

3
.gitignore vendored
View File

@ -1 +1,4 @@
target
.classpath
.project
.settings

View File

@ -2,4 +2,4 @@
with pkgs;
mkShell { buildInputs = [ jdk11 maven ]; }
mkShell { buildInputs = [ jdk11 maven gnuplot ]; }

View File

@ -1,22 +0,0 @@
import java.io.File;
import org.apache.tika.Tika ;
public class EjemploSimple {
public static void main(String[] args) throws Exception {
// Creamos una instancia de Tika con la configuracion por defecto
Tika tika = new Tika();
// Se parsean los ficheros pasados como argumento y se extrae el contenido
for (String file : args) {
File f = new File(file);
// Detectamos el MIME tipo del fichero
String type = tika.detect(f);
System.out.println(file +":"+type);
// Extraemos el texto plano en un string
String text = tika.parseToString(f);
System.out.print(text);
}
}
}

View File

@ -0,0 +1,45 @@
package org.RI.P1;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.*;
import org.apache.tika.parser.*;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class File {
private String filename;
private String type;
private String encoding;
private String language;
private InputStream inputStream;
private Metadata metadata;
private ContentHandler contentHandler;
private ParseContext parseContext;
private AutoDetectParser parser;
File() {
}
File(String file) throws FileNotFoundException {
inputStream = new FileInputStream(file);
metadata = new Metadata();
parser = new AutoDetectParser();
contentHandler = new BodyContentHandler();
parseContext = new ParseContext();
}
private void setAttributes() throws IOException, TikaException, SAXException {
parser.parse(inputStream, contentHandler, metadata, parseContext);
filename = metadata.get(TikaCoreProperties.TITLE);
type = metadata.get(Metadata.CONTENT_TYPE);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
language = metadata.get(Metadata.CONTENT_LANGUAGE);
}
}

View File

@ -0,0 +1,9 @@
* P1
** TODO Create a table with information of all documents
| filename | type | encoding | language |
** TODO Extract all URLs
** TODO Write to a file all word occurrences and frequencies
Sorted in a decreasing manner
** TODO Plot word frequencies
With gnuplot, with documents of at least 3 different languages.
We'll fit this to the Booth and Federowicz equation