Print metadata of a given file
This commit is contained in:
parent
69a5fbc678
commit
0620f42fa4
10
pom.xml
10
pom.xml
|
@ -30,6 +30,16 @@
|
||||||
<artifactId>tika-core</artifactId>
|
<artifactId>tika-core</artifactId>
|
||||||
<version>1.22</version>
|
<version>1.22</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.tika</groupId>
|
||||||
|
<artifactId>tika-langdetect</artifactId>
|
||||||
|
<version>1.22</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.tika</groupId>
|
||||||
|
<artifactId>tika-parsers</artifactId>
|
||||||
|
<version>1.21</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -1,45 +0,0 @@
|
||||||
package org.RI.P1;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
|
|
||||||
import org.apache.tika.exception.TikaException;
|
|
||||||
import org.apache.tika.metadata.*;
|
|
||||||
import org.apache.tika.parser.*;
|
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
|
||||||
import org.xml.sax.ContentHandler;
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
|
|
||||||
public class File {
|
|
||||||
private String filename;
|
|
||||||
private String type;
|
|
||||||
private String encoding;
|
|
||||||
private String language;
|
|
||||||
private InputStream inputStream;
|
|
||||||
private Metadata metadata;
|
|
||||||
private ContentHandler contentHandler;
|
|
||||||
private ParseContext parseContext;
|
|
||||||
private AutoDetectParser parser;
|
|
||||||
|
|
||||||
File() {
|
|
||||||
}
|
|
||||||
|
|
||||||
File(String file) throws FileNotFoundException {
|
|
||||||
inputStream = new FileInputStream(file);
|
|
||||||
metadata = new Metadata();
|
|
||||||
parser = new AutoDetectParser();
|
|
||||||
contentHandler = new BodyContentHandler();
|
|
||||||
parseContext = new ParseContext();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setAttributes() throws IOException, TikaException, SAXException {
|
|
||||||
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
|
||||||
filename = metadata.get(TikaCoreProperties.TITLE);
|
|
||||||
type = metadata.get(Metadata.CONTENT_TYPE);
|
|
||||||
encoding = metadata.get(Metadata.CONTENT_ENCODING);
|
|
||||||
language = metadata.get(Metadata.CONTENT_LANGUAGE);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
package org.RI.P1;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
|
import org.apache.tika.language.detect.LanguageDetector;
|
||||||
|
import org.apache.tika.langdetect.OptimaizeLangDetector;
|
||||||
|
import org.apache.tika.language.detect.LanguageResult;
|
||||||
|
import org.apache.tika.metadata.*;
|
||||||
|
import org.apache.tika.parser.*;
|
||||||
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
public class FileData {
|
||||||
|
private String filename;
|
||||||
|
private String type;
|
||||||
|
private String encoding;
|
||||||
|
private LanguageResult language;
|
||||||
|
private InputStream inputStream;
|
||||||
|
private Metadata metadata;
|
||||||
|
private BodyContentHandler contentHandler;
|
||||||
|
private ParseContext parseContext;
|
||||||
|
private AutoDetectParser parser;
|
||||||
|
private LanguageDetector langIdentifier;
|
||||||
|
|
||||||
|
FileData() {
|
||||||
|
}
|
||||||
|
|
||||||
|
FileData(File file) throws FileNotFoundException {
|
||||||
|
inputStream = new FileInputStream(file);
|
||||||
|
metadata = new Metadata();
|
||||||
|
parser = new AutoDetectParser();
|
||||||
|
contentHandler = new BodyContentHandler(-1);
|
||||||
|
parseContext = new ParseContext();
|
||||||
|
langIdentifier = new OptimaizeLangDetector().loadModels();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setMetadata() throws IOException, TikaException, SAXException {
|
||||||
|
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
||||||
|
filename = metadata.get(TikaCoreProperties.TITLE);
|
||||||
|
type = metadata.get(Metadata.CONTENT_TYPE);
|
||||||
|
encoding = metadata.get(Metadata.CONTENT_ENCODING);
|
||||||
|
language = langIdentifier.detect(contentHandler.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
||||||
|
+ language.getLanguage() + "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, TikaException, SAXException {
|
||||||
|
try {
|
||||||
|
File file = new File(args[0]);
|
||||||
|
FileData data = new FileData(file);
|
||||||
|
data.setMetadata();
|
||||||
|
System.out.println(data);
|
||||||
|
} catch (FileNotFoundException exp) {
|
||||||
|
System.out.println("The file " + args[0] + " could not be found");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,9 +1,10 @@
|
||||||
* P1
|
* P1
|
||||||
** TODO Create a table with information of all documents
|
|
||||||
| filename | type | encoding | language |
|
|
||||||
** TODO Extract all URLs
|
** TODO Extract all URLs
|
||||||
** TODO Write to a file all word occurrences and frequencies
|
** TODO Write to a file all word occurrences and frequencies
|
||||||
Sorted in a decreasing manner
|
Sorted in a decreasing manner
|
||||||
** TODO Plot word frequencies
|
** TODO Plot word frequencies
|
||||||
With gnuplot, with documents of at least 3 different languages.
|
With gnuplot, with documents of at least 3 different languages.
|
||||||
We'll fit this to the Booth and Federowicz equation
|
We'll fit this to the Booth and Federowicz equation
|
||||||
|
** DONE Create a table with information of all documents
|
||||||
|
CLOSED: [2020-10-25 Sun 19:58]
|
||||||
|
| filename | type | encoding | language |
|
||||||
|
|
Loading…
Reference in New Issue