Print metadata of a given file

This commit is contained in:
coolneng 2020-10-25 19:58:54 +01:00
parent 69a5fbc678
commit 0620f42fa4
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
4 changed files with 79 additions and 47 deletions

10
pom.xml
View File

@ -30,6 +30,16 @@
<artifactId>tika-core</artifactId>
<version>1.22</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect</artifactId>
<version>1.22</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.21</version>
</dependency>
</dependencies>
<build>

View File

@ -1,45 +0,0 @@
package org.RI.P1;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.*;
import org.apache.tika.parser.*;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class File {
private String filename;
private String type;
private String encoding;
private String language;
private InputStream inputStream;
private Metadata metadata;
private ContentHandler contentHandler;
private ParseContext parseContext;
private AutoDetectParser parser;
File() {
}
File(String file) throws FileNotFoundException {
inputStream = new FileInputStream(file);
metadata = new Metadata();
parser = new AutoDetectParser();
contentHandler = new BodyContentHandler();
parseContext = new ParseContext();
}
private void setAttributes() throws IOException, TikaException, SAXException {
parser.parse(inputStream, contentHandler, metadata, parseContext);
filename = metadata.get(TikaCoreProperties.TITLE);
type = metadata.get(Metadata.CONTENT_TYPE);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
language = metadata.get(Metadata.CONTENT_LANGUAGE);
}
}

View File

@ -0,0 +1,66 @@
package org.RI.P1;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.*;
import org.apache.tika.parser.*;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class FileData {
private String filename;
private String type;
private String encoding;
private LanguageResult language;
private InputStream inputStream;
private Metadata metadata;
private BodyContentHandler contentHandler;
private ParseContext parseContext;
private AutoDetectParser parser;
private LanguageDetector langIdentifier;
FileData() {
}
FileData(File file) throws FileNotFoundException {
inputStream = new FileInputStream(file);
metadata = new Metadata();
parser = new AutoDetectParser();
contentHandler = new BodyContentHandler(-1);
parseContext = new ParseContext();
langIdentifier = new OptimaizeLangDetector().loadModels();
}
private void setMetadata() throws IOException, TikaException, SAXException {
parser.parse(inputStream, contentHandler, metadata, parseContext);
filename = metadata.get(TikaCoreProperties.TITLE);
type = metadata.get(Metadata.CONTENT_TYPE);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
language = langIdentifier.detect(contentHandler.toString());
}
@Override
public String toString() {
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
+ language.getLanguage() + "\n";
}
public static void main(String[] args) throws IOException, TikaException, SAXException {
try {
File file = new File(args[0]);
FileData data = new FileData(file);
data.setMetadata();
System.out.println(data);
} catch (FileNotFoundException exp) {
System.out.println("The file " + args[0] + " could not be found");
}
}
}

View File

@ -1,9 +1,10 @@
* P1
** TODO Create a table with information of all documents
| filename | type | encoding | language |
** TODO Extract all URLs
** TODO Write to a file all word occurrences and frequencies
Sorted in a decreasing manner
** TODO Plot word frequencies
With gnuplot, with documents of at least 3 different languages.
We'll fit this to the Booth and Federowicz equation
** DONE Create a table with information of all documents
CLOSED: [2020-10-25 Sun 19:58]
| filename | type | encoding | language |