RI-P1/src/main/java/org/RI/P1/FileData.java

58 lines
1.9 KiB
Java
Raw Normal View History

2020-10-25 19:58:54 +01:00
package org.RI.P1;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.*;
import org.apache.tika.parser.*;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class FileData {
private String filename;
private String type;
private String encoding;
private LanguageResult language;
private InputStream inputStream;
private Metadata metadata;
private BodyContentHandler contentHandler;
private ParseContext parseContext;
private AutoDetectParser parser;
private LanguageDetector langIdentifier;
FileData() {
}
2020-10-25 20:24:59 +01:00
FileData(File file) throws IOException, TikaException, SAXException {
2020-10-25 19:58:54 +01:00
inputStream = new FileInputStream(file);
metadata = new Metadata();
parser = new AutoDetectParser();
contentHandler = new BodyContentHandler(-1);
parseContext = new ParseContext();
langIdentifier = new OptimaizeLangDetector().loadModels();
2020-10-25 20:24:59 +01:00
setMetadata();
2020-10-25 19:58:54 +01:00
}
private void setMetadata() throws IOException, TikaException, SAXException {
parser.parse(inputStream, contentHandler, metadata, parseContext);
filename = metadata.get(TikaCoreProperties.TITLE);
type = metadata.get(Metadata.CONTENT_TYPE);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
language = langIdentifier.detect(contentHandler.toString());
}
@Override
public String toString() {
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
+ language.getLanguage() + "\n";
}
}