2020-10-25 19:58:54 +01:00
|
|
|
package org.RI.P1;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
|
|
|
import org.apache.tika.exception.TikaException;
|
|
|
|
import org.apache.tika.language.detect.LanguageDetector;
|
|
|
|
import org.apache.tika.langdetect.OptimaizeLangDetector;
|
|
|
|
import org.apache.tika.language.detect.LanguageResult;
|
|
|
|
import org.apache.tika.metadata.*;
|
|
|
|
import org.apache.tika.parser.*;
|
|
|
|
import org.apache.tika.sax.BodyContentHandler;
|
|
|
|
import org.xml.sax.SAXException;
|
|
|
|
|
|
|
|
public class FileData {
|
|
|
|
private String filename;
|
|
|
|
private String type;
|
|
|
|
private String encoding;
|
|
|
|
private LanguageResult language;
|
|
|
|
private InputStream inputStream;
|
|
|
|
private Metadata metadata;
|
|
|
|
private BodyContentHandler contentHandler;
|
|
|
|
private ParseContext parseContext;
|
|
|
|
private AutoDetectParser parser;
|
|
|
|
private LanguageDetector langIdentifier;
|
|
|
|
|
|
|
|
FileData() {
|
|
|
|
}
|
|
|
|
|
2020-10-25 20:24:59 +01:00
|
|
|
FileData(File file) throws IOException, TikaException, SAXException {
|
2020-10-25 19:58:54 +01:00
|
|
|
inputStream = new FileInputStream(file);
|
|
|
|
metadata = new Metadata();
|
|
|
|
parser = new AutoDetectParser();
|
|
|
|
contentHandler = new BodyContentHandler(-1);
|
|
|
|
parseContext = new ParseContext();
|
|
|
|
langIdentifier = new OptimaizeLangDetector().loadModels();
|
2020-10-25 20:24:59 +01:00
|
|
|
setMetadata();
|
2020-10-25 19:58:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
private void setMetadata() throws IOException, TikaException, SAXException {
|
|
|
|
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
|
|
|
filename = metadata.get(TikaCoreProperties.TITLE);
|
|
|
|
type = metadata.get(Metadata.CONTENT_TYPE);
|
|
|
|
encoding = metadata.get(Metadata.CONTENT_ENCODING);
|
|
|
|
language = langIdentifier.detect(contentHandler.toString());
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public String toString() {
|
|
|
|
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
|
|
|
+ language.getLanguage() + "\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|