package org.RI.P1; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.*; import org.apache.tika.parser.*; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class FileData { private String filename; private String type; private String encoding; private LanguageResult language; private InputStream inputStream; private Metadata metadata; private BodyContentHandler contentHandler; private ParseContext parseContext; private AutoDetectParser parser; private LanguageDetector langIdentifier; FileData() { } FileData(File file) throws IOException, TikaException, SAXException { inputStream = new FileInputStream(file); metadata = new Metadata(); parser = new AutoDetectParser(); contentHandler = new BodyContentHandler(-1); parseContext = new ParseContext(); langIdentifier = new OptimaizeLangDetector().loadModels(); setMetadata(); } private void setMetadata() throws IOException, TikaException, SAXException { parser.parse(inputStream, contentHandler, metadata, parseContext); filename = metadata.get(TikaCoreProperties.TITLE); type = metadata.get(Metadata.CONTENT_TYPE); encoding = metadata.get(Metadata.CONTENT_ENCODING); language = langIdentifier.detect(contentHandler.toString()); } @Override public String toString() { return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: " + language.getLanguage() + "\n"; } }