package org.RI.P1; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.*; import org.apache.tika.parser.*; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class FileData { private String filename; private String type; private String encoding; private LanguageResult language; private InputStream inputStream; private Metadata metadata; private BodyContentHandler contentHandler; private ParseContext parseContext; private AutoDetectParser parser; private LanguageDetector langIdentifier; FileData() { } FileData(File file) throws FileNotFoundException { inputStream = new FileInputStream(file); metadata = new Metadata(); parser = new AutoDetectParser(); contentHandler = new BodyContentHandler(-1); parseContext = new ParseContext(); langIdentifier = new OptimaizeLangDetector().loadModels(); } private void setMetadata() throws IOException, TikaException, SAXException { parser.parse(inputStream, contentHandler, metadata, parseContext); filename = metadata.get(TikaCoreProperties.TITLE); type = metadata.get(Metadata.CONTENT_TYPE); encoding = metadata.get(Metadata.CONTENT_ENCODING); language = langIdentifier.detect(contentHandler.toString()); } @Override public String toString() { return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: " + language.getLanguage() + "\n"; } public static void main(String[] args) throws IOException, TikaException, SAXException { try { File file = new File(args[0]); FileData data = new FileData(file); data.setMetadata(); System.out.println(data); } catch (FileNotFoundException exp) { System.out.println("The file " + args[0] + " could not be found"); } } }