67 lines
2.3 KiB
Java
67 lines
2.3 KiB
Java
|
package org.RI.P1;
|
||
|
|
||
|
import java.io.File;
|
||
|
import java.io.FileInputStream;
|
||
|
import java.io.FileNotFoundException;
|
||
|
import java.io.IOException;
|
||
|
import java.io.InputStream;
|
||
|
|
||
|
import org.apache.tika.exception.TikaException;
|
||
|
import org.apache.tika.language.detect.LanguageDetector;
|
||
|
import org.apache.tika.langdetect.OptimaizeLangDetector;
|
||
|
import org.apache.tika.language.detect.LanguageResult;
|
||
|
import org.apache.tika.metadata.*;
|
||
|
import org.apache.tika.parser.*;
|
||
|
import org.apache.tika.sax.BodyContentHandler;
|
||
|
import org.xml.sax.SAXException;
|
||
|
|
||
|
public class FileData {
|
||
|
private String filename;
|
||
|
private String type;
|
||
|
private String encoding;
|
||
|
private LanguageResult language;
|
||
|
private InputStream inputStream;
|
||
|
private Metadata metadata;
|
||
|
private BodyContentHandler contentHandler;
|
||
|
private ParseContext parseContext;
|
||
|
private AutoDetectParser parser;
|
||
|
private LanguageDetector langIdentifier;
|
||
|
|
||
|
FileData() {
|
||
|
}
|
||
|
|
||
|
FileData(File file) throws FileNotFoundException {
|
||
|
inputStream = new FileInputStream(file);
|
||
|
metadata = new Metadata();
|
||
|
parser = new AutoDetectParser();
|
||
|
contentHandler = new BodyContentHandler(-1);
|
||
|
parseContext = new ParseContext();
|
||
|
langIdentifier = new OptimaizeLangDetector().loadModels();
|
||
|
}
|
||
|
|
||
|
private void setMetadata() throws IOException, TikaException, SAXException {
|
||
|
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
||
|
filename = metadata.get(TikaCoreProperties.TITLE);
|
||
|
type = metadata.get(Metadata.CONTENT_TYPE);
|
||
|
encoding = metadata.get(Metadata.CONTENT_ENCODING);
|
||
|
language = langIdentifier.detect(contentHandler.toString());
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public String toString() {
|
||
|
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
||
|
+ language.getLanguage() + "\n";
|
||
|
}
|
||
|
|
||
|
public static void main(String[] args) throws IOException, TikaException, SAXException {
|
||
|
try {
|
||
|
File file = new File(args[0]);
|
||
|
FileData data = new FileData(file);
|
||
|
data.setMetadata();
|
||
|
System.out.println(data);
|
||
|
} catch (FileNotFoundException exp) {
|
||
|
System.out.println("The file " + args[0] + " could not be found");
|
||
|
}
|
||
|
}
|
||
|
}
|