diff --git a/src/main/java/org/RI/P1/FileData.java b/src/main/java/org/RI/P1/FileData.java index de9c07d..eb7464c 100644 --- a/src/main/java/org/RI/P1/FileData.java +++ b/src/main/java/org/RI/P1/FileData.java @@ -22,12 +22,15 @@ import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.*; import org.apache.tika.parser.*; +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.ToHTMLContentHandler; import org.apache.tika.parser.html.HtmlParser; import org.xml.sax.SAXException; +import org.apache.commons.compress.utils.IOUtils; public class FileData { private File file; @@ -67,11 +70,20 @@ public class FileData { setMetadata(); } + private String detectEncoding(InputStream inputStream) throws IOException { + CharsetDetector detector = new CharsetDetector(); + byte[] data = IOUtils.toByteArray(inputStream); + detector.setText(data); + CharsetMatch match = detector.detect(); + + return match.getName(); + } + private void setMetadata() throws IOException, TikaException, SAXException { parser.parse(inputStream, contentHandler, metadata, parseContext); filename = metadata.get(TikaCoreProperties.TITLE); type = metadata.get(Metadata.CONTENT_TYPE); - encoding = metadata.get(Metadata.CONTENT_ENCODING); + encoding = detectEncoding(inputStream); language = langIdentifier.detect(contentHandler.toString()); }