From 6a30e7b532361cb8758c955a60decd507f520a4e Mon Sep 17 00:00:00 2001 From: coolneng Date: Thu, 29 Oct 2020 13:26:04 +0100 Subject: [PATCH] Fix encoding detector --- src/main/java/org/RI/P1/FileData.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/RI/P1/FileData.java b/src/main/java/org/RI/P1/FileData.java index de9c07d..eb7464c 100644 --- a/src/main/java/org/RI/P1/FileData.java +++ b/src/main/java/org/RI/P1/FileData.java @@ -22,12 +22,15 @@ import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.*; import org.apache.tika.parser.*; +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.ToHTMLContentHandler; import org.apache.tika.parser.html.HtmlParser; import org.xml.sax.SAXException; +import org.apache.commons.compress.utils.IOUtils; public class FileData { private File file; @@ -67,11 +70,20 @@ public class FileData { setMetadata(); } + private String detectEncoding(InputStream inputStream) throws IOException { + CharsetDetector detector = new CharsetDetector(); + byte[] data = IOUtils.toByteArray(inputStream); + detector.setText(data); + CharsetMatch match = detector.detect(); + + return match.getName(); + } + private void setMetadata() throws IOException, TikaException, SAXException { parser.parse(inputStream, contentHandler, metadata, parseContext); filename = metadata.get(TikaCoreProperties.TITLE); type = metadata.get(Metadata.CONTENT_TYPE); - encoding = metadata.get(Metadata.CONTENT_ENCODING); + encoding = detectEncoding(inputStream); language = langIdentifier.detect(contentHandler.toString()); }