Fix encoding detector

This commit is contained in:
coolneng 2020-10-29 13:26:04 +01:00
parent dd1c5f92d8
commit 6a30e7b532
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 13 additions and 1 deletions

View File

@ -22,12 +22,15 @@ import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.*; import org.apache.tika.metadata.*;
import org.apache.tika.parser.*; import org.apache.tika.parser.*;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler; import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.apache.commons.compress.utils.IOUtils;
public class FileData { public class FileData {
private File file; private File file;
@ -67,11 +70,20 @@ public class FileData {
setMetadata(); setMetadata();
} }
private String detectEncoding(InputStream inputStream) throws IOException {
CharsetDetector detector = new CharsetDetector();
byte[] data = IOUtils.toByteArray(inputStream);
detector.setText(data);
CharsetMatch match = detector.detect();
return match.getName();
}
private void setMetadata() throws IOException, TikaException, SAXException { private void setMetadata() throws IOException, TikaException, SAXException {
parser.parse(inputStream, contentHandler, metadata, parseContext); parser.parse(inputStream, contentHandler, metadata, parseContext);
filename = metadata.get(TikaCoreProperties.TITLE); filename = metadata.get(TikaCoreProperties.TITLE);
type = metadata.get(Metadata.CONTENT_TYPE); type = metadata.get(Metadata.CONTENT_TYPE);
encoding = metadata.get(Metadata.CONTENT_ENCODING); encoding = detectEncoding(inputStream);
language = langIdentifier.detect(contentHandler.toString()); language = langIdentifier.detect(contentHandler.toString());
} }