Fix encoding detector
This commit is contained in:
parent
dd1c5f92d8
commit
6a30e7b532
|
@ -22,12 +22,15 @@ import org.apache.tika.langdetect.OptimaizeLangDetector;
|
|||
import org.apache.tika.language.detect.LanguageResult;
|
||||
import org.apache.tika.metadata.*;
|
||||
import org.apache.tika.parser.*;
|
||||
import org.apache.tika.parser.txt.CharsetDetector;
|
||||
import org.apache.tika.parser.txt.CharsetMatch;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.LinkContentHandler;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||
import org.apache.tika.parser.html.HtmlParser;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
|
||||
public class FileData {
|
||||
private File file;
|
||||
|
@ -67,11 +70,20 @@ public class FileData {
|
|||
setMetadata();
|
||||
}
|
||||
|
||||
private String detectEncoding(InputStream inputStream) throws IOException {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
byte[] data = IOUtils.toByteArray(inputStream);
|
||||
detector.setText(data);
|
||||
CharsetMatch match = detector.detect();
|
||||
|
||||
return match.getName();
|
||||
}
|
||||
|
||||
private void setMetadata() throws IOException, TikaException, SAXException {
|
||||
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
||||
filename = metadata.get(TikaCoreProperties.TITLE);
|
||||
type = metadata.get(Metadata.CONTENT_TYPE);
|
||||
encoding = metadata.get(Metadata.CONTENT_ENCODING);
|
||||
encoding = detectEncoding(inputStream);
|
||||
language = langIdentifier.detect(contentHandler.toString());
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue