Fix encoding detector
This commit is contained in:
parent
dd1c5f92d8
commit
6a30e7b532
|
@ -22,12 +22,15 @@ import org.apache.tika.langdetect.OptimaizeLangDetector;
|
||||||
import org.apache.tika.language.detect.LanguageResult;
|
import org.apache.tika.language.detect.LanguageResult;
|
||||||
import org.apache.tika.metadata.*;
|
import org.apache.tika.metadata.*;
|
||||||
import org.apache.tika.parser.*;
|
import org.apache.tika.parser.*;
|
||||||
|
import org.apache.tika.parser.txt.CharsetDetector;
|
||||||
|
import org.apache.tika.parser.txt.CharsetMatch;
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
import org.apache.tika.sax.LinkContentHandler;
|
import org.apache.tika.sax.LinkContentHandler;
|
||||||
import org.apache.tika.sax.TeeContentHandler;
|
import org.apache.tika.sax.TeeContentHandler;
|
||||||
import org.apache.tika.sax.ToHTMLContentHandler;
|
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||||
import org.apache.tika.parser.html.HtmlParser;
|
import org.apache.tika.parser.html.HtmlParser;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
import org.apache.commons.compress.utils.IOUtils;
|
||||||
|
|
||||||
public class FileData {
|
public class FileData {
|
||||||
private File file;
|
private File file;
|
||||||
|
@ -67,11 +70,20 @@ public class FileData {
|
||||||
setMetadata();
|
setMetadata();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String detectEncoding(InputStream inputStream) throws IOException {
|
||||||
|
CharsetDetector detector = new CharsetDetector();
|
||||||
|
byte[] data = IOUtils.toByteArray(inputStream);
|
||||||
|
detector.setText(data);
|
||||||
|
CharsetMatch match = detector.detect();
|
||||||
|
|
||||||
|
return match.getName();
|
||||||
|
}
|
||||||
|
|
||||||
private void setMetadata() throws IOException, TikaException, SAXException {
|
private void setMetadata() throws IOException, TikaException, SAXException {
|
||||||
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
parser.parse(inputStream, contentHandler, metadata, parseContext);
|
||||||
filename = metadata.get(TikaCoreProperties.TITLE);
|
filename = metadata.get(TikaCoreProperties.TITLE);
|
||||||
type = metadata.get(Metadata.CONTENT_TYPE);
|
type = metadata.get(Metadata.CONTENT_TYPE);
|
||||||
encoding = metadata.get(Metadata.CONTENT_ENCODING);
|
encoding = detectEncoding(inputStream);
|
||||||
language = langIdentifier.detect(contentHandler.toString());
|
language = langIdentifier.detect(contentHandler.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue