Extract all links from file
This commit is contained in:
parent
306bb78c61
commit
0f688eaf42
3
TODO.org
3
TODO.org
|
@ -1,5 +1,4 @@
|
||||||
* P1
|
* P1
|
||||||
** TODO Extract all URLs
|
|
||||||
** TODO Write to a file all word occurrences and frequencies
|
** TODO Write to a file all word occurrences and frequencies
|
||||||
Sorted in a decreasing manner
|
Sorted in a decreasing manner
|
||||||
** TODO Plot word frequencies
|
** TODO Plot word frequencies
|
||||||
|
@ -8,3 +7,5 @@ We'll fit this to the Booth and Federowicz equation
|
||||||
** DONE Create a table with information of all documents
|
** DONE Create a table with information of all documents
|
||||||
CLOSED: [2020-10-25 Sun 19:58]
|
CLOSED: [2020-10-25 Sun 19:58]
|
||||||
| filename | type | encoding | language |
|
| filename | type | encoding | language |
|
||||||
|
** DONE Extract all URLs
|
||||||
|
CLOSED: [2020-10-25 Sun 22:14]
|
||||||
|
|
|
@ -2,7 +2,6 @@ package org.RI.P1;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
@ -13,9 +12,14 @@ import org.apache.tika.language.detect.LanguageResult;
|
||||||
import org.apache.tika.metadata.*;
|
import org.apache.tika.metadata.*;
|
||||||
import org.apache.tika.parser.*;
|
import org.apache.tika.parser.*;
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
|
import org.apache.tika.sax.LinkContentHandler;
|
||||||
|
import org.apache.tika.sax.TeeContentHandler;
|
||||||
|
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||||
|
import org.apache.tika.parser.html.HtmlParser;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
public class FileData {
|
public class FileData {
|
||||||
|
private File file;
|
||||||
private String filename;
|
private String filename;
|
||||||
private String type;
|
private String type;
|
||||||
private String encoding;
|
private String encoding;
|
||||||
|
@ -26,17 +30,23 @@ public class FileData {
|
||||||
private ParseContext parseContext;
|
private ParseContext parseContext;
|
||||||
private AutoDetectParser parser;
|
private AutoDetectParser parser;
|
||||||
private LanguageDetector langIdentifier;
|
private LanguageDetector langIdentifier;
|
||||||
|
private LinkContentHandler linkHandler;
|
||||||
|
private TeeContentHandler teeHandler;
|
||||||
|
private ToHTMLContentHandler toHTMLhandler;
|
||||||
|
|
||||||
FileData() {
|
FileData() {
|
||||||
}
|
}
|
||||||
|
|
||||||
FileData(File file) throws IOException, TikaException, SAXException {
|
FileData(File file) throws IOException, TikaException, SAXException {
|
||||||
|
this.file = file;
|
||||||
inputStream = new FileInputStream(file);
|
inputStream = new FileInputStream(file);
|
||||||
metadata = new Metadata();
|
metadata = new Metadata();
|
||||||
parser = new AutoDetectParser();
|
parser = new AutoDetectParser();
|
||||||
contentHandler = new BodyContentHandler(-1);
|
contentHandler = new BodyContentHandler(-1);
|
||||||
parseContext = new ParseContext();
|
parseContext = new ParseContext();
|
||||||
langIdentifier = new OptimaizeLangDetector().loadModels();
|
langIdentifier = new OptimaizeLangDetector().loadModels();
|
||||||
|
linkHandler = new LinkContentHandler();
|
||||||
|
toHTMLhandler = new ToHTMLContentHandler();
|
||||||
setMetadata();
|
setMetadata();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,6 +58,15 @@ public class FileData {
|
||||||
language = langIdentifier.detect(contentHandler.toString());
|
language = langIdentifier.detect(contentHandler.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void getLinks() throws IOException, TikaException, SAXException {
|
||||||
|
HtmlParser htmlParser = new HtmlParser();
|
||||||
|
inputStream = new FileInputStream(file);
|
||||||
|
teeHandler = new TeeContentHandler(linkHandler, contentHandler, toHTMLhandler);
|
||||||
|
htmlParser.parse(inputStream, teeHandler, metadata, parseContext);
|
||||||
|
System.out.println("Filename: " + filename);
|
||||||
|
System.out.println("Links: " + linkHandler.getLinks());
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
||||||
|
|
Loading…
Reference in New Issue