diff --git a/TODO.org b/TODO.org index 7d0ef6c..5a305ab 100644 --- a/TODO.org +++ b/TODO.org @@ -1,5 +1,4 @@ * P1 -** TODO Extract all URLs ** TODO Write to a file all word occurrences and frequencies Sorted in a decreasing manner ** TODO Plot word frequencies @@ -8,3 +7,5 @@ We'll fit this to the Booth and Federowicz equation ** DONE Create a table with information of all documents CLOSED: [2020-10-25 Sun 19:58] | filename | type | encoding | language | +** DONE Extract all URLs +CLOSED: [2020-10-25 Sun 22:14] diff --git a/src/main/java/org/RI/P1/FileData.java b/src/main/java/org/RI/P1/FileData.java index 229e590..3fd9fdd 100644 --- a/src/main/java/org/RI/P1/FileData.java +++ b/src/main/java/org/RI/P1/FileData.java @@ -2,7 +2,6 @@ package org.RI.P1; import java.io.File; import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -13,9 +12,14 @@ import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.*; import org.apache.tika.parser.*; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.LinkContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.ToHTMLContentHandler; +import org.apache.tika.parser.html.HtmlParser; import org.xml.sax.SAXException; public class FileData { + private File file; private String filename; private String type; private String encoding; @@ -26,17 +30,23 @@ public class FileData { private ParseContext parseContext; private AutoDetectParser parser; private LanguageDetector langIdentifier; + private LinkContentHandler linkHandler; + private TeeContentHandler teeHandler; + private ToHTMLContentHandler toHTMLhandler; FileData() { } FileData(File file) throws IOException, TikaException, SAXException { + this.file = file; inputStream = new FileInputStream(file); metadata = new Metadata(); parser = new AutoDetectParser(); contentHandler = new BodyContentHandler(-1); parseContext = new ParseContext(); langIdentifier = new OptimaizeLangDetector().loadModels(); + linkHandler = new LinkContentHandler(); + toHTMLhandler = new ToHTMLContentHandler(); setMetadata(); } @@ -48,6 +58,15 @@ public class FileData { language = langIdentifier.detect(contentHandler.toString()); } + void getLinks() throws IOException, TikaException, SAXException { + HtmlParser htmlParser = new HtmlParser(); + inputStream = new FileInputStream(file); + teeHandler = new TeeContentHandler(linkHandler, contentHandler, toHTMLhandler); + htmlParser.parse(inputStream, teeHandler, metadata, parseContext); + System.out.println("Filename: " + filename); + System.out.println("Links: " + linkHandler.getLinks()); + } + @Override public String toString() { return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "