Extract all links from file

This commit is contained in:
coolneng 2020-10-25 22:14:20 +01:00
parent 306bb78c61
commit 0f688eaf42
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 22 additions and 2 deletions

View File

@ -1,5 +1,4 @@
* P1
** TODO Extract all URLs
** TODO Write to a file all word occurrences and frequencies
Sorted in a decreasing manner
** TODO Plot word frequencies
@ -8,3 +7,5 @@ We'll fit this to the Booth and Federowicz equation
** DONE Create a table with information of all documents
CLOSED: [2020-10-25 Sun 19:58]
| filename | type | encoding | language |
** DONE Extract all URLs
CLOSED: [2020-10-25 Sun 22:14]

View File

@ -2,7 +2,6 @@ package org.RI.P1;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
@ -13,9 +12,14 @@ import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.*;
import org.apache.tika.parser.*;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.SAXException;
public class FileData {
private File file;
private String filename;
private String type;
private String encoding;
@ -26,17 +30,23 @@ public class FileData {
private ParseContext parseContext;
private AutoDetectParser parser;
private LanguageDetector langIdentifier;
private LinkContentHandler linkHandler;
private TeeContentHandler teeHandler;
private ToHTMLContentHandler toHTMLhandler;
FileData() {
}
FileData(File file) throws IOException, TikaException, SAXException {
this.file = file;
inputStream = new FileInputStream(file);
metadata = new Metadata();
parser = new AutoDetectParser();
contentHandler = new BodyContentHandler(-1);
parseContext = new ParseContext();
langIdentifier = new OptimaizeLangDetector().loadModels();
linkHandler = new LinkContentHandler();
toHTMLhandler = new ToHTMLContentHandler();
setMetadata();
}
@ -48,6 +58,15 @@ public class FileData {
language = langIdentifier.detect(contentHandler.toString());
}
void getLinks() throws IOException, TikaException, SAXException {
HtmlParser htmlParser = new HtmlParser();
inputStream = new FileInputStream(file);
teeHandler = new TeeContentHandler(linkHandler, contentHandler, toHTMLhandler);
htmlParser.parse(inputStream, teeHandler, metadata, parseContext);
System.out.println("Filename: " + filename);
System.out.println("Links: " + linkHandler.getLinks());
}
@Override
public String toString() {
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "