Implement poorly word frequency reader

This commit is contained in:
coolneng 2020-10-25 23:40:20 +01:00
parent bada55444e
commit ecbe3349ce
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
4 changed files with 81 additions and 2 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ target
.classpath .classpath
.project .project
.settings .settings
output

View File

@ -1,6 +1,4 @@
* P1 * P1
** TODO Write to a file all word occurrences and frequencies
Sorted in a decreasing manner
** TODO Plot word frequencies ** TODO Plot word frequencies
With gnuplot, with documents of at least 3 different languages. With gnuplot, with documents of at least 3 different languages.
We'll fit this to the Booth and Federowicz equation We'll fit this to the Booth and Federowicz equation
@ -9,3 +7,6 @@ CLOSED: [2020-10-25 Sun 19:58]
| filename | type | encoding | language | | filename | type | encoding | language |
** DONE Extract all URLs ** DONE Extract all URLs
CLOSED: [2020-10-25 Sun 22:14] CLOSED: [2020-10-25 Sun 22:14]
** DONE Write to a file all word occurrences and frequencies
CLOSED: [2020-10-25 Sun 23:40]
Sorted in a decreasing manner

View File

@ -23,6 +23,7 @@ public class AnalyzeDirectory {
System.out.println("Usage: AnalyzeDirectory <directory> <option>"); System.out.println("Usage: AnalyzeDirectory <directory> <option>");
System.out.println("option metadata: shows the filename/file type/encoding and language of the files"); System.out.println("option metadata: shows the filename/file type/encoding and language of the files");
System.out.println("option links: shows all the links contained in each file"); System.out.println("option links: shows all the links contained in each file");
System.out.println("option frequency: saves word frequency to a file");
System.exit(1); System.exit(1);
} }
@ -42,11 +43,20 @@ public class AnalyzeDirectory {
} }
} }
private static void getWordOcurrences() throws IOException, TikaException, SAXException {
for (File file : files) {
FileData data = new FileData(file);
data.setWordOcurrences();
}
}
private static void chooseAction(String action) throws IOException, TikaException, SAXException { private static void chooseAction(String action) throws IOException, TikaException, SAXException {
if (action.equals("metadata")) { if (action.equals("metadata")) {
getMetadata(); getMetadata();
} else if (action.equals("links")) { } else if (action.equals("links")) {
getAllLinks(); getAllLinks();
} else if (action.equals("frequency")) {
getWordOcurrences();
} else { } else {
usage(); usage();
} }

View File

@ -2,9 +2,20 @@ package org.RI.P1;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Scanner;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Comparator;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.langdetect.OptimaizeLangDetector;
@ -15,6 +26,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler; import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tools.ant.FileScanner;
import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
@ -33,6 +45,9 @@ public class FileData {
private LinkContentHandler linkHandler; private LinkContentHandler linkHandler;
private TeeContentHandler teeHandler; private TeeContentHandler teeHandler;
private ToHTMLContentHandler toHTMLhandler; private ToHTMLContentHandler toHTMLhandler;
private HashMap<String, Integer> wordFrequency;
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies;
private Tika tika;
FileData() { FileData() {
} }
@ -47,6 +62,9 @@ public class FileData {
langIdentifier = new OptimaizeLangDetector().loadModels(); langIdentifier = new OptimaizeLangDetector().loadModels();
linkHandler = new LinkContentHandler(); linkHandler = new LinkContentHandler();
toHTMLhandler = new ToHTMLContentHandler(); toHTMLhandler = new ToHTMLContentHandler();
wordFrequency = new HashMap<>();
orderedFrequencies = new ArrayList<>();
tika = new Tika();
setMetadata(); setMetadata();
} }
@ -67,6 +85,55 @@ public class FileData {
System.out.println("Links: " + linkHandler.getLinks()); System.out.println("Links: " + linkHandler.getLinks());
} }
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
String fileContent = tika.parseToString(file);
fileContent.toLowerCase();
Scanner fileScanner = new Scanner(fileContent);
Integer defaultValue = 0;
Integer ocurrence;
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
fileScanner.useDelimiter(delimiters);
while (fileScanner.hasNext()) {
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue);
if (ocurrence == defaultValue) {
wordFrequency.put(fileScanner.next(), defaultValue + 1);
} else {
wordFrequency.put(fileScanner.next(), ocurrence += 1);
}
}
}
private void sortWords() {
for (Entry<String, Integer> item : wordFrequency.entrySet()) {
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
}
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() {
@Override
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) {
return (m1.getValue()).compareTo(m2.getValue());
}
});
}
private void saveFrequency() throws FileNotFoundException {
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat");
PrintStream output = new PrintStream(outputFile);
System.setOut(output);
for (Entry<Integer, String> item : orderedFrequencies) {
System.out.println(item.getValue() + " " + item.getKey());
}
}
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
tokenizeFile();
sortWords();
saveFrequency();
}
@Override @Override
public String toString() { public String toString() {
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: " return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "