Implement poorly word frequency reader
This commit is contained in:
parent
bada55444e
commit
ecbe3349ce
|
@ -2,3 +2,4 @@ target
|
||||||
.classpath
|
.classpath
|
||||||
.project
|
.project
|
||||||
.settings
|
.settings
|
||||||
|
output
|
||||||
|
|
5
TODO.org
5
TODO.org
|
@ -1,6 +1,4 @@
|
||||||
* P1
|
* P1
|
||||||
** TODO Write to a file all word occurrences and frequencies
|
|
||||||
Sorted in a decreasing manner
|
|
||||||
** TODO Plot word frequencies
|
** TODO Plot word frequencies
|
||||||
With gnuplot, with documents of at least 3 different languages.
|
With gnuplot, with documents of at least 3 different languages.
|
||||||
We'll fit this to the Booth and Federowicz equation
|
We'll fit this to the Booth and Federowicz equation
|
||||||
|
@ -9,3 +7,6 @@ CLOSED: [2020-10-25 Sun 19:58]
|
||||||
| filename | type | encoding | language |
|
| filename | type | encoding | language |
|
||||||
** DONE Extract all URLs
|
** DONE Extract all URLs
|
||||||
CLOSED: [2020-10-25 Sun 22:14]
|
CLOSED: [2020-10-25 Sun 22:14]
|
||||||
|
** DONE Write to a file all word occurrences and frequencies
|
||||||
|
CLOSED: [2020-10-25 Sun 23:40]
|
||||||
|
Sorted in a decreasing manner
|
||||||
|
|
|
@ -23,6 +23,7 @@ public class AnalyzeDirectory {
|
||||||
System.out.println("Usage: AnalyzeDirectory <directory> <option>");
|
System.out.println("Usage: AnalyzeDirectory <directory> <option>");
|
||||||
System.out.println("option metadata: shows the filename/file type/encoding and language of the files");
|
System.out.println("option metadata: shows the filename/file type/encoding and language of the files");
|
||||||
System.out.println("option links: shows all the links contained in each file");
|
System.out.println("option links: shows all the links contained in each file");
|
||||||
|
System.out.println("option frequency: saves word frequency to a file");
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,11 +43,20 @@ public class AnalyzeDirectory {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void getWordOcurrences() throws IOException, TikaException, SAXException {
|
||||||
|
for (File file : files) {
|
||||||
|
FileData data = new FileData(file);
|
||||||
|
data.setWordOcurrences();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void chooseAction(String action) throws IOException, TikaException, SAXException {
|
private static void chooseAction(String action) throws IOException, TikaException, SAXException {
|
||||||
if (action.equals("metadata")) {
|
if (action.equals("metadata")) {
|
||||||
getMetadata();
|
getMetadata();
|
||||||
} else if (action.equals("links")) {
|
} else if (action.equals("links")) {
|
||||||
getAllLinks();
|
getAllLinks();
|
||||||
|
} else if (action.equals("frequency")) {
|
||||||
|
getWordOcurrences();
|
||||||
} else {
|
} else {
|
||||||
usage();
|
usage();
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,9 +2,20 @@ package org.RI.P1;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Scanner;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
import org.apache.tika.language.detect.LanguageDetector;
|
import org.apache.tika.language.detect.LanguageDetector;
|
||||||
import org.apache.tika.langdetect.OptimaizeLangDetector;
|
import org.apache.tika.langdetect.OptimaizeLangDetector;
|
||||||
|
@ -15,6 +26,7 @@ import org.apache.tika.sax.BodyContentHandler;
|
||||||
import org.apache.tika.sax.LinkContentHandler;
|
import org.apache.tika.sax.LinkContentHandler;
|
||||||
import org.apache.tika.sax.TeeContentHandler;
|
import org.apache.tika.sax.TeeContentHandler;
|
||||||
import org.apache.tika.sax.ToHTMLContentHandler;
|
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||||
|
import org.apache.tools.ant.FileScanner;
|
||||||
import org.apache.tika.parser.html.HtmlParser;
|
import org.apache.tika.parser.html.HtmlParser;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
@ -33,6 +45,9 @@ public class FileData {
|
||||||
private LinkContentHandler linkHandler;
|
private LinkContentHandler linkHandler;
|
||||||
private TeeContentHandler teeHandler;
|
private TeeContentHandler teeHandler;
|
||||||
private ToHTMLContentHandler toHTMLhandler;
|
private ToHTMLContentHandler toHTMLhandler;
|
||||||
|
private HashMap<String, Integer> wordFrequency;
|
||||||
|
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies;
|
||||||
|
private Tika tika;
|
||||||
|
|
||||||
FileData() {
|
FileData() {
|
||||||
}
|
}
|
||||||
|
@ -47,6 +62,9 @@ public class FileData {
|
||||||
langIdentifier = new OptimaizeLangDetector().loadModels();
|
langIdentifier = new OptimaizeLangDetector().loadModels();
|
||||||
linkHandler = new LinkContentHandler();
|
linkHandler = new LinkContentHandler();
|
||||||
toHTMLhandler = new ToHTMLContentHandler();
|
toHTMLhandler = new ToHTMLContentHandler();
|
||||||
|
wordFrequency = new HashMap<>();
|
||||||
|
orderedFrequencies = new ArrayList<>();
|
||||||
|
tika = new Tika();
|
||||||
setMetadata();
|
setMetadata();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,6 +85,55 @@ public class FileData {
|
||||||
System.out.println("Links: " + linkHandler.getLinks());
|
System.out.println("Links: " + linkHandler.getLinks());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
|
||||||
|
String fileContent = tika.parseToString(file);
|
||||||
|
fileContent.toLowerCase();
|
||||||
|
Scanner fileScanner = new Scanner(fileContent);
|
||||||
|
Integer defaultValue = 0;
|
||||||
|
Integer ocurrence;
|
||||||
|
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
|
||||||
|
fileScanner.useDelimiter(delimiters);
|
||||||
|
|
||||||
|
while (fileScanner.hasNext()) {
|
||||||
|
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue);
|
||||||
|
if (ocurrence == defaultValue) {
|
||||||
|
wordFrequency.put(fileScanner.next(), defaultValue + 1);
|
||||||
|
} else {
|
||||||
|
wordFrequency.put(fileScanner.next(), ocurrence += 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void sortWords() {
|
||||||
|
for (Entry<String, Integer> item : wordFrequency.entrySet()) {
|
||||||
|
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() {
|
||||||
|
@Override
|
||||||
|
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) {
|
||||||
|
return (m1.getValue()).compareTo(m2.getValue());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void saveFrequency() throws FileNotFoundException {
|
||||||
|
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat");
|
||||||
|
PrintStream output = new PrintStream(outputFile);
|
||||||
|
System.setOut(output);
|
||||||
|
|
||||||
|
for (Entry<Integer, String> item : orderedFrequencies) {
|
||||||
|
System.out.println(item.getValue() + " " + item.getKey());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
|
||||||
|
tokenizeFile();
|
||||||
|
sortWords();
|
||||||
|
saveFrequency();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
||||||
|
|
Loading…
Reference in New Issue