Implement poorly word frequency reader
This commit is contained in:
parent
bada55444e
commit
ecbe3349ce
|
@ -2,3 +2,4 @@ target
|
|||
.classpath
|
||||
.project
|
||||
.settings
|
||||
output
|
||||
|
|
5
TODO.org
5
TODO.org
|
@ -1,6 +1,4 @@
|
|||
* P1
|
||||
** TODO Write to a file all word occurrences and frequencies
|
||||
Sorted in a decreasing manner
|
||||
** TODO Plot word frequencies
|
||||
With gnuplot, with documents of at least 3 different languages.
|
||||
We'll fit this to the Booth and Federowicz equation
|
||||
|
@ -9,3 +7,6 @@ CLOSED: [2020-10-25 Sun 19:58]
|
|||
| filename | type | encoding | language |
|
||||
** DONE Extract all URLs
|
||||
CLOSED: [2020-10-25 Sun 22:14]
|
||||
** DONE Write to a file all word occurrences and frequencies
|
||||
CLOSED: [2020-10-25 Sun 23:40]
|
||||
Sorted in a decreasing manner
|
||||
|
|
|
@ -23,6 +23,7 @@ public class AnalyzeDirectory {
|
|||
System.out.println("Usage: AnalyzeDirectory <directory> <option>");
|
||||
System.out.println("option metadata: shows the filename/file type/encoding and language of the files");
|
||||
System.out.println("option links: shows all the links contained in each file");
|
||||
System.out.println("option frequency: saves word frequency to a file");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
|
@ -42,11 +43,20 @@ public class AnalyzeDirectory {
|
|||
}
|
||||
}
|
||||
|
||||
private static void getWordOcurrences() throws IOException, TikaException, SAXException {
|
||||
for (File file : files) {
|
||||
FileData data = new FileData(file);
|
||||
data.setWordOcurrences();
|
||||
}
|
||||
}
|
||||
|
||||
private static void chooseAction(String action) throws IOException, TikaException, SAXException {
|
||||
if (action.equals("metadata")) {
|
||||
getMetadata();
|
||||
} else if (action.equals("links")) {
|
||||
getAllLinks();
|
||||
} else if (action.equals("frequency")) {
|
||||
getWordOcurrences();
|
||||
} else {
|
||||
usage();
|
||||
}
|
||||
|
|
|
@ -2,9 +2,20 @@ package org.RI.P1;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Scanner;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.language.detect.LanguageDetector;
|
||||
import org.apache.tika.langdetect.OptimaizeLangDetector;
|
||||
|
@ -15,6 +26,7 @@ import org.apache.tika.sax.BodyContentHandler;
|
|||
import org.apache.tika.sax.LinkContentHandler;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||
import org.apache.tools.ant.FileScanner;
|
||||
import org.apache.tika.parser.html.HtmlParser;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
@ -33,6 +45,9 @@ public class FileData {
|
|||
private LinkContentHandler linkHandler;
|
||||
private TeeContentHandler teeHandler;
|
||||
private ToHTMLContentHandler toHTMLhandler;
|
||||
private HashMap<String, Integer> wordFrequency;
|
||||
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies;
|
||||
private Tika tika;
|
||||
|
||||
FileData() {
|
||||
}
|
||||
|
@ -47,6 +62,9 @@ public class FileData {
|
|||
langIdentifier = new OptimaizeLangDetector().loadModels();
|
||||
linkHandler = new LinkContentHandler();
|
||||
toHTMLhandler = new ToHTMLContentHandler();
|
||||
wordFrequency = new HashMap<>();
|
||||
orderedFrequencies = new ArrayList<>();
|
||||
tika = new Tika();
|
||||
setMetadata();
|
||||
}
|
||||
|
||||
|
@ -67,6 +85,55 @@ public class FileData {
|
|||
System.out.println("Links: " + linkHandler.getLinks());
|
||||
}
|
||||
|
||||
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
|
||||
String fileContent = tika.parseToString(file);
|
||||
fileContent.toLowerCase();
|
||||
Scanner fileScanner = new Scanner(fileContent);
|
||||
Integer defaultValue = 0;
|
||||
Integer ocurrence;
|
||||
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
|
||||
fileScanner.useDelimiter(delimiters);
|
||||
|
||||
while (fileScanner.hasNext()) {
|
||||
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue);
|
||||
if (ocurrence == defaultValue) {
|
||||
wordFrequency.put(fileScanner.next(), defaultValue + 1);
|
||||
} else {
|
||||
wordFrequency.put(fileScanner.next(), ocurrence += 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void sortWords() {
|
||||
for (Entry<String, Integer> item : wordFrequency.entrySet()) {
|
||||
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
|
||||
}
|
||||
|
||||
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() {
|
||||
@Override
|
||||
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) {
|
||||
return (m1.getValue()).compareTo(m2.getValue());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void saveFrequency() throws FileNotFoundException {
|
||||
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat");
|
||||
PrintStream output = new PrintStream(outputFile);
|
||||
System.setOut(output);
|
||||
|
||||
for (Entry<Integer, String> item : orderedFrequencies) {
|
||||
System.out.println(item.getValue() + " " + item.getKey());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
|
||||
tokenizeFile();
|
||||
sortWords();
|
||||
saveFrequency();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Filename: " + filename + "\n" + "Type: " + type + "\n" + "Encoding: " + encoding + "\n" + "Language: "
|
||||
|
|
Loading…
Reference in New Issue