Fix word frequency reader

This commit is contained in:
coolneng 2020-10-28 13:32:23 +01:00
parent 5db622e5c1
commit fba0ac2473
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 22 additions and 27 deletions

View File

@ -8,12 +8,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Scanner;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Set;
import java.util.AbstractMap;
import java.util.ArrayList;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
@ -26,7 +26,6 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tools.ant.FileScanner;
import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.SAXException;
@ -46,7 +45,7 @@ public class FileData {
private TeeContentHandler teeHandler;
private ToHTMLContentHandler toHTMLhandler;
private HashMap<String, Integer> wordFrequency;
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies;
private ArrayList<AbstractMap.SimpleEntry<String, Integer>> sortedWordFrequency;
private Tika tika;
FileData() {
@ -63,7 +62,7 @@ public class FileData {
linkHandler = new LinkContentHandler();
toHTMLhandler = new ToHTMLContentHandler();
wordFrequency = new HashMap<>();
orderedFrequencies = new ArrayList<>();
sortedWordFrequency = new ArrayList<>();
tika = new Tika();
setMetadata();
}
@ -87,42 +86,38 @@ public class FileData {
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
String fileContent = tika.parseToString(file);
fileContent.toLowerCase();
Scanner fileScanner = new Scanner(fileContent);
Integer defaultValue = 0;
Integer ocurrence;
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
fileScanner.useDelimiter(delimiters);
fileContent = fileContent.toLowerCase();
fileContent = fileContent.replaceAll("[\\(\\)¡!¿?→,.:;\\-—\"«»“”]", "");
while (fileScanner.hasNext()) {
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue);
if (ocurrence == defaultValue) {
wordFrequency.put(fileScanner.next(), defaultValue + 1);
} else {
wordFrequency.put(fileScanner.next(), ocurrence += 1);
}
for (String token : fileContent.split("\\s+")) {
wordFrequency.compute(token, (key, val) -> (val == null) ? 1 : val + 1);
}
}
private void sortWords() {
for (Entry<String, Integer> item : wordFrequency.entrySet()) {
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
Set<Map.Entry<String, Integer>> entrySet = wordFrequency.entrySet();
for (Map.Entry<String, Integer> item : entrySet) {
sortedWordFrequency.add(new AbstractMap.SimpleEntry<String, Integer>(item.getKey(), item.getValue()));
}
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() {
Collections.sort(sortedWordFrequency, new Comparator<AbstractMap.SimpleEntry<String, Integer>>() {
@Override
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) {
public int compare(AbstractMap.SimpleEntry<String, Integer> m1,
AbstractMap.SimpleEntry<String, Integer> m2) {
return (m1.getValue()).compareTo(m2.getValue());
}
});
Collections.reverse(sortedWordFrequency);
}
private void saveFrequency() throws FileNotFoundException {
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat");
private void saveWordFrequency() throws FileNotFoundException {
FileOutputStream outputFile = new FileOutputStream("output/" + filename + ".dat");
PrintStream output = new PrintStream(outputFile);
System.setOut(output);
for (Entry<Integer, String> item : orderedFrequencies) {
for (Map.Entry<String, Integer> item : sortedWordFrequency) {
System.out.println(item.getValue() + " " + item.getKey());
}
@ -131,7 +126,7 @@ public class FileData {
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
tokenizeFile();
sortWords();
saveFrequency();
saveWordFrequency();
}
@Override