Fix word frequency reader

This commit is contained in:
coolneng 2020-10-28 13:32:23 +01:00
parent 5db622e5c1
commit fba0ac2473
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 22 additions and 27 deletions

View File

@ -8,12 +8,12 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.Scanner;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Comparator; import java.util.Comparator;
import java.util.Set;
import java.util.AbstractMap;
import java.util.ArrayList;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
@ -26,7 +26,6 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler; import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tools.ant.FileScanner;
import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
@ -46,7 +45,7 @@ public class FileData {
private TeeContentHandler teeHandler; private TeeContentHandler teeHandler;
private ToHTMLContentHandler toHTMLhandler; private ToHTMLContentHandler toHTMLhandler;
private HashMap<String, Integer> wordFrequency; private HashMap<String, Integer> wordFrequency;
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies; private ArrayList<AbstractMap.SimpleEntry<String, Integer>> sortedWordFrequency;
private Tika tika; private Tika tika;
FileData() { FileData() {
@ -63,7 +62,7 @@ public class FileData {
linkHandler = new LinkContentHandler(); linkHandler = new LinkContentHandler();
toHTMLhandler = new ToHTMLContentHandler(); toHTMLhandler = new ToHTMLContentHandler();
wordFrequency = new HashMap<>(); wordFrequency = new HashMap<>();
orderedFrequencies = new ArrayList<>(); sortedWordFrequency = new ArrayList<>();
tika = new Tika(); tika = new Tika();
setMetadata(); setMetadata();
} }
@ -87,42 +86,38 @@ public class FileData {
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException { private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
String fileContent = tika.parseToString(file); String fileContent = tika.parseToString(file);
fileContent.toLowerCase(); fileContent = fileContent.toLowerCase();
Scanner fileScanner = new Scanner(fileContent); fileContent = fileContent.replaceAll("[\\(\\)¡!¿?→,.:;\\-—\"«»“”]", "");
Integer defaultValue = 0;
Integer ocurrence;
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
fileScanner.useDelimiter(delimiters);
while (fileScanner.hasNext()) { for (String token : fileContent.split("\\s+")) {
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue); wordFrequency.compute(token, (key, val) -> (val == null) ? 1 : val + 1);
if (ocurrence == defaultValue) {
wordFrequency.put(fileScanner.next(), defaultValue + 1);
} else {
wordFrequency.put(fileScanner.next(), ocurrence += 1);
}
} }
} }
private void sortWords() { private void sortWords() {
for (Entry<String, Integer> item : wordFrequency.entrySet()) { Set<Map.Entry<String, Integer>> entrySet = wordFrequency.entrySet();
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
for (Map.Entry<String, Integer> item : entrySet) {
sortedWordFrequency.add(new AbstractMap.SimpleEntry<String, Integer>(item.getKey(), item.getValue()));
} }
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() { Collections.sort(sortedWordFrequency, new Comparator<AbstractMap.SimpleEntry<String, Integer>>() {
@Override @Override
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) { public int compare(AbstractMap.SimpleEntry<String, Integer> m1,
AbstractMap.SimpleEntry<String, Integer> m2) {
return (m1.getValue()).compareTo(m2.getValue()); return (m1.getValue()).compareTo(m2.getValue());
} }
}); });
Collections.reverse(sortedWordFrequency);
} }
private void saveFrequency() throws FileNotFoundException { private void saveWordFrequency() throws FileNotFoundException {
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat"); FileOutputStream outputFile = new FileOutputStream("output/" + filename + ".dat");
PrintStream output = new PrintStream(outputFile); PrintStream output = new PrintStream(outputFile);
System.setOut(output); System.setOut(output);
for (Entry<Integer, String> item : orderedFrequencies) { for (Map.Entry<String, Integer> item : sortedWordFrequency) {
System.out.println(item.getValue() + " " + item.getKey()); System.out.println(item.getValue() + " " + item.getKey());
} }
@ -131,7 +126,7 @@ public class FileData {
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException { void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
tokenizeFile(); tokenizeFile();
sortWords(); sortWords();
saveFrequency(); saveWordFrequency();
} }
@Override @Override