Fix word frequency reader
This commit is contained in:
parent
5db622e5c1
commit
fba0ac2473
|
@ -8,12 +8,12 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Scanner;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Comparator;
|
||||
import java.util.Set;
|
||||
import java.util.AbstractMap;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
|
@ -26,7 +26,6 @@ import org.apache.tika.sax.BodyContentHandler;
|
|||
import org.apache.tika.sax.LinkContentHandler;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||
import org.apache.tools.ant.FileScanner;
|
||||
import org.apache.tika.parser.html.HtmlParser;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
@ -46,7 +45,7 @@ public class FileData {
|
|||
private TeeContentHandler teeHandler;
|
||||
private ToHTMLContentHandler toHTMLhandler;
|
||||
private HashMap<String, Integer> wordFrequency;
|
||||
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies;
|
||||
private ArrayList<AbstractMap.SimpleEntry<String, Integer>> sortedWordFrequency;
|
||||
private Tika tika;
|
||||
|
||||
FileData() {
|
||||
|
@ -63,7 +62,7 @@ public class FileData {
|
|||
linkHandler = new LinkContentHandler();
|
||||
toHTMLhandler = new ToHTMLContentHandler();
|
||||
wordFrequency = new HashMap<>();
|
||||
orderedFrequencies = new ArrayList<>();
|
||||
sortedWordFrequency = new ArrayList<>();
|
||||
tika = new Tika();
|
||||
setMetadata();
|
||||
}
|
||||
|
@ -87,42 +86,38 @@ public class FileData {
|
|||
|
||||
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
|
||||
String fileContent = tika.parseToString(file);
|
||||
fileContent.toLowerCase();
|
||||
Scanner fileScanner = new Scanner(fileContent);
|
||||
Integer defaultValue = 0;
|
||||
Integer ocurrence;
|
||||
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
|
||||
fileScanner.useDelimiter(delimiters);
|
||||
fileContent = fileContent.toLowerCase();
|
||||
fileContent = fileContent.replaceAll("[\\(\\)¡!¿?→,.:;\\-—\"«»“”]", "");
|
||||
|
||||
while (fileScanner.hasNext()) {
|
||||
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue);
|
||||
if (ocurrence == defaultValue) {
|
||||
wordFrequency.put(fileScanner.next(), defaultValue + 1);
|
||||
} else {
|
||||
wordFrequency.put(fileScanner.next(), ocurrence += 1);
|
||||
}
|
||||
for (String token : fileContent.split("\\s+")) {
|
||||
wordFrequency.compute(token, (key, val) -> (val == null) ? 1 : val + 1);
|
||||
}
|
||||
}
|
||||
|
||||
private void sortWords() {
|
||||
for (Entry<String, Integer> item : wordFrequency.entrySet()) {
|
||||
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
|
||||
Set<Map.Entry<String, Integer>> entrySet = wordFrequency.entrySet();
|
||||
|
||||
for (Map.Entry<String, Integer> item : entrySet) {
|
||||
sortedWordFrequency.add(new AbstractMap.SimpleEntry<String, Integer>(item.getKey(), item.getValue()));
|
||||
}
|
||||
|
||||
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() {
|
||||
Collections.sort(sortedWordFrequency, new Comparator<AbstractMap.SimpleEntry<String, Integer>>() {
|
||||
@Override
|
||||
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) {
|
||||
public int compare(AbstractMap.SimpleEntry<String, Integer> m1,
|
||||
AbstractMap.SimpleEntry<String, Integer> m2) {
|
||||
return (m1.getValue()).compareTo(m2.getValue());
|
||||
}
|
||||
});
|
||||
|
||||
Collections.reverse(sortedWordFrequency);
|
||||
}
|
||||
|
||||
private void saveFrequency() throws FileNotFoundException {
|
||||
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat");
|
||||
private void saveWordFrequency() throws FileNotFoundException {
|
||||
FileOutputStream outputFile = new FileOutputStream("output/" + filename + ".dat");
|
||||
PrintStream output = new PrintStream(outputFile);
|
||||
System.setOut(output);
|
||||
|
||||
for (Entry<Integer, String> item : orderedFrequencies) {
|
||||
for (Map.Entry<String, Integer> item : sortedWordFrequency) {
|
||||
System.out.println(item.getValue() + " " + item.getKey());
|
||||
}
|
||||
|
||||
|
@ -131,7 +126,7 @@ public class FileData {
|
|||
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
|
||||
tokenizeFile();
|
||||
sortWords();
|
||||
saveFrequency();
|
||||
saveWordFrequency();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue