Fix word frequency reader
This commit is contained in:
parent
5db622e5c1
commit
fba0ac2473
|
@ -8,12 +8,12 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Scanner;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.AbstractMap;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
|
@ -26,7 +26,6 @@ import org.apache.tika.sax.BodyContentHandler;
|
||||||
import org.apache.tika.sax.LinkContentHandler;
|
import org.apache.tika.sax.LinkContentHandler;
|
||||||
import org.apache.tika.sax.TeeContentHandler;
|
import org.apache.tika.sax.TeeContentHandler;
|
||||||
import org.apache.tika.sax.ToHTMLContentHandler;
|
import org.apache.tika.sax.ToHTMLContentHandler;
|
||||||
import org.apache.tools.ant.FileScanner;
|
|
||||||
import org.apache.tika.parser.html.HtmlParser;
|
import org.apache.tika.parser.html.HtmlParser;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
@ -46,7 +45,7 @@ public class FileData {
|
||||||
private TeeContentHandler teeHandler;
|
private TeeContentHandler teeHandler;
|
||||||
private ToHTMLContentHandler toHTMLhandler;
|
private ToHTMLContentHandler toHTMLhandler;
|
||||||
private HashMap<String, Integer> wordFrequency;
|
private HashMap<String, Integer> wordFrequency;
|
||||||
private ArrayList<Map.Entry<Integer, String>> orderedFrequencies;
|
private ArrayList<AbstractMap.SimpleEntry<String, Integer>> sortedWordFrequency;
|
||||||
private Tika tika;
|
private Tika tika;
|
||||||
|
|
||||||
FileData() {
|
FileData() {
|
||||||
|
@ -63,7 +62,7 @@ public class FileData {
|
||||||
linkHandler = new LinkContentHandler();
|
linkHandler = new LinkContentHandler();
|
||||||
toHTMLhandler = new ToHTMLContentHandler();
|
toHTMLhandler = new ToHTMLContentHandler();
|
||||||
wordFrequency = new HashMap<>();
|
wordFrequency = new HashMap<>();
|
||||||
orderedFrequencies = new ArrayList<>();
|
sortedWordFrequency = new ArrayList<>();
|
||||||
tika = new Tika();
|
tika = new Tika();
|
||||||
setMetadata();
|
setMetadata();
|
||||||
}
|
}
|
||||||
|
@ -87,42 +86,38 @@ public class FileData {
|
||||||
|
|
||||||
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
|
private void tokenizeFile() throws FileNotFoundException, IOException, TikaException {
|
||||||
String fileContent = tika.parseToString(file);
|
String fileContent = tika.parseToString(file);
|
||||||
fileContent.toLowerCase();
|
fileContent = fileContent.toLowerCase();
|
||||||
Scanner fileScanner = new Scanner(fileContent);
|
fileContent = fileContent.replaceAll("[\\(\\)¡!¿?→,.:;\\-—\"«»“”]", "");
|
||||||
Integer defaultValue = 0;
|
|
||||||
Integer ocurrence;
|
|
||||||
String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]";
|
|
||||||
fileScanner.useDelimiter(delimiters);
|
|
||||||
|
|
||||||
while (fileScanner.hasNext()) {
|
for (String token : fileContent.split("\\s+")) {
|
||||||
ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue);
|
wordFrequency.compute(token, (key, val) -> (val == null) ? 1 : val + 1);
|
||||||
if (ocurrence == defaultValue) {
|
|
||||||
wordFrequency.put(fileScanner.next(), defaultValue + 1);
|
|
||||||
} else {
|
|
||||||
wordFrequency.put(fileScanner.next(), ocurrence += 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sortWords() {
|
private void sortWords() {
|
||||||
for (Entry<String, Integer> item : wordFrequency.entrySet()) {
|
Set<Map.Entry<String, Integer>> entrySet = wordFrequency.entrySet();
|
||||||
orderedFrequencies.add(Map.entry(item.getValue(), item.getKey()));
|
|
||||||
|
for (Map.Entry<String, Integer> item : entrySet) {
|
||||||
|
sortedWordFrequency.add(new AbstractMap.SimpleEntry<String, Integer>(item.getKey(), item.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
Collections.sort(orderedFrequencies, new Comparator<Map.Entry<Integer, String>>() {
|
Collections.sort(sortedWordFrequency, new Comparator<AbstractMap.SimpleEntry<String, Integer>>() {
|
||||||
@Override
|
@Override
|
||||||
public int compare(Map.Entry<Integer, String> m1, Map.Entry<Integer, String> m2) {
|
public int compare(AbstractMap.SimpleEntry<String, Integer> m1,
|
||||||
|
AbstractMap.SimpleEntry<String, Integer> m2) {
|
||||||
return (m1.getValue()).compareTo(m2.getValue());
|
return (m1.getValue()).compareTo(m2.getValue());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Collections.reverse(sortedWordFrequency);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void saveFrequency() throws FileNotFoundException {
|
private void saveWordFrequency() throws FileNotFoundException {
|
||||||
FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat");
|
FileOutputStream outputFile = new FileOutputStream("output/" + filename + ".dat");
|
||||||
PrintStream output = new PrintStream(outputFile);
|
PrintStream output = new PrintStream(outputFile);
|
||||||
System.setOut(output);
|
System.setOut(output);
|
||||||
|
|
||||||
for (Entry<Integer, String> item : orderedFrequencies) {
|
for (Map.Entry<String, Integer> item : sortedWordFrequency) {
|
||||||
System.out.println(item.getValue() + " " + item.getKey());
|
System.out.println(item.getValue() + " " + item.getKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -131,7 +126,7 @@ public class FileData {
|
||||||
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
|
void setWordOcurrences() throws FileNotFoundException, IOException, TikaException {
|
||||||
tokenizeFile();
|
tokenizeFile();
|
||||||
sortWords();
|
sortWords();
|
||||||
saveFrequency();
|
saveWordFrequency();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in New Issue