From fba0ac2473b09033eb448e9d6f12e37ba513718d Mon Sep 17 00:00:00 2001 From: coolneng Date: Wed, 28 Oct 2020 13:32:23 +0100 Subject: [PATCH] Fix word frequency reader --- src/main/java/org/RI/P1/FileData.java | 49 ++++++++++++--------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/RI/P1/FileData.java b/src/main/java/org/RI/P1/FileData.java index b38f761..de9c07d 100644 --- a/src/main/java/org/RI/P1/FileData.java +++ b/src/main/java/org/RI/P1/FileData.java @@ -8,12 +8,12 @@ import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.util.HashMap; -import java.util.Scanner; -import java.util.ArrayList; import java.util.Collections; import java.util.Map; -import java.util.Map.Entry; import java.util.Comparator; +import java.util.Set; +import java.util.AbstractMap; +import java.util.ArrayList; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; @@ -26,7 +26,6 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.ToHTMLContentHandler; -import org.apache.tools.ant.FileScanner; import org.apache.tika.parser.html.HtmlParser; import org.xml.sax.SAXException; @@ -46,7 +45,7 @@ public class FileData { private TeeContentHandler teeHandler; private ToHTMLContentHandler toHTMLhandler; private HashMap wordFrequency; - private ArrayList> orderedFrequencies; + private ArrayList> sortedWordFrequency; private Tika tika; FileData() { @@ -63,7 +62,7 @@ public class FileData { linkHandler = new LinkContentHandler(); toHTMLhandler = new ToHTMLContentHandler(); wordFrequency = new HashMap<>(); - orderedFrequencies = new ArrayList<>(); + sortedWordFrequency = new ArrayList<>(); tika = new Tika(); setMetadata(); } @@ -87,42 +86,38 @@ public class FileData { private void tokenizeFile() throws FileNotFoundException, IOException, TikaException { String fileContent = tika.parseToString(file); - fileContent.toLowerCase(); - Scanner fileScanner = new Scanner(fileContent); - Integer defaultValue = 0; - Integer ocurrence; - String delimiters = "\\s+|[\\.\\;\\:\\,]\\s+|[()¿?¡!]"; - fileScanner.useDelimiter(delimiters); + fileContent = fileContent.toLowerCase(); + fileContent = fileContent.replaceAll("[\\(\\)¡!¿?→,.:;\\-—\"«»“”]", ""); - while (fileScanner.hasNext()) { - ocurrence = wordFrequency.getOrDefault(fileScanner, defaultValue); - if (ocurrence == defaultValue) { - wordFrequency.put(fileScanner.next(), defaultValue + 1); - } else { - wordFrequency.put(fileScanner.next(), ocurrence += 1); - } + for (String token : fileContent.split("\\s+")) { + wordFrequency.compute(token, (key, val) -> (val == null) ? 1 : val + 1); } } private void sortWords() { - for (Entry item : wordFrequency.entrySet()) { - orderedFrequencies.add(Map.entry(item.getValue(), item.getKey())); + Set> entrySet = wordFrequency.entrySet(); + + for (Map.Entry item : entrySet) { + sortedWordFrequency.add(new AbstractMap.SimpleEntry(item.getKey(), item.getValue())); } - Collections.sort(orderedFrequencies, new Comparator>() { + Collections.sort(sortedWordFrequency, new Comparator>() { @Override - public int compare(Map.Entry m1, Map.Entry m2) { + public int compare(AbstractMap.SimpleEntry m1, + AbstractMap.SimpleEntry m2) { return (m1.getValue()).compareTo(m2.getValue()); } }); + + Collections.reverse(sortedWordFrequency); } - private void saveFrequency() throws FileNotFoundException { - FileOutputStream outputFile = new FileOutputStream("output/output_" + filename + ".dat"); + private void saveWordFrequency() throws FileNotFoundException { + FileOutputStream outputFile = new FileOutputStream("output/" + filename + ".dat"); PrintStream output = new PrintStream(outputFile); System.setOut(output); - for (Entry item : orderedFrequencies) { + for (Map.Entry item : sortedWordFrequency) { System.out.println(item.getValue() + " " + item.getKey()); } @@ -131,7 +126,7 @@ public class FileData { void setWordOcurrences() throws FileNotFoundException, IOException, TikaException { tokenizeFile(); sortWords(); - saveFrequency(); + saveWordFrequency(); } @Override