From 0a54f7403b3f4d1c78483cb2b2162ac2312d5491 Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Wed, 9 Dec 2020 22:49:18 +0100
Subject: [PATCH] Add incomplete processing module

---
 src/P2/processing.py | 195 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 src/P2/processing.py

diff --git a/src/P2/processing.py b/src/P2/processing.py
new file mode 100644
index 0000000..02bd9bf
--- /dev/null
+++ b/src/P2/processing.py
@@ -0,0 +1,195 @@
+import time
+from typing import Union
+from sys import argv
+
+from matplotlib.pyplot import *
+from pandas import DataFrame
+from seaborn import heatmap, set_style, set_theme, pairplot
+from sklearn.metrics import silhouette_score, calinski_harabasz_score
+from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN
+
+from preprocessing import parse_data
+
+
+def choose_model(
+    model,
+) -> Union[KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN, None]:
+    if model == "kmeans":
+        return KMeans(random_state=42)
+    elif model == "birch":
+        return Birch()
+    elif model == "affinity":
+        return AffinityPropagation(random_state=42)
+    elif model == "meanshift":
+        return MeanShift()
+    elif model == "dbscan":
+        return DBSCAN()
+
+
+def predict_data(data, model, results, sample) -> DataFrame:
+    model = choose_model(model)
+    start_time = time.time()
+    prediction = model.fit_predict(data)
+    execution_time = time.time() - start_time
+    calinski = calinski_harabasz_score(X=data, labels=prediction)
+    silhouette = silhouette_score(
+        X=data,
+        labels=prediction,
+        metric="euclidean",
+        sample_size=sample,
+    )
+    populated_results = populate_results(
+        df=results,
+        model=model,
+        prediction=prediction,
+        clusters=len(prediction),
+        calinski=calinski,
+        silhouette=silhouette,
+        time=execution_time,
+    )
+    return populated_results
+
+
+def plot_heatmap(results):
+    fig = figure(figsize=(20, 10))
+    heatmap(
+        data=results,
+        cmap="Blues",
+        square=True,
+        annot=True,
+    )
+    fig_title = "Heatmap"
+    title(fig_title)
+    show()
+    fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
+
+
+def plot_scatter_plot(results):
+    fig = figure(figsize=(20, 10))
+    original_data = results.drop("prediction")
+    pairplot(
+        data=results,
+        vars=original_data,
+        hue="prediction",
+        palette="Paired",
+        diag_kind="hist",
+    )
+    fig_title = "Scatter plot"
+    title(fig_title)
+    show()
+    fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
+
+
+def print_dataframe(df):
+    df.set_index("model")
+    output_df = df.filter["clusters", "silhouette", "calinski", "time"]
+    print(output_df)
+
+
+def show_results(results):
+    set_theme()
+    set_style("white")
+    plot_heatmap(results=results)
+    plot_scatter_plot(results=results)
+    print_dataframe(df=results)
+
+
+def create_result_dataframes():
+    results = DataFrame(
+        columns=[
+            "clusters",
+            "model",
+            "prediction",
+            "silhouette",
+            "calinski-harabasz",
+            "time",
+        ]
+    )
+    indexed_results = results.set_index("model")
+    return indexed_results, indexed_results
+
+
+def populate_results(
+    df, model, clusters, prediction, calinski, silhouette, time
+) -> DataFrame:
+    renamed_model = rename_model(model=f"{model}")
+    columns = [
+        "model",
+        "clusters",
+        "prediction",
+        "silhouette",
+        "calinski-harabasz",
+        "time",
+    ]
+    values = [renamed_model, clusters, prediction, silhouette, calinski, time]
+    dictionary = dict(zip(columns, values))
+    populated_df = df.append(dictionary, ignore_index=True)
+    return populated_df
+
+
+def rename_model(model) -> str:
+    short_name = ["kmeans", "birch", "affinity", "meanshift", "dbscan"]
+    models = [
+        "KMean(random_state=42)",
+        "AffinityPropagation(random_state=42)",
+        "MeanShift()",
+        "DBSCAN()",
+    ]
+    mapping = dict(zip(models, short_name))
+    return mapping[model]
+
+
+def construct_case(df, choice):
+    cases = {
+        "case1": df.loc[(df["LUMINOSIDAD"].str.contains("NOCHE"))],
+        "case2": df.loc[
+            (df["ISLA"].str.contains("NO_ES_ISLA") == False)
+            & (df["FACTORES_ATMOSFERICOS"].str.contains("LLUVIA|LLOVIZNA"))
+        ],
+        "case3": df.loc[(df["HORA"] > 19) & (df["TIPO_VIA"] == "AUTOPISTA")],
+        "case4": df.loc[
+            (df["COMUNIDAD_AUTONOMA"] == "Andalucía")
+            & (df["LUMINOSIDAD"].str.contains("SIN ILUMINACIÓN"))
+        ],
+        "case5": df.loc[
+            (df["DIASEMANA"] == 7)
+            & (df["COMUNIDAD_AUTONOMA"] == "Madrid, Comunidad de")
+        ],
+    }
+    return cases[choice]
+
+
+def usage():
+    print("Usage: " + argv[0] + "<preprocessing action> <case> <sample size>")
+    print("preprocessing actions:")
+    print("fill: fills the na values with the mean")
+    print("drop: drops the na values")
+    print("cases: choice of case study")
+    print("sample size: size of the sample when computing the Silhouette Coefficient")
+    exit()
+
+
+def main():
+    models = ["kmeans", "birch", "affinity", "meanshift", "dbscan"]
+    if len(argv) != 4:
+        usage()
+    case, sample = argv[2], argv[3]
+    data = parse_data(source="data/accidentes_2013.csv", action=str(argv[1]))
+    individual_result, complete_results = create_result_dataframes()
+    case_data = construct_case(df=data, choice=case)
+    for model in models:
+        model_results = predict_data(
+            data=case_data,
+            model=model,
+            results=individual_result,
+            sample=sample,
+        )
+        complete_results = complete_results.append(
+            individual_result.append(model_results)
+        )
+    indexed_results = complete_results.set_index("model")
+    show_results(results=indexed_results)
+
+
+if __name__ == "__main__":
+    main()