From 0a54f7403b3f4d1c78483cb2b2162ac2312d5491 Mon Sep 17 00:00:00 2001 From: coolneng Date: Wed, 9 Dec 2020 22:49:18 +0100 Subject: [PATCH] Add incomplete processing module --- src/P2/processing.py | 195 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 src/P2/processing.py diff --git a/src/P2/processing.py b/src/P2/processing.py new file mode 100644 index 0000000..02bd9bf --- /dev/null +++ b/src/P2/processing.py @@ -0,0 +1,195 @@ +import time +from typing import Union +from sys import argv + +from matplotlib.pyplot import * +from pandas import DataFrame +from seaborn import heatmap, set_style, set_theme, pairplot +from sklearn.metrics import silhouette_score, calinski_harabasz_score +from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN + +from preprocessing import parse_data + + +def choose_model( + model, +) -> Union[KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN, None]: + if model == "kmeans": + return KMeans(random_state=42) + elif model == "birch": + return Birch() + elif model == "affinity": + return AffinityPropagation(random_state=42) + elif model == "meanshift": + return MeanShift() + elif model == "dbscan": + return DBSCAN() + + +def predict_data(data, model, results, sample) -> DataFrame: + model = choose_model(model) + start_time = time.time() + prediction = model.fit_predict(data) + execution_time = time.time() - start_time + calinski = calinski_harabasz_score(X=data, labels=prediction) + silhouette = silhouette_score( + X=data, + labels=prediction, + metric="euclidean", + sample_size=sample, + ) + populated_results = populate_results( + df=results, + model=model, + prediction=prediction, + clusters=len(prediction), + calinski=calinski, + silhouette=silhouette, + time=execution_time, + ) + return populated_results + + +def plot_heatmap(results): + fig = figure(figsize=(20, 10)) + heatmap( + data=results, + cmap="Blues", + square=True, + annot=True, + ) + fig_title = "Heatmap" + title(fig_title) + show() + fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png") + + +def plot_scatter_plot(results): + fig = figure(figsize=(20, 10)) + original_data = results.drop("prediction") + pairplot( + data=results, + vars=original_data, + hue="prediction", + palette="Paired", + diag_kind="hist", + ) + fig_title = "Scatter plot" + title(fig_title) + show() + fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png") + + +def print_dataframe(df): + df.set_index("model") + output_df = df.filter["clusters", "silhouette", "calinski", "time"] + print(output_df) + + +def show_results(results): + set_theme() + set_style("white") + plot_heatmap(results=results) + plot_scatter_plot(results=results) + print_dataframe(df=results) + + +def create_result_dataframes(): + results = DataFrame( + columns=[ + "clusters", + "model", + "prediction", + "silhouette", + "calinski-harabasz", + "time", + ] + ) + indexed_results = results.set_index("model") + return indexed_results, indexed_results + + +def populate_results( + df, model, clusters, prediction, calinski, silhouette, time +) -> DataFrame: + renamed_model = rename_model(model=f"{model}") + columns = [ + "model", + "clusters", + "prediction", + "silhouette", + "calinski-harabasz", + "time", + ] + values = [renamed_model, clusters, prediction, silhouette, calinski, time] + dictionary = dict(zip(columns, values)) + populated_df = df.append(dictionary, ignore_index=True) + return populated_df + + +def rename_model(model) -> str: + short_name = ["kmeans", "birch", "affinity", "meanshift", "dbscan"] + models = [ + "KMean(random_state=42)", + "AffinityPropagation(random_state=42)", + "MeanShift()", + "DBSCAN()", + ] + mapping = dict(zip(models, short_name)) + return mapping[model] + + +def construct_case(df, choice): + cases = { + "case1": df.loc[(df["LUMINOSIDAD"].str.contains("NOCHE"))], + "case2": df.loc[ + (df["ISLA"].str.contains("NO_ES_ISLA") == False) + & (df["FACTORES_ATMOSFERICOS"].str.contains("LLUVIA|LLOVIZNA")) + ], + "case3": df.loc[(df["HORA"] > 19) & (df["TIPO_VIA"] == "AUTOPISTA")], + "case4": df.loc[ + (df["COMUNIDAD_AUTONOMA"] == "Andalucía") + & (df["LUMINOSIDAD"].str.contains("SIN ILUMINACIÓN")) + ], + "case5": df.loc[ + (df["DIASEMANA"] == 7) + & (df["COMUNIDAD_AUTONOMA"] == "Madrid, Comunidad de") + ], + } + return cases[choice] + + +def usage(): + print("Usage: " + argv[0] + " ") + print("preprocessing actions:") + print("fill: fills the na values with the mean") + print("drop: drops the na values") + print("cases: choice of case study") + print("sample size: size of the sample when computing the Silhouette Coefficient") + exit() + + +def main(): + models = ["kmeans", "birch", "affinity", "meanshift", "dbscan"] + if len(argv) != 4: + usage() + case, sample = argv[2], argv[3] + data = parse_data(source="data/accidentes_2013.csv", action=str(argv[1])) + individual_result, complete_results = create_result_dataframes() + case_data = construct_case(df=data, choice=case) + for model in models: + model_results = predict_data( + data=case_data, + model=model, + results=individual_result, + sample=sample, + ) + complete_results = complete_results.append( + individual_result.append(model_results) + ) + indexed_results = complete_results.set_index("model") + show_results(results=indexed_results) + + +if __name__ == "__main__": + main()