From 8bcc7fa7bcd1261fafa20200ff4662e75c15a7e4 Mon Sep 17 00:00:00 2001 From: coolneng Date: Fri, 11 Dec 2020 13:55:27 +0100 Subject: [PATCH] Subset the dataframe columns after case filtering --- src/P2/preprocessing.py | 18 ++---------------- src/P2/processing.py | 5 +++-- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/src/P2/preprocessing.py b/src/P2/preprocessing.py index e7ec4e1..c68f8eb 100644 --- a/src/P2/preprocessing.py +++ b/src/P2/preprocessing.py @@ -22,30 +22,17 @@ def process_na(df, action): def filter_dataframe(df): relevant_columns = [ - "HORA", - "DIASEMANA", - "COMUNIDAD_AUTONOMA", - "ISLA", "TOT_HERIDOS_LEVES", "TOT_HERIDOS_GRAVES", "TOT_VEHICULOS_IMPLICADOS", "TOT_MUERTOS", - "TIPO_VIA", - "LUMINOSIDAD", - "FACTORES_ATMOSFERICOS", ] filtered_df = df.filter(items=relevant_columns) return filtered_df def normalize_numerical_values(df): - cols = [ - "TOT_HERIDOS_LEVES", - "TOT_HERIDOS_GRAVES", - "TOT_VEHICULOS_IMPLICADOS", - "TOT_MUERTOS", - ] - filtered_df = df.filter(items=cols) + filtered_df = filter_dataframe(df=df) normalized_data = normalize(X=filtered_df) normalized_df = DataFrame(data=normalized_data, columns=filtered_df.columns) df.update(normalized_df) @@ -55,6 +42,5 @@ def normalize_numerical_values(df): def parse_data(source, action): df = read_csv(filepath_or_buffer=source, na_values="?") processed_df = process_na(df=df, action=action) - filtered_df = filter_dataframe(df=processed_df) - normalized_df = normalize_numerical_values(df=filtered_df) + normalized_df = normalize_numerical_values(df=processed_df) return normalized_df diff --git a/src/P2/processing.py b/src/P2/processing.py index 2364651..39db65e 100644 --- a/src/P2/processing.py +++ b/src/P2/processing.py @@ -7,7 +7,7 @@ from seaborn import heatmap, set_style, set_theme, pairplot from sklearn.metrics import silhouette_score, calinski_harabasz_score from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN -from preprocessing import parse_data +from preprocessing import parse_data, filter_dataframe def choose_model(model): @@ -160,9 +160,10 @@ def main(): data = parse_data(source="data/accidentes_2013.csv", action=str(argv[1])) individual_result, complete_results = create_result_dataframes() case_data = construct_case(df=data, choice=case) + filtered_data = filter_dataframe(df=case_data) for model in models: model_results = predict_data( - data=case_data, + data=filtered_data, model=model, results=individual_result, sample=sample,