diff --git a/src/P1/processing.py b/src/P1/processing.py index c2c0521..fd41bdf 100644 --- a/src/P1/processing.py +++ b/src/P1/processing.py @@ -29,8 +29,9 @@ def choose_model(model): def predict_data(data, target, model, results): - model = choose_model(model) - if model == "knn": + model_name = model + model = choose_model(model=model) + if model_name == "knn": data = scale(data) confusion_matrices, auc, fpr, tpr = [], [], [], [] for train_index, test_index in split_k_sets(data): @@ -43,7 +44,7 @@ def predict_data(data, target, model, results): tpr.append(tpr_item) populated_results = populate_results( df=results, - model=model, + model=model_name, fpr=mean(fpr, axis=0), tpr=mean(tpr, axis=0), auc=mean(auc), @@ -122,27 +123,13 @@ def create_result_dataframes(): def populate_results(df, model, fpr, tpr, auc, confusion_matrix): - renamed_model = rename_model(model=f"{model}") columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"] - values = [renamed_model, fpr, tpr, auc, confusion_matrix] + values = [model, fpr, tpr, auc, confusion_matrix] dictionary = dict(zip(columns, values)) populated_df = df.append(dictionary, ignore_index=True) return populated_df -def rename_model(model): - short_name = ["gnb", "svc", "knn", "tree", "neuralnet"] - models = [ - "GaussianNB()", - "LinearSVC(random_state=42)", - "KNeighborsClassifier(n_neighbors=10)", - "DecisionTreeClassifier(random_state=42)", - "MLPClassifier(hidden_layer_sizes=10)", - ] - mapping = dict(zip(models, short_name)) - return mapping[model] - - def transform_dataframe(data, target): joined_df = data.join(target) binned_df = joined_df.copy() diff --git a/src/P2/preprocessing.py b/src/P2/preprocessing.py index c780168..e7ec4e1 100644 --- a/src/P2/preprocessing.py +++ b/src/P2/preprocessing.py @@ -2,13 +2,13 @@ from pandas import DataFrame, read_csv from sklearn.preprocessing import normalize -def replace_values(df) -> DataFrame: +def replace_values(df): for column in df.columns: df[column].fillna(value=df[column].mean(), inplace=True) return df -def process_na(df, action) -> DataFrame: +def process_na(df, action): if action == "drop": return df.dropna() elif action == "fill": @@ -20,7 +20,7 @@ def process_na(df, action) -> DataFrame: exit() -def filter_dataframe(df) -> DataFrame: +def filter_dataframe(df): relevant_columns = [ "HORA", "DIASEMANA", @@ -38,18 +38,7 @@ def filter_dataframe(df) -> DataFrame: return filtered_df -def choose_numerical_values(df): - cols = [ - "TOT_HERIDOS_LEVES", - "TOT_HERIDOS_GRAVES", - "TOT_VEHICULOS_IMPLICADOS", - "TOT_MUERTOS", - ] - filtered_df = df.filter(items=cols) - return filtered_df - - -def normalize_numerical_values(df) -> DataFrame: +def normalize_numerical_values(df): cols = [ "TOT_HERIDOS_LEVES", "TOT_HERIDOS_GRAVES", @@ -63,7 +52,7 @@ def normalize_numerical_values(df) -> DataFrame: return df -def parse_data(source, action) -> DataFrame: +def parse_data(source, action): df = read_csv(filepath_or_buffer=source, na_values="?") processed_df = process_na(df=df, action=action) filtered_df = filter_dataframe(df=processed_df) diff --git a/src/P2/processing.py b/src/P2/processing.py index 1b91a52..2364651 100644 --- a/src/P2/processing.py +++ b/src/P2/processing.py @@ -1,5 +1,4 @@ import time -from typing import Union from sys import argv from matplotlib.pyplot import * @@ -8,12 +7,10 @@ from seaborn import heatmap, set_style, set_theme, pairplot from sklearn.metrics import silhouette_score, calinski_harabasz_score from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN -from preprocessing import parse_data, choose_numerical_values +from preprocessing import parse_data -def choose_model( - model, -) -> Union[KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN, None]: +def choose_model(model): if model == "kmeans": return KMeans(random_state=42) elif model == "birch": @@ -26,22 +23,22 @@ def choose_model( return DBSCAN() -def predict_data(data, model, results, sample) -> DataFrame: +def predict_data(data, model, results, sample): + model_name = model model = choose_model(model) start_time = time.time() - numerical_data = choose_numerical_values(df=data) - prediction = model.fit_predict(numerical_data) + prediction = model.fit_predict(data) execution_time = time.time() - start_time - calinski = calinski_harabasz_score(X=numerical_data, labels=prediction) + calinski = calinski_harabasz_score(X=data, labels=prediction) silhouette = silhouette_score( - X=numerical_data, + X=data, labels=prediction, metric="euclidean", sample_size=sample, ) populated_results = populate_results( df=results, - model=model, + model=model_name, prediction=prediction, clusters=len(prediction), calinski=calinski, @@ -110,10 +107,7 @@ def create_result_dataframes(): return indexed_results, indexed_results -def populate_results( - df, model, clusters, prediction, calinski, silhouette, time -) -> DataFrame: - renamed_model = rename_model(model=f"{model}") +def populate_results(df, model, clusters, prediction, calinski, silhouette, time): columns = [ "model", "clusters", @@ -122,25 +116,12 @@ def populate_results( "calinski-harabasz", "time", ] - values = [renamed_model, clusters, prediction, silhouette, calinski, time] + values = [model, clusters, prediction, silhouette, calinski, time] dictionary = dict(zip(columns, values)) populated_df = df.append(dictionary, ignore_index=True) return populated_df -def rename_model(model) -> str: - short_name = ["kmeans", "birch", "affinity", "meanshift", "dbscan"] - models = [ - "KMeans(random_state=42)", - "Birch()", - "AffinityPropagation(random_state=42)", - "MeanShift()", - "DBSCAN()", - ] - mapping = dict(zip(models, short_name)) - return mapping[model] - - def construct_case(df, choice): cases = { "case1": df.loc[(df["LUMINOSIDAD"].str.contains("NOCHE"))],