Remove type hints and the rename_model function

2020-12-11 13:08:37 +01:00 · 2020-12-11 13:08:37 +01:00 · e15685d575
parent a3798a781f
commit e15685d575
3 changed files with 20 additions and 63 deletions
--- a/src/P1/processing.py
+++ b/src/P1/processing.py
@ -29,8 +29,9 @@ def choose_model(model):


 def predict_data(data, target, model, results):
-    model = choose_model(model)
-    if model == "knn":
+    model_name = model
+    model = choose_model(model=model)
+    if model_name == "knn":
        data = scale(data)
    confusion_matrices, auc, fpr, tpr = [], [], [], []
    for train_index, test_index in split_k_sets(data):
@ -43,7 +44,7 @@ def predict_data(data, target, model, results):
        tpr.append(tpr_item)
    populated_results = populate_results(
        df=results,
-        model=model,
+        model=model_name,
        fpr=mean(fpr, axis=0),
        tpr=mean(tpr, axis=0),
        auc=mean(auc),
@ -122,27 +123,13 @@ def create_result_dataframes():


 def populate_results(df, model, fpr, tpr, auc, confusion_matrix):
-    renamed_model = rename_model(model=f"{model}")
    columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"]
-    values = [renamed_model, fpr, tpr, auc, confusion_matrix]
+    values = [model, fpr, tpr, auc, confusion_matrix]
    dictionary = dict(zip(columns, values))
    populated_df = df.append(dictionary, ignore_index=True)
    return populated_df


-def rename_model(model):
-    short_name = ["gnb", "svc", "knn", "tree", "neuralnet"]
-    models = [
-        "GaussianNB()",
-        "LinearSVC(random_state=42)",
-        "KNeighborsClassifier(n_neighbors=10)",
-        "DecisionTreeClassifier(random_state=42)",
-        "MLPClassifier(hidden_layer_sizes=10)",
-    ]
-    mapping = dict(zip(models, short_name))
-    return mapping[model]
-
-
 def transform_dataframe(data, target):
    joined_df = data.join(target)
    binned_df = joined_df.copy()
--- a/src/P2/preprocessing.py
+++ b/src/P2/preprocessing.py
@ -2,13 +2,13 @@ from pandas import DataFrame, read_csv
 from sklearn.preprocessing import normalize


-def replace_values(df) -> DataFrame:
+def replace_values(df):
    for column in df.columns:
        df[column].fillna(value=df[column].mean(), inplace=True)
    return df


-def process_na(df, action) -> DataFrame:
+def process_na(df, action):
    if action == "drop":
        return df.dropna()
    elif action == "fill":
@ -20,7 +20,7 @@ def process_na(df, action) -> DataFrame:
        exit()


-def filter_dataframe(df) -> DataFrame:
+def filter_dataframe(df):
    relevant_columns = [
        "HORA",
        "DIASEMANA",
@ -38,18 +38,7 @@ def filter_dataframe(df) -> DataFrame:
    return filtered_df


-def choose_numerical_values(df):
-    cols = [
-        "TOT_HERIDOS_LEVES",
-        "TOT_HERIDOS_GRAVES",
-        "TOT_VEHICULOS_IMPLICADOS",
-        "TOT_MUERTOS",
-    ]
-    filtered_df = df.filter(items=cols)
-    return filtered_df
-
-
-def normalize_numerical_values(df) -> DataFrame:
+def normalize_numerical_values(df):
    cols = [
        "TOT_HERIDOS_LEVES",
        "TOT_HERIDOS_GRAVES",
@ -63,7 +52,7 @@ def normalize_numerical_values(df) -> DataFrame:
    return df


-def parse_data(source, action) -> DataFrame:
+def parse_data(source, action):
    df = read_csv(filepath_or_buffer=source, na_values="?")
    processed_df = process_na(df=df, action=action)
    filtered_df = filter_dataframe(df=processed_df)
--- a/src/P2/processing.py
+++ b/src/P2/processing.py
@ -1,5 +1,4 @@
 import time
-from typing import Union
 from sys import argv

 from matplotlib.pyplot import *
@ -8,12 +7,10 @@ from seaborn import heatmap, set_style, set_theme, pairplot
 from sklearn.metrics import silhouette_score, calinski_harabasz_score
 from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN

-from preprocessing import parse_data, choose_numerical_values
+from preprocessing import parse_data


-def choose_model(
-    model,
-) -> Union[KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN, None]:
+def choose_model(model):
    if model == "kmeans":
        return KMeans(random_state=42)
    elif model == "birch":
@ -26,22 +23,22 @@ def choose_model(
        return DBSCAN()


-def predict_data(data, model, results, sample) -> DataFrame:
+def predict_data(data, model, results, sample):
+    model_name = model
    model = choose_model(model)
    start_time = time.time()
-    numerical_data = choose_numerical_values(df=data)
-    prediction = model.fit_predict(numerical_data)
+    prediction = model.fit_predict(data)
    execution_time = time.time() - start_time
-    calinski = calinski_harabasz_score(X=numerical_data, labels=prediction)
+    calinski = calinski_harabasz_score(X=data, labels=prediction)
    silhouette = silhouette_score(
-        X=numerical_data,
+        X=data,
        labels=prediction,
        metric="euclidean",
        sample_size=sample,
    )
    populated_results = populate_results(
        df=results,
-        model=model,
+        model=model_name,
        prediction=prediction,
        clusters=len(prediction),
        calinski=calinski,
@ -110,10 +107,7 @@ def create_result_dataframes():
    return indexed_results, indexed_results


-def populate_results(
-    df, model, clusters, prediction, calinski, silhouette, time
-) -> DataFrame:
-    renamed_model = rename_model(model=f"{model}")
+def populate_results(df, model, clusters, prediction, calinski, silhouette, time):
    columns = [
        "model",
        "clusters",
@ -122,25 +116,12 @@ def populate_results(
        "calinski-harabasz",
        "time",
    ]
-    values = [renamed_model, clusters, prediction, silhouette, calinski, time]
+    values = [model, clusters, prediction, silhouette, calinski, time]
    dictionary = dict(zip(columns, values))
    populated_df = df.append(dictionary, ignore_index=True)
    return populated_df


-def rename_model(model) -> str:
-    short_name = ["kmeans", "birch", "affinity", "meanshift", "dbscan"]
-    models = [
-        "KMeans(random_state=42)",
-        "Birch()",
-        "AffinityPropagation(random_state=42)",
-        "MeanShift()",
-        "DBSCAN()",
-    ]
-    mapping = dict(zip(models, short_name))
-    return mapping[model]
-
-
 def construct_case(df, choice):
    cases = {
        "case1": df.loc[(df["LUMINOSIDAD"].str.contains("NOCHE"))],