Remove type hints and the rename_model function

This commit is contained in:
coolneng 2020-12-11 13:08:37 +01:00
parent a3798a781f
commit e15685d575
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
3 changed files with 20 additions and 63 deletions

View File

@ -29,8 +29,9 @@ def choose_model(model):
def predict_data(data, target, model, results): def predict_data(data, target, model, results):
model = choose_model(model) model_name = model
if model == "knn": model = choose_model(model=model)
if model_name == "knn":
data = scale(data) data = scale(data)
confusion_matrices, auc, fpr, tpr = [], [], [], [] confusion_matrices, auc, fpr, tpr = [], [], [], []
for train_index, test_index in split_k_sets(data): for train_index, test_index in split_k_sets(data):
@ -43,7 +44,7 @@ def predict_data(data, target, model, results):
tpr.append(tpr_item) tpr.append(tpr_item)
populated_results = populate_results( populated_results = populate_results(
df=results, df=results,
model=model, model=model_name,
fpr=mean(fpr, axis=0), fpr=mean(fpr, axis=0),
tpr=mean(tpr, axis=0), tpr=mean(tpr, axis=0),
auc=mean(auc), auc=mean(auc),
@ -122,27 +123,13 @@ def create_result_dataframes():
def populate_results(df, model, fpr, tpr, auc, confusion_matrix): def populate_results(df, model, fpr, tpr, auc, confusion_matrix):
renamed_model = rename_model(model=f"{model}")
columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"] columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"]
values = [renamed_model, fpr, tpr, auc, confusion_matrix] values = [model, fpr, tpr, auc, confusion_matrix]
dictionary = dict(zip(columns, values)) dictionary = dict(zip(columns, values))
populated_df = df.append(dictionary, ignore_index=True) populated_df = df.append(dictionary, ignore_index=True)
return populated_df return populated_df
def rename_model(model):
short_name = ["gnb", "svc", "knn", "tree", "neuralnet"]
models = [
"GaussianNB()",
"LinearSVC(random_state=42)",
"KNeighborsClassifier(n_neighbors=10)",
"DecisionTreeClassifier(random_state=42)",
"MLPClassifier(hidden_layer_sizes=10)",
]
mapping = dict(zip(models, short_name))
return mapping[model]
def transform_dataframe(data, target): def transform_dataframe(data, target):
joined_df = data.join(target) joined_df = data.join(target)
binned_df = joined_df.copy() binned_df = joined_df.copy()

View File

@ -2,13 +2,13 @@ from pandas import DataFrame, read_csv
from sklearn.preprocessing import normalize from sklearn.preprocessing import normalize
def replace_values(df) -> DataFrame: def replace_values(df):
for column in df.columns: for column in df.columns:
df[column].fillna(value=df[column].mean(), inplace=True) df[column].fillna(value=df[column].mean(), inplace=True)
return df return df
def process_na(df, action) -> DataFrame: def process_na(df, action):
if action == "drop": if action == "drop":
return df.dropna() return df.dropna()
elif action == "fill": elif action == "fill":
@ -20,7 +20,7 @@ def process_na(df, action) -> DataFrame:
exit() exit()
def filter_dataframe(df) -> DataFrame: def filter_dataframe(df):
relevant_columns = [ relevant_columns = [
"HORA", "HORA",
"DIASEMANA", "DIASEMANA",
@ -38,18 +38,7 @@ def filter_dataframe(df) -> DataFrame:
return filtered_df return filtered_df
def choose_numerical_values(df): def normalize_numerical_values(df):
cols = [
"TOT_HERIDOS_LEVES",
"TOT_HERIDOS_GRAVES",
"TOT_VEHICULOS_IMPLICADOS",
"TOT_MUERTOS",
]
filtered_df = df.filter(items=cols)
return filtered_df
def normalize_numerical_values(df) -> DataFrame:
cols = [ cols = [
"TOT_HERIDOS_LEVES", "TOT_HERIDOS_LEVES",
"TOT_HERIDOS_GRAVES", "TOT_HERIDOS_GRAVES",
@ -63,7 +52,7 @@ def normalize_numerical_values(df) -> DataFrame:
return df return df
def parse_data(source, action) -> DataFrame: def parse_data(source, action):
df = read_csv(filepath_or_buffer=source, na_values="?") df = read_csv(filepath_or_buffer=source, na_values="?")
processed_df = process_na(df=df, action=action) processed_df = process_na(df=df, action=action)
filtered_df = filter_dataframe(df=processed_df) filtered_df = filter_dataframe(df=processed_df)

View File

@ -1,5 +1,4 @@
import time import time
from typing import Union
from sys import argv from sys import argv
from matplotlib.pyplot import * from matplotlib.pyplot import *
@ -8,12 +7,10 @@ from seaborn import heatmap, set_style, set_theme, pairplot
from sklearn.metrics import silhouette_score, calinski_harabasz_score from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN
from preprocessing import parse_data, choose_numerical_values from preprocessing import parse_data
def choose_model( def choose_model(model):
model,
) -> Union[KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN, None]:
if model == "kmeans": if model == "kmeans":
return KMeans(random_state=42) return KMeans(random_state=42)
elif model == "birch": elif model == "birch":
@ -26,22 +23,22 @@ def choose_model(
return DBSCAN() return DBSCAN()
def predict_data(data, model, results, sample) -> DataFrame: def predict_data(data, model, results, sample):
model_name = model
model = choose_model(model) model = choose_model(model)
start_time = time.time() start_time = time.time()
numerical_data = choose_numerical_values(df=data) prediction = model.fit_predict(data)
prediction = model.fit_predict(numerical_data)
execution_time = time.time() - start_time execution_time = time.time() - start_time
calinski = calinski_harabasz_score(X=numerical_data, labels=prediction) calinski = calinski_harabasz_score(X=data, labels=prediction)
silhouette = silhouette_score( silhouette = silhouette_score(
X=numerical_data, X=data,
labels=prediction, labels=prediction,
metric="euclidean", metric="euclidean",
sample_size=sample, sample_size=sample,
) )
populated_results = populate_results( populated_results = populate_results(
df=results, df=results,
model=model, model=model_name,
prediction=prediction, prediction=prediction,
clusters=len(prediction), clusters=len(prediction),
calinski=calinski, calinski=calinski,
@ -110,10 +107,7 @@ def create_result_dataframes():
return indexed_results, indexed_results return indexed_results, indexed_results
def populate_results( def populate_results(df, model, clusters, prediction, calinski, silhouette, time):
df, model, clusters, prediction, calinski, silhouette, time
) -> DataFrame:
renamed_model = rename_model(model=f"{model}")
columns = [ columns = [
"model", "model",
"clusters", "clusters",
@ -122,25 +116,12 @@ def populate_results(
"calinski-harabasz", "calinski-harabasz",
"time", "time",
] ]
values = [renamed_model, clusters, prediction, silhouette, calinski, time] values = [model, clusters, prediction, silhouette, calinski, time]
dictionary = dict(zip(columns, values)) dictionary = dict(zip(columns, values))
populated_df = df.append(dictionary, ignore_index=True) populated_df = df.append(dictionary, ignore_index=True)
return populated_df return populated_df
def rename_model(model) -> str:
short_name = ["kmeans", "birch", "affinity", "meanshift", "dbscan"]
models = [
"KMeans(random_state=42)",
"Birch()",
"AffinityPropagation(random_state=42)",
"MeanShift()",
"DBSCAN()",
]
mapping = dict(zip(models, short_name))
return mapping[model]
def construct_case(df, choice): def construct_case(df, choice):
cases = { cases = {
"case1": df.loc[(df["LUMINOSIDAD"].str.contains("NOCHE"))], "case1": df.loc[(df["LUMINOSIDAD"].str.contains("NOCHE"))],