Subset the dataframe columns after case filtering
This commit is contained in:
parent
e15685d575
commit
8bcc7fa7bc
|
@ -22,30 +22,17 @@ def process_na(df, action):
|
||||||
|
|
||||||
def filter_dataframe(df):
|
def filter_dataframe(df):
|
||||||
relevant_columns = [
|
relevant_columns = [
|
||||||
"HORA",
|
|
||||||
"DIASEMANA",
|
|
||||||
"COMUNIDAD_AUTONOMA",
|
|
||||||
"ISLA",
|
|
||||||
"TOT_HERIDOS_LEVES",
|
"TOT_HERIDOS_LEVES",
|
||||||
"TOT_HERIDOS_GRAVES",
|
"TOT_HERIDOS_GRAVES",
|
||||||
"TOT_VEHICULOS_IMPLICADOS",
|
"TOT_VEHICULOS_IMPLICADOS",
|
||||||
"TOT_MUERTOS",
|
"TOT_MUERTOS",
|
||||||
"TIPO_VIA",
|
|
||||||
"LUMINOSIDAD",
|
|
||||||
"FACTORES_ATMOSFERICOS",
|
|
||||||
]
|
]
|
||||||
filtered_df = df.filter(items=relevant_columns)
|
filtered_df = df.filter(items=relevant_columns)
|
||||||
return filtered_df
|
return filtered_df
|
||||||
|
|
||||||
|
|
||||||
def normalize_numerical_values(df):
|
def normalize_numerical_values(df):
|
||||||
cols = [
|
filtered_df = filter_dataframe(df=df)
|
||||||
"TOT_HERIDOS_LEVES",
|
|
||||||
"TOT_HERIDOS_GRAVES",
|
|
||||||
"TOT_VEHICULOS_IMPLICADOS",
|
|
||||||
"TOT_MUERTOS",
|
|
||||||
]
|
|
||||||
filtered_df = df.filter(items=cols)
|
|
||||||
normalized_data = normalize(X=filtered_df)
|
normalized_data = normalize(X=filtered_df)
|
||||||
normalized_df = DataFrame(data=normalized_data, columns=filtered_df.columns)
|
normalized_df = DataFrame(data=normalized_data, columns=filtered_df.columns)
|
||||||
df.update(normalized_df)
|
df.update(normalized_df)
|
||||||
|
@ -55,6 +42,5 @@ def normalize_numerical_values(df):
|
||||||
def parse_data(source, action):
|
def parse_data(source, action):
|
||||||
df = read_csv(filepath_or_buffer=source, na_values="?")
|
df = read_csv(filepath_or_buffer=source, na_values="?")
|
||||||
processed_df = process_na(df=df, action=action)
|
processed_df = process_na(df=df, action=action)
|
||||||
filtered_df = filter_dataframe(df=processed_df)
|
normalized_df = normalize_numerical_values(df=processed_df)
|
||||||
normalized_df = normalize_numerical_values(df=filtered_df)
|
|
||||||
return normalized_df
|
return normalized_df
|
||||||
|
|
|
@ -7,7 +7,7 @@ from seaborn import heatmap, set_style, set_theme, pairplot
|
||||||
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
||||||
from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN
|
from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN
|
||||||
|
|
||||||
from preprocessing import parse_data
|
from preprocessing import parse_data, filter_dataframe
|
||||||
|
|
||||||
|
|
||||||
def choose_model(model):
|
def choose_model(model):
|
||||||
|
@ -160,9 +160,10 @@ def main():
|
||||||
data = parse_data(source="data/accidentes_2013.csv", action=str(argv[1]))
|
data = parse_data(source="data/accidentes_2013.csv", action=str(argv[1]))
|
||||||
individual_result, complete_results = create_result_dataframes()
|
individual_result, complete_results = create_result_dataframes()
|
||||||
case_data = construct_case(df=data, choice=case)
|
case_data = construct_case(df=data, choice=case)
|
||||||
|
filtered_data = filter_dataframe(df=case_data)
|
||||||
for model in models:
|
for model in models:
|
||||||
model_results = predict_data(
|
model_results = predict_data(
|
||||||
data=case_data,
|
data=filtered_data,
|
||||||
model=model,
|
model=model,
|
||||||
results=individual_result,
|
results=individual_result,
|
||||||
sample=sample,
|
sample=sample,
|
||||||
|
|
Loading…
Reference in New Issue