Subset the dataframe columns after case filtering

This commit is contained in:
coolneng 2020-12-11 13:55:27 +01:00
parent e15685d575
commit 8bcc7fa7bc
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 5 additions and 18 deletions

View File

@ -22,30 +22,17 @@ def process_na(df, action):
def filter_dataframe(df):
relevant_columns = [
"HORA",
"DIASEMANA",
"COMUNIDAD_AUTONOMA",
"ISLA",
"TOT_HERIDOS_LEVES",
"TOT_HERIDOS_GRAVES",
"TOT_VEHICULOS_IMPLICADOS",
"TOT_MUERTOS",
"TIPO_VIA",
"LUMINOSIDAD",
"FACTORES_ATMOSFERICOS",
]
filtered_df = df.filter(items=relevant_columns)
return filtered_df
def normalize_numerical_values(df):
cols = [
"TOT_HERIDOS_LEVES",
"TOT_HERIDOS_GRAVES",
"TOT_VEHICULOS_IMPLICADOS",
"TOT_MUERTOS",
]
filtered_df = df.filter(items=cols)
filtered_df = filter_dataframe(df=df)
normalized_data = normalize(X=filtered_df)
normalized_df = DataFrame(data=normalized_data, columns=filtered_df.columns)
df.update(normalized_df)
@ -55,6 +42,5 @@ def normalize_numerical_values(df):
def parse_data(source, action):
df = read_csv(filepath_or_buffer=source, na_values="?")
processed_df = process_na(df=df, action=action)
filtered_df = filter_dataframe(df=processed_df)
normalized_df = normalize_numerical_values(df=filtered_df)
normalized_df = normalize_numerical_values(df=processed_df)
return normalized_df

View File

@ -7,7 +7,7 @@ from seaborn import heatmap, set_style, set_theme, pairplot
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.cluster import KMeans, Birch, AffinityPropagation, MeanShift, DBSCAN
from preprocessing import parse_data
from preprocessing import parse_data, filter_dataframe
def choose_model(model):
@ -160,9 +160,10 @@ def main():
data = parse_data(source="data/accidentes_2013.csv", action=str(argv[1]))
individual_result, complete_results = create_result_dataframes()
case_data = construct_case(df=data, choice=case)
filtered_data = filter_dataframe(df=case_data)
for model in models:
model_results = predict_data(
data=case_data,
data=filtered_data,
model=model,
results=individual_result,
sample=sample,