Complete missing values in columns with less data

This commit is contained in:
coolneng 2021-01-01 23:52:43 +01:00
parent 9a8944cb5d
commit 3b08cb73f5
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 10 additions and 2 deletions

View File

@ -19,9 +19,17 @@ def rename_columns(df_list) -> DataFrame:
return df_list return df_list
def drop_null_values(df_list): def process_null_values(df_list):
drop_columns = ["tipo_marchas", "descuento", "ciudad"] drop_columns = ["tipo_marchas", "descuento", "ciudad"]
fill_columns = ["asientos", "motor_cc", "potencia"]
for df in df_list: for df in df_list:
for column in fill_columns:
if column == "asientos":
df[column].fillna(value=df[column].median(), inplace=True)
else:
df[column].fillna(
value=df[column].str.extract("(\d+)").mean(), inplace=True
)
df.drop(columns=drop_columns, inplace=True) df.drop(columns=drop_columns, inplace=True)
df.dropna(inplace=True) df.dropna(inplace=True)
return df_list return df_list
@ -80,7 +88,7 @@ def split_k_sets(df):
def parse_data(train, test): def parse_data(train, test):
df_list = construct_dataframes(train=train, test=test) df_list = construct_dataframes(train=train, test=test)
renamed_df_list = rename_columns(df_list) renamed_df_list = rename_columns(df_list)
processed_df_list = drop_null_values(renamed_df_list) processed_df_list = process_null_values(renamed_df_list)
encoded_df_list = encode_columns(processed_df_list) encoded_df_list = encode_columns(processed_df_list)
train_data, train_target = balance_training_data(encoded_df_list[0]) train_data, train_target = balance_training_data(encoded_df_list[0])
test_data, test_ids = split_data_target(encoded_df_list[1], dataset="test") test_data, test_ids = split_data_target(encoded_df_list[1], dataset="test")