Translate summary to spanish

2020-11-10 22:28:40 +01:00 · 2020-11-10 22:28:40 +01:00 · bb67366284
parent dfe85ab6ce
commit bb67366284
3 changed files with 158 additions and 71 deletions
--- a/docs/Experiments.org
+++ b/docs/Experiments.org
@ -1,71 +0,0 @@
-* Experiments
-
-We will first try to gather information about our dataset, by evaluating the statistics of our attributes.
-
-#+BEGIN_SRC python
-from pandas import read_csv
-from sklearn.preprocessing import LabelEncoder
-
-
-def replace_values(df):
-    columns = ["BI-RADS", "Margin", "Density", "Age"]
-    for column in columns:
-        df[column].fillna(value=df[column].mean(), inplace=True)
-    return df
-
-
-def process_na(df, action):
-    if action == "drop":
-        return df.dropna()
-    return replace_values(df)
-
-
-def encode_columns(df):
-    encoder = LabelEncoder()
-    encoder.fit(df["Shape"])
-
-
-def parse_data(source, action):
-    df = read_csv(filepath_or_buffer=source, na_values="?")
-    processed_df = process_na(df, action)
-    return processed_df
-#+END_SRC
-
-#+RESULTS:
-
-
-#+BEGIN_SRC python
-df = parse_data("../data/mamografia.csv", "drop")
-print(df.describe())
-#+END_SRC
-
-#+RESULTS:
-:           BI-RADS         Age      Margin     Density
-: count  847.000000  847.000000  847.000000  847.000000
-: mean     4.322314   55.842975    2.833530    2.909091
-: std      0.703762   14.603754    1.564049    0.370292
-: min      0.000000   18.000000    1.000000    1.000000
-: 25%      4.000000   46.000000    1.000000    3.000000
-: 50%      4.000000   57.000000    3.000000    3.000000
-: 75%      5.000000   66.000000    4.000000    3.000000
-: max      6.000000   96.000000    5.000000    4.000000
-
-We observe that *margin* and *density* are the columns with the most unknown values. The age group of our cohort is middle aged, the BI-RADS score is mostly in the suspicious category, the density is mostly low and the margin belongs to the microlobulated/obscured category.
-
-We'll try to impute values, instead of dropping them, when they're invalid.
-
-#+BEGIN_SRC python
-df = parse_data("../data/mamografia.csv", "replace")
-print(df.describe())
-#+END_SRC
-
-#+RESULTS:
-:           BI-RADS         Age      Margin     Density
-: count  961.000000  961.000000  961.000000  961.000000
-: mean     4.296142   55.487448    2.796276    2.910734
-: std      0.705555   14.442373    1.526880    0.365074
-: min      0.000000   18.000000    1.000000    1.000000
-: 25%      4.000000   45.000000    1.000000    3.000000
-: 50%      4.000000   57.000000    3.000000    3.000000
-: 75%      5.000000   66.000000    4.000000    3.000000
-: max      6.000000   96.000000    5.000000    4.000000
--- a/docs/Summary.org
+++ b/docs/Summary.org
@ -0,0 +1,158 @@
+#+TITLE: Práctica 1
+#+SUBTITLE: Inteligencia de Negocio
+#+AUTHOR: Amin Kasrou Aouam
+#+DATE: 2020-11-10
+#+PANDOC_OPTIONS: template:~/.pandoc/templates/eisvogel.latex
+#+PANDOC_OPTIONS: listings:t
+#+PANDOC_OPTIONS: toc:t
+#+PANDOC_METADATA: lang=es
+#+PANDOC_METADATA: titlepage:t
+#+PANDOC_METADATA: listings-no-page-break:t
+#+PANDOC_METADATA: toc-own-page:t
+#+PANDOC_METADATA: table-use-row-colors:t
+#+PANDOC_METADATA: logo:/home/coolneng/Photos/Logos/UGR.png
+* Práctica 1
+
+** Introducción
+
+En esta práctica, usaremos distintos algoritmos de aprendizaje automático para resolver un problema de clasificación.
+
+** Procesado de datos
+
+Antes de proceder con el entrenamiento de los distintos modelos, debemos realizar un preprocesado de los datos, para asegurarnos que nuestros modelos aprenden de un /dataset/ congruente.
+
+La integridad de la lógica del preprocesado se encuentra en el archivo /preprocessing.py/, cuyo contenido mostramos aquí:
+
+#+begin_src python
+from pandas import read_csv
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import KFold
+
+
+def replace_values(df):
+    columns = ["BI-RADS", "Margin", "Density", "Age"]
+    for column in columns:
+        df[column].fillna(value=df[column].mean(), inplace=True)
+    return df
+
+
+def process_na(df, action):
+    if action == "drop":
+        return df.dropna()
+    elif action == "fill":
+        return replace_values(df)
+    else:
+        print("Unknown action selected. The choices are: ")
+        print("fill: fills the na values with the mean")
+        print("drop: drops the na values")
+        exit()
+
+
+def encode_columns(df):
+    label_encoder = LabelEncoder()
+    encoded_df = df.copy()
+    encoded_df["Shape"] = label_encoder.fit_transform(df["Shape"])
+    encoded_df["Severity"] = label_encoder.fit_transform(df["Severity"])
+    return encoded_df
+
+
+def split_train_target(df):
+    train_data = df.drop(columns=["Severity"])
+    target_data = df["Severity"]
+    return train_data, target_data
+
+
+def split_k_sets(df):
+    k_fold = KFold(shuffle=True, random_state=42)
+    return k_fold.split(df)
+
+
+def parse_data(source, action):
+    df = read_csv(filepath_or_buffer=source, na_values="?")
+    processed_df = process_na(df=df, action=action)
+    encoded_df = encode_columns(df=processed_df)
+    test_data, target_data = split_train_target(df=encoded_df)
+    return test_data, target_data
+#+end_src
+
+#+RESULTS:
+
+A continuación, mostraremos cada uno de los pasos que realizamos para obtener el /dataset/ final:
+
+*** Valores nulos:
+
+Nuestro /dataset/ contiene valores nulos, representados mediante un signo de interrogación (?). Optamos por evaluar 2 estrategias:
+
+1. Eliminar los valores nulos
+
+#+BEGIN_SRC python
+df = read_csv(filepath_or_buffer="../data/mamografia.csv", na_values="?")
+processed_df = process_na(df=df, action="drop")
+print("DataFrame sin preprocesamiento: ")
+print(df.describe())
+print("DataFrame sin preprocesamiento: ")
+print(processed_df.describe())
+#+END_SRC
+
+#+RESULTS:
+#+begin_example
+DataFrame sin preprocesamiento: 
+          BI-RADS         Age      Margin     Density
+count  959.000000  956.000000  913.000000  885.000000
+mean     4.296142   55.487448    2.796276    2.910734
+std      0.706291   14.480131    1.566546    0.380444
+min      0.000000   18.000000    1.000000    1.000000
+25%      4.000000   45.000000    1.000000    3.000000
+50%      4.000000   57.000000    3.000000    3.000000
+75%      5.000000   66.000000    4.000000    3.000000
+max      6.000000   96.000000    5.000000    4.000000
+DataFrame sin preprocesamiento: 
+          BI-RADS         Age      Margin     Density
+count  847.000000  847.000000  847.000000  847.000000
+mean     4.322314   55.842975    2.833530    2.909091
+std      0.703762   14.603754    1.564049    0.370292
+min      0.000000   18.000000    1.000000    1.000000
+25%      4.000000   46.000000    1.000000    3.000000
+50%      4.000000   57.000000    3.000000    3.000000
+75%      5.000000   66.000000    4.000000    3.000000
+max      6.000000   96.000000    5.000000    4.000000
+#+end_example
+
+Observamos que el número de instancias disminuye considerablemente, hasta un máximo de 112, en el caso del /BI-RADS/. Aún así, los valores de la media y desviación estándar no se ven afectados de forma considerable.
+
+2. Imputar su valor con la media
+
+#+BEGIN_SRC python
+df = read_csv(filepath_or_buffer="../data/mamografia.csv", na_values="?")
+processed_df = process_na(df=df, action="fill")
+print("DataFrame sin preprocesamiento: ")
+print(df.describe())
+print("DataFrame sin preprocesamiento: ")
+print(processed_df.describe())
+#+END_SRC
+
+#+RESULTS:
+#+begin_example
+DataFrame sin preprocesamiento: 
+          BI-RADS         Age      Margin     Density
+count  961.000000  961.000000  961.000000  961.000000
+mean     4.296142   55.487448    2.796276    2.910734
+std      0.705555   14.442373    1.526880    0.365074
+min      0.000000   18.000000    1.000000    1.000000
+25%      4.000000   45.000000    1.000000    3.000000
+50%      4.000000   57.000000    3.000000    3.000000
+75%      5.000000   66.000000    4.000000    3.000000
+max      6.000000   96.000000    5.000000    4.000000
+DataFrame sin preprocesamiento: 
+          BI-RADS         Age      Margin     Density
+count  961.000000  961.000000  961.000000  961.000000
+mean     4.296142   55.487448    2.796276    2.910734
+std      0.705555   14.442373    1.526880    0.365074
+min      0.000000   18.000000    1.000000    1.000000
+25%      4.000000   45.000000    1.000000    3.000000
+50%      4.000000   57.000000    3.000000    3.000000
+75%      5.000000   66.000000    4.000000    3.000000
+max      6.000000   96.000000    5.000000    4.000000
+#+end_example
+
+Esta alternativa nos permite mantener el número de instancias en todas las columnas, sin alterar la media ni la desviación típica.
--- a/docs/Summary.pdf
+++ b/docs/Summary.pdf