From 0421493eff729fe30595244f0a2b81b609c9b505 Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Tue, 10 Nov 2020 20:28:59 +0100
Subject: [PATCH] Add processing module

---
 src/processing.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/processing.py b/src/processing.py
index 8eba2ec..0f8ca17 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -1 +1,76 @@
-from preprocessing import parse_data
+from numpy import mean
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.model_selection import cross_val_score
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import scale
+from sklearn.svm import LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+
+from preprocessing import parse_data, split_k_sets
+
+
+def naive_bayes():
+    model = GaussianNB()
+    return model
+
+
+def linear_svc():
+    model = LinearSVC(random_state=42)
+    return model
+
+
+def k_nearest_neighbors():
+    model = KNeighborsClassifier(n_neighbors=10)
+    return model
+
+
+def decision_tree():
+    model = DecisionTreeClassifier(random_state=42)
+    return model
+
+
+def choose_model(model):
+    if model == "gnb":
+        return naive_bayes()
+    elif model == "svc":
+        return linear_svc()
+    elif model == "knn":
+        return k_nearest_neighbors()
+    elif model == "tree":
+        return decision_tree()
+
+
+def predict_data(data, target, model):
+    model = choose_model(model)
+    if model == "knn":
+        data = scale(data)
+    predictions = []
+    for train_index, test_index in split_k_sets(data):
+        model.fit(data.iloc[train_index], target.iloc[train_index])
+        prediction = model.predict(data.iloc[test_index])
+        predictions.append(prediction)
+    return model, predictions
+
+
+def evaluate_performance(predictions, model, data, target):
+    confusion_matrices = []
+    classification_reports = []
+    score = cross_val_score(model, data, target, cv=10)
+    for prediction in predictions:
+        confusion_matrices.append(confusion_matrix(target, prediction))
+        classification_reports.append(classification_report(target, prediction))
+    print("Model:" + model)
+    print("Score: " + score)
+    print("Confusion matrix: " + mean(confusion_matrices))
+    print("Classification report: " + mean(classification_reports))
+
+
+def main():
+    data, target = parse_data(source="../data/mamografia.csv", action="fill")
+    model, predictions = predict_data(data=data, target=target, model="knn")
+    evaluate_performance(predictions=predictions, model=model, data=data, target=target)
+
+
+if __name__ == "__main__":
+    main()