Plot roc_auc_curve and add plumbing for plotting

This commit is contained in:
coolneng 2020-12-08 14:08:47 +01:00
parent 3dd13a6fb5
commit c5a147e5df
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 115 additions and 36 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

View File

@ -1,12 +1,15 @@
from numpy import mean from numpy import mean, arange
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale from sklearn.preprocessing import scale
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from seaborn import set_theme
from matplotlib.pyplot import *
from pandas import DataFrame
from sys import argv from sys import argv
@ -24,65 +27,141 @@ def choose_model(model):
return DecisionTreeClassifier(random_state=42) return DecisionTreeClassifier(random_state=42)
elif model == "neuralnet": elif model == "neuralnet":
return MLPClassifier(hidden_layer_sizes=10) return MLPClassifier(hidden_layer_sizes=10)
else:
print("Unknown model selected. The choices are: ")
print("gnb: Gaussian Naive Bayes")
print("svc: Linear Support Vector Classification")
print("knn: K-neighbors")
print("tree: Decision tree")
print("neuralnet: MLP Classifier")
exit()
def predict_data(data, target, model): def predict_data(data, target, model, results):
model = choose_model(model) model = choose_model(model)
if model == "knn": if model == "knn":
data = scale(data) data = scale(data)
accuracy_scores = [] confusion_matrices, auc, fpr, tpr = [], [], [], []
confusion_matrices = []
auc = []
for train_index, test_index in split_k_sets(data): for train_index, test_index in split_k_sets(data):
model.fit(data.iloc[train_index], target.iloc[train_index]) model.fit(data.iloc[train_index], target.iloc[train_index])
prediction = model.predict(data.iloc[test_index]) prediction = model.predict(data.iloc[test_index])
accuracy_scores.append(accuracy_score(target.iloc[test_index], prediction))
confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction)) confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction))
auc.append(roc_auc_score(target.iloc[test_index], prediction)) auc.append(roc_auc_score(target.iloc[test_index], prediction))
cv_score = cross_val_score(model, data, target, cv=10) fpr_item, tpr_item, _ = roc_curve(target.iloc[test_index], prediction)
evaluate_performance( fpr.append(fpr_item)
confusion_matrix=mean(confusion_matrices, axis=0), tpr.append(tpr_item)
accuracy=mean(accuracy_scores), populated_results = populate_results(
cv_score=mean(cv_score), df=results,
model=model,
fpr=mean(fpr, axis=0),
tpr=mean(tpr, axis=0),
auc=mean(auc), auc=mean(auc),
confusion_matrix=mean(confusion_matrices, axis=0),
)
return populated_results
def plot_roc_auc_curve(model, results):
rounded_auc = round(results.loc[model]["auc"], 3)
plot(
results.loc[model]["fpr"],
results.loc[model]["tpr"],
label=f"{model} , AUC={rounded_auc}",
)
xticks(arange(0.0, 1.0, step=0.1))
yticks(arange(0.0, 1.0, step=0.1))
legend(loc="lower right")
def plot_confusion_matrix(model, results):
matrix = results.loc[model]["confusion_matrix"]
classes = ["Negative", "Positive"]
for item in matrix:
text(x=0.5, y=0.5, s=item)
xticks(ticks=arange(len(classes)), labels=classes)
yticks(ticks=arange(len(classes)), labels=classes)
def choose_plot_type(type, model, results):
if type == "roc":
plot_roc_auc_curve(model, results)
elif type == "confusion_matrix":
plot_confusion_matrix(model, results)
def plot_individual_figure(results, type, x_axis, y_axis, fig_title):
fig = figure(figsize=(8, 6))
for model in results.index:
choose_plot_type(type, model, results)
xlabel(x_axis)
ylabel(y_axis)
title(fig_title)
show()
fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
# TODO Add cross_val_score
def plot_all_figures(results):
set_theme()
plot_individual_figure(
results,
type="roc",
x_axis="fpr",
y_axis="tpr",
fig_title="ROC AUC curve",
)
plot_individual_figure(
results,
type="confusion_matrix",
x_axis="fpr",
y_axis="tpr",
fig_title="Confusion Matrix",
) )
def evaluate_performance(confusion_matrix, accuracy, cv_score, auc): def create_result_dataframes():
print("Accuracy Score: " + str(accuracy)) results = DataFrame(columns=["model", "fpr", "tpr", "auc", "confusion_matrix"])
print("Confusion matrix: ") indexed_results = results.set_index("model")
print(str(confusion_matrix)) return indexed_results, indexed_results
print("Cross validation score: " + str(cv_score))
print("AUC: " + str(auc))
def populate_results(df, model, fpr, tpr, auc, confusion_matrix):
renamed_model = rename_model(model=f"{model}")
columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"]
values = [renamed_model, fpr, tpr, auc, confusion_matrix]
dictionary = dict(zip(columns, values))
populated_df = df.append(dictionary, ignore_index=True)
return populated_df
def rename_model(model):
short_name = ["gnb", "svc", "knn", "tree", "neuralnet"]
models = [
"GaussianNB()",
"LinearSVC(random_state=42)",
"KNeighborsClassifier(n_neighbors=10)",
"DecisionTreeClassifier(random_state=42)",
"MLPClassifier(hidden_layer_sizes=10)",
]
mapping = dict(zip(models, short_name))
return mapping[model]
def usage(): def usage():
print("Usage: " + argv[0] + "<preprocessing action> <model>") print("Usage: " + argv[0] + "<preprocessing action>")
print("preprocessing actions:") print("preprocessing actions:")
print("fill: fills the na values with the mean") print("fill: fills the na values with the mean")
print("drop: drops the na values") print("drop: drops the na values")
print("models:")
print("gnb: Gaussian Naive Bayes")
print("svc: Linear Support Vector Classification")
print("knn: K-neighbors")
print("tree: Decision tree")
print("neuralnet: MLP Classifier")
exit() exit()
def main(): def main():
if len(argv) != 3: models = ["gnb", "svc", "knn", "tree", "neuralnet"]
if len(argv) != 2:
usage() usage()
data, target = parse_data(source="data/mamografia.csv", action=str(argv[1])) data, target = parse_data(source="data/mamografia.csv", action=str(argv[1]))
predict_data(data=data, target=target, model=str(argv[2])) individual_result, complete_results = create_result_dataframes()
for model in models:
model_results = predict_data(
data=data, target=target, model=model, results=individual_result
)
complete_results = complete_results.append(
individual_result.append(model_results)
)
indexed_results = complete_results.set_index("model")
plot_all_figures(results=indexed_results)
if __name__ == "__main__": if __name__ == "__main__":