import classification.knn as knn
import data_reader as dr
import matplotlib.pyplot as plt
from helper_methods import *


# Use same data as it was in LR work
main_data = dr.DataTable("data/loan_train.csv")


main_data.head

['',
 'Unnamed: 0',
 'loan_status',
 'Principal',
 'terms',
 'effective_date',
 'due_date',
 'age',
 'education',
 'Gender']


target_name = "loan_status"
main_data.select_target(target_name)
main_data.target[target_name].data[0:5]

Target loan_status was added

[1, 1, 1, 1, 1]


main_data.class_dict[target_name]  # to check which number represent which word

[{'_count': 2, 'COLLECTION': 0, 'PAIDOFF': 1}, {0: 'COLLECTION', 1: 'PAIDOFF'}]


main_data.plot(parameter1='age', parameter2='Gender', classifier=target_name)


main_data.activate_features(['Principal',
 'terms',
 'effective_date',
 'age',
 'education',
 'Gender'])
main_data.add_new_feature(['education', 'age'])
main_data.add_new_feature(['Gender', 'age'])

Feature Principal was added
Feature terms was added
Feature effective_date was added
Feature age was added
Feature education was added
Feature Gender was added
New created feature education*age was added
This education*age feature is added to the list of training set
New created feature Gender*age was added
This Gender*age feature is added to the list of training set


main_data.max_scaling()

Column  was scaled
Column Unnamed: 0 was scaled
Column loan_status was scaled
Column Principal was scaled
Column terms was scaled
Column effective_date was scaled
Column due_date was scaled
Column age was scaled
Column education was scaled
Column Gender was scaled
Column education*age was scaled
Column Gender*age was scaled


main_data.split_data(0.6, shuffle=True)

Shuffle was done
Data was split as follows: 0.6 training set and 0.4 testing set


training_features_data, training_target_data = main_data.get_training_data()  # returns (features data, target data)
# cv_data = main_data.get_cv_data()
testing_data = main_data.get_testing_data()


# check how 0 and 1 were split after shuffling
axis_x = [x for x in range(len(training_target_data))]
plt.scatter(axis_x, training_target_data)
plt.show()


model_knn = knn.KNN()


model_knn.set_data([training_features_data, training_target_data])


model_knn.set_labels = main_data.get_labels()


model_knn.set_labels  # to check what we have in our model

(['Principal',
  'terms',
  'effective_date',
  'age',
  'education',
  'Gender',
  'education*age',
  'Gender*age'],
 'loan_status')


def get_accuracy_from_cm(cm):
    accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
    return round(accuracy, 2)

k_array = [1, 3, 5, 7, 9, 11]
accuracy_array = []
precision_array = []
recall_array = []
f1_score_array = []

for k in k_array:
    model_knn.k_nn = k
    confusion_matrix, precision, recall = model_knn.evaluation(testing_data)
    precision_array.append(precision)
    recall_array.append(recall)
    accuracy_array.append(get_accuracy_from_cm(confusion_matrix))
    f1_score_array.append(2 * precision * recall / (precision + recall))


accuracy_array

[0.71, 0.71, 0.72, 0.72, 0.7, 0.71]


f1_score_array

[0.8198198198198199,
 0.824561403508772,
 0.8266666666666668,
 0.8296943231441049,
 0.8141592920353982,
 0.8275862068965517]


recall_array

[0.900990099009901,
 0.9306930693069307,
 0.9207920792079208,
 0.9405940594059405,
 0.9108910891089109,
 0.9504950495049505]


precision_array

[0.7520661157024794,
 0.7401574803149606,
 0.75,
 0.7421875,
 0.736,
 0.732824427480916]

AI: K-Nearest Neighbors¶

Classification problem with KNN: should we give a loan or no?¶

Introduction¶

Section 1 - preparation ¶

Section 2 - model building ¶

Section 3 - k-evaluation ¶

Section 4 - results ¶

AI: K-Nearest Neighbors¶

Classification problem with KNN: should we give a loan or no?¶

Introduction¶

Section 1 - preparation¶

Section 2 - model building¶

Section 3 - k-evaluation¶

Section 4 - results¶

Section 1 - preparation ¶

Section 2 - model building ¶

Section 3 - k-evaluation ¶

Section 4 - results ¶