# upload packages
import data_reader as dr
import clusterization.kmean as km
import matplotlib.pyplot as plt
import random


# to have similar random results
random.seed(301)


# data uploading and feature enablement
main_table = dr.DataTable("data/Cust_Segmentation.csv")


# returns all features from our data
main_table.head

['Customer Id',
 'Age',
 'Edu',
 'Years Employed',
 'Income',
 'Card Debt',
 'Other Debt',
 'Defaulted',
 'Address',
 'DebtIncomeRatio']


plt.title("Income vs Years Employed")
main_table.plot("Income", "Years Employed")


 main_table.max_scaling()

Column Customer Id was scaled
Column Age was scaled
Column Edu was scaled
Column Years Employed was scaled
Column Income was scaled
Column Card Debt was scaled
Column Other Debt was scaled
Column Defaulted was scaled
Column Address was scaled
Column DebtIncomeRatio was scaled


# new plot
plt.title("Scaled Income vs Years Employed")
main_table.plot("Income", "Years Employed")


main_table.activate_features(["Income", "Years Employed"])
main_table.select_target("Customer Id")

Feature Income was added
Feature Years Employed was added
Target Customer Id was added


# split data
main_table.split_data(0.7, shuffle=True)

Shuffle was done
Data was split as follows: 0.7 training set and 0.30000000000000004 testing set


# here we do training.testing data generation from our data set
min_max_info = main_table.get_min_max_features()
training_data = main_table.get_training_data()
test_data = main_table.get_testing_data()
training_data = training_data[0]
test_data = test_data[0]

# set labels for further data visualization
labels = main_table.get_labels()[0]


# model initialization
model_kmean = km.KMean()


# let's do K-Mean algo only 3 times for different randomly init centroids
model_kmean.NUMBER_OF_CENTROIDS_INITIALIZATION = 3
model_kmean.set_labels(labels)

# next command allows to visualizate every learning iteration
# during method .fit()
model_kmean.set_monitor(True)

# set 5 learning iterations
model_kmean.epoch = 5

# set 3 number of centroids
model_kmean.number_of_centroids = 3

# what data will be used for the learning?
model_kmean.set_training_data(training_data)

# additional in case of centroid random initialization in range min-max
# (not in the use right now)
model_kmean.set_min_max_ranges(min_max_info)

# we want to pick points up randomly from 
# our data set and make them as our initial centroids.
# to do that, set centroid_mode to None
model_kmean.centroid_mode = None


model_kmean.fit()


cost_functions = model_kmean.cost_functions
axis_x = [x for x in range(1, model_kmean.epoch+1)]
plt.plot(axis_x, cost_functions)
plt.title("Cost function vs learning iterations")
plt.xlabel("Iterations")
plt.ylabel("Cost function")

Text(0, 0.5, 'Cost function')


# Best centroids:
model_kmean.centroids

[[0.05828239984056782, 0.06169536066651156],
 [0.09649010587881882, 0.25685919395944873],
 [0.1919339918674164, 0.5648802402971185]]


def store_model(k_mean_model):
    return [k_mean_model.centroids, model_kmean.cost_functions]

model_kmean.NUMBER_OF_CENTROIDS_INITIALIZATION = 5  # let's increase number of random tries
min_k = 2
max_k = 10  # how many k we want to explore
results_by_k = []  # to store resutls

for k in range(min_k, max_k+1):
    model_kmean.number_of_centroids = k
    model_kmean.fit()
    results_by_k.append(store_model(model_kmean))
    model_kmean.cost_functions = None  # reset best model results to store a new one
    # model_kmean.set_monitor(False)  # to disable visualization each step

Centroid 3 doesn't have any point

There is no point assigned to this 3 centroid

Centroid 5 doesn't have any point

There is no point assigned to this 5 centroid

Centroid 6 doesn't have any point

There is no point assigned to this 6 centroid


axis_x = [x+min_k for x in range(max_k - min_k + 1)]

centroids_by_k = []
cost_functions_by_k = []

for k_results in results_by_k:
    centroids_by_k.append(k_results[0])
    
    # return the min cost function for particular k
    cost_functions_by_k.append((k_results[1][-1]))
    
plt.plot(axis_x, cost_functions_by_k)
plt.title("Cost function vs K")
plt.xlabel("K")
plt.ylabel("Cost function")

Text(0, 0.5, 'Cost function')


min_cost_function = float("inf")
k_best = None
for idx in range(len(cost_functions_by_k)):
    k = idx + min_k
    if min_cost_function > cost_functions_by_k[idx]:
        min_cost_function = cost_functions_by_k[idx]
        k_best = k
print("Best K is", k_best)

Best K is 5


customer_data = test_data[20]
customer_data  # income, years of employed

[0.06, 0.15]


best_centroids = centroids_by_k[k_best - min_k]  # this is our best centroids from our K = 5
best_centroids

[[0.19454494774105843, 0.5726270955486146],
 [0.09118599771283756, 0.28134241729899934],
 [0.053608498534171364, 0.08063670915819787],
 [0.12748539892757962, 0.15752650013355549],
 [0.06080041797283175, 0.004928941724051094]]


# set best learned centroids to our model
model_kmean.set_best_centroids(best_centroids)


model_kmean.predict(customer_data)

3

AI: K-Mean¶

Clusterization problem with K-Mean: what kind of customers do we have?¶

Introduction¶

Section 1 - preparation ¶

Section 2 - model building ¶

Section 3 - first try ¶

Section 4 - full up ¶

Section 5 - results ¶

AI: K-Mean¶

Clusterization problem with K-Mean: what kind of customers do we have?¶

Introduction¶

Section 1 - preparation¶

Section 2 - model building¶

Section 3 - first try¶

Section 4 - full up¶

Section 5 - results¶

Section 1 - preparation ¶

Section 2 - model building ¶

Section 3 - first try ¶

Section 4 - full up ¶

Section 5 - results ¶