In [1]:
from sklearn.datasets import load_breast_cancer
from random import shuffle
import math

# Load the dataset.
X, Y = load_breast_cancer(True)
# X is a list of n entries, each of which is a list of 10 measurements related to the tumour
# Y is a list of n entries, each of which is a number indicating whether 
# the corresponding tumour is malignant or benign

# Malignant tumours are represented by 1.
MALIGNANT = 1
# Benign tumours are represented by a 0.
BENIGN = 0

In [2]:
# We want to populate our KNN with 100 entries from the dataset.
data = []
for i in range(100):
    data.append( (X[i], Y[i]) )
# Now _data_ is a list of pairs; each pair consists of
# 1. a list of measurements
# 2. a diagnosis (MALIGNANT or BENIGN)

In [3]:
FEATURES = [0, 1]

def classify(p, k):
    closest_points = find_k_closest_points(p, k)
    label = find_label_with_highest_count(closest_points)
    return label

def distance(p, q):
    d = 0
    for i in FEATURES:
        d += (p[i] - q[i]) ** 2
    return math.sqrt(d)

def find_k_closest_points(p, k):
    sorted_data = sorted(data, key=lambda q: distance(p, q[0]))
    return sorted_data[:k]

def find_label_with_highest_count(closest):
    B = 0
    M = 0
    
    for point in closest:
        if point[1] == MALIGNANT:
            M += 1
        if point[1] == BENIGN:
            B += 1
            
    if B > M:
        return BENIGN
    else:
        return MALIGNANT

In [4]:
n = 121
classify(X[n], 3), Y[n]

(0, 0)