Реализация алгоритма FRIS-STOLP (python 3)

Теория по данному алгоритму приведена здесь, а также в курсе Константина Воронцова.

В данной статье мы рассмотрим только реализацию на python 3.

Код можно копировать в том же порядке «как есть».

Подключаем необходимые пакеты

import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from math import floor, ceil # for rounding up and down
from scipy.stats import pearsonr
from sklearn import datasets
import pandas as pd
import math
from typing import Dict, Tuple, List
import copy
from random import randint
from scipy.stats import pearsonr
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
import matplotlib.cm as cm
from scipy.stats import pearsonr
import matplotlib.colors as matplotcolor
from sklearn.decomposition import PCA
from random import randint
import matplotlib.cm as cm
def Get_Class_Label_Of_Object(x):
    return x[len(x)-1]

Метод вычисления расстояния между двумя объектами

def Distance(x, y):
    if len(x) != len(y):
        return -1
    
    sum = 0
    for i in range(len(x)):
        sum += (x[i]-y[i])*(x[i]-y[i])
        
    return sum

Чтение данных из файла:

def PrepareData():
    all_data = np.loadtxt(open("./wine_data.csv","r"), delimiter = ",", skiprows = 0, dtype = np.float64)
    
    # load class labels from column 1
    y_wine = all_data[:,0]
    #print (y_wine)
    # conversion of the class labels to integer-type array
    y_wine = y_wine.astype(np.int64, copy = False)

    # load the 14 features
    X_wine = all_data[:,1:]
    
    data = all_data[:, 6:8]   #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 6:9    
    
    x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
    y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5

    plt.figure(2, figsize = (16, 12))
    plt.clf()

    # Plot the training points
    plt.scatter(data[:, 0], data[:, 1], c = y_wine, cmap = plt.cm.Set1, edgecolor = 'k')
    plt.xlabel('Flavanoids')
    plt.ylabel('Nonflavanoid phenols')

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

    plt.show()

    uniq = [1 for x in range(len(data))]
    
    cnt_of_uniq = 0
    for i in range(len(data)):
        for j in range(i+1,len(data)):
            if data[i, 0] == data[j, 0] and data[i, 1] == data[j, 1]:
                uniq[j] = 0
   
    for i in range(len(data)):
        if uniq[i] == 1:
            cnt_of_uniq += 1
            
    w, h = 3, cnt_of_uniq
    m = [[0 for x in range(w)] for y in range(h)] 
          
    cur_cnt = 0
    for i in range(len(data)):
        if uniq[i] == 1:
            for j in range(2):
                m[cur_cnt][j] = data[i, j]
            m[cur_cnt][2] = y_wine[i]-1
            cur_cnt += 1
    
    data = m
    return data

data = PrepareData()

Нормализация данных, отображение данных на графике, убираем одинаковые объекты в выборке:

def Prepare_Iris_Data():
    # import some data to play with
    iris = datasets.load_iris()
    X = iris.data[:, 2:4]  # we only take the first two features.
    y = iris.target

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

    plt.figure(2, figsize = (16, 12))
    plt.clf()

    # Plot the training points
    plt.scatter(X[:, 0], X[:, 1], c = y, cmap = plt.cm.Set1, edgecolor = 'k')
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

    plt.show()
    
    uniq = [1 for x in range(len(X))]
    
    cnt_of_uniq = 0
    for i in range(len(X)):
        for j in range(i+1,len(X)):
            if X[i, 0] == X[j, 0] and X[i, 1] == X[j, 1]:
                uniq[j] = 0
   
    for i in range(len(X)):
        if uniq[i] == 1:
            cnt_of_uniq += 1
            
    w, h = 3, cnt_of_uniq
    m = [[0 for x in range(w)] for y in range(h)] 
          
    cur_cnt = 0
    for i in range(150):
        if uniq[i] == 1:
            for j in range(2):
                m[cur_cnt][j] = X[i, j]
            m[cur_cnt][2] = y[i]
            cur_cnt += 1
    
    data = m
    return data
    
iris_data = Prepare_Iris_Data()

Выборка данных

Выборка данных «Ирисы Фишера»

Необходимые столпы (эталоны) для алгоритма

Необходимые столпы (эталоны) для алгоритма

Класс FRIS-STOLP:

class FRiS_STOLP:
    EPS = 1E-7

    # Найти ближайшего соседа
    def Get_Nearest_Neighbour(self, x, Omega):
        dist = [0]*len(Omega)

        for i in range(len(Omega)):
            dist[i] = Distance(x[0:len(x)-1], Omega[i][0:len(x)-1])

        min_dist = min(dist)

        for i in range(len(Omega)):
            if dist[i] + self.EPS > min_dist and dist[i] - self.EPS < min_dist:
                return Omega[i]

    # Найти эталоны
    def FindEtalon(self, X, Omega, Xl, defence_to_tolerance_ratio):
        label = Get_Class_Label_Of_Object(X[0])
        Efficiency = [0]*len(X)
        
        Tolerance_denom = 0
        for item in Xl: 
            if Get_Class_Label_Of_Object(item) != label:
                Tolerance_denom += 1
                
        if Tolerance_denom == 0:
            Tolerance_denom = 1
               
        
        NN_enemy_etalon_For_Defence = [list() for t in range(len(X))]
        
        for p in range(len(X)):
            item = X[p]
            if Get_Class_Label_Of_Object(item) == label:
                NN_enemy_etalon_For_Defence[p] = self.Get_Nearest_Neighbour(item, Omega)
        
        
        NN_enemy_etalon_For_Tolerance = [list() for t in range(len(Xl))]
        
        for p in range(len(Xl)):
            item = Xl[p]
            if Get_Class_Label_Of_Object(item) != label:
                NN_enemy_etalon_For_Tolerance[p] = self.Get_Nearest_Neighbour(item, Omega)
        
        for i in range(len(X)):
            x = X[i]
            sum_of_defence = 0
            
            for j in range(len(X)):
                item = X[j]
                if item != x and Get_Class_Label_Of_Object(item) == label:
                    S = self.FRiS(NN_enemy_etalon_For_Defence[j], item, x)
                    sum_of_defence += S

            Defence = sum_of_defence / (len(X)-1)

            Tolerance_sum = 0
            
            NN_enemy_etalon = []
            NN_enemy_etalon = [list() for t in range(len(Xl))]
          
            for j in range(len(Xl)): 
                item = Xl[j]
                if Get_Class_Label_Of_Object(item) != label:
                    S = self.FRiS(x, item, NN_enemy_etalon_For_Tolerance[j])
                    Tolerance_sum += S

            Tolerance = Tolerance_sum / Tolerance_denom

            Efficiency[i] = defence_to_tolerance_ratio * Defence + (1.0 - defence_to_tolerance_ratio) * Tolerance

        max_Efficiency = max(Efficiency)

        for i in range(len(X)):
            if Efficiency[i] + self.EPS > max_Efficiency and Efficiency[i] - self.EPS < max_Efficiency:
                return X[i]


    # Получить объекты i-го класса
    def Get_Objects_From_Ith_Class(self, X, class_label):
        cnt = 0

        for i in range(len(X)):
            obj = X[i]

            if Get_Class_Label_Of_Object(obj) == class_label:
                cnt += 1

        w, h = 4, cnt
        res = [[0 for x in range(w)] for y in range(h)] 

        cnt = 0

        for i in range(len(X)):
            obj = X[i]
            if Get_Class_Label_Of_Object(obj) == class_label:
                res[cnt] = obj
                cnt += 1

        return res

    # --------------------------------------------------------------

    # Получить "врагов" i-го класса
    def Get_Enemies_Of_Ith_Class(self, X, class_label):
        cnt = 0

        for i in range(len(X)):
            obj = X[i]
            if Get_Class_Label_Of_Object(obj) != class_label:
                cnt += 1

        w, h = 4, cnt
        res = [[0 for x in range(w)] for y in range(h)] 

        cnt = 0

        for i in range(len(X)):
            obj = X[i]
            if Get_Class_Label_Of_Object(obj) != class_label:
                res[cnt] = obj
                cnt += 1

        return res

    # --------------------------------------------------------------

    def Truncate_Set(self, X, Y):
        res = []
        for i in range(len(X)):
            is_exist = False

            for j in range(len(Y)):
                if X[i] == Y[j]:
                    is_exist = True
                    break

            if is_exist == False:
                res[len(res):]  = [X[i]]

        return res

    # --------------------------------------------------------------

    # FRIS-функция
    def FRiS(self, a, x, c):
        S_nom = math.sqrt(Distance(a[0:len(x)-1], x[0:len(x)-1])) - math.sqrt(Distance(x[0:len(x)-1], c[0:len(x)-1]))
        S_denom = math.sqrt(Distance(a[0:len(x)-1], x[0:len(x)-1])) + math.sqrt(Distance(x[0:len(x)-1], c[0:len(x)-1]))
        S = S_nom / S_denom
        return S    

    # Метод "получить множество правильно классифицированных объектов"
    def Construct_A_Set_Of_Correctly_Classified_Objects(self, Xl, Etalons):
        res = []
        
        for i in range(len(Xl)):
            x = Xl[i]
            label_of_cur_class = Get_Class_Label_Of_Object(x)

            Enemies_Etalons = copy.deepcopy(Etalons)
            del Enemies_Etalons[label_of_cur_class]
            Enemies_Etalons = self.Merge_Elements(Enemies_Etalons)
            
            nearest_neighbour_friend = self.Get_Nearest_Neighbour(x, Etalons[label_of_cur_class])
            nearest_neighbour_enemy = self.Get_Nearest_Neighbour(x, Enemies_Etalons)
            
            S = self.FRiS(nearest_neighbour_enemy, x, nearest_neighbour_friend)
            
            if S > 0:
                res[len(res):] = [x]
                
        return res
    # --------------------------------------------------------------  

    # Получить количество классов в выборке
    def Get_Number_Of_Classes_In_Dataset(self, Xl):
        len_of_obj = len(Xl[0])
        
        labels = set()
        for obj in Xl:
            cur_label = obj[len_of_obj-1]
            labels.add(cur_label)
    
        return len(labels)

    # Объекдинить элементы
    def Merge_Elements(self, lst):
        res = []
        for i in range(len(lst)):
            res += lst[i]
            
        return res

    # Классификация по одному ближайшему соседу
    def Classify_By_1NN(self, u, Etalons):
        number_of_classes = self.Get_Number_Of_Classes_In_Dataset(Etalons)
        nn = self.Get_Nearest_Neighbour(u, Etalons)
        return Get_Class_Label_Of_Object(nn)

    # Получить эталоны
    def Get_Etalons(self, Xl):
        number_of_classes = self.Get_Number_Of_Classes_In_Dataset(Xl)
        
        Etalons = [list() for i in range(number_of_classes)]

        for i in range(3):    
            Xy = self.Get_Objects_From_Ith_Class(Xl, i)
            X_enemies = self.Get_Enemies_Of_Ith_Class(Xl, i)
            
            Etalons[i][len(Etalons[i]):] = [self.FindEtalon(Xy, X_enemies, Xl, 0.5)]
            
     #   print (Etalons)
            
        for i in range(3):
            Xy = self.Get_Objects_From_Ith_Class(Xl, i)

            Enemies_Etalons = copy.deepcopy(Etalons)
          
            del Enemies_Etalons[i]
            Enemies_Etalons = self.Merge_Elements(Enemies_Etalons)
            
            Etalons[i] = []
            Etalons[i][len(Etalons[i]):] = [self.FindEtalon(Xy, Enemies_Etalons, Xl, 0.5)]
            
     
        while len(Xl) > 0:
            U = self.Construct_A_Set_Of_Correctly_Classified_Objects(Xl, Etalons)
         
            Xl = self.Truncate_Set(Xl, U)

            for i in range(3):
                Xy = self.Get_Objects_From_Ith_Class(Xl, i)

                if len(Xy) == 0: 
                    continue

                Enemies_Etalons = copy.deepcopy(Etalons)
                del Enemies_Etalons[i]
                Enemies_Etalons = self.Merge_Elements(Enemies_Etalons)
                
                if len(Xy) == 1:
                    obj = Xy[0]
                    Etalons[i][len(Etalons[i]):] = [obj]
                else:
                    Etalons[i][len(Etalons[i]):] = [self.FindEtalon(Xy, Enemies_Etalons, Xl, 0.5)]

        return Etalons

    def FRiS_Support_Classify(self, u, xl, k, draw_plot_if_two_dimentional, draw_lines_between_u_nn_and_supports):
        number_of_classes = self.Get_Number_Of_Classes_In_Dataset(xl)

        nn = [list() for t in range(number_of_classes)]

        for i in range(number_of_classes):
            Xy = self.Get_Objects_From_Ith_Class(xl, i)
            nn[i] = self.Get_Nearest_Neighbour(u, Xy)

            
        candidates_for_supporting = [list() for t in range(number_of_classes)]
        for cur_class in range(number_of_classes):
            candidates_for_supporting[cur_class] = self.Get_Objects_From_Ith_Class(xl, cur_class)
            candidates_for_supporting[cur_class].remove(nn[cur_class])
        
        supporting_list = [list() for t in range(number_of_classes)]
        
        for cur_class in range(number_of_classes):
            for it in range(k-1):
                supporting_element = fris.Get_Nearest_Neighbour(nn[cur_class], candidates_for_supporting[cur_class])
                candidates_for_supporting[cur_class].remove(supporting_element)
                supporting_list[cur_class].append(supporting_element)
        
        
        sum_of_fris = [0 for t in range(number_of_classes)]
    
        for cur_class in range(number_of_classes):
            for i in range(len(supporting_list[cur_class])):
                sum_of_fris[cur_class] += self.FRiS(supporting_list[cur_class][i], nn[cur_class], u)
                
                if draw_lines_between_u_nn_and_supports == True:
                    plt.plot([supporting_list[cur_class][i][0], nn[cur_class][0]], [supporting_list[cur_class][i][1], nn[cur_class][1]], color = 'yellow')
                    plt.plot([nn[cur_class][0], u[0]], [nn[cur_class][1], u[1]], color = 'yellow')
            
            sum_of_fris[cur_class] /= len(supporting_list[cur_class])
        
        if draw_lines_between_u_nn_and_supports == True:
            plt.scatter([u[0]], [u[1]], c = 'green', s = 25)
        
        
        label = sum_of_fris.index(max(sum_of_fris))
        
        cmap_light = matplotlib.colors.ListedColormap(['red', 'green', '#AAAAFF'])

        #---------------------------------------------------------
        if len(u)-1 == 2 and draw_plot_if_two_dimentional == True:           
            cur_col = 'green'
            
            if label == 0:
                cur_col = 'red'
            if label == 1:
                cur_col = 'blue'
            
            plt.scatter([u[0]], [u[1]], c = cur_col, s = 25)
            
            x1_coord = []
            x2_coord = []
            y = []
            for i in range(len(xl)):
                x1_coord.append(xl[i][0])
                x2_coord.append(xl[i][1])
                y.append(xl[i][2])

            plt.scatter(x1_coord, x2_coord, c = y, cmap = cmap_light, s = 25)
            
        #---------------------------------------------------------
        
        return label

Основная программа:

# Инициализация выборки
xl_0 = [[-1, 1, 0], [-1, 3, 0], [-1, 5, 0], [1, 1, 0], [1, 3, 0], [1, 5, 0], [3, 1, 0], [3, 3, 0], [3, 5, 0], [5, 1, 0], [5, 3, 0], [5, 5, 0]]
xl_1 = [[8, 3, 1], [7.8, 3, 1], [8.2, 3, 1], [8, 3.2, 1], [8, 2.8, 1], [7.9, 2.9, 1], [7.9, 3.1, 1], [8.1, 2.9, 1], [8.1, 3.1, 1]]

xl = xl_0 + xl_1

fris = FRiS_STOLP()

x_min = 5
y_min = 1

x_max = 9
y_max = 5

N = 30
M = 30

plt.figure(2, figsize = (16, 12))

dx = (x_max-x_min)/N
dy = (y_max-y_min)/M

classification_map = []

for i in range(N+1):
    x = x_min + i * dx
    for j in range(M+1):
        y = y_min + j * dy
        u = [x, y, -1]
        classification_map.append(fris.FRiS_Support_Classify(u = u, xl = xl, k = 5, draw_plot_if_two_dimentional = True, draw_lines_between_u_nn_and_supports = False))

        
#----------------------------------------------------------
u = [6.0, 3.0, -1]
fris.FRiS_Support_Classify(u = u, xl = xl, k = 5, draw_plot_if_two_dimentional = False, draw_lines_between_u_nn_and_supports = True)
plt.show()
#----------------------------------------------------------

plt.figure(2, figsize = (16, 12))

N = 30
M = 30

dx = (x_max - x_min) / (N+1)
dy = (y_max - y_min) / (M+1)
xx, yy = np.arange(x_min, x_max, dx), np.arange(y_min, y_max, dy)

xv, yv = np.meshgrid(xx, yy)

cmap_light = matplotlib.colors.ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])

c = np.empty(xv.shape, dtype = int)
for x_it in range(xv.shape[0]):
        for y_it in range(yv.shape[0]):
            c[y_it][x_it] = classification_map[x_it*yv.shape[0]+y_it]

plt.pcolormesh(xv, yv, c, cmap = cmap_light)

x1_coord = []
x2_coord = []
y = []

for i in range(len(xl)):
    x1_coord.append(xl[i][0])
    x2_coord.append(xl[i][1]) 
    y.append(xl[i][2])

plt.scatter(x1_coord, x2_coord, c = y, cmap = plt.cm.Set1, s = 25)

plt.show()

Пример работы алгоритма FRIS-STOLP

Пример работы алгоритма FRIS-STOLP

Classification map для алгоритма FRIS-STOLP

Classification map для алгоритма FRIS-STOLP

# Создание объекта FRIS-STOLP
fris = FRiS_STOLP()
# Получили эталоны для выборки по всем классам
Etalons = fris.Get_Etalons(data)

all_data = np.loadtxt(open("./wine_data.csv","r"), delimiter = ",", skiprows = 0, dtype = np.float64)

# load class labels from column 1
y_wine = all_data[:,0]
#print (y_wine)
# conversion of the class labels to integer-type array
y_wine = y_wine.astype(np.int64, copy = False)

# load the 14 features
X_wine = all_data[:,1:]

data = all_data[:, 6:8]   #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 6:9    

x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5

plt.figure(2, figsize = (8, 6))
plt.clf()

# Plot the training points
plt.scatter(data[:, 0], data[:, 1], c = y_wine, cmap = plt.cm.Set1, edgecolor = 'k')
plt.xlabel('Flavanoids')
plt.ylabel('Nonflavanoid phenols')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

x_coord = []
y_coord = []
labels = []

Etalons = fris.Merge_Elements(Etalons)

for i in range(len(Etalons)):
    x_coord.append(Etalons[i][0])
    
for i in range(len(Etalons)):
    y_coord.append(Etalons[i][1])

#plt.plot(iris_data[119][0], iris_data[119][1], 'ro', color = 'g')
plt.plot(x_coord, y_coord, 'ro', color = 'b')

plt.show()

dcda1407908d52135980e514bcd3cbb7.png

Создание classification map:

def Create_Classification_Map(x_min, x_max, N, y_min, y_max, M, Etalons):
    dx = (x_max - x_min)/N
    dy = (y_max - y_min)/M
    
    obj_list = [list() for t in range(N*M)]
    
    for i in range(N):
        x = x_min + i * dx
        for j in range(M):
            y = y_min + j * dy
            
            u = [x, y, -1]
            label = fris_for_iris.Classify_By_1NN(u, Etalons)
              
            obj = [x, y, label]    
            obj_list[i*M+j] = obj
            
    return obj_list


fris_for_iris = FRiS_STOLP()
Etalons_iris = fris_for_iris.Get_Etalons(iris_data)

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, 2:4]  # we only take the first two features.
y = iris.target

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

plt.figure(2, figsize = (10, 8))
plt.clf()



# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c = y, edgecolor = 'k', alpha = 0.5)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.axis

x_coord = []
y_coord = []
labels = []

Etalons_iris = fris_for_iris.Merge_Elements(Etalons_iris)

classification_map = Create_Classification_Map(x_min, x_max, 100, y_min, y_max, 100, Etalons_iris)

x_coords_for_classification_map = []
y_coords_for_classification_map = []
c_for_classification_map = []

for i in range(len(classification_map)):
    obj = classification_map[i]
    label = Get_Class_Label_Of_Object(obj)
        
    cur_col = "white"
    if label == 0:
        cur_col = 'red'
    if label == 1:
        cur_col = 'green'
    if label == 2:
        cur_col = 'yellow'
    
    x_coords_for_classification_map.append(classification_map[i][0])
    y_coords_for_classification_map.append(classification_map[i][1])
    c_for_classification_map.append(label)
    
    

N = 100
M = 100


dx = (x_max - x_min) / N
dy = (y_max - y_min) / M
xx, yy = np.arange(x_min, x_max, dx), np.arange(y_min, y_max, dy)

xv, yv = np.meshgrid(xx, yy)


cmap_light = matplotlib.colors.ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])

c = np.empty(xv.shape, dtype = int)
for x_it in range(xv.shape[0]):
        for y_it in range(yv.shape[0]):
            c[x_it][y_it] = c_for_classification_map[y_it*xv.shape[0]+x_it]

plt.pcolormesh(xv, yv, c, cmap = cmap_light)


for i in range(len(X)):
    obj = X[i]
    label = y[i]
        
    cur_col = "white"
    if label == 0:
        cur_col = 'red'
    if label == 1:
        cur_col = 'green'
    if label == 2:
        cur_col = 'yellow'
    
    plt.plot([obj[0]], [obj[1]], 'ro', color = cur_col, alpha = 1.0)    


for i in range(len(Etalons_iris)):
    x_coord.append(Etalons_iris[i][0])
    
for i in range(len(Etalons_iris)):
    y_coord.append(Etalons_iris[i][1])

    
for i in range(len(Etalons_iris)):
    shape = 's'
    obj = Etalons_iris[i]
    label = Get_Class_Label_Of_Object(obj)
    
    cur_col = "white"
    
    if label == 0:
        cur_col = "red"
    if label == 1:
        cur_col = "green"
    if label == 2:
        cur_col = "yellow"
    
    plt.plot([x_coord[i]], [y_coord[i]], "ro", color = cur_col, markersize = 10, linestyle = '-', linewidth = 5, markeredgecolor = 'black')

plt.show()

Результат работы алгоритма FRIS-STOLP для выборки

Результат работы алгоритма FRIS-STOLP для выборки «Ирисы Фишера» (classification map)

© Habrahabr.ru