Brute force search clustering

Brute force search clustering#

Mahmood Amintoosi, Fall 2024

Computer Science Dept, Ferdowsi University of Mashhad

import numpy as np
import math

جدول درستی رو قبلا دیدیم:

def SAT_Table(x, i):
    if i == x.shape[0]:
        print(x)
    else:
        for x[i] in range(2):
            SAT_Table(x, i + 1)

N = 3
x = np.zeros(N)
SAT_Table(x, i=0)

[0. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 1.]
[1. 0. 0.]
[1. 0. 1.]
[1. 1. 0.]
[1. 1. 1.]

همون رو اصلاح می کنیم

def all_clustering(x, i):
    if i == x.shape[0]:
        s = convert_bin_to_cluster(x)
        print(s)
    else:
        for x[i] in range(2):
            all_clustering(x, i + 1)

یک تابع می‌نویسیم که هر سطر جدول درستی رو به یک روش خوشه‌بندی تبدیل کنه

def convert_bin_to_cluster(x):
    zeros = []
    ones = []
    for index, value in enumerate(x):
        if value == 0:
            zeros.append(index)
        else:
            ones.append(index)
    s = [zeros, ones]
    return s

N = 3
x = np.zeros(N)
all_clustering(x, i=0)

[[0, 1, 2], []]
[[0, 1], [2]]
[[0, 2], [1]]
[[0], [1, 2]]
[[1, 2], [0]]
[[1], [0, 2]]
[[2], [0, 1]]
[[], [0, 1, 2]]

روشهای دیگر نوشتن تابع

convert_bin_to_cluster(x)

x = np.array([0, 1, 0, 1, 0, 1, 0, 0, 1])

zeros = np.nonzero(x)[0]
ones = np.nonzero(np.bitwise_xor(x, 1))[0]

print("Zero indices:", zeros)
print("One indices:", ones)

Zero indices: [1 3 5 8]
One indices: [0 2 4 6 7]

x = [0, 1, 0, 1, 0, 1, 0, 0, 1]

zeros = [i for i, item in enumerate(x) if item == 0]
ones = [i for i, item in enumerate(x) if item == 1]

print("Zero indices:", zeros)
print("One indices:", ones)

Zero indices: [0, 2, 4, 6, 7]
One indices: [1, 3, 5, 8]

اگه داده‌ها مثلا به صورت زیر باشه چکار کنیم؟

D = [10, 20, 23]

def all_clustering(x, i, D):
    if i == x.shape[0]:
        s = convert_bin_to_cluster(x)
        clust_0 = [D[i] for i in s[0]]
        clust_1 = [D[i] for i in s[1]]
        print(clust_0, clust_1)
    else:
        for x[i] in range(2):
            all_clustering(x, i + 1, D)

N = 3
x = np.zeros(N)
D = [10, 20, 23]
all_clustering(x, i=0, D=D)

[10, 20, 23] []
[10, 20] [23]
[10, 23] [20]
[10] [20, 23]
[20, 23] [10]
[20] [10, 23]
[23] [10, 20]
[] [10, 20, 23]

آیا روی داده‌هایی که چند مولفه هم دارند کار می‌کند؟

N = 3
x = np.zeros(N)
D = [[10, 11], [20, 21], 23]
all_clustering(x, i=0, D=D)

[[10, 11], [20, 21], 23] []
[[10, 11], [20, 21]] [23]
[[10, 11], 23] [[20, 21]]
[[10, 11]] [[20, 21], 23]
[[20, 21], 23] [[10, 11]]
[[20, 21]] [[10, 11], 23]
[23] [[10, 11], [20, 21]]
[] [[10, 11], [20, 21], 23]

روی ماتریس چطور؟

N = 3
x = np.zeros(N)
D = np.array([[10, 11], [20, 21], [23, 24]])
print(D)
all_clustering(x, i=0, D=D)

[[10 11]
 [20 21]
 [23 24]]
[array([10, 11]), array([20, 21]), array([23, 24])] []
[array([10, 11]), array([20, 21])] [array([23, 24])]
[array([10, 11]), array([23, 24])] [array([20, 21])]
[array([10, 11])] [array([20, 21]), array([23, 24])]
[array([20, 21]), array([23, 24])] [array([10, 11])]
[array([20, 21])] [array([10, 11]), array([23, 24])]
[array([23, 24])] [array([10, 11]), array([20, 21])]
[] [array([10, 11]), array([20, 21]), array([23, 24])]

اما نتیجه‌ها فقط چاپ شده‌اند و نداریمشون!

داده‌ها رو هم ارسال کرده‌ایم!

def all_clustering(x, i):
    if i == x.shape[0]:
        s = convert_bin_to_cluster(x)
        yield s
    else:
        for x[i] in range(2):
            yield from all_clustering(x, i + 1)

N = 3
x = np.zeros(N)
D = [10, 20 ,23]
for s in all_clustering(x, i=0):
    clust_0 = [D[i] for i in s[0]]
    clust_1 = [D[i] for i in s[1]]
    print(clust_0, clust_1)

[10, 20, 23] []
[10, 20] [23]
[10, 23] [20]
[10] [20, 23]
[20, 23] [10]
[20] [10, 23]
[23] [10, 20]
[] [10, 20, 23]

چگونه فاصله بین هر دو زوج از عناصر یک دسته را پیدا کنیم؟

def compute_distances(D):
    distances = []
    n = len(D)
    for i in range(n):
        for j in range(i + 1, n):
            distance = math.sqrt((D[i] - D[j]) ** 2)
            distances.append(distance)
    distances = np.array(distances)
    return distances

D = [10, 11, 20]
distances = compute_distances(D)
print(distances, type(distances))

[ 1. 10.  9.] <class 'numpy.ndarray'>

یک تابع دیگه بنویسیم که مجموع مربعات فاصله‌ها رو برگردونه

def SSE_D(D):
    '''
    Sum of Squared Euclidean Distances
    '''
    distances = compute_distances(D)
    SSE = sum(distances**2)
    return SSE

D = [10, 11, 20]
SSE = SSE_D(D)
print(SSE)

182.0

برای انواع دسته‌بندی فاصله‌ها رو پیدا و چاپ کنیم

N = 3
x = np.zeros(N)
D = [10, 20 ,23]
for s in all_clustering(x, i=0):
    clust_0 = [D[i] for i in s[0]]
    clust_1 = [D[i] for i in s[1]]
    SSE_0 = SSE_D(clust_0)
    SSE_1 = SSE_D(clust_1)
    print(clust_0, SSE_0, clust_1, SSE_1)

[10, 20, 23] 278.0 [] 0
[10, 20] 100.0 [23] 0
[10, 23] 169.0 [20] 0
[10] 0 [20, 23] 9.0
[20, 23] 9.0 [10] 0
[20] 0 [10, 23] 169.0
[23] 0 [10, 20] 100.0
[] 0 [10, 20, 23] 278.0

حالا میتونیم خوشه‌بندی بهینه رو پیدا کنیم :)

N = 3
x = np.zeros(N)
D = [10, 20, 23]
min_SSE = np.Inf
best_clustering = []
for s in all_clustering(x, i=0):
    clust_0 = [D[i] for i in s[0]]
    clust_1 = [D[i] for i in s[1]]
    SSE_0 = SSE_D(clust_0)
    SSE_1 = SSE_D(clust_1)
    # Sum of Within Cluster Distances, 14.28 of ESL
    SSE = SSE_0 + SSE_1 
    if SSE < min_SSE:
        min_SSE = SSE
        best_clustering = [clust_0, clust_1]
print(best_clustering, min_SSE)

[[10], [20, 23]] 9.0

هدف در خوشه‌بندی می‌تواند کمینه کردن مجموع فواصل درون خوشه‌ای باشد:

Sum of Squared Euclidean Distances
Within-Cluster Sum of Squares (WCSS), i.e. variance: Wiki, Medium
Sum of Within Cluster Distances, Sec. 14.3.5 of ESL
Sum of Squares Within (SSW)
Sum of Squares Error (SSE)

تبدیل به تابعش کنیم:

def BF_clustering(D):
    N = len(D)
    x = np.zeros(N)
    min_SSE = np.Inf
    best_clustering = []
    for s in all_clustering(x, i=0):
        clust_0 = [D[i] for i in s[0]]
        clust_1 = [D[i] for i in s[1]]
        SSE_0 = SSE_D(clust_0)
        SSE_1 = SSE_D(clust_1)
        SSE = SSE_0 + SSE_1
        if SSE < min_SSE:
            min_SSE = SSE
            best_clustering = [clust_0, clust_1]
    return best_clustering, min_SSE

D = [10, 20, 23]
best_clustering, min_SSE = BF_clustering(D)
for sub_set in best_clustering:
    print(set(sub_set))

{10}
{20, 23}

روشهای دیگر برای محاسبه فواصل همه زوج عناصر یک دسته

import itertools
D = [10, 20, 23]
n_clusters = 2
combinations = itertools.combinations(D, n_clusters)
for x in combinations:
    print(x)

(10, 20)
(10, 23)
(20, 23)

D = [10, 20, 23]
distances = [math.sqrt((a - b) ** 2) for a, b in itertools.combinations(D, 2)]
print(distances)

[10.0, 13.0, 3.0]

Using SciPy#

from scipy.spatial import distance_matrix

D = [10, 20, 23]
D = np.array(D)
print(D, type(D), D.shape)

D = D[:, np.newaxis]
print(D, type(D), D.shape)

# Compute pairwise distances
distances = distance_matrix(D, D)
print(distances)

[10 20 23] <class 'numpy.ndarray'> (3,)
[[10]
 [20]
 [23]] <class 'numpy.ndarray'> (3, 1)
[[ 0. 10. 13.]
 [10.  0.  3.]
 [13.  3.  0.]]

آیا تابع قبلی ما برای ماتریس‌ها هم کار خواهد کرد؟

D = np.array([[10, 11], [20, 21], [23, 24]])

best_clustering, min_SSE = BF_clustering(D)
for sub_set in best_clustering:
    print(set(sub_set))

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line <cell line: 3>()
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=0'>1</a> D = np.array([[10, 11], [20, 21], [23, 24]])
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=2'>3</a> best_clustering, min_SSE = BF_clustering(D)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=3'>4</a> for sub_set in best_clustering:
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=4'>5</a>     print(set(sub_set))

c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line BF_clustering(D)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=6'>7</a> clust_0 = [D[i] for i in s[0]]
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=7'>8</a> clust_1 = [D[i] for i in s[1]]
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=8'>9</a> SSE_0 = SSE_D(clust_0)
     <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=9'>10</a> SSE_1 = SSE_D(clust_1)
     <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=10'>11</a> SSE = SSE_0 + SSE_1

c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line SSE_D(D)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=0'>1</a> def SSE_D(D):
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=1'>2</a>     distances = compute_distances(D)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=2'>3</a>     SSE = sum(distances**2)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=3'>4</a>     return SSE

c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line compute_distances(D)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=3'>4</a> for i in range(n):
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=4'>5</a>     for j in range(i + 1, n):
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=5'>6</a>         distance = math.sqrt((D[i] - D[j]) ** 2)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=6'>7</a>         distances.append(distance)
      <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=7'>8</a> distances = np.array(distances)

TypeError: only length-1 arrays can be converted to Python scalars

math.sqrt((D[0] - D[1]) ** 2)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
c:\Users\hp\OneDrive\FUM\Teaching\Alg-for-DS\code\clustering\BF-clustering.ipynb Cell 44 line <cell line: 1>()
----> <a href='vscode-notebook-cell:/c%3A/Users/hp/OneDrive/FUM/Teaching/Alg-for-DS/code/clustering/BF-clustering.ipynb#X63sZmlsZQ%3D%3D?line=0'>1</a> math.sqrt((D[0] - D[1]) ** 2)

TypeError: only length-1 arrays can be converted to Python scalars

print(D[0], D[1], D[0]-D[1])
print((D[0]-D[1])**2, np.sum((D[0]-D[1])**2))
print(np.sum((D[0]-D[1])**2)**0.5)

[10 11] [20 21] [-10 -10]
[100 100] 200
14.142135623730951

np.linalg.norm(D[0] - D[1])

14.142135623730951

Brute force search clustering

Contents

Brute force search clustering#

Using SciPy#