Brute force search clustering#
Mahmood Amintoosi, Fall 2024
Computer Science Dept, Ferdowsi University of Mashhad
import numpy as np
import math
جدول درستی رو قبلا دیدیم:
def SAT_Table(x, i):
if i == x.shape[0]:
print(x)
else:
for x[i] in range(2):
SAT_Table(x, i + 1)
N = 3
x = np.zeros(N)
SAT_Table(x, i=0)
[0. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 1.]
[1. 0. 0.]
[1. 0. 1.]
[1. 1. 0.]
[1. 1. 1.]
همون رو اصلاح می کنیم
def all_clustering(x, i):
if i == x.shape[0]:
s = convert_bin_to_cluster(x)
print(s)
else:
for x[i] in range(2):
all_clustering(x, i + 1)
یک تابع مینویسیم که هر سطر جدول درستی رو به یک روش خوشهبندی تبدیل کنه
def convert_bin_to_cluster(x):
zeros = []
ones = []
for index, value in enumerate(x):
if value == 0:
zeros.append(index)
else:
ones.append(index)
s = [zeros, ones]
return s
N = 3
x = np.zeros(N)
all_clustering(x, i=0)
[[0, 1, 2], []]
[[0, 1], [2]]
[[0, 2], [1]]
[[0], [1, 2]]
[[1, 2], [0]]
[[1], [0, 2]]
[[2], [0, 1]]
[[], [0, 1, 2]]
روشهای دیگر نوشتن تابع
convert_bin_to_cluster(x)
x = np.array([0, 1, 0, 1, 0, 1, 0, 0, 1])
zeros = np.nonzero(x)[0]
ones = np.nonzero(np.bitwise_xor(x, 1))[0]
print("Zero indices:", zeros)
print("One indices:", ones)
Zero indices: [1 3 5 8]
One indices: [0 2 4 6 7]
x = [0, 1, 0, 1, 0, 1, 0, 0, 1]
zeros = [i for i, item in enumerate(x) if item == 0]
ones = [i for i, item in enumerate(x) if item == 1]
print("Zero indices:", zeros)
print("One indices:", ones)
Zero indices: [0, 2, 4, 6, 7]
One indices: [1, 3, 5, 8]
اگه دادهها مثلا به صورت زیر باشه چکار کنیم؟
D = [10, 20, 23]
def all_clustering(x, i, D):
if i == x.shape[0]:
s = convert_bin_to_cluster(x)
clust_0 = [D[i] for i in s[0]]
clust_1 = [D[i] for i in s[1]]
print(clust_0, clust_1)
else:
for x[i] in range(2):
all_clustering(x, i + 1, D)
N = 3
x = np.zeros(N)
D = [10, 20, 23]
all_clustering(x, i=0, D=D)
[10, 20, 23] []
[10, 20] [23]
[10, 23] [20]
[10] [20, 23]
[20, 23] [10]
[20] [10, 23]
[23] [10, 20]
[] [10, 20, 23]
آیا روی دادههایی که چند مولفه هم دارند کار میکند؟
N = 3
x = np.zeros(N)
D = [[10, 11], [20, 21], 23]
all_clustering(x, i=0, D=D)
[[10, 11], [20, 21], 23] []
[[10, 11], [20, 21]] [23]
[[10, 11], 23] [[20, 21]]
[[10, 11]] [[20, 21], 23]
[[20, 21], 23] [[10, 11]]
[[20, 21]] [[10, 11], 23]
[23] [[10, 11], [20, 21]]
[] [[10, 11], [20, 21], 23]
روی ماتریس چطور؟
N = 3
x = np.zeros(N)
D = np.array([[10, 11], [20, 21], [23, 24]])
print(D)
all_clustering(x, i=0, D=D)
[[10 11]
[20 21]
[23 24]]
[array([10, 11]), array([20, 21]), array([23, 24])] []
[array([10, 11]), array([20, 21])] [array([23, 24])]
[array([10, 11]), array([23, 24])] [array([20, 21])]
[array([10, 11])] [array([20, 21]), array([23, 24])]
[array([20, 21]), array([23, 24])] [array([10, 11])]
[array([20, 21])] [array([10, 11]), array([23, 24])]
[array([23, 24])] [array([10, 11]), array([20, 21])]
[] [array([10, 11]), array([20, 21]), array([23, 24])]
اما نتیجهها فقط چاپ شدهاند و نداریمشون!
دادهها رو هم ارسال کردهایم!
def all_clustering(x, i):
if i == x.shape[0]:
s = convert_bin_to_cluster(x)
yield s
else:
for x[i] in range(2):
yield from all_clustering(x, i + 1)
N = 3
x = np.zeros(N)
D = [10, 20 ,23]
for s in all_clustering(x, i=0):
clust_0 = [D[i] for i in s[0]]
clust_1 = [D[i] for i in s[1]]
print(clust_0, clust_1)
[10, 20, 23] []
[10, 20] [23]
[10, 23] [20]
[10] [20, 23]
[20, 23] [10]
[20] [10, 23]
[23] [10, 20]
[] [10, 20, 23]
چگونه فاصله بین هر دو زوج از عناصر یک دسته را پیدا کنیم؟
def compute_distances(D):
distances = []
n = len(D)
for i in range(n):
for j in range(i + 1, n):
distance = math.sqrt((D[i] - D[j]) ** 2)
distances.append(distance)
distances = np.array(distances)
return distances
D = [10, 11, 20]
distances = compute_distances(D)
print(distances, type(distances))
[ 1. 10. 9.] <class 'numpy.ndarray'>
یک تابع دیگه بنویسیم که مجموع مربعات فاصلهها رو برگردونه
def SSE_D(D):
'''
Sum of Squared Euclidean Distances
'''
distances = compute_distances(D)
SSE = sum(distances**2)
return SSE
D = [10, 11, 20]
SSE = SSE_D(D)
print(SSE)
182.0
برای انواع دستهبندی فاصلهها رو پیدا و چاپ کنیم
N = 3
x = np.zeros(N)
D = [10, 20 ,23]
for s in all_clustering(x, i=0):
clust_0 = [D[i] for i in s[0]]
clust_1 = [D[i] for i in s[1]]
SSE_0 = SSE_D(clust_0)
SSE_1 = SSE_D(clust_1)
print(clust_0, SSE_0, clust_1, SSE_1)
[10, 20, 23] 278.0 [] 0
[10, 20] 100.0 [23] 0
[10, 23] 169.0 [20] 0
[10] 0 [20, 23] 9.0
[20, 23] 9.0 [10] 0
[20] 0 [10, 23] 169.0
[23] 0 [10, 20] 100.0
[] 0 [10, 20, 23] 278.0
حالا میتونیم خوشهبندی بهینه رو پیدا کنیم :)
N = 3
x = np.zeros(N)
D = [10, 20, 23]
min_SSE = np.Inf
best_clustering = []
for s in all_clustering(x, i=0):
clust_0 = [D[i] for i in s[0]]
clust_1 = [D[i] for i in s[1]]
SSE_0 = SSE_D(clust_0)
SSE_1 = SSE_D(clust_1)
# Sum of Within Cluster Distances, 14.28 of ESL
SSE = SSE_0 + SSE_1
if SSE < min_SSE:
min_SSE = SSE
best_clustering = [clust_0, clust_1]
print(best_clustering, min_SSE)
[[10], [20, 23]] 9.0
هدف در خوشهبندی میتواند کمینه کردن مجموع فواصل درون خوشهای باشد:
Sum of Squared Euclidean Distances
Within-Cluster Sum of Squares (WCSS), i.e. variance: Wiki, Medium
Sum of Within Cluster Distances, Sec. 14.3.5 of ESL
تبدیل به تابعش کنیم:
def BF_clustering(D):
N = len(D)
x = np.zeros(N)
min_SSE = np.Inf
best_clustering = []
for s in all_clustering(x, i=0):
clust_0 = [D[i] for i in s[0]]
clust_1 = [D[i] for i in s[1]]
SSE_0 = SSE_D(clust_0)
SSE_1 = SSE_D(clust_1)
SSE = SSE_0 + SSE_1
if SSE < min_SSE:
min_SSE = SSE
best_clustering = [clust_0, clust_1]
return best_clustering, min_SSE
D = [10, 20, 23]
best_clustering, min_SSE = BF_clustering(D)
for sub_set in best_clustering:
print(set(sub_set))
{10}
{20, 23}
روشهای دیگر برای محاسبه فواصل همه زوج عناصر یک دسته
import itertools
D = [10, 20, 23]
n_clusters = 2
combinations = itertools.combinations(D, n_clusters)
for x in combinations:
print(x)
(10, 20)
(10, 23)
(20, 23)
D = [10, 20, 23]
distances = [math.sqrt((a - b) ** 2) for a, b in itertools.combinations(D, 2)]
print(distances)
[10.0, 13.0, 3.0]
Using SciPy#
from scipy.spatial import distance_matrix
D = [10, 20, 23]
D = np.array(D)
print(D, type(D), D.shape)
D = D[:, np.newaxis]
print(D, type(D), D.shape)
# Compute pairwise distances
distances = distance_matrix(D, D)
print(distances)
[10 20 23] <class 'numpy.ndarray'> (3,)
[[10]
[20]
[23]] <class 'numpy.ndarray'> (3, 1)
[[ 0. 10. 13.]
[10. 0. 3.]
[13. 3. 0.]]
آیا تابع قبلی ما برای ماتریسها هم کار خواهد کرد؟
D = np.array([[10, 11], [20, 21], [23, 24]])
best_clustering, min_SSE = BF_clustering(D)
for sub_set in best_clustering:
print(set(sub_set))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line <cell line: 3>()
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=0'>1</a> D = np.array([[10, 11], [20, 21], [23, 24]])
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=2'>3</a> best_clustering, min_SSE = BF_clustering(D)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=3'>4</a> for sub_set in best_clustering:
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=4'>5</a> print(set(sub_set))
c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line BF_clustering(D)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=6'>7</a> clust_0 = [D[i] for i in s[0]]
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=7'>8</a> clust_1 = [D[i] for i in s[1]]
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=8'>9</a> SSE_0 = SSE_D(clust_0)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=9'>10</a> SSE_1 = SSE_D(clust_1)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=10'>11</a> SSE = SSE_0 + SSE_1
c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line SSE_D(D)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=0'>1</a> def SSE_D(D):
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=1'>2</a> distances = compute_distances(D)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=2'>3</a> SSE = sum(distances**2)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=3'>4</a> return SSE
c:\temp\git\fum-cs\fds\code\BF-clustering.ipynb Cell 43 line compute_distances(D)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=3'>4</a> for i in range(n):
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=4'>5</a> for j in range(i + 1, n):
----> <a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=5'>6</a> distance = math.sqrt((D[i] - D[j]) ** 2)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=6'>7</a> distances.append(distance)
<a href='vscode-notebook-cell:/c%3A/temp/git/fum-cs/fds/code/BF-clustering.ipynb#X60sZmlsZQ%3D%3D?line=7'>8</a> distances = np.array(distances)
TypeError: only length-1 arrays can be converted to Python scalars
math.sqrt((D[0] - D[1]) ** 2)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\Users\hp\OneDrive\FUM\Teaching\Alg-for-DS\code\clustering\BF-clustering.ipynb Cell 44 line <cell line: 1>()
----> <a href='vscode-notebook-cell:/c%3A/Users/hp/OneDrive/FUM/Teaching/Alg-for-DS/code/clustering/BF-clustering.ipynb#X63sZmlsZQ%3D%3D?line=0'>1</a> math.sqrt((D[0] - D[1]) ** 2)
TypeError: only length-1 arrays can be converted to Python scalars
print(D[0], D[1], D[0]-D[1])
print((D[0]-D[1])**2, np.sum((D[0]-D[1])**2))
print(np.sum((D[0]-D[1])**2)**0.5)
[10 11] [20 21] [-10 -10]
[100 100] 200
14.142135623730951
np.linalg.norm(D[0] - D[1])
14.142135623730951