{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![](img/banner.png)\n", "%%HTML\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Brute force search clustering\n", "\n", "**Mahmood Amintoosi, Fall 2024**\n", "\n", "Computer Science Dept, Ferdowsi University of Mashhad" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import math" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "جدول درستی رو قبلا دیدیم:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def SAT_Table(x, i):\n", " if i == x.shape[0]:\n", " print(x)\n", " else:\n", " for x[i] in range(2):\n", " SAT_Table(x, i + 1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0. 0. 0.]\n", "[0. 0. 1.]\n", "[0. 1. 0.]\n", "[0. 1. 1.]\n", "[1. 0. 0.]\n", "[1. 0. 1.]\n", "[1. 1. 0.]\n", "[1. 1. 1.]\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "SAT_Table(x, i=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "همون رو اصلاح می کنیم" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def all_clustering(x, i):\n", " if i == x.shape[0]:\n", " s = convert_bin_to_cluster(x)\n", " print(s)\n", " else:\n", " for x[i] in range(2):\n", " all_clustering(x, i + 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "یک تابع می‌نویسیم که هر سطر جدول درستی رو\n", " به یک روش خوشه‌بندی تبدیل کنه" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def convert_bin_to_cluster(x):\n", " zeros = []\n", " ones = []\n", " for index, value in enumerate(x):\n", " if value == 0:\n", " zeros.append(index)\n", " else:\n", " ones.append(index)\n", " s = [zeros, ones]\n", " return s" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0, 1, 2], []]\n", "[[0, 1], [2]]\n", "[[0, 2], [1]]\n", "[[0], [1, 2]]\n", "[[1, 2], [0]]\n", "[[1], [0, 2]]\n", "[[2], [0, 1]]\n", "[[], [0, 1, 2]]\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "all_clustering(x, i=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "روشهای دیگر نوشتن تابع\n", "\n", " convert_bin_to_cluster(x)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Zero indices: [1 3 5 8]\n", "One indices: [0 2 4 6 7]\n" ] } ], "source": [ "x = np.array([0, 1, 0, 1, 0, 1, 0, 0, 1])\n", "\n", "zeros = np.nonzero(x)[0]\n", "ones = np.nonzero(np.bitwise_xor(x, 1))[0]\n", "\n", "print(\"Zero indices:\", zeros)\n", "print(\"One indices:\", ones)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Zero indices: [0, 2, 4, 6, 7]\n", "One indices: [1, 3, 5, 8]\n" ] } ], "source": [ "x = [0, 1, 0, 1, 0, 1, 0, 0, 1]\n", "\n", "zeros = [i for i, item in enumerate(x) if item == 0]\n", "ones = [i for i, item in enumerate(x) if item == 1]\n", "\n", "print(\"Zero indices:\", zeros)\n", "print(\"One indices:\", ones)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "اگه داده‌ها مثلا به صورت زیر باشه چکار کنیم؟\n", "\n", "D = [10, 20, 23]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def all_clustering(x, i, D):\n", " if i == x.shape[0]:\n", " s = convert_bin_to_cluster(x)\n", " clust_0 = [D[i] for i in s[0]]\n", " clust_1 = [D[i] for i in s[1]]\n", " print(clust_0, clust_1)\n", " else:\n", " for x[i] in range(2):\n", " all_clustering(x, i + 1, D)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10, 20, 23] []\n", "[10, 20] [23]\n", "[10, 23] [20]\n", "[10] [20, 23]\n", "[20, 23] [10]\n", "[20] [10, 23]\n", "[23] [10, 20]\n", "[] [10, 20, 23]\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "D = [10, 20, 23]\n", "all_clustering(x, i=0, D=D)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "آیا روی داده‌هایی که چند مولفه هم دارند کار می‌کند؟" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[10, 11], [20, 21], 23] []\n", "[[10, 11], [20, 21]] [23]\n", "[[10, 11], 23] [[20, 21]]\n", "[[10, 11]] [[20, 21], 23]\n", "[[20, 21], 23] [[10, 11]]\n", "[[20, 21]] [[10, 11], 23]\n", "[23] [[10, 11], [20, 21]]\n", "[] [[10, 11], [20, 21], 23]\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "D = [[10, 11], [20, 21], 23]\n", "all_clustering(x, i=0, D=D)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "روی ماتریس چطور؟" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[10 11]\n", " [20 21]\n", " [23 24]]\n", "[array([10, 11]), array([20, 21]), array([23, 24])] []\n", "[array([10, 11]), array([20, 21])] [array([23, 24])]\n", "[array([10, 11]), array([23, 24])] [array([20, 21])]\n", "[array([10, 11])] [array([20, 21]), array([23, 24])]\n", "[array([20, 21]), array([23, 24])] [array([10, 11])]\n", "[array([20, 21])] [array([10, 11]), array([23, 24])]\n", "[array([23, 24])] [array([10, 11]), array([20, 21])]\n", "[] [array([10, 11]), array([20, 21]), array([23, 24])]\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "D = np.array([[10, 11], [20, 21], [23, 24]])\n", "print(D)\n", "all_clustering(x, i=0, D=D)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "اما نتیجه‌ها فقط چاپ شده‌اند و نداریمشون!\n", "\n", "داده‌ها رو هم ارسال کرده‌ایم!" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def all_clustering(x, i):\n", " if i == x.shape[0]:\n", " s = convert_bin_to_cluster(x)\n", " yield s\n", " else:\n", " for x[i] in range(2):\n", " yield from all_clustering(x, i + 1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10, 20, 23] []\n", "[10, 20] [23]\n", "[10, 23] [20]\n", "[10] [20, 23]\n", "[20, 23] [10]\n", "[20] [10, 23]\n", "[23] [10, 20]\n", "[] [10, 20, 23]\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "D = [10, 20 ,23]\n", "for s in all_clustering(x, i=0):\n", " clust_0 = [D[i] for i in s[0]]\n", " clust_1 = [D[i] for i in s[1]]\n", " print(clust_0, clust_1)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "چگونه فاصله بین هر دو زوج از عناصر یک دسته را پیدا کنیم؟" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def compute_distances(D):\n", " distances = []\n", " n = len(D)\n", " for i in range(n):\n", " for j in range(i + 1, n):\n", " distance = math.sqrt((D[i] - D[j]) ** 2)\n", " distances.append(distance)\n", " distances = np.array(distances)\n", " return distances" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 1. 10. 9.] \n" ] } ], "source": [ "D = [10, 11, 20]\n", "distances = compute_distances(D)\n", "print(distances, type(distances))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "یک تابع دیگه بنویسیم که مجموع مربعات فاصله‌ها رو برگردونه" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def SSE_D(D):\n", " '''\n", " Sum of Squared Euclidean Distances\n", " '''\n", " distances = compute_distances(D)\n", " SSE = sum(distances**2)\n", " return SSE" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "182.0\n" ] } ], "source": [ "D = [10, 11, 20]\n", "SSE = SSE_D(D)\n", "print(SSE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "برای انواع دسته‌بندی فاصله‌ها رو پیدا و چاپ کنیم" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10, 20, 23] 278.0 [] 0\n", "[10, 20] 100.0 [23] 0\n", "[10, 23] 169.0 [20] 0\n", "[10] 0 [20, 23] 9.0\n", "[20, 23] 9.0 [10] 0\n", "[20] 0 [10, 23] 169.0\n", "[23] 0 [10, 20] 100.0\n", "[] 0 [10, 20, 23] 278.0\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "D = [10, 20 ,23]\n", "for s in all_clustering(x, i=0):\n", " clust_0 = [D[i] for i in s[0]]\n", " clust_1 = [D[i] for i in s[1]]\n", " SSE_0 = SSE_D(clust_0)\n", " SSE_1 = SSE_D(clust_1)\n", " print(clust_0, SSE_0, clust_1, SSE_1)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "حالا میتونیم خوشه‌بندی بهینه رو پیدا کنیم :)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[10], [20, 23]] 9.0\n" ] } ], "source": [ "N = 3\n", "x = np.zeros(N)\n", "D = [10, 20, 23]\n", "min_SSE = np.Inf\n", "best_clustering = []\n", "for s in all_clustering(x, i=0):\n", " clust_0 = [D[i] for i in s[0]]\n", " clust_1 = [D[i] for i in s[1]]\n", " SSE_0 = SSE_D(clust_0)\n", " SSE_1 = SSE_D(clust_1)\n", " # Sum of Within Cluster Distances, 14.28 of ESL\n", " SSE = SSE_0 + SSE_1 \n", " if SSE < min_SSE:\n", " min_SSE = SSE\n", " best_clustering = [clust_0, clust_1]\n", "print(best_clustering, min_SSE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "هدف در خوشه‌بندی می‌تواند کمینه کردن مجموع فواصل درون خوشه‌ای باشد:\n", "\n", "* Sum of [Squared Euclidean Distances](https://en.wikipedia.org/wiki/K-means_clustering)\n", "* Within-Cluster Sum of Squares (WCSS), i.e. variance: [Wiki](https://en.wikipedia.org/wiki/K-means_clustering), [Medium](https://odsc.medium.com/unsupervised-learning-evaluating-clusters-bd47eed175ce)\n", "* Sum of Within Cluster Distances, Sec. 14.3.5 of ESL\n", "* [Sum of Squares Within (SSW)](https://towardsdatascience.com/explain-ml-in-a-simple-way-k-means-clustering-e925d019743b)\n", "* [Sum of Squares Error (SSE)](https://towardsdatascience.com/explain-ml-in-a-simple-way-k-means-clustering-e925d019743b)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "تبدیل به تابعش کنیم:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def BF_clustering(D):\n", " N = len(D)\n", " x = np.zeros(N)\n", " min_SSE = np.Inf\n", " best_clustering = []\n", " for s in all_clustering(x, i=0):\n", " clust_0 = [D[i] for i in s[0]]\n", " clust_1 = [D[i] for i in s[1]]\n", " SSE_0 = SSE_D(clust_0)\n", " SSE_1 = SSE_D(clust_1)\n", " SSE = SSE_0 + SSE_1\n", " if SSE < min_SSE:\n", " min_SSE = SSE\n", " best_clustering = [clust_0, clust_1]\n", " return best_clustering, min_SSE" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{10}\n", "{20, 23}\n" ] } ], "source": [ "D = [10, 20, 23]\n", "best_clustering, min_SSE = BF_clustering(D)\n", "for sub_set in best_clustering:\n", " print(set(sub_set))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "روشهای دیگر برای محاسبه فواصل همه زوج عناصر یک دسته" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(10, 20)\n", "(10, 23)\n", "(20, 23)\n" ] } ], "source": [ "import itertools\n", "D = [10, 20, 23]\n", "n_clusters = 2\n", "combinations = itertools.combinations(D, n_clusters)\n", "for x in combinations:\n", " print(x)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10.0, 13.0, 3.0]\n" ] } ], "source": [ "D = [10, 20, 23]\n", "distances = [math.sqrt((a - b) ** 2) for a, b in itertools.combinations(D, 2)]\n", "print(distances)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using SciPy" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10 20 23] (3,)\n", "[[10]\n", " [20]\n", " [23]] (3, 1)\n", "[[ 0. 10. 13.]\n", " [10. 0. 3.]\n", " [13. 3. 0.]]\n" ] } ], "source": [ "from scipy.spatial import distance_matrix\n", "\n", "D = [10, 20, 23]\n", "D = np.array(D)\n", "print(D, type(D), D.shape)\n", "\n", "D = D[:, np.newaxis]\n", "print(D, type(D), D.shape)\n", "\n", "# Compute pairwise distances\n", "distances = distance_matrix(D, D)\n", "print(distances)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "آیا تابع قبلی ما برای ماتریس‌ها هم کار خواهد کرد؟" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "only length-1 arrays can be converted to Python scalars", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\temp\\git\\fum-cs\\fds\\code\\BF-clustering.ipynb Cell 43\u001b[0m line \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m D \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39marray([[\u001b[39m10\u001b[39m, \u001b[39m11\u001b[39m], [\u001b[39m20\u001b[39m, \u001b[39m21\u001b[39m], [\u001b[39m23\u001b[39m, \u001b[39m24\u001b[39m]])\n\u001b[1;32m----> 3\u001b[0m best_clustering, min_SSE \u001b[39m=\u001b[39m BF_clustering(D)\n\u001b[0;32m 4\u001b[0m \u001b[39mfor\u001b[39;00m sub_set \u001b[39min\u001b[39;00m best_clustering:\n\u001b[0;32m 5\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mset\u001b[39m(sub_set))\n", "\u001b[1;32mc:\\temp\\git\\fum-cs\\fds\\code\\BF-clustering.ipynb Cell 43\u001b[0m line \u001b[0;36mBF_clustering\u001b[1;34m(D)\u001b[0m\n\u001b[0;32m 7\u001b[0m clust_0 \u001b[39m=\u001b[39m [D[i] \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m s[\u001b[39m0\u001b[39m]]\n\u001b[0;32m 8\u001b[0m clust_1 \u001b[39m=\u001b[39m [D[i] \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m s[\u001b[39m1\u001b[39m]]\n\u001b[1;32m----> 9\u001b[0m SSE_0 \u001b[39m=\u001b[39m SSE_D(clust_0)\n\u001b[0;32m 10\u001b[0m SSE_1 \u001b[39m=\u001b[39m SSE_D(clust_1)\n\u001b[0;32m 11\u001b[0m SSE \u001b[39m=\u001b[39m SSE_0 \u001b[39m+\u001b[39m SSE_1\n", "\u001b[1;32mc:\\temp\\git\\fum-cs\\fds\\code\\BF-clustering.ipynb Cell 43\u001b[0m line \u001b[0;36mSSE_D\u001b[1;34m(D)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mSSE_D\u001b[39m(D):\n\u001b[1;32m----> 2\u001b[0m distances \u001b[39m=\u001b[39m compute_distances(D)\n\u001b[0;32m 3\u001b[0m SSE \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(distances\u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39m2\u001b[39m)\n\u001b[0;32m 4\u001b[0m \u001b[39mreturn\u001b[39;00m SSE\n", "\u001b[1;32mc:\\temp\\git\\fum-cs\\fds\\code\\BF-clustering.ipynb Cell 43\u001b[0m line \u001b[0;36mcompute_distances\u001b[1;34m(D)\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(n):\n\u001b[0;32m 5\u001b[0m \u001b[39mfor\u001b[39;00m j \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(i \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m, n):\n\u001b[1;32m----> 6\u001b[0m distance \u001b[39m=\u001b[39m math\u001b[39m.\u001b[39;49msqrt((D[i] \u001b[39m-\u001b[39;49m D[j]) \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m \u001b[39m2\u001b[39;49m)\n\u001b[0;32m 7\u001b[0m distances\u001b[39m.\u001b[39mappend(distance)\n\u001b[0;32m 8\u001b[0m distances \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39marray(distances)\n", "\u001b[1;31mTypeError\u001b[0m: only length-1 arrays can be converted to Python scalars" ] } ], "source": [ "D = np.array([[10, 11], [20, 21], [23, 24]])\n", "\n", "best_clustering, min_SSE = BF_clustering(D)\n", "for sub_set in best_clustering:\n", " print(set(sub_set))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "only length-1 arrays can be converted to Python scalars", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\hp\\OneDrive\\FUM\\Teaching\\Alg-for-DS\\code\\clustering\\BF-clustering.ipynb Cell 44\u001b[0m line \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m math\u001b[39m.\u001b[39;49msqrt((D[\u001b[39m0\u001b[39;49m] \u001b[39m-\u001b[39;49m D[\u001b[39m1\u001b[39;49m]) \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m \u001b[39m2\u001b[39;49m)\n", "\u001b[1;31mTypeError\u001b[0m: only length-1 arrays can be converted to Python scalars" ] } ], "source": [ "math.sqrt((D[0] - D[1]) ** 2)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10 11] [20 21] [-10 -10]\n", "[100 100] 200\n", "14.142135623730951\n" ] } ], "source": [ "print(D[0], D[1], D[0]-D[1])\n", "print((D[0]-D[1])**2, np.sum((D[0]-D[1])**2))\n", "print(np.sum((D[0]-D[1])**2)**0.5)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14.142135623730951" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.linalg.norm(D[0] - D[1])" ] } ], "metadata": { "kernelspec": { "display_name": "p310", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 2 }