{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 下載 seeds.csv\n", "! wget \"https://raw.githubusercontent.com/TA-aiacademy/course_3.0/main/02_ML/part7/Chapter9/excercise/datasets/seeds.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from sklearn.datasets import make_blobs\n", "# dataset: https://archive.ics.uci.edu/ml/datasets/seeds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('seeds.csv', header=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 利用以上的seeds dataset,請利用Kmeans 進行clustering, 並設法找到最適合的number of clusters\n", "請比照教材中的繪圖,將inertial 對 number of cluster的折線圖劃出 " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "from sklearn.metrics import silhouette_score\n", "from sklearn.preprocessing import StandardScaler\n", "ks = range(1, 6)\n", "inertias = []\n", "samples = df.values\n", "for k in ks:\n", " # Create a KMeans instance with k clusters: model\n", " model = KMeans(n_clusters=k)\n", "\n", " # Fit model to samples\n", " model.fit(samples)\n", "\n", " # Append the inertia to the list of inertias\n", " inertias.append(model.inertia_)\n", "\n", "# Plot ks vs inertias\n", "plt.plot(ks, inertias, '-o')\n", "plt.xlabel('number of clusters, k')\n", "plt.ylabel('inertia')\n", "plt.xticks(ks)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 先指定centroids的位置及各cluster的標準差\n", "blob_centers = np.array([[-0.2, 2.3], [1.5, 2.3], [2.8, 2.8], [2.8, 1.3],\n", " [-1, -2], [-2, 2]])\n", "blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1, 0.2])\n", "X, y = make_blobs(n_samples=300,\n", " centers=blob_centers,\n", " cluster_std=blob_std,\n", " random_state=7)\n", "# samples = df1.values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 利用以上的dataset,請利用Kmeans 進行clustering, 並設法找到最適合的number of clusters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# k = 1~12 做十二次kmeans, 並將每次結果的inertia收集在一個list裡\n", "kmeans_per_k = [\n", " KMeans(n_clusters=k, random_state=42).fit(X) for k in range(1, 12)\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "silhouette_scores = [\n", " silhouette_score(X, model.labels_) for model in kmeans_per_k[1:]\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "silhouette_scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8, 3))\n", "plt.plot(range(2, 12), silhouette_scores, \"bo-\")\n", "plt.xlabel(\"$k$\", fontsize=14)\n", "plt.ylabel(\"Silhouette score\", fontsize=14)\n", "# plt.axis([1.8, 8.5, 0.55, 0.75])\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }