{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 下載 seeds.csv\n", "! wget \"https://raw.githubusercontent.com/TA-aiacademy/course_3.0/main/02_ML/part7/Chapter9/excercise/datasets/seeds.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from scipy.stats import pearsonr\n", "import numpy as np\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('seeds.csv', header=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**這個dataset 是穀物的資料,其中column 3 and column 4 分別是穀物的長與寬**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**練習一: 請取出這兩個feature並計算其pearson correlation**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from scipy.stats import pearsonr\n", "\n", "# 寬度\n", "width = df.iloc[:, 4].values\n", "\n", "# 長度\n", "length = df.iloc[:, 3].values\n", "\n", "# 畫出scatter plot, width vs length\n", "plt.scatter(width, length)\n", "plt.axis('equal')\n", "plt.show()\n", "\n", "# 算出 Pearson correlation\n", "correlation, pvalue = pearsonr(width, length)\n", "\n", "# Display the correlation\n", "print(correlation)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "**練習2: 請利用pca 將上述長與寬兩個feature 進行轉換以去除其correlation, 並顯示其轉換後的correlation**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import PCA\n", "from sklearn.decomposition import PCA\n", "\n", "# Create PCA instance: model\n", "model = PCA()\n", "grains = df.iloc[:, [3, 4]]\n", "# Apply the fit_transform method of model to grains: pca_features\n", "pca_features = # [your code here] #\n", "\n", "# Assign 0th column of pca_features: xs\n", "xs = # [your code here] #\n", "\n", "# Assign 1st column of pca_features: ys\n", "ys = # [your code here] #\n", "\n", "# Scatter plot xs vs ys\n", "plt.scatter(xs, ys)\n", "plt.axis('equal')\n", "plt.show()\n", "\n", "# Calculate the Pearson correlation of xs and ys\n", "correlation, pvalue = pearsonr(xs, ys)\n", "\n", "# Display the correlation\n", "print(correlation)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**練習3: 檢視explained variance**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import make_pipeline\n", "import matplotlib.pyplot as plt\n", "\n", "samples = df.values\n", "# 先對資料進行標準化, 宣告一個名為scaler的StandardScaler\n", "scaler = # [your code here] #\n", "\n", "# 宣告一個名為 pca的PCA\n", "pca = # [your code here] #\n", "\n", "# 以make_pipeline將前述兩個模型依序串一起\n", "pipeline = ## [your code here] ##\n", "\n", "# 將模型fit資料samples\n", "# [your code here] #\n", "\n", "# 將各feature 的explained_variance_ 畫出\n", "features = range(pca.n_components_)\n", "plt.bar(features, pca.explained_variance_)\n", "plt.xlabel('PCA feature')\n", "plt.ylabel('variance')\n", "plt.xticks(features)\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grains = np.zeros((data.shape[0], 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grains[:, 0] = width\n", "grains[:, 1] = length" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 畫出width 與 length的散佈圖\n", "plt.scatter(grains[:, 0], grains[:, 1])\n", "\n", "# Create a PCA instance: model\n", "model = PCA()\n", "\n", "# Fit model to points\n", "model.fit(grains)\n", "\n", "# Get the mean of the grain samples: mean\n", "mean = model.mean_\n", "\n", "# Get the first principal component: first_pc\n", "first_pc = model.components_[0, :]\n", "\n", "# Plot first_pc as an arrow, starting at mean\n", "plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)\n", "\n", "# Keep axes on same scale\n", "plt.axis('equal')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }