{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading Dataset & Quick Overview" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from matplotlib.pylab import rcParams\n", "from sklearn.datasets import load_boston\n", "\n", "%matplotlib inline\n", "rcParams['figure.figsize'] = 15, 15\n", "\n", "data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n", "raw_df = pd.read_csv(data_url, sep=\"\\s+\", skiprows=22, header=None)\n", "data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n", "target = raw_df.values[1::2, 2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(data.shape) # data.data ==> Features\n", "print(target.shape) # data.target ==> Label" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Plot Features V.S. Y\n", "Can you explain the relation between other features with house prices?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import math\n", "\n", "features = [\n", " 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',\n", " 'PTRATIO', 'B', 'LSTAT'\n", "]\n", "\n", "fig, axes = plt.subplots(4, 4)\n", "for i in range(13):\n", " x_f = i % 4\n", " y_f = math.floor((i) / 4)\n", " axes[x_f, y_f].scatter(data[:, i], target)\n", " axes[x_f, y_f].set_xlabel(features[i])\n", " axes[x_f, y_f].set_ylabel('prices')\n", "plt.show()\n", "\"\"\"\n", "- CRIM per capita crime rate by town\n", "- ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n", "- INDUS proportion of non-retail business acres per town\n", "- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n", "- NOX nitric oxides concentration (parts per 10 million)\n", "- RM average number of rooms per dwelling\n", "- AGE proportion of owner-occupied units built prior to 1940\n", "- DIS weighted distances to five Boston employment centres\n", "- RAD index of accessibility to radial highways\n", "- TAX full-value property-tax rate per $10,000\n", "- PTRATIO pupil-teacher ratio by town\n", "- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", "- LSTAT % lower status of the population\n", "\"\"\"\n", "pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting Home Prices: SVR\n", "在沒有做任何new feature生成的情況下,是否可以讓SVR的performance接近linear regression?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = data\n", "y = target" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import needed models in scikit-learn by yourself\n", "# and fit a linear model using training dataset\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import svm\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,\n", " y,\n", " test_size=0.25,\n", " random_state=42,\n", " shuffle=True)\n", "\n", "model = svm.SVR(kernel='poly', gamma=1e-2, degree=2, C=0.1)\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# make prediction here\n", "# y_pred = '''make prediction on testing dataset here'''\n", "y_pred = model.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import metrics\n", "# visualize the result\n", "%matplotlib inline\n", "plt.scatter(y_test, y_pred)\n", "plt.plot([0, 50], [0, 50], '--k')\n", "plt.axis('tight')\n", "plt.xlabel('True price ($1000s)')\n", "plt.ylabel('Predicted price ($1000s)')\n", "print(\"RMS:\", np.sqrt(np.mean((y_pred - y_test)**2)))\n", "print('R2:%.2f' % metrics.r2_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }