{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# build model prediction and compare each features set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import xgboost as xgb\n", "import pickle\n", "import numpy as np\n", "import os\n", "\n", "from gensim.models import Doc2Vec, doc2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_6.zip\n", "!unzip -q NLP_part2_6.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## read preprocess article df\n", "df = pd.read_csv('Data/article_preprocessed.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## drop data\n", "diff_threshold = 20\n", "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## define y\n", "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n", "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## create a numpy format data\n", "basic_data = np.zeros((df.shape[0], 258))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "basic_data[:, 0] = df['idx']\n", "basic_data[:, 1] = df['type']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## bag of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## load bag of words result\n", "with open('Data/article_count', 'rb') as file:\n", " _, count = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## select top 256 words (counts of document) \n", "most_count_id = np.array((count > 0).sum(axis=0))[0].argsort()[::-1][:256]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## subset data\n", "count = count[:, most_count_id]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "count_data = basic_data.copy().astype('int')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## subset bag of words matrix\n", "count_data[:, 2:] = count[count_data[:, 0]].toarray()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 請產生 tf-idf, average word2vec, doc2vec features, 並分別建立 prediction model (xgboost, svm or linear regression...) , 比較 4 組 features 的結果 (AUC)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## code" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 4 }