{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "9T3doV20AG30" }, "source": [ "# build model prediction and compare each features set" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "clkVOQ0OAG4F" }, "outputs": [], "source": [ "import pandas as pd\n", "import xgboost as xgb\n", "import pickle\n", "import numpy as np\n", "import os\n", "\n", "from gensim.models import Doc2Vec, doc2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ub2JM2pFAG4K" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_6.zip\n", "!unzip -q NLP_part2_6.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "P9AeqAH_AG4O" }, "outputs": [], "source": [ "# read preprocess article df\n", "df = pd.read_csv('Data/article_preprocessed.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "r-sw-X9GAG4Q" }, "outputs": [], "source": [ "# drop data\n", "diff_threshold = 20\n", "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uZ8Jwu7HAG4S" }, "outputs": [], "source": [ "# define y\n", "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n", "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9sjj_b20AG4U" }, "outputs": [], "source": [ "df['type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2rYJnUtXAG4X" }, "outputs": [], "source": [ "# create a numpy format data\n", "basic_data = np.zeros((df.shape[0], 258))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9-u8wW5aAG4Z" }, "outputs": [], "source": [ "basic_data[:, 0] = df['idx']\n", "basic_data[:, 1] = df['type']" ] }, { "cell_type": "markdown", "metadata": { "id": "fdg7qJcWAG4b" }, "source": [ "## bag of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gwIUQ0gcAG4c" }, "outputs": [], "source": [ "# load bag of words result\n", "with open('Data/article_count', 'rb') as file:\n", " _, count = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MZz3YBdnAG4e" }, "outputs": [], "source": [ "# select top 256 words (counts of document)\n", "most_count_id = np.array((count > 0).sum(axis=0))[0].argsort()[::-1][:256]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KOKmqPEoAG4f" }, "outputs": [], "source": [ "# subset data\n", "count = count[:, most_count_id]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AJT_hTXkAG4g" }, "outputs": [], "source": [ "count_data = basic_data.copy().astype('int')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "K7wRViPOAG4i" }, "outputs": [], "source": [ "# subset bag of words matrix\n", "count_data[:, 2:] = count[count_data[:, 0]].toarray()" ] }, { "cell_type": "markdown", "metadata": { "id": "8gN2gDufAG4j" }, "source": [ "## TF-IDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nARDCwHaAG4k" }, "outputs": [], "source": [ "# load tf-idf result\n", "with open('Data/article_tfidf', 'rb') as file:\n", " _, tfidf = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mx74pYbQAG4l" }, "outputs": [], "source": [ "# select top 256 words (counts of document)\n", "most_tfidf_id = np.array((tfidf > 0).sum(axis=0))[0].argsort()[::-1][:256]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fcqkGOGfAG4m" }, "outputs": [], "source": [ "# subset data\n", "tfidf = tfidf[:, most_tfidf_id]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NwPLB-2vAG4m" }, "outputs": [], "source": [ "tfidf_data = basic_data.copy().astype('int')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9RBDqESBAG4n" }, "outputs": [], "source": [ "# subset tf-idf matrix\n", "tfidf_data[:, 2:] = tfidf[tfidf_data[:, 0]].toarray()" ] }, { "cell_type": "markdown", "metadata": { "id": "lH5KmIaCAG4n" }, "source": [ "## average word2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BdIJ29AvAG4o" }, "outputs": [], "source": [ "# load average word2vec result\n", "with open('Data/avg_article_vector', 'rb') as file:\n", " avg_vector = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SgxD_8YpAG4p" }, "outputs": [], "source": [ "avg_data = basic_data.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7LyCK7qOAG4p" }, "outputs": [], "source": [ "# select rows of average word2vec\n", "for i, row in df.iterrows():\n", " avg_data[i, 2:] = avg_vector[row['idx']]" ] }, { "cell_type": "markdown", "metadata": { "id": "OSaIM7KtAG4q" }, "source": [ "## doc2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Sx-7qmUnAG4q" }, "outputs": [], "source": [ "# load doc2vec model\n", "model = Doc2Vec.load('word2vec_model/doc2vec')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zBZWY26tAG4q" }, "outputs": [], "source": [ "doc2vec_data = basic_data.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "VtOFWmsmAG4r" }, "outputs": [], "source": [ "# select idx of doc2vec\n", "for i, row in df.iterrows():\n", " doc2vec_data[i, 2:] = model.docvecs[str(row['idx'])]" ] }, { "cell_type": "markdown", "metadata": { "id": "HVexRhi2AG4s" }, "source": [ "# prediction model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HioP_zRDAG4t" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NKbl6KvzAG4t" }, "outputs": [], "source": [ "# split data to training and testing data\n", "train, test = train_test_split(df, test_size=0.2, stratify=df['type'])\n", "train_idx = np.array(train.index)\n", "test_idx = np.array(test.index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RzbtLrEAAG4t" }, "outputs": [], "source": [ "# define a dictionary to collect model result\n", "result = {}" ] }, { "cell_type": "markdown", "metadata": { "id": "-aDpdPyyAG4u" }, "source": [ "## train model use xgboost" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "TiQc2g9jAG4u" }, "outputs": [], "source": [ "# bag of words\n", "model = xgb.XGBClassifier()\n", "model.fit(count_data[train_idx, 2:], count_data[train_idx, 1],\n", " eval_set=[(count_data[test_idx, 2:], count_data[test_idx, 1])], eval_metric='auc'\n", " )\n", "\n", "# testing auc\n", "result['bag_of_words'] = model.evals_result()['validation_0']['auc'][-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "N6IHir6NAG4v" }, "outputs": [], "source": [ "# tf-idf\n", "model = xgb.XGBClassifier()\n", "model.fit(tfidf_data[train_idx, 2:], tfidf_data[train_idx, 1],\n", " eval_set=[(tfidf_data[test_idx, 2:], tfidf_data[test_idx, 1])], eval_metric='auc'\n", " )\n", "\n", "# testing auc\n", "result['tf-idf'] = model.evals_result()['validation_0']['auc'][-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "Yrk26aBUAG4v" }, "outputs": [], "source": [ "# average word2vec\n", "model = xgb.XGBClassifier()\n", "model.fit(avg_data[train_idx, 2:], avg_data[train_idx, 1],\n", " eval_set=[(avg_data[test_idx, 2:], avg_data[test_idx, 1])], eval_metric='auc'\n", " )\n", "\n", "# testing auc\n", "result['avg_word2vec'] = model.evals_result()['validation_0']['auc'][-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "LworZW9AAG4w" }, "outputs": [], "source": [ "# doc2vec\n", "model = xgb.XGBClassifier()\n", "model.fit(doc2vec_data[train_idx, 2:], doc2vec_data[train_idx, 1],\n", " eval_set=[(doc2vec_data[test_idx, 2:], doc2vec_data[test_idx, 1])], eval_metric='auc'\n", " )\n", "\n", "# testing auc\n", "result['doc2vec'] = model.evals_result()['validation_0']['auc'][-1]" ] }, { "cell_type": "markdown", "metadata": { "id": "Q69o6KD_AG4x" }, "source": [ "## plot result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LUJTEsUCAG40" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XGpWGEtbAG41" }, "outputs": [], "source": [ "plt.bar(np.arange(4), result.values())\n", "plt.xticks(np.arange(4), result.keys())\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VJ5ykAHgAG41" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "include_colab_link": true } }, "nbformat": 4, "nbformat_minor": 0 }