{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part1/02-2_feature_engineering_ans.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ke_wxqkbcXXN"
      },
      "source": [
        "# text's feature engineering: turn unstructure to  structure"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "x15qRo7vcXXf"
      },
      "outputs": [],
      "source": [
        "import re\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import pickle\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "junGiP4XcXXi"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_2.zip\n",
        "!unzip -q NLP_part1_2.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vBnffTxwcXXl"
      },
      "outputs": [],
      "source": [
        "df = pd.read_csv('Data/article_preprocessed.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "iRXlwYmccXXm"
      },
      "outputs": [],
      "source": [
        "# load 'article_cutted'\n",
        "with open(\"Data/article_cutted\", \"rb\") as file:\n",
        "    sentences = pickle.load(file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Evty9Vz7cXXo"
      },
      "source": [
        "## define y (push > boo)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "J_Huh8VxcXXr"
      },
      "outputs": [],
      "source": [
        "# drop data\n",
        "diff_threshold = 20\n",
        "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HM-XOKyVcXXt"
      },
      "outputs": [],
      "source": [
        "# define y\n",
        "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n",
        "df = df.reset_index(drop=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WxmXGpYjcXXu"
      },
      "outputs": [],
      "source": [
        "df['type'].value_counts()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AT0sGExFcXXw"
      },
      "source": [
        "## simple feature"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "s-doNr3ZcXXx"
      },
      "outputs": [],
      "source": [
        "# word count\n",
        "# http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)\n",
        "df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\\u4e00-\\u9fff]')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6WNwlorScXXy"
      },
      "outputs": [],
      "source": [
        "# punctuation count\n",
        "df['punctuation'] = df['content'].str.replace('[\\w\\s]', '')\n",
        "df['punctuation_count'] = df['punctuation'].str.len()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lFAePuA4cXXy"
      },
      "outputs": [],
      "source": [
        "# question mark count\n",
        "df['question_count'] = df['punctuation'].str.count('[?？]')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CDxUs76KcXXz"
      },
      "outputs": [],
      "source": [
        "# drop punctuation column\n",
        "df = df.drop(['punctuation'],axis=1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VmipIGyjcXX0"
      },
      "outputs": [],
      "source": [
        "df.iloc[:5, -4:]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "deVBIMOMcXX0"
      },
      "outputs": [],
      "source": [
        "# compute correlation\n",
        "df.iloc[:, -4:].corr()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i55mOeiocXX1"
      },
      "source": [
        "## bag of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "crYvb51xcXX1"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import CountVectorizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "A44huksacXX2"
      },
      "outputs": [],
      "source": [
        "# define transformer (轉換器)\n",
        "vectorizer = CountVectorizer()\n",
        "count = vectorizer.fit_transform([' '.join(x) for x in sentences])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CT9-9FR1cXX3"
      },
      "outputs": [],
      "source": [
        "count"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "u-32UyDccXX3"
      },
      "outputs": [],
      "source": [
        "# save data as pickle format\n",
        "with open(\"Data/article_count\", \"wb\") as file:\n",
        "    pickle.dump([vectorizer, count], file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YSvl4dNDcXX4"
      },
      "source": [
        "### select top 10 frequency of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vxKDgrRJcXX4"
      },
      "outputs": [],
      "source": [
        "# create a dictionary: id as key ; word as values\n",
        "id2word = {v:k for k, v in vectorizer.vocabulary_.items()}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "98Q12DXGcXX5"
      },
      "outputs": [],
      "source": [
        "# columnwise sum: words frequency\n",
        "sum_ = np.array(count.sum(axis=0))[0]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mVbp_g1UcXX5"
      },
      "outputs": [],
      "source": [
        "# top 10 frequency's wordID\n",
        "most_sum_id = sum_.argsort()[::-1][:10].tolist()\n",
        "most_sum_id"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "EYnmlYSQcXX6"
      },
      "outputs": [],
      "source": [
        "# print top 10 frequency's words\n",
        "features = [id2word[i] for i in most_sum_id]\n",
        "features"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fok13vWgcXX8"
      },
      "outputs": [],
      "source": [
        "# print the data\n",
        "data = pd.DataFrame(count[df.idx,:][:,most_sum_id].toarray(), columns=features)\n",
        "data[:5]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7uVNkRn_cXX_"
      },
      "outputs": [],
      "source": [
        "df.iloc[0].content"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-Nf_b_ZScXYA"
      },
      "outputs": [],
      "source": [
        "# compute correlation\n",
        "data = pd.concat([df.type, data], axis=1)\n",
        "data.corr()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1zaVePrYcXYA"
      },
      "source": [
        "# TF-IDF"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "do3KrYeocXYA"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N1uzP6skcXYB"
      },
      "outputs": [],
      "source": [
        "# define transformer (轉換器)\n",
        "vectorizer = TfidfVectorizer(norm=None) ## do not do normalize\n",
        "tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ey_8aXc-cXYC"
      },
      "outputs": [],
      "source": [
        "# save data as pickle format\n",
        "with open(\"Data/article_tfidf\", \"wb\") as file:\n",
        "    pickle.dump([vectorizer, tfidf], file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IfXaHzAmcXYC"
      },
      "source": [
        "### select top 10 average tf-idf of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vFDSkWWBcXYC"
      },
      "outputs": [],
      "source": [
        "# create a dictionary: id as key ; word as values\n",
        "id2word = {v:k for k, v in vectorizer.vocabulary_.items()}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "j3lTr_0LcXYD"
      },
      "outputs": [],
      "source": [
        "# columnwise average: words tf-idf\n",
        "avg = tfidf.sum(axis=0) / (tfidf!=0).sum(axis=0)\n",
        "\n",
        "# set df < 20 as 0\n",
        "avg[(tfidf!=0).sum(axis=0)<20] = 0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KUITGa-TcXYD"
      },
      "outputs": [],
      "source": [
        "avg = np.array(avg)[0]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Fwa4jRjEcXYD"
      },
      "outputs": [],
      "source": [
        "# top 10 tfidf's wordID\n",
        "most_avg_id = avg.argsort()[::-1][:10].tolist()\n",
        "most_avg_id"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gGvkW4evcXYE"
      },
      "outputs": [],
      "source": [
        "# print top 10 tf-idf's words\n",
        "features = [id2word[i] for i in most_avg_id]\n",
        "features"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fjE1l-d0cXYc"
      },
      "outputs": [],
      "source": [
        "# print the data\n",
        "data = pd.DataFrame(tfidf[df.idx,:][:,most_avg_id].toarray(), columns=features)\n",
        "data[:5]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PcUCS3_PcXYd"
      },
      "outputs": [],
      "source": [
        "# compute correlation\n",
        "data = pd.concat([df.type, data], axis=1)\n",
        "data.corr()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3gQUb0wwcXYd"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}