{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J9qjwDRGcWzo"
      },
      "source": [
        "# text's feature engineering: turn unstructure to  structure"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Oywp_-4UcWzz"
      },
      "outputs": [],
      "source": [
        "import re\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import pickle\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "c36uTsHacWz9"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_2.zip\n",
        "!unzip -q NLP_part1_2.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2xz5vyXscW0D"
      },
      "outputs": [],
      "source": [
        "df = pd.read_csv('Data/article_preprocessed.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sCl5GCP2cW0G"
      },
      "outputs": [],
      "source": [
        "# load 'article_cutted'\n",
        "with open(\"Data/article_cutted\", \"rb\") as file:\n",
        "    sentences = pickle.load(file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kopjni38cW0H"
      },
      "source": [
        "## define y (push > boo)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OAS56d1fcW0I"
      },
      "outputs": [],
      "source": [
        "# drop data\n",
        "diff_threshold = 20\n",
        "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_mTXst7ucW0K"
      },
      "outputs": [],
      "source": [
        "# define y\n",
        "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n",
        "df = df.reset_index(drop=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "L4EhnEo_cW0L"
      },
      "outputs": [],
      "source": [
        "df['type'].value_counts()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oVLVCnFCcW0M"
      },
      "source": [
        "## simple feature"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "c9vW5ze4cW0N"
      },
      "outputs": [],
      "source": [
        "# word count\n",
        "# http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)\n",
        "df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\\u4e00-\\u9fff]')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-khlaLHycW0O"
      },
      "outputs": [],
      "source": [
        "# punctuation count\n",
        "# 請產生 \"標點符號數\" 欄位"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xY0LW5lScW0R"
      },
      "outputs": [],
      "source": [
        "# question mark count\n",
        "# 請產生 \"問號數\" 欄位"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Kuo7VReNcW0T"
      },
      "outputs": [],
      "source": [
        "# 可以自由發揮想像還有什麼 features , etc. 比例"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "J5WIlIodcW0T"
      },
      "outputs": [],
      "source": [
        "# drop punctuation column\n",
        "df = df.drop(['punctuation'],axis=1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "b8Tb4FeccW0U"
      },
      "outputs": [],
      "source": [
        "df.iloc[:5, -4:]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yj-eUs7RcW0U"
      },
      "outputs": [],
      "source": [
        "# compute correlation\n",
        "# 請計算前面建立好的 features 與 type 的 correlation"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DaysfaAkcW0V"
      },
      "source": [
        "## bag of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "azZyrBJpcW0V"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import CountVectorizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GqY-FGNLcW0W"
      },
      "outputs": [],
      "source": [
        "# define transformer (轉換器)\n",
        "vectorizer = CountVectorizer()\n",
        "count = vectorizer.fit_transform([' '.join(x) for x in sentences])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sWJAGCGlcW0X"
      },
      "outputs": [],
      "source": [
        "count"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Tx6OBEKXcW0X"
      },
      "outputs": [],
      "source": [
        "# save data as pickle format\n",
        "with open(\"Data/article_count\", \"wb\") as file:\n",
        "    pickle.dump([vectorizer, count], file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TlQxSqoBcW0Y"
      },
      "source": [
        "### select top 10 frequency of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mtcF-sfmcW0Z"
      },
      "outputs": [],
      "source": [
        "# 用前面轉換好的 count matrix, 嘗試抽出 top 10 count 最多的關鍵字\n",
        "# 建立 top 10 count 關鍵字的 bag of words features, 計算其與 type 的 correlation"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "I27pNv5GcW0Z"
      },
      "source": [
        "# TF-IDF"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ocnLYq1gcW0b"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Oe7ZDsdycW0c"
      },
      "outputs": [],
      "source": [
        "# define transformer (轉換器)\n",
        "vectorizer = TfidfVectorizer(norm=None) ## do not do normalize\n",
        "tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OI9h0gEecW0f"
      },
      "outputs": [],
      "source": [
        "# save data as pickle format\n",
        "with open(\"Data/article_tfidf\", \"wb\") as file:\n",
        "    pickle.dump([vectorizer, tfidf], file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9lUohGYhcW0f"
      },
      "source": [
        "### select top 10 average tf-idf of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-jEBjnXAcW0g"
      },
      "outputs": [],
      "source": [
        "# 用前面轉換好的 tfidf matrix, 嘗試抽出 top 10 平均 tfidf 最高的關鍵字\n",
        "# 建立 top 10 平均 tfidf 關鍵字的 tf-idf features, 計算其與 type 的 correlation"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}