{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part1/01-1_preprocess.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "etKXWxGtcVch"
      },
      "source": [
        "# data preprocess and words cut"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "G_iQhOp8cVcr"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import re\n",
        "import os\n",
        "import jieba\n",
        "import jieba.posseg as pseg\n",
        "import pickle\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n45MJp2-cVcu"
      },
      "source": [
        "## data overview"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MJ-_9lblcVcy"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_1.zip\n",
        "!unzip -q NLP_part1_1.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vqZ9ioVhcVcz"
      },
      "outputs": [],
      "source": [
        "article = pd.read_csv('Data/article_practice.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sLo7xiDHcVc0"
      },
      "outputs": [],
      "source": [
        "article.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3k2zgPj6cVc1"
      },
      "outputs": [],
      "source": [
        "# filter rules\n",
        "article['content'] = article['content'].str.replace('https?:\\/\\/\\S*', '')\n",
        "article['content'] = article['content'].replace('', np.nan)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yzmx6cTKcVc2"
      },
      "outputs": [],
      "source": [
        "# remove data\n",
        "article = article.dropna()\n",
        "article = article.reset_index(drop=True)\n",
        "article['idx'] = article.index"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "FiRg9w6UcVc4"
      },
      "outputs": [],
      "source": [
        "article.to_csv('Data/article_preprocessed.csv', index=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DRqflkHNcVc5"
      },
      "source": [
        "# jieba"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OCL0l6BecVc6"
      },
      "source": [
        "## cut word"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PmqdzeuGcVc7"
      },
      "outputs": [],
      "source": [
        "# set dictionary (can define yourself)\n",
        "jieba.set_dictionary('jieba/dict.txt.big')\n",
        "stop_words = open('jieba/stop_words.txt').read().splitlines()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "he5roFaOcVc8"
      },
      "outputs": [],
      "source": [
        "print(stop_words[:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "puc6sWd6cVc9"
      },
      "outputs": [],
      "source": [
        "data = pd.read_csv('Data/article_preprocessed.csv')\n",
        "data = data['content'].tolist()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "uJW-RFfJcVc9"
      },
      "outputs": [],
      "source": [
        "print(data[:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0nU9GSfVcVc-"
      },
      "outputs": [],
      "source": [
        "sentences = []\n",
        "\n",
        "for i, text in enumerate(data):\n",
        "    line = []\n",
        "\n",
        "    for w in jieba.cut(text, cut_all=False):\n",
        "\n",
        "        # remove stopwords and digits\n",
        "        # can define your own rules\n",
        "        if w not in stop_words and not bool(re.match('[0-9]+', w)):\n",
        "            line.append(w)\n",
        "\n",
        "    sentences.append(line)\n",
        "\n",
        "    if i%10000==0:\n",
        "        print(i, '/', len(data))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WB-csKnhcVc-"
      },
      "outputs": [],
      "source": [
        "print(sentences[0:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "eTJfuPzncVc_"
      },
      "outputs": [],
      "source": [
        "# save data as pickle format\n",
        "with open(\"Data/article_cutted\", \"wb\") as file:\n",
        "    pickle.dump(sentences, file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Q2jZg5bUcVdA"
      },
      "source": [
        "## posseg (詞性)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8uvtpHt_cVdC"
      },
      "outputs": [],
      "source": [
        "# 在斷詞時 , 請用 jieba 把詞性抓出來 (1 筆即可)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "of-wxdDFcVdD"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}