{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part1/01-2_preprocess_ans.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZK7YdhYydKxk"
      },
      "source": [
        "# data preprocess and words cut"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UWI-L0A0dKxt"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import re\n",
        "import os\n",
        "import jieba\n",
        "import jieba.posseg as pseg\n",
        "import pickle\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zjuO6wJ0dKxw"
      },
      "source": [
        "## data overview"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NuHa0j0PdKxy"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_1.zip\n",
        "!unzip -q NLP_part1_1.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ppr55NMJdKxz"
      },
      "outputs": [],
      "source": [
        "article = pd.read_csv('Data/article_practice.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ng2kaQDcdKx0"
      },
      "outputs": [],
      "source": [
        "article.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pWvx48LvdKx1"
      },
      "outputs": [],
      "source": [
        "# filter rules\n",
        "article['content'] = article['content'].str.replace('https?:\\/\\/\\S*', '')\n",
        "article['content'] = article['content'].replace('', np.nan)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ojSczORXdKx3"
      },
      "outputs": [],
      "source": [
        "# remove data\n",
        "article = article.dropna()\n",
        "article = article.reset_index(drop=True)\n",
        "article['idx'] = article.index"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ewNCCiN3dKx4"
      },
      "outputs": [],
      "source": [
        "article.to_csv('Data/article_preprocessed.csv', index=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Og3HOz6TdKx5"
      },
      "source": [
        "# jieba"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ndHxP7LrdKx6"
      },
      "source": [
        "## cut word"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nLVHsbL0dKx6"
      },
      "outputs": [],
      "source": [
        "# set dictionary (can define yourself)\n",
        "jieba.set_dictionary('jieba/dict.txt.big')\n",
        "stop_words = open('jieba/stop_words.txt').read().splitlines()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZKxHTMMGdKx7"
      },
      "outputs": [],
      "source": [
        "print(stop_words[:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TxYrIDQ3dKx7"
      },
      "outputs": [],
      "source": [
        "data = pd.read_csv('Data/article_preprocessed.csv')\n",
        "data = data['content'].tolist()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sOvXXw6BdKx7"
      },
      "outputs": [],
      "source": [
        "print(data[:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "B_C7e_WxdKx8"
      },
      "outputs": [],
      "source": [
        "sentences = []\n",
        "\n",
        "for i, text in enumerate(data):\n",
        "    line = []\n",
        "\n",
        "    for w in jieba.cut(text, cut_all=False):\n",
        "\n",
        "        # remove stopwords and digits\n",
        "        # can define your own rules\n",
        "        if w not in stop_words and not bool(re.match('[0-9]+', w)):\n",
        "            line.append(w)\n",
        "\n",
        "    sentences.append(line)\n",
        "\n",
        "    if i%10000==0:\n",
        "        print(i, '/', len(data))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pgs7wXB9dKx8"
      },
      "outputs": [],
      "source": [
        "print(sentences[0:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "v5VQSuAKdKx8"
      },
      "outputs": [],
      "source": [
        "# save data as pickle format\n",
        "with open(\"Data/article_cutted\", \"wb\") as file:\n",
        "    pickle.dump(sentences, file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NWxj3wqTdKx8"
      },
      "source": [
        "## posseg (詞性)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0GyK7smxdKx9"
      },
      "outputs": [],
      "source": [
        "import jieba.posseg as pseg"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "KQ5U9iozdKx9"
      },
      "outputs": [],
      "source": [
        "for w, f in pseg.cut(data[0]):\n",
        "    print(w, ' ', f)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "frS0DaE6dKx-"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wjPERqrHdKx-"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}