{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "ZK7YdhYydKxk" }, "source": [ "# data preprocess and words cut" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UWI-L0A0dKxt" }, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import os\n", "import jieba\n", "import jieba.posseg as pseg\n", "import pickle\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": { "id": "zjuO6wJ0dKxw" }, "source": [ "## data overview" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NuHa0j0PdKxy" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_1.zip\n", "!unzip -q NLP_part1_1.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ppr55NMJdKxz" }, "outputs": [], "source": [ "article = pd.read_csv('Data/article_practice.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ng2kaQDcdKx0" }, "outputs": [], "source": [ "article.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pWvx48LvdKx1" }, "outputs": [], "source": [ "# filter rules\n", "article['content'] = article['content'].str.replace('https?:\\/\\/\\S*', '')\n", "article['content'] = article['content'].replace('', np.nan)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ojSczORXdKx3" }, "outputs": [], "source": [ "# remove data\n", "article = article.dropna()\n", "article = article.reset_index(drop=True)\n", "article['idx'] = article.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ewNCCiN3dKx4" }, "outputs": [], "source": [ "article.to_csv('Data/article_preprocessed.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "Og3HOz6TdKx5" }, "source": [ "# jieba" ] }, { "cell_type": "markdown", "metadata": { "id": "ndHxP7LrdKx6" }, "source": [ "## cut word" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nLVHsbL0dKx6" }, "outputs": [], "source": [ "# set dictionary (can define yourself)\n", "jieba.set_dictionary('jieba/dict.txt.big')\n", "stop_words = open('jieba/stop_words.txt').read().splitlines()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZKxHTMMGdKx7" }, "outputs": [], "source": [ "print(stop_words[:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TxYrIDQ3dKx7" }, "outputs": [], "source": [ "data = pd.read_csv('Data/article_preprocessed.csv')\n", "data = data['content'].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sOvXXw6BdKx7" }, "outputs": [], "source": [ "print(data[:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "B_C7e_WxdKx8" }, "outputs": [], "source": [ "sentences = []\n", "\n", "for i, text in enumerate(data):\n", " line = []\n", "\n", " for w in jieba.cut(text, cut_all=False):\n", "\n", " # remove stopwords and digits\n", " # can define your own rules\n", " if w not in stop_words and not bool(re.match('[0-9]+', w)):\n", " line.append(w)\n", "\n", " sentences.append(line)\n", "\n", " if i%10000==0:\n", " print(i, '/', len(data))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pgs7wXB9dKx8" }, "outputs": [], "source": [ "print(sentences[0:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "v5VQSuAKdKx8" }, "outputs": [], "source": [ "# save data as pickle format\n", "with open(\"Data/article_cutted\", \"wb\") as file:\n", " pickle.dump(sentences, file)" ] }, { "cell_type": "markdown", "metadata": { "id": "NWxj3wqTdKx8" }, "source": [ "## posseg (詞性)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0GyK7smxdKx9" }, "outputs": [], "source": [ "import jieba.posseg as pseg" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "KQ5U9iozdKx9" }, "outputs": [], "source": [ "for w, f in pseg.cut(data[0]):\n", " print(w, ' ', f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "frS0DaE6dKx-" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wjPERqrHdKx-" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "gpuType": "T4", "include_colab_link": true }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }