{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "etKXWxGtcVch" }, "source": [ "# data preprocess and words cut" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "G_iQhOp8cVcr" }, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import os\n", "import jieba\n", "import jieba.posseg as pseg\n", "import pickle\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": { "id": "n45MJp2-cVcu" }, "source": [ "## data overview" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MJ-_9lblcVcy" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_1.zip\n", "!unzip -q NLP_part1_1.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vqZ9ioVhcVcz" }, "outputs": [], "source": [ "article = pd.read_csv('Data/article_practice.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sLo7xiDHcVc0" }, "outputs": [], "source": [ "article.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3k2zgPj6cVc1" }, "outputs": [], "source": [ "# filter rules\n", "article['content'] = article['content'].str.replace('https?:\\/\\/\\S*', '')\n", "article['content'] = article['content'].replace('', np.nan)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yzmx6cTKcVc2" }, "outputs": [], "source": [ "# remove data\n", "article = article.dropna()\n", "article = article.reset_index(drop=True)\n", "article['idx'] = article.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FiRg9w6UcVc4" }, "outputs": [], "source": [ "article.to_csv('Data/article_preprocessed.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "DRqflkHNcVc5" }, "source": [ "# jieba" ] }, { "cell_type": "markdown", "metadata": { "id": "OCL0l6BecVc6" }, "source": [ "## cut word" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PmqdzeuGcVc7" }, "outputs": [], "source": [ "# set dictionary (can define yourself)\n", "jieba.set_dictionary('jieba/dict.txt.big')\n", "stop_words = open('jieba/stop_words.txt').read().splitlines()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "he5roFaOcVc8" }, "outputs": [], "source": [ "print(stop_words[:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "puc6sWd6cVc9" }, "outputs": [], "source": [ "data = pd.read_csv('Data/article_preprocessed.csv')\n", "data = data['content'].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uJW-RFfJcVc9" }, "outputs": [], "source": [ "print(data[:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0nU9GSfVcVc-" }, "outputs": [], "source": [ "sentences = []\n", "\n", "for i, text in enumerate(data):\n", " line = []\n", "\n", " for w in jieba.cut(text, cut_all=False):\n", "\n", " # remove stopwords and digits\n", " # can define your own rules\n", " if w not in stop_words and not bool(re.match('[0-9]+', w)):\n", " line.append(w)\n", "\n", " sentences.append(line)\n", "\n", " if i%10000==0:\n", " print(i, '/', len(data))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "WB-csKnhcVc-" }, "outputs": [], "source": [ "print(sentences[0:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "eTJfuPzncVc_" }, "outputs": [], "source": [ "# save data as pickle format\n", "with open(\"Data/article_cutted\", \"wb\") as file:\n", " pickle.dump(sentences, file)" ] }, { "cell_type": "markdown", "metadata": { "id": "Q2jZg5bUcVdA" }, "source": [ "## posseg (詞性)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8uvtpHt_cVdC" }, "outputs": [], "source": [ "# 在斷詞時 , 請用 jieba 把詞性抓出來 (1 筆即可)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "of-wxdDFcVdD" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "gpuType": "T4", "include_colab_link": true }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }