{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "J9qjwDRGcWzo" }, "source": [ "# text's feature engineering: turn unstructure to structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Oywp_-4UcWzz" }, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c36uTsHacWz9" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_2.zip\n", "!unzip -q NLP_part1_2.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2xz5vyXscW0D" }, "outputs": [], "source": [ "df = pd.read_csv('Data/article_preprocessed.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sCl5GCP2cW0G" }, "outputs": [], "source": [ "# load 'article_cutted'\n", "with open(\"Data/article_cutted\", \"rb\") as file:\n", " sentences = pickle.load(file)" ] }, { "cell_type": "markdown", "metadata": { "id": "kopjni38cW0H" }, "source": [ "## define y (push > boo)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OAS56d1fcW0I" }, "outputs": [], "source": [ "# drop data\n", "diff_threshold = 20\n", "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_mTXst7ucW0K" }, "outputs": [], "source": [ "# define y\n", "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n", "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "L4EhnEo_cW0L" }, "outputs": [], "source": [ "df['type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": { "id": "oVLVCnFCcW0M" }, "source": [ "## simple feature" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c9vW5ze4cW0N" }, "outputs": [], "source": [ "# word count\n", "# http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)\n", "df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\\u4e00-\\u9fff]')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-khlaLHycW0O" }, "outputs": [], "source": [ "# punctuation count\n", "# 請產生 \"標點符號數\" 欄位" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xY0LW5lScW0R" }, "outputs": [], "source": [ "# question mark count\n", "# 請產生 \"問號數\" 欄位" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Kuo7VReNcW0T" }, "outputs": [], "source": [ "# 可以自由發揮想像還有什麼 features , etc. 比例" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "J5WIlIodcW0T" }, "outputs": [], "source": [ "# drop punctuation column\n", "df = df.drop(['punctuation'],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b8Tb4FeccW0U" }, "outputs": [], "source": [ "df.iloc[:5, -4:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yj-eUs7RcW0U" }, "outputs": [], "source": [ "# compute correlation\n", "# 請計算前面建立好的 features 與 type 的 correlation" ] }, { "cell_type": "markdown", "metadata": { "id": "DaysfaAkcW0V" }, "source": [ "## bag of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "azZyrBJpcW0V" }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GqY-FGNLcW0W" }, "outputs": [], "source": [ "# define transformer (轉換器)\n", "vectorizer = CountVectorizer()\n", "count = vectorizer.fit_transform([' '.join(x) for x in sentences])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sWJAGCGlcW0X" }, "outputs": [], "source": [ "count" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Tx6OBEKXcW0X" }, "outputs": [], "source": [ "# save data as pickle format\n", "with open(\"Data/article_count\", \"wb\") as file:\n", " pickle.dump([vectorizer, count], file)" ] }, { "cell_type": "markdown", "metadata": { "id": "TlQxSqoBcW0Y" }, "source": [ "### select top 10 frequency of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mtcF-sfmcW0Z" }, "outputs": [], "source": [ "# 用前面轉換好的 count matrix, 嘗試抽出 top 10 count 最多的關鍵字\n", "# 建立 top 10 count 關鍵字的 bag of words features, 計算其與 type 的 correlation" ] }, { "cell_type": "markdown", "metadata": { "id": "I27pNv5GcW0Z" }, "source": [ "# TF-IDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ocnLYq1gcW0b" }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Oe7ZDsdycW0c" }, "outputs": [], "source": [ "# define transformer (轉換器)\n", "vectorizer = TfidfVectorizer(norm=None) ## do not do normalize\n", "tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OI9h0gEecW0f" }, "outputs": [], "source": [ "# save data as pickle format\n", "with open(\"Data/article_tfidf\", \"wb\") as file:\n", " pickle.dump([vectorizer, tfidf], file)" ] }, { "cell_type": "markdown", "metadata": { "id": "9lUohGYhcW0f" }, "source": [ "### select top 10 average tf-idf of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-jEBjnXAcW0g" }, "outputs": [], "source": [ "# 用前面轉換好的 tfidf matrix, 嘗試抽出 top 10 平均 tfidf 最高的關鍵字\n", "# 建立 top 10 平均 tfidf 關鍵字的 tf-idf features, 計算其與 type 的 correlation" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "gpuType": "T4" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }