{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "Ke_wxqkbcXXN" }, "source": [ "# text's feature engineering: turn unstructure to structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "x15qRo7vcXXf" }, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "junGiP4XcXXi" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_2.zip\n", "!unzip -q NLP_part1_2.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vBnffTxwcXXl" }, "outputs": [], "source": [ "df = pd.read_csv('Data/article_preprocessed.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "iRXlwYmccXXm" }, "outputs": [], "source": [ "# load 'article_cutted'\n", "with open(\"Data/article_cutted\", \"rb\") as file:\n", " sentences = pickle.load(file)" ] }, { "cell_type": "markdown", "metadata": { "id": "Evty9Vz7cXXo" }, "source": [ "## define y (push > boo)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "J_Huh8VxcXXr" }, "outputs": [], "source": [ "# drop data\n", "diff_threshold = 20\n", "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HM-XOKyVcXXt" }, "outputs": [], "source": [ "# define y\n", "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n", "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "WxmXGpYjcXXu" }, "outputs": [], "source": [ "df['type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": { "id": "AT0sGExFcXXw" }, "source": [ "## simple feature" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "s-doNr3ZcXXx" }, "outputs": [], "source": [ "# word count\n", "# http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)\n", "df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\\u4e00-\\u9fff]')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6WNwlorScXXy" }, "outputs": [], "source": [ "# punctuation count\n", "df['punctuation'] = df['content'].str.replace('[\\w\\s]', '')\n", "df['punctuation_count'] = df['punctuation'].str.len()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lFAePuA4cXXy" }, "outputs": [], "source": [ "# question mark count\n", "df['question_count'] = df['punctuation'].str.count('[??]')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CDxUs76KcXXz" }, "outputs": [], "source": [ "# drop punctuation column\n", "df = df.drop(['punctuation'],axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VmipIGyjcXX0" }, "outputs": [], "source": [ "df.iloc[:5, -4:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "deVBIMOMcXX0" }, "outputs": [], "source": [ "# compute correlation\n", "df.iloc[:, -4:].corr()" ] }, { "cell_type": "markdown", "metadata": { "id": "i55mOeiocXX1" }, "source": [ "## bag of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "crYvb51xcXX1" }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "A44huksacXX2" }, "outputs": [], "source": [ "# define transformer (轉換器)\n", "vectorizer = CountVectorizer()\n", "count = vectorizer.fit_transform([' '.join(x) for x in sentences])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CT9-9FR1cXX3" }, "outputs": [], "source": [ "count" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "u-32UyDccXX3" }, "outputs": [], "source": [ "# save data as pickle format\n", "with open(\"Data/article_count\", \"wb\") as file:\n", " pickle.dump([vectorizer, count], file)" ] }, { "cell_type": "markdown", "metadata": { "id": "YSvl4dNDcXX4" }, "source": [ "### select top 10 frequency of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vxKDgrRJcXX4" }, "outputs": [], "source": [ "# create a dictionary: id as key ; word as values\n", "id2word = {v:k for k, v in vectorizer.vocabulary_.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "98Q12DXGcXX5" }, "outputs": [], "source": [ "# columnwise sum: words frequency\n", "sum_ = np.array(count.sum(axis=0))[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mVbp_g1UcXX5" }, "outputs": [], "source": [ "# top 10 frequency's wordID\n", "most_sum_id = sum_.argsort()[::-1][:10].tolist()\n", "most_sum_id" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EYnmlYSQcXX6" }, "outputs": [], "source": [ "# print top 10 frequency's words\n", "features = [id2word[i] for i in most_sum_id]\n", "features" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fok13vWgcXX8" }, "outputs": [], "source": [ "# print the data\n", "data = pd.DataFrame(count[df.idx,:][:,most_sum_id].toarray(), columns=features)\n", "data[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7uVNkRn_cXX_" }, "outputs": [], "source": [ "df.iloc[0].content" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-Nf_b_ZScXYA" }, "outputs": [], "source": [ "# compute correlation\n", "data = pd.concat([df.type, data], axis=1)\n", "data.corr()" ] }, { "cell_type": "markdown", "metadata": { "id": "1zaVePrYcXYA" }, "source": [ "# TF-IDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "do3KrYeocXYA" }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N1uzP6skcXYB" }, "outputs": [], "source": [ "# define transformer (轉換器)\n", "vectorizer = TfidfVectorizer(norm=None) ## do not do normalize\n", "tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ey_8aXc-cXYC" }, "outputs": [], "source": [ "# save data as pickle format\n", "with open(\"Data/article_tfidf\", \"wb\") as file:\n", " pickle.dump([vectorizer, tfidf], file)" ] }, { "cell_type": "markdown", "metadata": { "id": "IfXaHzAmcXYC" }, "source": [ "### select top 10 average tf-idf of words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vFDSkWWBcXYC" }, "outputs": [], "source": [ "# create a dictionary: id as key ; word as values\n", "id2word = {v:k for k, v in vectorizer.vocabulary_.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "j3lTr_0LcXYD" }, "outputs": [], "source": [ "# columnwise average: words tf-idf\n", "avg = tfidf.sum(axis=0) / (tfidf!=0).sum(axis=0)\n", "\n", "# set df < 20 as 0\n", "avg[(tfidf!=0).sum(axis=0)<20] = 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KUITGa-TcXYD" }, "outputs": [], "source": [ "avg = np.array(avg)[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Fwa4jRjEcXYD" }, "outputs": [], "source": [ "# top 10 tfidf's wordID\n", "most_avg_id = avg.argsort()[::-1][:10].tolist()\n", "most_avg_id" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gGvkW4evcXYE" }, "outputs": [], "source": [ "# print top 10 tf-idf's words\n", "features = [id2word[i] for i in most_avg_id]\n", "features" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fjE1l-d0cXYc" }, "outputs": [], "source": [ "# print the data\n", "data = pd.DataFrame(tfidf[df.idx,:][:,most_avg_id].toarray(), columns=features)\n", "data[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PcUCS3_PcXYd" }, "outputs": [], "source": [ "# compute correlation\n", "data = pd.concat([df.type, data], axis=1)\n", "data.corr()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3gQUb0wwcXYd" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "gpuType": "T4", "include_colab_link": true }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }