{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "iFxoavs6v6Gw" }, "source": [ "# the application of word2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NoC3oHV8v6G4" }, "outputs": [], "source": [ "from gensim.models import word2vec\n", "from sklearn.cluster import KMeans\n", "import numpy as np\n", "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6OEo-1a4v6G6" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_4.zip\n", "!unzip -q NLP_part2_4.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XvLfZDvHv6G8" }, "outputs": [], "source": [ "# load word2vec model\n", "model = word2vec.Word2Vec.load('word2vec_model/CBOW')" ] }, { "cell_type": "markdown", "metadata": { "id": "JFs4ADhHv6G8" }, "source": [ "## similarity" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9Mg-VH-cv6G9" }, "outputs": [], "source": [ "# get most similarity with given words\n", "# 可以替換關鍵字\n", "model.wv.most_similar('KMT')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5i99ojIGv6G-" }, "outputs": [], "source": [ "# get most similarity with given words's relationship\n", "# 可以替換關鍵字\n", "model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])" ] }, { "cell_type": "markdown", "metadata": { "id": "-MYpP-jlv6G-" }, "source": [ "## clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "3AoozOTOv6G_" }, "outputs": [], "source": [ "# create a dictionary: words as key ; count as values\n", "words = {word: model.wv.get_vecattr(word, \"count\") for word in list(model.wv.index_to_key)}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PNlJH1fiv6HA" }, "outputs": [], "source": [ "# sort and select the top 10000 count of words\n", "words = sorted(words.items(), key=lambda x: x[1], reverse=True)\n", "words = words[:10000]\n", "words = np.array(words)[:, 0]\n", "words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2u7DZB3rv6HA" }, "outputs": [], "source": [ "# extract the word vectors\n", "vecs = model.wv[words]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "z7bOrbd0v6HB" }, "outputs": [], "source": [ "# run clustering algorithm\n", "kmeans = KMeans(n_clusters=50)\n", "cluster = kmeans.fit_predict(vecs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "onbvWYxxv6HB" }, "outputs": [], "source": [ "# print the result\n", "df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T\n", "df.head(n=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Z4berA4Tv6HC" }, "outputs": [], "source": [ "## print every cluster of words\n", "data = pd.concat([d[['words']].reset_index(drop=True).rename(columns={'words': k}) for k, d in df.groupby('no. cluster')],\n", " axis=1)\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JtDvXsKbv6HD" }, "outputs": [], "source": [ "# 可以嘗試調整 Kmeans 的參數 , etc. 分個 100 群如何 ?" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "include_colab_link": true } }, "nbformat": 4, "nbformat_minor": 0 }