{ "cells": [ { "cell_type": "markdown", "id": "792c17a7", "metadata": { "id": "792c17a7" }, "source": [ "# **ConvNeXt**\n", "此份程式碼會介紹如何使用 tf.keras 的方式建構 ConvNeXt 的模型架構。" ] }, { "cell_type": "markdown", "id": "e5168150", "metadata": { "id": "e5168150" }, "source": [ "![image](https://hackmd.io/_uploads/r1UV1gH_a.png)\n", "\n", "- [source paper](https://arxiv.org/abs/2201.03545)" ] }, { "cell_type": "markdown", "id": "e093fb85", "metadata": { "id": "e093fb85" }, "source": [ "## 匯入套件" ] }, { "cell_type": "code", "execution_count": null, "id": "2d0b571e", "metadata": { "id": "2d0b571e" }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# Tensorflow 相關套件\n", "import tensorflow as tf\n", "from tensorflow.keras import datasets, layers, Model, Sequential, losses" ] }, { "cell_type": "markdown", "id": "fbdf4c9b", "metadata": { "id": "fbdf4c9b" }, "source": [ "## 載入資料集" ] }, { "cell_type": "code", "execution_count": null, "id": "3af4a308", "metadata": { "id": "3af4a308" }, "outputs": [], "source": [ "(x_train, y_train), (x_test, y_test) = datasets.mnist.load_data()\n", "\n", "# Expand dimensions\n", "x_train = tf.expand_dims(x_train, axis=3, name=None)\n", "x_test = tf.expand_dims(x_test, axis=3, name=None)\n", "print(f'x_train shape: {x_train.shape}')\n", "print(f'x_test shape: {x_test.shape}')\n", "print('----------')\n", "\n", "# Grayscale to RGB\n", "x_train = tf.repeat(x_train, 3, axis=3)\n", "x_test = tf.repeat(x_test, 3, axis=3)\n", "print(f'x_train shape: {x_train.shape}')\n", "print(f'x_test shape: {x_test.shape}')\n", "print('----------')\n", "\n", "# Split dataset into training and validation data\n", "x_val = x_train[int(x_train.shape[0]*0.8):, :, :, :]\n", "y_val = y_train[int(y_train.shape[0]*0.8):]\n", "x_train = x_train[:int(x_train.shape[0]*0.8), :, :, :]\n", "y_train = y_train[:int(y_train.shape[0]*0.8)]\n", "print(f'x_train shape: {x_train.shape}, x_val shape: {x_val.shape}')\n", "print(f'y_train shape: {y_train.shape}, y_val shape: {y_val.shape}')" ] }, { "cell_type": "markdown", "id": "be847e45", "metadata": { "id": "be847e45" }, "source": [ "## ConvNext Arhietecture" ] }, { "cell_type": "markdown", "id": "6e02381e", "metadata": { "id": "6e02381e" }, "source": [ "![image](https://hackmd.io/_uploads/rksNyeru6.png)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "420691fd", "metadata": { "id": "420691fd" }, "outputs": [], "source": [ "labels_num = 10" ] }, { "cell_type": "code", "execution_count": null, "id": "bbecd35b", "metadata": { "id": "bbecd35b" }, "outputs": [], "source": [ "def ConvNeXtBlock(x, filter_num, block_num):\n", " for i in range(block_num):\n", " # depthwise conv\n", " depthwise = layers.DepthwiseConv2D((7, 7),\n", " padding='same')(x)\n", " depthwise = layers.LayerNormalization(epsilon=1e-6)(depthwise)\n", "\n", " # pointwise conv\n", " pointwise = layers.Conv2D(4 * filter_num, (1, 1),\n", " strides=(1, 1),\n", " padding='same',\n", " activation='gelu')(depthwise)\n", " pointwise = layers.Conv2D(filter_num, (1, 1),\n", " strides=(1, 1),\n", " padding='same')(pointwise)\n", "\n", " # skip connection\n", " outputs = layers.Add()([x, pointwise])\n", " x = outputs\n", " return outputs" ] }, { "cell_type": "code", "execution_count": null, "id": "2518b8bc", "metadata": { "id": "2518b8bc" }, "outputs": [], "source": [ "def Downsample(x, filter_num):\n", " x = layers.LayerNormalization(epsilon=1e-6)(x)\n", " output = layers.Conv2D(filter_num, (2, 2),\n", " strides=(2, 2),\n", " padding='same')(x)\n", " return output" ] }, { "cell_type": "code", "execution_count": null, "id": "a7fc1e5b", "metadata": { "id": "a7fc1e5b" }, "outputs": [], "source": [ "filter_list = [96, 192, 384, 768]\n", "block_list = [3, 3, 9, 3]\n", "\n", "tf.keras.backend.clear_session()\n", "inputs = layers.Input(shape=x_train.shape[1:])\n", "x = layers.Resizing(224, 224,\n", " interpolation=\"bilinear\",\n", " input_shape=x_train.shape[1:])(inputs)\n", "\n", "x = layers.Conv2D(filter_list[0], (4, 4),\n", " strides=(4, 4),\n", " padding='same')(x)\n", "x = layers.LayerNormalization(epsilon=1e-6)(x)\n", "x = ConvNeXtBlock(x, filter_list[0], block_list[0])\n", "\n", "for filter_num, block_num in zip(filter_list[1:], block_list[1:]):\n", " x = Downsample(x, filter_num)\n", " x = ConvNeXtBlock(x, filter_num, block_num)\n", "\n", "x = layers.GlobalAveragePooling2D()(x)\n", "x = layers.LayerNormalization(epsilon=1e-6)(x)\n", "outputs = layers.Dense(labels_num, activation='softmax')(x)" ] }, { "cell_type": "code", "execution_count": null, "id": "84af9190", "metadata": { "id": "84af9190" }, "outputs": [], "source": [ "ConvNeXt_model = Model(inputs=inputs, outputs=outputs)" ] }, { "cell_type": "code", "execution_count": null, "id": "613215ac", "metadata": { "id": "613215ac" }, "outputs": [], "source": [ "ConvNeXt_model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "a97ebae0", "metadata": { "id": "a97ebae0" }, "outputs": [], "source": [ "batch_size = 4\n", "inputs = np.ones((batch_size, x_train.shape[1], x_train.shape[2], 3),\n", " dtype=np.float32)\n", "ConvNeXt_model(inputs)" ] }, { "cell_type": "code", "execution_count": null, "id": "94ebd2fa", "metadata": { "id": "94ebd2fa" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 5 }