diff --git a/Next_Word_Prediction.ipynb b/Next_Word_Prediction.ipynb new file mode 100644 index 00000000..91517d58 --- /dev/null +++ b/Next_Word_Prediction.ipynb @@ -0,0 +1,2117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yt8X6as3vrSN" + }, + "source": [ + "### Import the required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wCwrGkBONxCx", + "outputId": "aa91faf6-8259-4ced-9777-56d9115cb462" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "%pip install -q datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "KUv8N5MCmPMq" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "\n", + "# datasets is a library that contains a lot of datasets that are useful and easy to use.\n", + "from datasets import load_dataset\n", + "\n", + "# Tokenizer from tensorflow is used to convert the text into tokens.\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "# pad_sequences is used to make the length of all the sentences equal.\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "# to_categorical is used to convert the labels into one-hot encoding.\n", + "from tensorflow.keras.utils import to_categorical\n", + "\n", + "# Required libraries for the model.\n", + "from tensorflow.keras.models import Sequential, load_model\n", + "from tensorflow.keras.layers import Embedding, Dense, LSTM\n", + "from tensorflow.keras.callbacks import EarlyStopping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VcQaJuanv_kX" + }, + "source": [ + "### Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269, + "referenced_widgets": [ + "ce79aefa8d2249e0860993c04bac6c0d", + "86eafc1ab818461c8f6cfcbf38f492ab", + "1e9e9a2491d04c9d866a9249b0e0b48a", + "d9233d52143a460f8db44e1b9eea80c1", + "cbe324cebdac4ebdb4d38ed2465660fd", + "398c042e3cb84b2bad7f6447be49ed1d", + "b72846749c9f41039e4906820d19c83b", + "34a86d81d73041f5abea66586ea7f820", + "4f46a113c01f4e52bca51520c50d3715", + "fbd60ee7068143ff8a072ec88c32988a", + "ea3214cf3d9244158172ed54f89f2751", + "4ec54523dc9c48fab9975debc52f139d", + "8df73ac506184a1da689a09f9a035eeb", + "f4f2171fb79943bdbf7a90c4edc86c45", + "918864d966fc42eb92acba5a4ac4be18", + "e6eb20068566476d953e6b22af586ee9", + "d4476f8cfa5047919bcf1b3edd227114", + "27074ac17529468fa6eb08feb11649ad", + "10b6b319673540068a8e60b70e1448b7", + "23c56df304f74b39b4a054ac0f9f25e1", + "349f656c05de42ad9238c3fce61147d6", + "9a0a0e8ecbe24b9b905d5eea8669b169", + "50a9c2e2e2ee4e46b7d0a6b5e78b044d", + "f3d1e9525c4a42648a7d1f6034655042", + "149599d7b84840a5a7668e895ac65b0c", + "83221cdfc77e4a7dafca3f0215055bc3", + "69115578238e4d9fa253bfcdca235ef6", + "8d2c41d5808c4f98bd3262cfb217a46f", + "c484aec4e4894f888454ad88f7efe1f1", + "2b6793e54bf945daa81fc0c4333e37ef", + "482c6f5769114eaa95c23fcf3036276b", + "4c5cfe9d69c64777a96fb9c1a86da12b", + "0fb8a1d979e2415181446159a681bab2", + "d15a57c81ac0438f9e211d82182afa39", + "2cc17df57d40480aa1855a621550e242", + "5d9385206cf941a1aa4cb4b056e65adb", + "0b2d126430704faca4a7422312b1e106", + "b7f70b526a71428ca70563ceb269bbaf", + "e27ab08a34394281881aac7f0645aa34", + "6bf1a0875e3f4ca6abc08e320d03db96", + "efaa0bf323374773af9307ab80b8eb9d", + "c4b34f4908ea4e879a3e0172c2f84e61", + "f7e0198f106442e29ad95e851817bfe9", + "4cad34139e6e46b8b09b0a54e1a58776" + ] + }, + "id": "Uz41WgekmPMs", + "outputId": "21d8d4b8-77bb-4467-a11c-89ef8bff7bc7" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ce79aefa8d2249e0860993c04bac6c0d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading readme: 0%| | 0.00/327 [00:00