diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 74e8097bb..fcd544598 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -27,14 +27,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "c21f82d5-17df-4fc9-a180-05edd032f02d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gpt2 optimized for Cloud AI 100 \n", + " QEFFAutoModelForCausalLM\n", + "QEffGPT2LMHeadModel(\n", + " (transformer): QEffGPT2Model(\n", + " (wte): Embedding(50257, 768)\n", + " (wpe): Embedding(1024, 768)\n", + " (drop): Dropout(p=0.1, inplace=False)\n", + " (h): ModuleList(\n", + " (0-11): 12 x QEffGPT2Block(\n", + " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (attn): QEffGPT2Attention(\n", + " (c_attn): Conv1D(nf=2304, nx=768)\n", + " (c_proj): Conv1D(nf=768, nx=768)\n", + " (attn_dropout): Dropout(p=0.1, inplace=False)\n", + " (resid_dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): GPT2MLP(\n", + " (c_fc): Conv1D(nf=3072, nx=768)\n", + " (c_proj): Conv1D(nf=768, nx=3072)\n", + " (act): NewGELUActivation()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", + ")\n" + ] + } + ], "source": [ "# Initiate the Original Transformer model\n", "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", + "# Initiate the tokenizer for transformers library\n", + "from transformers import AutoTokenizer\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", @@ -58,10 +108,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0b293196-ba44-460e-94fb-4378283bc196", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/GPT2LMHeadModel.onnx')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# We can now export the modified models to ONNX framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", @@ -84,19 +145,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/qpc-46bd7fd6377ab8fb/qpc')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Compile the model for provided compilation arguments\n", "# Please use platform SDK to Check num_cores for your card.\n", "\n", - "qeff_model.compile(\n", - " num_cores=14,\n", - " mxfp6=True,\n", - " device_group=[0],\n", - ")" + "qeff_model.compile(num_cores=14, mxfp6_matmul=True)" ] }, { @@ -109,21 +177,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'AutoTokenizer' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name)\n\u001b[1;32m 4\u001b[0m qeff_model\u001b[38;5;241m.\u001b[39mgenerate(prompts\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMy name is\u001b[39m\u001b[38;5;124m\"\u001b[39m], tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n", + "\u001b[0;31mNameError\u001b[0m: name 'AutoTokenizer' is not defined" + ] + } + ], "source": [ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "\n", - "qeff_model.generate(prompts=[\"My name is\"])" + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bab713e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "qeff_env", "language": "python", "name": "python3" }, @@ -137,7 +225,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.19" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index d1a1f3c5f..9fcb75ecf 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -29,12 +29,27 @@ "execution_count": null, "id": "c21f82d5-17df-4fc9-a180-05edd032f02d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n", + "Fetching 2 files: 0%| | 0/2 [00:00