|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "## RULE BASED RETRIEVAL " |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "markdown", |
| 12 | + "metadata": {}, |
| 13 | + "source": [ |
| 14 | + "### aka why won't my LLM do what I tell it to when I tell it to " |
| 15 | + ] |
| 16 | + }, |
| 17 | + { |
| 18 | + "cell_type": "markdown", |
| 19 | + "metadata": {}, |
| 20 | + "source": [ |
| 21 | + "### SETUP" |
| 22 | + ] |
| 23 | + }, |
| 24 | + { |
| 25 | + "cell_type": "code", |
| 26 | + "execution_count": 1, |
| 27 | + "metadata": {}, |
| 28 | + "outputs": [ |
| 29 | + { |
| 30 | + "name": "stderr", |
| 31 | + "output_type": "stream", |
| 32 | + "text": [ |
| 33 | + "/Users/tomsmoker/Projects/whyhow/rule-based-retrieval/venv/lib/python3.10/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
| 34 | + " from tqdm.autonotebook import tqdm\n" |
| 35 | + ] |
| 36 | + } |
| 37 | + ], |
| 38 | + "source": [ |
| 39 | + "import logging\n", |
| 40 | + "\n", |
| 41 | + "from pinecone import PodSpec\n", |
| 42 | + "\n", |
| 43 | + "from whyhow_rbr import Client, Rule, IndexNotFoundException" |
| 44 | + ] |
| 45 | + }, |
| 46 | + { |
| 47 | + "cell_type": "code", |
| 48 | + "execution_count": 2, |
| 49 | + "metadata": {}, |
| 50 | + "outputs": [], |
| 51 | + "source": [ |
| 52 | + "# Configure parameters\n", |
| 53 | + "index_name = \"whyhow-demo\"\n", |
| 54 | + "namespace = \"BC-CS688\"\n", |
| 55 | + "pdfs = [\"../data/full_book_one.pdf\"]" |
| 56 | + ] |
| 57 | + }, |
| 58 | + { |
| 59 | + "cell_type": "code", |
| 60 | + "execution_count": 3, |
| 61 | + "metadata": {}, |
| 62 | + "outputs": [], |
| 63 | + "source": [ |
| 64 | + "# Logging\n", |
| 65 | + "logging_level = logging.INFO\n", |
| 66 | + "\n", |
| 67 | + "logging.basicConfig(\n", |
| 68 | + " level=logging.WARNING,\n", |
| 69 | + " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", |
| 70 | + ")\n", |
| 71 | + "logger = logging.getLogger(\"create_index\")\n", |
| 72 | + "logger.setLevel(logging_level)" |
| 73 | + ] |
| 74 | + }, |
| 75 | + { |
| 76 | + "cell_type": "code", |
| 77 | + "execution_count": 4, |
| 78 | + "metadata": {}, |
| 79 | + "outputs": [], |
| 80 | + "source": [ |
| 81 | + "# Initialize client\n", |
| 82 | + "client = Client()" |
| 83 | + ] |
| 84 | + }, |
| 85 | + { |
| 86 | + "cell_type": "code", |
| 87 | + "execution_count": 5, |
| 88 | + "metadata": {}, |
| 89 | + "outputs": [ |
| 90 | + { |
| 91 | + "name": "stderr", |
| 92 | + "output_type": "stream", |
| 93 | + "text": [ |
| 94 | + "2024-04-01 16:10:36,135 - INFO - create_index - Index whyhow-demo already exists, reusing it\n" |
| 95 | + ] |
| 96 | + } |
| 97 | + ], |
| 98 | + "source": [ |
| 99 | + "try:\n", |
| 100 | + " index = client.get_index(index_name)\n", |
| 101 | + " logger.info(f\"Index {index_name} already exists, reusing it\")\n", |
| 102 | + "except IndexNotFoundException:\n", |
| 103 | + " spec = PodSpec(environment=\"gcp-starter\")\n", |
| 104 | + " index = client.create_index(index_name, spec=spec)\n", |
| 105 | + " logger.info(f\"Index {index_name} created\")" |
| 106 | + ] |
| 107 | + }, |
| 108 | + { |
| 109 | + "cell_type": "code", |
| 110 | + "execution_count": 6, |
| 111 | + "metadata": {}, |
| 112 | + "outputs": [ |
| 113 | + { |
| 114 | + "name": "stderr", |
| 115 | + "output_type": "stream", |
| 116 | + "text": [ |
| 117 | + "Upserted vectors: 100%|██████████| 1156/1156 [00:08<00:00, 133.76it/s]\n" |
| 118 | + ] |
| 119 | + } |
| 120 | + ], |
| 121 | + "source": [ |
| 122 | + "# Upload, split, chunk, and vectorize documents in Pinecone\n", |
| 123 | + "client.upload_documents(index=index, documents=pdfs, namespace=namespace)" |
| 124 | + ] |
| 125 | + }, |
| 126 | + { |
| 127 | + "cell_type": "markdown", |
| 128 | + "metadata": {}, |
| 129 | + "source": [ |
| 130 | + "### RULES" |
| 131 | + ] |
| 132 | + }, |
| 133 | + { |
| 134 | + "cell_type": "code", |
| 135 | + "execution_count": 29, |
| 136 | + "metadata": {}, |
| 137 | + "outputs": [], |
| 138 | + "source": [ |
| 139 | + "rules = [\n", |
| 140 | + " Rule(\n", |
| 141 | + " # Replace with your filename\n", |
| 142 | + " filename=\"full_book_one.pdf\",\n", |
| 143 | + " page_numbers=[40],\n", |
| 144 | + " keywords=['friends']\n", |
| 145 | + " ),\n", |
| 146 | + " Rule(\n", |
| 147 | + " # Replace with your filename\n", |
| 148 | + " filename=\"doc2.pdf\",\n", |
| 149 | + " page_numbers=[2],\n", |
| 150 | + " keywords=[],\n", |
| 151 | + " )\n", |
| 152 | + "]" |
| 153 | + ] |
| 154 | + }, |
| 155 | + { |
| 156 | + "cell_type": "code", |
| 157 | + "execution_count": 35, |
| 158 | + "metadata": {}, |
| 159 | + "outputs": [], |
| 160 | + "source": [ |
| 161 | + "question = \"Who does Harry know? Like who are his friends?\"\n", |
| 162 | + "top_k = 5" |
| 163 | + ] |
| 164 | + }, |
| 165 | + { |
| 166 | + "cell_type": "code", |
| 167 | + "execution_count": 36, |
| 168 | + "metadata": {}, |
| 169 | + "outputs": [], |
| 170 | + "source": [ |
| 171 | + "result = client.query(\n", |
| 172 | + " question=question,\n", |
| 173 | + " index=index,\n", |
| 174 | + " namespace=namespace,\n", |
| 175 | + " rules=rules,\n", |
| 176 | + " top_k=top_k,\n", |
| 177 | + " process_rules_separately=False,\n", |
| 178 | + " keyword_trigger=False\n", |
| 179 | + ")" |
| 180 | + ] |
| 181 | + }, |
| 182 | + { |
| 183 | + "cell_type": "code", |
| 184 | + "execution_count": 37, |
| 185 | + "metadata": {}, |
| 186 | + "outputs": [ |
| 187 | + { |
| 188 | + "name": "stderr", |
| 189 | + "output_type": "stream", |
| 190 | + "text": [ |
| 191 | + "2024-04-01 16:18:22,626 - INFO - create_index - Answer: I don't have the context documents to answer who Harry's friends are. Please provide the relevant context or specify which Harry you are referring to.\n" |
| 192 | + ] |
| 193 | + } |
| 194 | + ], |
| 195 | + "source": [ |
| 196 | + "answer = result[\"answer\"]\n", |
| 197 | + "\n", |
| 198 | + "logger.info(f\"Answer: {answer}\")" |
| 199 | + ] |
| 200 | + }, |
| 201 | + { |
| 202 | + "cell_type": "markdown", |
| 203 | + "metadata": {}, |
| 204 | + "source": [ |
| 205 | + "### WHAT IF I WANT IT TO FIND KEYWORDS " |
| 206 | + ] |
| 207 | + }, |
| 208 | + { |
| 209 | + "cell_type": "code", |
| 210 | + "execution_count": 38, |
| 211 | + "metadata": {}, |
| 212 | + "outputs": [], |
| 213 | + "source": [ |
| 214 | + "question = \"What does Harry Potter like to eat?\"" |
| 215 | + ] |
| 216 | + }, |
| 217 | + { |
| 218 | + "cell_type": "code", |
| 219 | + "execution_count": 39, |
| 220 | + "metadata": {}, |
| 221 | + "outputs": [], |
| 222 | + "source": [ |
| 223 | + "rule = Rule(\n", |
| 224 | + " filename=\"../data/full_book_one.pdf\",\n", |
| 225 | + " page_numbers=[15, 30, 45],\n", |
| 226 | + " keywords=[\"food\", \"favorite\"]\n", |
| 227 | + ")" |
| 228 | + ] |
| 229 | + }, |
| 230 | + { |
| 231 | + "cell_type": "code", |
| 232 | + "execution_count": 40, |
| 233 | + "metadata": {}, |
| 234 | + "outputs": [ |
| 235 | + { |
| 236 | + "name": "stdout", |
| 237 | + "output_type": "stream", |
| 238 | + "text": [ |
| 239 | + "Harry Potter likes to eat roast beef, roast chicken, pork chops, lamb chops, sausages, bacon, steak, boiled potatoes, roast potatoes, chips, Yorkshire pudding, peas, carrots, gravy, ketchup, chocolate éclairs, jam doughnuts, trifle, strawberries, jelly, rice pudding, treacle tart, and Bertie Bott's Every-Flavour Beans, but not mint humbugs.\n", |
| 240 | + "[{'id': '../data/full_book_one.pdf-85-1', 'score': 0.6526559, 'metadata': {'text': 'piled with food. He had never seen so many things he liked to eat on one table: roast beef, roast chicken, pork chops and lamb chops, sausages, bacon and steak, boiled potatoes, roast potatoes, chips, Yorkshire pudding, peas, carrots, gravy , ketchup and, for some strange reason, mint humbugs. The Dursleys had never exactly starved Harry , but he’d never been allowed to eat as much as he liked. Dudley had always taken anything that Harry really wanted, even if it made him sick. Harry', 'page_number': 85, 'chunk_number': 1, 'filename': '../data/full_book_one.pdf', 'uuid': '5361d3da-7ea6-457c-80d4-54f599976ffe'}}, {'id': '../data/full_book_one.pdf-85-2', 'score': 0.599429369, 'metadata': {'text': 'anything that Harry really wanted, even if it made him sick. Harry piled his plate with a bit of everything except the humbugs and began to eat. It was all delicious. ‘That does look good,’ said the ghost in the ruff sadly , watching Harry cut up his steak. ‘Can’t you –?’ ‘I haven’t eaten for nearly five hundred years,’ said the ghost. ‘I don’t need to, of course, but one does miss it. I don’t think I’ve introduced myself? Sir Nicholas de Mimsy-Porpington at your', 'page_number': 85, 'chunk_number': 2, 'filename': '../data/full_book_one.pdf', 'uuid': 'e90dff94-e1f5-4d02-9cbb-8d2474cd44b0'}}, {'id': '../data/full_book_one.pdf-86-2', 'score': 0.585694432, 'metadata': {'text': 'chocolate éclairs and jam doughnuts, trifle, strawberries, jelly , rice pudding ... As Harry helped himself to a treacle tart, the talk turned to their families. ‘I’m half and half,’ said Seamus. ‘Me dad’s a Muggle. Mam didn’t tell him she was a witch ’til after they were married. Bit of a nasty shock for him.’ The others laughed. ‘What about you, Neville?’ said Ron. ‘Well, my gran brought me up and she’s a witch,’ said Neville, ‘but the family thought I was all Muggle for ages. My great-uncle', 'page_number': 86, 'chunk_number': 2, 'filename': '../data/full_book_one.pdf', 'uuid': '56c894ab-3d76-4e5c-a8bf-c5eba50d4e3c'}}, {'id': '../data/full_book_one.pdf-71-0', 'score': 0.58425, 'metadata': {'text': '78 H ARRY POTTER eating the frogs than looking at the Famous Witches and Wizards cards, but Harry couldn’t keep his eyes off them. Soon he had not only Dumbledore and Morgana, but Hengist of Woodcraft, Alberic Grunnion, Circe, Paracelsus and Merlin. He finally tore his eyes away from the druidess Cliodna, who was scratching her nose, to open a bag of Bertie Bott’s Every-Flavour Beans. ‘You want to be careful with those,’ Ron warned Harry . ‘When', 'page_number': 71, 'chunk_number': 0, 'filename': '../data/full_book_one.pdf', 'uuid': '81533a65-f364-46ce-ac84-8624d4e55c83'}}, {'id': '../data/full_book_one.pdf-85-0', 'score': 0.575736284, 'metadata': {'text': '92 H ARRY POTTER here they are: Nitwit! Blubber! Oddment! T weak! ‘Thank you!’ He sat back down. Everybody clapped and cheered. Harry didn’t know whether to laugh or not. ‘Is he – a bit mad?’ he asked Percy uncertainly . ‘Mad?’ said Percy airily . ‘He’s a genius! Best wizard in the world! But he is a bit mad, yes. Potatoes, Harry?’ Harry’s mouth fell open. The dishes in front of him were now piled with food. He had never seen so many things he liked to eat', 'page_number': 85, 'chunk_number': 0, 'filename': '../data/full_book_one.pdf', 'uuid': '0087d8d3-2e7e-4a0d-ba33-fe9bf4d73161'}}]\n", |
| 241 | + "[0, 1, 2, 3, 4]\n" |
| 242 | + ] |
| 243 | + } |
| 244 | + ], |
| 245 | + "source": [ |
| 246 | + "result = client.query(\n", |
| 247 | + " question=question,\n", |
| 248 | + " index=index,\n", |
| 249 | + " namespace=namespace,\n", |
| 250 | + " rules=[rule],\n", |
| 251 | + " keyword_trigger=True\n", |
| 252 | + ")\n", |
| 253 | + "\n", |
| 254 | + "print(result[\"answer\"])\n", |
| 255 | + "print(result[\"matches\"])\n", |
| 256 | + "print(result[\"used_contexts\"])" |
| 257 | + ] |
| 258 | + }, |
| 259 | + { |
| 260 | + "cell_type": "markdown", |
| 261 | + "metadata": {}, |
| 262 | + "source": [ |
| 263 | + "### WHAT IF WE WANT IT TO RUN EACH RULE IN A ROW" |
| 264 | + ] |
| 265 | + }, |
| 266 | + { |
| 267 | + "cell_type": "code", |
| 268 | + "execution_count": 41, |
| 269 | + "metadata": {}, |
| 270 | + "outputs": [], |
| 271 | + "source": [ |
| 272 | + "question = \"What is Harry Potter's favorite food?\"" |
| 273 | + ] |
| 274 | + }, |
| 275 | + { |
| 276 | + "cell_type": "code", |
| 277 | + "execution_count": 43, |
| 278 | + "metadata": {}, |
| 279 | + "outputs": [], |
| 280 | + "source": [ |
| 281 | + "rule_1 = Rule(\n", |
| 282 | + " filename=\"data/full_book_one.pdf\",\n", |
| 283 | + " page_numbers=[120, 121, 150]\n", |
| 284 | + ")\n", |
| 285 | + "\n", |
| 286 | + "rule_2 = Rule(\n", |
| 287 | + " filename=\"data/full_book_one.pdf\",\n", |
| 288 | + " page_numbers=[80, 81, 82]\n", |
| 289 | + ")\n", |
| 290 | + "\n", |
| 291 | + "result = client.query(\n", |
| 292 | + " question=question,\n", |
| 293 | + " index=index,\n", |
| 294 | + " namespace=namespace,\n", |
| 295 | + " rules=[rule_1, rule_2],\n", |
| 296 | + " process_rules_separately=True\n", |
| 297 | + ")" |
| 298 | + ] |
| 299 | + }, |
| 300 | + { |
| 301 | + "cell_type": "code", |
| 302 | + "execution_count": null, |
| 303 | + "metadata": {}, |
| 304 | + "outputs": [], |
| 305 | + "source": [] |
| 306 | + } |
| 307 | + ], |
| 308 | + "metadata": { |
| 309 | + "kernelspec": { |
| 310 | + "display_name": "venv", |
| 311 | + "language": "python", |
| 312 | + "name": "python3" |
| 313 | + }, |
| 314 | + "language_info": { |
| 315 | + "codemirror_mode": { |
| 316 | + "name": "ipython", |
| 317 | + "version": 3 |
| 318 | + }, |
| 319 | + "file_extension": ".py", |
| 320 | + "mimetype": "text/x-python", |
| 321 | + "name": "python", |
| 322 | + "nbconvert_exporter": "python", |
| 323 | + "pygments_lexer": "ipython3", |
| 324 | + "version": "3.10.13" |
| 325 | + } |
| 326 | + }, |
| 327 | + "nbformat": 4, |
| 328 | + "nbformat_minor": 2 |
| 329 | +} |
0 commit comments