From 476781a00d7b7872ea08d3e8fdd19d9cf0f0ca88 Mon Sep 17 00:00:00 2001 From: eren23 Date: Sun, 23 Apr 2023 22:19:04 +0200 Subject: [PATCH 1/5] stupid comment for interim commit --- .gitignore | 3 +- knowledgegpt/utils/utils_distance.py | 86 +++++++++++++++++---------- knowledgegpt/utils/utils_embedding.py | 32 ++++++---- 3 files changed, 75 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 9beaf89..aab923c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ examples/calculated_indexes/* static_files/*_test.* examples/example_config.py output_images/ -.chroma \ No newline at end of file +.chroma +settings.json \ No newline at end of file diff --git a/knowledgegpt/utils/utils_distance.py b/knowledgegpt/utils/utils_distance.py index 19e39bd..b4af013 100644 --- a/knowledgegpt/utils/utils_distance.py +++ b/knowledgegpt/utils/utils_distance.py @@ -1,12 +1,16 @@ # https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb sourced from here import numpy as np -from knowledgegpt.utils.utils_embedding import get_hf_embeddings, get_embedding +from knowledgegpt.utils.utils_embedding import ( + get_hf_embeddings, + get_embedding, + lang_embedding_dim_map, +) def vector_similarity(x: list[float], y: list[float]) -> float: """ Returns the similarity between two vectors. - + Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product. :param x: The first vector. :param y: The second vector. @@ -15,13 +19,18 @@ def vector_similarity(x: list[float], y: list[float]) -> float: return np.dot(np.array(x), np.array(y)) -def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array], verbose=False, - embedding_type: str = "hf", model_lang: str = 'en', - index_type: str = "basic") -> list[(float, (str, str))]: +def order_document_sections_by_query_similarity( + query: str, + contexts: dict[(str, str), np.array], + verbose=False, + embedding_type: str = "hf", + model_lang: str = "en", + index_type: str = "basic", +) -> list[(float, (str, str))]: """ Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings - to find the most relevant sections. - + to find the most relevant sections. + Return the list of document sections, sorted by relevance in descending order. :param query: The query to answer. :param contexts: The embeddings of the document sections. @@ -37,39 +46,47 @@ def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, query_embedding = get_embedding(query) if index_type == "basic": + document_similarities = sorted( + [ + (vector_similarity(query_embedding, doc_embedding), doc_index) + for doc_index, doc_embedding in contexts.items() + ], + reverse=True, + ) - document_similarities = sorted([ - (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in - contexts.items() - ], reverse=True) - elif index_type == "basic_svm": # pass from sklearn import svm import numpy as np - x = np.concatenate([query_embedding[None,...], np.array(list(contexts.values()))], axis=0) - y= np.zeros(len(x)) - y[0]=1 - - clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=1.0) - clf.fit(x, y) # train - + x = np.concatenate( + [query_embedding[None, ...], np.array(list(contexts.values()))], axis=0 + ) + y = np.zeros(len(x)) + y[0] = 1 + + clf = svm.LinearSVC( + class_weight="balanced", verbose=False, max_iter=10000, tol=1e-6, C=1.0 + ) + clf.fit(x, y) # train + similarities = clf.decision_function(x) sorted_ix = np.argsort(-similarities) - - n_neighbors = len(contexts)//4 - - document_similarities = [(similarities[i], list(contexts.keys())[i]) for i in sorted_ix[:min(n_neighbors, len(sorted_ix))]] - + n_neighbors = len(contexts) // 4 + + document_similarities = [ + (similarities[i], list(contexts.keys())[i]) + for i in sorted_ix[: min(n_neighbors, len(sorted_ix))] + ] + return document_similarities - + elif index_type == "faiss": import faiss if embedding_type == "hf": - dim = 384 + dim = lang_embedding_dim_map[model_lang] else: dim = 1536 @@ -87,7 +104,9 @@ def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, D, I = index.search(query_embedding, len(contexts)) # actual search - document_similarities = [(D[0][i], list(contexts.keys())[I[0][i]]) for i in range(len(I[0]))] + document_similarities = [ + (D[0][i], list(contexts.keys())[I[0][i]]) for i in range(len(I[0])) + ] # print("document_similarities", document_similarities) if not verbose: print("DONE, FAISS") @@ -97,18 +116,21 @@ def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, client = chromadb.Client() collection = client.create_collection("chroma_collection") - + collection.add( embeddings=list(contexts.values()), - ids=[str(i) for i in list(contexts.keys())] + ids=[str(i) for i in list(contexts.keys())], ) - + query_result = collection.query( query_embeddings=[query_embedding], n_results=len(contexts), ) - - document_similarities = [(query_result["distances"][0][i], int(query_result["ids"][0][i])) for i in range(len(query_result["ids"][0]))] + + document_similarities = [ + (query_result["distances"][0][i], int(query_result["ids"][0][i])) + for i in range(len(query_result["ids"][0])) + ] if not verbose: print("DONE, CHROMA") return document_similarities diff --git a/knowledgegpt/utils/utils_embedding.py b/knowledgegpt/utils/utils_embedding.py index 028f767..9d455ca 100644 --- a/knowledgegpt/utils/utils_embedding.py +++ b/knowledgegpt/utils/utils_embedding.py @@ -9,11 +9,14 @@ model_language_map = { "en": "sentence-transformers/all-MiniLM-L6-v2", - "tr": "emrecan/bert-base-turkish-cased-mean-nli-stsb-tr" + "tr": "emrecan/bert-base-turkish-cased-mean-nli-stsb-tr", + "it": "efederici/sentence-it5-base", } +lang_embedding_dim_map = {"en": 384, "tr": 768, "it": 512} -def get_hf_embeddings(text: str, model_lang='en') -> np.ndarray: + +def get_hf_embeddings(text: str, model_lang="en") -> np.ndarray: """ Returns the embeddings for the supplied text using the specified model. Uses the Hugging Face library. :param text: The text to embed. @@ -31,7 +34,9 @@ def get_hf_embeddings(text: str, model_lang='en') -> np.ndarray: return sentence_embeddings[0] -def compute_doc_embeddings_hf(df: pd.DataFrame, model_lang='en') -> dict[tuple[str, str], np.ndarray]: +def compute_doc_embeddings_hf( + df: pd.DataFrame, model_lang="en" +) -> dict[tuple[str, str], np.ndarray]: """ Computes the embeddings for the document sections. :param df: The dataframe containing the document sections. @@ -40,9 +45,7 @@ def compute_doc_embeddings_hf(df: pd.DataFrame, model_lang='en') -> dict[tuple[s """ - return { - idx: get_hf_embeddings(r.content, model_lang) for idx, r in df.iterrows() - } + return {idx: get_hf_embeddings(r.content, model_lang) for idx, r in df.iterrows()} # below sourced from here https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb @@ -53,10 +56,7 @@ def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]: :param model: The model to use. :return: The embeddings for the text. """ - result = openai.Embedding.create( - model=model, - input=text - ) + result = openai.Embedding.create(model=model, input=text) time.sleep(5) return result["data"][0]["embedding"] @@ -67,6 +67,12 @@ def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float :param df: The dataframe containing the document sections. :return: The embeddings for the document sections. """ - return { - idx: get_embedding(r.content) for idx, r in df.iterrows() - } + return {idx: get_embedding(r.content) for idx, r in df.iterrows()} + + +def get_dimensions(lang: str) -> int: + """ + Returns the dimensionality of the embeddings for the specified language. + :param lang: The language of the embeddings. + :return: The dimensionality of the embeddings. + """ From 8956d20d5c5bd34ba64e7cfc55b25e64bc8a8d08 Mon Sep 17 00:00:00 2001 From: eren23 Date: Sun, 23 Apr 2023 22:19:57 +0200 Subject: [PATCH 2/5] forgotten func --- knowledgegpt/utils/utils_embedding.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/knowledgegpt/utils/utils_embedding.py b/knowledgegpt/utils/utils_embedding.py index 9d455ca..fc29fda 100644 --- a/knowledgegpt/utils/utils_embedding.py +++ b/knowledgegpt/utils/utils_embedding.py @@ -68,11 +68,3 @@ def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float :return: The embeddings for the document sections. """ return {idx: get_embedding(r.content) for idx, r in df.iterrows()} - - -def get_dimensions(lang: str) -> int: - """ - Returns the dimensionality of the embeddings for the specified language. - :param lang: The language of the embeddings. - :return: The dimensionality of the embeddings. - """ From d134c0eedd580b625c0182b497563a73d2571915 Mon Sep 17 00:00:00 2001 From: eren23 Date: Mon, 24 Apr 2023 22:42:22 +0200 Subject: [PATCH 3/5] basic check fix --- knowledgegpt/extractors/helpers.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/knowledgegpt/extractors/helpers.py b/knowledgegpt/extractors/helpers.py index 391d0ee..02be859 100644 --- a/knowledgegpt/extractors/helpers.py +++ b/knowledgegpt/extractors/helpers.py @@ -1,4 +1,6 @@ -def check_embedding_extractor(embedding_extractor, embedding_extractor_acceptable_list=None): +def check_embedding_extractor( + embedding_extractor, embedding_extractor_acceptable_list=None +): if embedding_extractor_acceptable_list is None: embedding_extractor_acceptable_list = ["hf", "openai"] @@ -6,29 +8,35 @@ def check_embedding_extractor(embedding_extractor, embedding_extractor_acceptabl raise Exception("Embedding Extractor must be a string") if embedding_extractor not in embedding_extractor_acceptable_list: - raise Exception(f"Embedding Extractor is not allowed. " - f"Please choose one of : {embedding_extractor_acceptable_list}") + raise Exception( + f"Embedding Extractor is not allowed. " + f"Please choose one of : {embedding_extractor_acceptable_list}" + ) def check_model_lang(model_lang, model_lang_acceptable_list=None): if model_lang_acceptable_list is None: - model_lang_acceptable_list = ["en", "tr"] + model_lang_acceptable_list = ["en", "tr", "it"] if not isinstance(model_lang, str): raise Exception("Model Lang must be a string") if model_lang not in model_lang_acceptable_list: - raise Exception(f"Model Lang is not allowed. " - f"Please choose one of : {model_lang_acceptable_list}") + raise Exception( + f"Model Lang is not allowed. " + f"Please choose one of : {model_lang_acceptable_list}" + ) def check_index_type(index_type, index_type_acceptable_list=None): if index_type_acceptable_list is None: - index_type_acceptable_list = ["basic","basic_svm", "faiss", "chroma"] + index_type_acceptable_list = ["basic", "basic_svm", "faiss", "chroma"] if not isinstance(index_type, str): raise Exception("Index Type must be a string") if index_type not in index_type_acceptable_list: - raise Exception(f"Index Type is not allowed. " - f"Please choose one of : {index_type_acceptable_list}") + raise Exception( + f"Index Type is not allowed. " + f"Please choose one of : {index_type_acceptable_list}" + ) From c014f3d6cdf4ea18ba8da6b5f74e99e17ed3bce7 Mon Sep 17 00:00:00 2001 From: eren23 Date: Mon, 24 Apr 2023 22:55:07 +0200 Subject: [PATCH 4/5] model lookup --- knowledgegpt/utils/utils_prompt.py | 70 ++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/knowledgegpt/utils/utils_prompt.py b/knowledgegpt/utils/utils_prompt.py index 37d7ded..e585fe6 100644 --- a/knowledgegpt/utils/utils_prompt.py +++ b/knowledgegpt/utils/utils_prompt.py @@ -1,5 +1,7 @@ # https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb sourced from here -from knowledgegpt.utils.utils_distance import order_document_sections_by_query_similarity +from knowledgegpt.utils.utils_distance import ( + order_document_sections_by_query_similarity, +) import pandas as pd import tiktoken @@ -10,17 +12,33 @@ encoding = tiktoken.get_encoding(ENCODING) separator_len = len(encoding.encode(SEPARATOR)) -relevancy_template = ''' +relevancy_template = """ You duty is to check if the question given and the context part given are relevant to each other. If they are relevant, please write "yes" or "y" or "1" or "true" or "t". If they are not relevant, please write "no" or "n" or "0" or "false" or "f". If you are not sure, please write "unsure" or "u" or "2" or "maybe" or "m". You don't have to be super strict, a basic relevancy check is enough we are trying to hunt down stuff like references to other documents, or other stuff that is not relevant to the question. Question: {question} Context: {context} Answer: -''' +""" +header_lookup = { + "tr": """Cümleyi doğru bir şekilde cevaplayın ve cevap metin içinde yoksa "bilmiyorum" diyin.\n\nMetin:\n""", + "en": """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n""", + "it": """Rispondi alla domanda con la massima veridicità possibile utilizzando il contesto fornito, e se la risposta non è contenuta nel testo qui sotto, digita "Non lo so".\n\nContesto:\n""", +} -def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, embedding_type: str = "hf", - verbose=False, model_lang: str = "en", max_tokens=1000, index_type="basic", prompt_template=None, strict_context=False) -> str: + +def construct_prompt( + question: str, + context_embeddings: dict, + df: pd.DataFrame, + embedding_type: str = "hf", + verbose=False, + model_lang: str = "en", + max_tokens=1000, + index_type="basic", + prompt_template=None, + strict_context=False, +) -> str: """ Construct the prompt to be used for completion. :param question: The question to answer. @@ -39,34 +57,41 @@ def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, embedding_type=embedding_type, model_lang=model_lang, verbose=verbose, - index_type=index_type + index_type=index_type, ) chosen_sections = [] chosen_sections_len = 0 chosen_sections_indexes = [] if strict_context: - print("STRICT MODE IS ON, THIS IS GOING TO TAKE A WHILE AND IS AN EXPERIMENTAL FEATURE") + print( + "STRICT MODE IS ON, THIS IS GOING TO TAKE A WHILE AND IS AN EXPERIMENTAL FEATURE" + ) for _, section_index in most_relevant_document_sections: document_section = df.loc[section_index] - + if strict_context: if len(document_section.content) < 10: continue - + import openai from knowledgegpt.utils.utils_completion import model_types - - prompt = relevancy_template.format(question=question, context=document_section.content) - - response = openai.Completion.create( - prompt = prompt, - ** model_types["davinci"] + + prompt = relevancy_template.format( + question=question, context=document_section.content ) - - if response["choices"][0]["text"].strip(" \n").lower() in ["no", "n", "0", "false", "f"]: + + response = openai.Completion.create(prompt=prompt, **model_types["davinci"]) + + if response["choices"][0]["text"].strip(" \n").lower() in [ + "no", + "n", + "0", + "false", + "f", + ]: continue - + document_tokens = len(encoding.encode(document_section.content)) chosen_sections_len += document_tokens + separator_len if chosen_sections_len > MAX_SECTION_LEN: @@ -80,11 +105,10 @@ def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, print("\n".join(chosen_sections_indexes)) if prompt_template is None: - if model_lang == "tr": - header = """Cümleyi doğru bir şekilde cevaplayın ve cevap metin içinde yoksa "bilmiyorum" diyin.\n\nMetin:\n""" - else: - header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n""" + header = header_lookup[model_lang] return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:" else: - return prompt_template.format(question=question, sections="".join(chosen_sections)) + return prompt_template.format( + question=question, sections="".join(chosen_sections) + ) From 4f7248f32cbe819a7867b5b29cf86c45a39ca67c Mon Sep 17 00:00:00 2001 From: eren23 Date: Tue, 25 Apr 2023 08:54:44 +0200 Subject: [PATCH 5/5] example --- examples/languages/basic_example_it.ipynb | 535 ++++++++++++++++++++++ 1 file changed, 535 insertions(+) create mode 100644 examples/languages/basic_example_it.ipynb diff --git a/examples/languages/basic_example_it.ipynb b/examples/languages/basic_example_it.ipynb new file mode 100644 index 0000000..5da01eb --- /dev/null +++ b/examples/languages/basic_example_it.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# !python3 -m spacy download en_core_web_sm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/eren/opt/anaconda3/envs/knowledgegpt-env/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/Users/eren/opt/anaconda3/envs/knowledgegpt-env/lib/python3.9/site-packages/pydub/utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\n", + " warn(\"Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\", RuntimeWarning)\n" + ] + } + ], + "source": [ + "from knowledgegpt.extractors.base_extractor import BaseExtractor\n", + "from knowledgegpt.utils.utils_scrape import scrape_content" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "from example_config import SECRET_KEY\n", + "openai.api_key = SECRET_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# # Scrape content from a webpage\n", + "# any other dataframes can be used as well, the only requirement is that the column name is \"content\"\n", + "df = scrape_content(\"https://it.wikipedia.org/wiki/Bombarda_(arma)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
content
0Bombarda (arma) - Wikipedia
1Bombarda (arma)
2Da Wikipedia, l'enciclopedia libera.
3Jump to navigation\\nJump to search\\nBombardaPi...
4La bombarda era un pezzo d'artiglieria a tiro ...
5Indice
61 Storia\\n2 Caratteristiche tecniche\\n3 Galler...
7Storia[modifica | modifica wikitesto]\\nNella s...
8In primo piano la bombarda 58 A, sullo sfondo ...
9Galleria d'immagini[modifica | modifica wikite...
10Uno schioppo datato al 1390–1400.
11Una bombarda in ferro battuto del 1450 circa, ...
12Il cannone dei Dardanelli detto anche Grande B...
13Il Mons Meg (datato al XV secolo) esposto al c...
14Palle di cannone del Mons Meg.
15Due bombardate inglesi abbandonate da Thomas S...
16Una bombarda utilizzata nel XV secolo dall'Ord...
17Note[modifica | modifica wikitesto]
18^ a b c BOMBARDA in \"Enciclopedia Italiana\", ...
19Bibliografia[modifica | modifica wikitesto]\\nM...
20Estratto da \"https://it.wikipedia.org/w/index....
21Menu di navigazione
22Strumenti personali
23Accesso non effettuatodiscussionicontributireg...
24Namespace
25VoceDiscussione
26italiano
27Visite
28LeggiModificaModifica wikitestoCronologia
29Altro
30Navigazione
31Pagina principaleUltime modificheUna voce a ca...
32Comunità
33Portale ComunitàBarIl WikipedianoFai una donaz...
34Strumenti
35Puntano quiModifiche correlatePagine specialiL...
36Stampa/esporta
37Crea un libroScarica come PDFVersione stampabile
38In altri progetti
39Wikimedia Commons
40In altre lingue
41БеларускаяБеларуская (тарашкевіца)БългарскиCat...
42Questa pagina è stata modificata per l'ultima ...
43Informativa sulla privacy\\nInformazioni su Wik...
\n", + "
" + ], + "text/plain": [ + " content\n", + "0 Bombarda (arma) - Wikipedia\n", + "1 Bombarda (arma)\n", + "2 Da Wikipedia, l'enciclopedia libera.\n", + "3 Jump to navigation\\nJump to search\\nBombardaPi...\n", + "4 La bombarda era un pezzo d'artiglieria a tiro ...\n", + "5 Indice\n", + "6 1 Storia\\n2 Caratteristiche tecniche\\n3 Galler...\n", + "7 Storia[modifica | modifica wikitesto]\\nNella s...\n", + "8 In primo piano la bombarda 58 A, sullo sfondo ...\n", + "9 Galleria d'immagini[modifica | modifica wikite...\n", + "10 Uno schioppo datato al 1390–1400.\n", + "11 Una bombarda in ferro battuto del 1450 circa, ...\n", + "12 Il cannone dei Dardanelli detto anche Grande B...\n", + "13 Il Mons Meg (datato al XV secolo) esposto al c...\n", + "14 Palle di cannone del Mons Meg.\n", + "15 Due bombardate inglesi abbandonate da Thomas S...\n", + "16 Una bombarda utilizzata nel XV secolo dall'Ord...\n", + "17 Note[modifica | modifica wikitesto]\n", + "18 ^ a b c BOMBARDA in \"Enciclopedia Italiana\", ...\n", + "19 Bibliografia[modifica | modifica wikitesto]\\nM...\n", + "20 Estratto da \"https://it.wikipedia.org/w/index....\n", + "21 Menu di navigazione\n", + "22 Strumenti personali\n", + "23 Accesso non effettuatodiscussionicontributireg...\n", + "24 Namespace\n", + "25 VoceDiscussione\n", + "26 italiano\n", + "27 Visite\n", + "28 LeggiModificaModifica wikitestoCronologia\n", + "29 Altro\n", + "30 Navigazione\n", + "31 Pagina principaleUltime modificheUna voce a ca...\n", + "32 Comunità\n", + "33 Portale ComunitàBarIl WikipedianoFai una donaz...\n", + "34 Strumenti\n", + "35 Puntano quiModifiche correlatePagine specialiL...\n", + "36 Stampa/esporta\n", + "37 Crea un libroScarica come PDFVersione stampabile\n", + "38 In altri progetti\n", + "39 Wikimedia Commons\n", + "40 In altre lingue\n", + "41 БеларускаяБеларуская (тарашкевіца)БългарскиCat...\n", + "42 Questa pagina è stata modificata per l'ultima ...\n", + "43 Informativa sulla privacy\\nInformazioni su Wik..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing embeddings...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model_lang it\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 1 document sections:\n", + "1\n", + "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n", + "\n", + "Context:\n", + "\n", + "* Bombarda (arma)\n", + "\n", + " Q: cos'è un cannone bombarda\n", + " A:\n", + "all_done!\n" + ] + } + ], + "source": [ + "basic_extractor = BaseExtractor(dataframe=df, embedding_extractor=\"hf\", model_lang=\"it\", is_turbo=True)\n", + "answer, prompt, messages = basic_extractor.extract(\"cos'è un cannone bombarda\", max_tokens=300)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Un cannone bombarda è un'arma d'artiglieria che utilizza la polvere da sparo per sparare proiettili esplosivi.\"" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "quanto pesa un cannone da bombardamento?\n", + "all_done!\n" + ] + } + ], + "source": [ + "answer, prompt, messages = basic_extractor.extract(\"quanto pesa un cannone da bombardamento?\", max_tokens=300)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Il peso di un cannone da bombardamento può variare notevolmente a seconda delle dimensioni e del tipo di arma. Tuttavia, i cannoni da bombardamento storici potevano pesare diverse tonnellate. Ad esempio, il cannone da bombardamento \"Mons Meg\" del XV secolo, esposto al castello di Edimburgo, pesa circa 6 tonnellate.'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'role': 'system', 'content': 'you are a helpful assistant'},\n", + " {'role': 'user',\n", + " 'content': 'Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don\\'t know.\"\\n\\nContext:\\n\\n* Bombarda (arma)\\n\\n Q: cos\\'è un cannone bombarda\\n A:'},\n", + " {'role': 'assistant',\n", + " 'content': \"Un cannone bombarda è un'arma d'artiglieria che utilizza la polvere da sparo per sparare proiettili esplosivi.\"},\n", + " {'role': 'user', 'content': 'quanto pesa un cannone da bombardamento?'},\n", + " {'role': 'assistant',\n", + " 'content': 'Il peso di un cannone da bombardamento può variare notevolmente a seconda delle dimensioni e del tipo di arma. Tuttavia, i cannoni da bombardamento storici potevano pesare diverse tonnellate. Ad esempio, il cannone da bombardamento \"Mons Meg\" del XV secolo, esposto al castello di Edimburgo, pesa circa 6 tonnellate.'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "knowledgegpt-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}