erwaen · elioth-frutos · Jun 5, 2023 · Jun 23, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -97,5 +97,20 @@ services:
       - 3000:3000
     depends_on:
       - db
+
+  retrieval_augmentation_api:
+    container_name: retrieval_augmentation_api
+    build:
+      dockerfile: Dockerfile
+      context: ./retrieval_augmentation/api
+    expose:
+      - 8000
+    ports:
+      - 8000:8000
+    environment:
+      - PINECONE_API_KEY=${PINECONE_API_KEY}
+      - PINECONE_ENV=${PINECONE_ENV}
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+
 volumes:
   postgresql_db_data:
diff --git a/retrieval_augmentation/api/Dockerfile b/retrieval_augmentation/api/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:latest
+
+COPY [ "api_script.py", "requirements.txt", "./" ]
+
+RUN pip3 install -r requirements.txt
+
+EXPOSE 8000
+
+CMD [ "uvicorn", "api_script:app" ]
diff --git a/retrieval_augmentation/api/README.md b/retrieval_augmentation/api/README.md
@@ -0,0 +1,65 @@
+# **Retrieval augmentation API**
+
+This API performs a similarity searches and retrieval augmented
+generative question answering on information stored in a Pinecone
+database.
+
+## **Similarity searches**
+
+### **Request**
+
+`POST /similarity search`
+
+```
+{
+  "query_content": "data entry clerk"
+}
+```
+
+### **Response**
+
+```
+{
+  "results": [
+    {
+      "page_content": "Job Tittle- Data Entry Clerk - Remote Job location- Remote Salary Depending on Candidate Experience Opening Required 10 Data entry operator. Fresher can also apply. Enter data into system accurately and efficiently Organize and maintain files and records Perform other duties as assigned by the supervisor. Compiling, verifying the accuracy, and sorting information to prepare source data for computer entry. Reviewing data for deficiencies or errors, correcting any incompatibilities, and checking output. Required Candidate profile. GOOD KNOWLEDGE OF COMPUTER MUST HAVE LAPTOP OR SYSTEM MUST HAVE BASIC TYPING SPEED MUST HAVE TYPING ACCURACY NO WORK PRESSURE & NO TIME BOUNDATION",
+      "metadata": {
+        "date_of_scrapping": "2023-06-17T23:10:43",
+        "job_title": "Data Entry Clerk - Remote",
+        "seq_num": 1,
+        "source": "https://py.linkedin.com/jobs/view/data-entry-clerk-remote-at-estaffing-inc-3549533163?refId=Rl4dJdGCnUXHvz5znJDy%2Fg%3D%3D&trackingId=GiDE%2BABmZCzZnciUJJOuqA%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card"
+      }
+    }
+  ]
+}
+```
+
+## **Retrieval augmented generative question answering**
+
+### **Request**
+
+`POST /qa`
+
+```
+{
+  "query_content": "What are the dangers of cyberattacks?"
+}
+```
+
+### **Response**
+
+```
+{
+  "results": [
+    {
+      "page_content": "Los ciberataques son una amenaza de peso en las economías ya en circunstancias normales. Pueden hacer que tu negocio se detenga por completo en un santiamén. Y en el mundo del COVID-19, la seguridad cibernética se ha vuelto aún más crítica ya que, de repente, los trabajadores remotos se exponen, sin quererlo, a los ciberdelincuentes. Nuestros datos muestran que solo el 46% de las pequeñas y medianas empresas (PYMES) tienen una estrategia de ciberseguridad activa y actualizada. Sin embargo, incluso las tecnologías y estrategias cibernéticas más avanzadas no pueden proteger a la empresa contra el eslabón más débil: sus empleados en remoto. Las personas siempre han sido el eslabón más débil de una estrategia de seguridad cibernética en capas de personas, procesos y tecnología en tiempos normales. Con los trabajadores que funcionan de forma remota desde las oficinas en sus casas, la amenaza y la exposición son aún mayores. Los trabajadores remotos están trabajando en sus redes personales, fuera de las VPN y más allá de los firewalls, lo que aumenta su riesgo y abre la puerta a ciberdelincuentes cada vez más astutos. Además de los problemas y desafíos comerciales de funcionar en el mundo COVID, ¿cuál sería el impacto para tu negocio de tener todos sus datos como rehenes? ¿Cómo afectaría un ataque masivo de ransomware y el pago de los ciberdelincuentes a tu flujo de caja, especialmente si eres una del 33% de las pymes que tienen cinco meses o menos de efectivo disponible? La buena noticia de",
+      "metadata": {
+        "article_title": "Ciberseguridad: los riesgos del trabajo en remoto",
+        "date_of_scrapping": "2023-06-13T12:46:14",
+        "seq_num": 2,
+        "source": "http://www.vistage.com.py/ciberseguridad-los-riesgos-del-trabajo-en-remoto/"
+      }
+    }
+  ]
+}
+```
diff --git a/retrieval_augmentation/api/api_script.py b/retrieval_augmentation/api/api_script.py
@@ -0,0 +1,60 @@
+import pinecone
+import os
+from fastapi import FastAPI
+from pydantic import BaseModel
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import RetrievalQAWithSourcesChain
+
+class Query(BaseModel):
+    query_content: str
+
+# Initilize a Python "instance" of Pinecone.
+pinecone.init(
+    api_key=os.getenv("PINECONE_API_KEY"),
+    environment=os.getenv("PINECONE_ENV")
+)
+
+# Set the text embedding model. We will use one from OpenAI.
+embeddings = OpenAIEmbeddings(
+    model='text-embedding-ada-002',
+    openai_api_key=os.getenv("OPENAI_API_KEY")
+)
+
+# Set the name for the new Pinecone index.
+index_name = 'remote-dev-guru'
+
+# Initialize a vectorstore from an existing Pinecone index.
+vectorstore = Pinecone.from_existing_index(
+    index_name, embeddings, text_key='text')
+
+# Completion LLM, for the Q&A functionality.
+llm = ChatOpenAI(
+    openai_api_key=os.getenv("OPENAI_API_KEY"),
+    model_name='gpt-3.5-turbo',
+    temperature=0.0
+    )
+
+
+app = FastAPI()
+
+# This endpoint does a similarity search with whatever is passed
+# as a query. It returns the 4 most relevant documents from the
+# vector database.
+@app.post("/similarity_search")
+async def process_query(query: Query):
+    return {
+        'results': vectorstore.similarity_search(query.query_content)
+        }
+
+# This endpoint answers questions based on information from the
+# vector database. It also returns the sources of information.
+@app.post("/qa")
+async def process_query(query: Query):
+    qa = RetrievalQAWithSourcesChain.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=vectorstore.as_retriever()
+    )
+    return qa(query.query_content)
diff --git a/retrieval_augmentation/api/requirements.txt b/retrieval_augmentation/api/requirements.txt
@@ -0,0 +1,4 @@
+"fastapi[all]"
+langchain
+pinecone-client
+openai
diff --git a/retrieval_augmentation/vector_db/Dockerfile b/retrieval_augmentation/vector_db/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:latest
+
+COPY [ "load_documents.py", "aux_methods.py", "requirements.txt", "./" ]
+
+RUN pip3 install -r requirements.txt
+
+ENTRYPOINT [ "python3", "load_documents.py" ]
diff --git a/retrieval_augmentation/vector_db/README.md b/retrieval_augmentation/vector_db/README.md
@@ -0,0 +1,21 @@
+# **Document loader**
+
+This Python script takes JSON documents generated by another
+microservice that contains informations from LinkedIn and
+other websites, vectorizes and stores that information in a
+Pinecone database (a vector database).
+
+## **Usage**
+
+To build the docker image run the following command
+
+```
+docker build -t vector_db .
+```
+
+To run the image as a container you'll need to place the .env file in
+the root directory and execute the following command
+
+```
+docker run -ti vector_db _file_name.json_ --env-file .env
+```
diff --git a/retrieval_augmentation/vector_db/aux_methods.py b/retrieval_augmentation/vector_db/aux_methods.py
@@ -0,0 +1,27 @@
+
+# Takes a string input from the user and converts it to
+# a boolean value.
+def user_input_to_bool(prompt: str):
+    user_input = input(prompt).lower()
+    if user_input == 'y' or user_input == 'yes':
+        return True
+    else:
+        return False
+
+# This function accepts the default metadata generated by the JSONLoader
+# extraction function from Langchain and allows us to modify the
+# metadata to our convenience.
+#
+# This version is for the data extracted from LinkedIn.
+def metadata_fun_linkedin(record: dict, metadata: dict) -> dict:
+    metadata["source"] = record.get("url_detalle_trabajo")
+    metadata["date_of_scrapping"] = record.get("fecha_de_scrapeo")
+    metadata["job_title"] = record.get("titulo_del_trabajo")
+    return metadata
+
+# This version is for the data extracted from other websites.
+def metadata_fun_web(record: dict, metadata: dict) -> dict:
+    metadata["source"] = record.get("link")
+    metadata["date_of_scrapping"] = record.get("scrape_date")
+    metadata["article_title"] = record.get("title")
+    return metadata
diff --git a/retrieval_augmentation/vector_db/linked_in_data.json b/retrieval_augmentation/vector_db/linked_in_data.json
@@ -0,0 +1,16 @@
+[
+    {
+        "titulo_del_trabajo": "Data Entry Clerk - Remote",
+        "url_detalle_trabajo": "https://py.linkedin.com/jobs/view/data-entry-clerk-remote-at-estaffing-inc-3549533163?refId=Rl4dJdGCnUXHvz5znJDy%2Fg%3D%3D&trackingId=GiDE%2BABmZCzZnciUJJOuqA%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card",
+        "fue_publicado": "2 months ago",
+        "nombre_empresa": "eStaffing Inc.",
+        "url_empresa": "https://www.linkedin.com/company/estaffinginc?trk=public_jobs_jserp-result_job-search-card-subtitle",
+        "ubicacion": "Paraguay",
+        "detalle_del_puesto": "\n        Job Tittle- Data Entry Clerk - Remote Job location- Remote Salary Depending on Candidate Experience Opening Required 10 Data entry operator. Fresher can also apply. Enter data into system accurately and efficiently Organize and maintain files and records Perform other duties as assigned by the supervisor. Compiling, verifying the accuracy, and sorting information to prepare source data for computer entry. Reviewing data for deficiencies or errors, correcting any incompatibilities, and checking output. Required Candidate profile. GOOD KNOWLEDGE OF COMPUTER MUST HAVE LAPTOP OR SYSTEM MUST HAVE BASIC TYPING SPEED MUST HAVE TYPING ACCURACY NO WORK PRESSURE & NO TIME BOUNDATION\n      ",
+        "antiguedad": "Entry level",
+        "tipo_empleo": "Full-time",
+        "funcion_laboral": "Administrative",
+        "sectores": "Staffing and Recruiting",
+        "fecha_de_scrapeo": "2023-06-17 23:10:43"
+    }
+]
diff --git a/retrieval_augmentation/vector_db/load_documents.py b/retrieval_augmentation/vector_db/load_documents.py
@@ -0,0 +1,87 @@
+import sys
+import os
+import pinecone
+import tiktoken
+from langchain.document_loaders import JSONLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import Pinecone
+from aux_methods import (
+    user_input_to_bool,
+    metadata_fun_linkedin,
+    metadata_fun_web
+    )
+
+# Make sure that the user inputs a file name as argument.
+if len(sys.argv) < 2:
+    print("You must provide a file from which to extract the data.")
+    exit()
+
+# Get the name of the JSON file passed as argument to the script.
+json_doc = sys.argv[1]
+
+# If the data is scrapped from LinkedIn, we have to embed the
+# "detalle_del_puesto" field. If it is from a website, we embed
+# the "content" field.
+linked_in = user_input_to_bool(
+    "Is the data from LinkedIn? Enter Y (yes) or N (no): ")
+content_key = 'detalle_del_puesto' if linked_in else 'content'
+
+# Load the JSON document stored in the file system.
+loader = JSONLoader(
+    file_path=json_doc,
+    jq_schema='.[]',
+    content_key=content_key,
+    # The scrapper for LinkedIn is different from the one used for
+    # scrapping data from other websites.
+    metadata_func=metadata_fun_linkedin if linked_in else metadata_fun_web
+    )
+documents = loader.load()
+
+# Set the tokenizer
+tokenizer = tiktoken.get_encoding('cl100k_base')
+
+# Create a length function to know the amount of tokens of a given
+# chunk of text.
+def tiktoken_len(text):
+    tokens = tokenizer.encode(
+        text,
+        disallowed_special=()
+    )
+    return len(tokens)
+
+# Set the text splitter to separate a large documents into smaller
+# chunks, because LLMs accept a limited amount of tokens.
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=400,
+    chunk_overlap=20,  # number of tokens overlap between chunks
+    length_function=tiktoken_len,
+    separators=['\n\n', '\n', ' ', '']
+)
+
+# Split the documents into smaller chunks of text.
+docs = text_splitter.split_documents(documents)
+
+# Initilize a Python "instance" of Pinecone.
+pinecone.init(
+    api_key=os.getenv("PINECONE_API_KEY"),
+    environment=os.getenv("PINECONE_ENV")
+)
+
+# Set the text embedding model. We will use one from OpenAI.
+embeddings = OpenAIEmbeddings(
+    model='text-embedding-ada-002',
+    openai_api_key=os.getenv("OPENAI_API_KEY")
+)
+
+# Set the name for the new Pinecone index.
+index_name = 'remote-dev-guru'
+
+# Vectorize the documents and upsert the vector to the Pinecone index.
+Pinecone.from_documents(docs, embeddings, index_name=index_name)
+
+# Print a success message for the user.
+print(
+    f"All of the documents from the directory {json_doc}"\
+    f" have been embedded and added to the Pinecone"\
+    f" index {index_name}.")
diff --git a/retrieval_augmentation/vector_db/requirements.txt b/retrieval_augmentation/vector_db/requirements.txt
@@ -0,0 +1,5 @@
+langchain
+pinecone-client
+tiktoken
+jq
+openai