Add missing type hints (#210)

strickvl · Copilot · claude · web-flow · commit b727eaabaafb · 2025-05-17T23:18:48.000+02:00
* Add type hints to various functions * Update zencoder/steps/deployment.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix dead link in PR template Corrected the path to CONTRIBUTING.md in the PR template by changing it from a relative path (CONTRIBUTING.md) to a path that goes up one directory (../CONTRIBUTING.md). 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -3,7 +3,7 @@
 Please provide a short summary explaining the motivation behind these changes.
 
 # Checklist
-- [ ] I have read the [contributing guidelines](CONTRIBUTING.md).
+- [ ] I have read the [contributing guidelines](../CONTRIBUTING.md).
 - [ ] I have run the necessary tests and linters.
 - [ ] I have updated relevant documentation where applicable.
 
diff --git a/end-to-end-computer-vision/utils/dataset_utils.py b/end-to-end-computer-vision/utils/dataset_utils.py
@@ -31,8 +31,8 @@
 logger = get_logger(__name__)
 
 
-def load_images_from_folder(folder):
-    images = []
+def load_images_from_folder(folder: str) -> List[Image.Image]:
+    images: List[Image.Image] = []
     for filename in os.listdir(folder):
         if (
             filename.endswith(".png")
@@ -45,7 +45,9 @@ def load_images_from_folder(folder):
     return images
 
 
-def load_images_from_source(data_source, download_dir, filenames):
+def load_images_from_source(
+    data_source: str, download_dir: str, filenames: List[str]
+) -> None:
     total_images = len(filenames)
     for index, filename in enumerate(filenames):
         src_path = f"{data_source}/{filename}.png"
diff --git a/llm-complete-guide/most_basic_rag_pipeline.py b/llm-complete-guide/most_basic_rag_pipeline.py
@@ -17,23 +17,26 @@
 
 import re
 import string
+from typing import List
 
 from openai import OpenAI
 from utils.openai_utils import get_openai_api_key
 
 
-def preprocess_text(text):
+def preprocess_text(text: str) -> str:
     text = text.lower()
     text = text.translate(str.maketrans("", "", string.punctuation))
     text = re.sub(r"\s+", " ", text).strip()
     return text
 
 
-def tokenize(text):
+def tokenize(text: str) -> List[str]:
     return preprocess_text(text).split()
 
 
-def retrieve_relevant_chunks(query, corpus, top_n=2):
+def retrieve_relevant_chunks(
+    query: str, corpus: List[str], top_n: int = 2
+) -> List[str]:
     query_tokens = set(tokenize(query))
     similarities = []
     for chunk in corpus:
@@ -46,7 +49,7 @@ def retrieve_relevant_chunks(query, corpus, top_n=2):
     return [chunk for chunk, _ in similarities[:top_n]]
 
 
-def answer_question(query, corpus, top_n=2):
+def answer_question(query: str, corpus: List[str], top_n: int = 2) -> str:
     relevant_chunks = retrieve_relevant_chunks(query, corpus, top_n)
     if not relevant_chunks:
         return "I don't have enough information to answer the question."
diff --git a/nightwatch-ai/src/pipelines/supabase_summary.py b/nightwatch-ai/src/pipelines/supabase_summary.py
@@ -13,13 +13,19 @@
 #  permissions and limitations under the License.
 
 
+from typing import Any, Callable
+
 from zenml.pipelines import pipeline
 
 pipeline_name = "daily_supabase_summary"
 
 
 @pipeline(name=pipeline_name)
-def daily_supabase_summary(get_latest_data, generate_summary, report_summary):
+def daily_supabase_summary(
+    get_latest_data: Callable[[], Any],
+    generate_summary: Callable[[Any], Any],
+    report_summary: Callable[[Any], Any],
+) -> None:
     """Generates a summary of the latest data.
 
     Args:
diff --git a/nightwatch-ai/src/run.py b/nightwatch-ai/src/run.py
@@ -21,7 +21,7 @@
 from zenml.client import Client
 
 
-def main():
+def main() -> None:
     if Client().active_stack.alerter is None:
         # we use a print alerter
         alerter = print_alerter()
diff --git a/zencoder/steps/deployment.py b/zencoder/steps/deployment.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, cast
+from typing import Dict, Optional, Tuple, cast
 
 from zenml import get_step_context, step
 from zenml.client import Client
@@ -14,7 +14,23 @@
 logger = get_logger(__name__)
 
 
-def parse_huggingface_url(url):
+def parse_huggingface_url(url: str) -> Tuple[str, str, str]:
+    """
+    Parses a Hugging Face Hub URL to extract the namespace, repository, and revision.
+
+    Args:
+        url: The Hugging Face Hub URL to parse. Expected format:
+             "https://huggingface.co/{namespace}/{repository}/tree/{revision}".
+
+    Returns:
+        A tuple containing:
+        - namespace: The owner or organization of the repository.
+        - repository: The name of the repository.
+        - revision: The specific commit hash or branch name.
+
+    Raises:
+        ValueError: If the URL does not match the expected format.
+    """
     # Split the URL into parts
     parts = url.split("/")
 
diff --git a/zencoder/steps/trainer.py b/zencoder/steps/trainer.py
@@ -8,7 +8,7 @@
 import functools
 import os
 import random
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -66,16 +66,16 @@ def get_fim_token_ids(tokenizer):
 
 ## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
 def permute(
-    sample,
-    np_rng,
-    suffix_tok_id,
-    prefix_tok_id,
-    middle_tok_id,
-    pad_tok_id,
-    fim_rate=0.5,
-    fim_spm_rate=0.5,
-    truncate_or_pad=False,
-):
+    sample: List[int],
+    np_rng: np.random.RandomState,
+    suffix_tok_id: Optional[int],
+    prefix_tok_id: Optional[int],
+    middle_tok_id: Optional[int],
+    pad_tok_id: Optional[int],
+    fim_rate: float = 0.5,
+    fim_spm_rate: float = 0.5,
+    truncate_or_pad: bool = False,
+) -> Tuple[List[int], np.random.RandomState]:
     """
     Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
     PSM and SPM (with a probability of fim_spm_rate).