huggingface
diff --git a/‎benches/causal_conv1d/impls/hf_kernels_causal_conv1d.md‎
Lines changed: 48 additions & 0 deletions b/‎benches/causal_conv1d/impls/hf_kernels_causal_conv1d.md‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎benches/causal_conv1d/impls/torch_causal_conv1d.md‎
Lines changed: 57 additions & 0 deletions b/‎benches/causal_conv1d/impls/torch_causal_conv1d.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎benches/causal_conv1d/results/combined_results.md‎
Lines changed: 45 additions & 0 deletions b/‎benches/causal_conv1d/results/combined_results.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎benches/rotary/impls/hf_kernels_rotary.md‎
Lines changed: 64 additions & 0 deletions b/‎benches/rotary/impls/hf_kernels_rotary.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎benches/rotary/impls/torch_rotary.md‎
Lines changed: 74 additions & 0 deletions b/‎benches/rotary/impls/torch_rotary.md‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎benches/rotary/results/combined_results.md‎
Lines changed: 45 additions & 0 deletions b/‎benches/rotary/results/combined_results.md‎
Lines changed: 45 additions & 0 deletions
@@ -0,0 +1,48 @@
+---
+on_github: huggingface/kernels-uvnotes
+---
+
+# HF Kernels - Causal Conv1D
+
+## GPU Info
+
+```python id=nv
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+```
+
+## Causal Conv1D Benchmark
+
+```python id=benchmark outputs=causal_conv1d.jsonl
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the causal conv1d kernel
+causal_conv1d = get_kernel("kernels-community/causal-conv1d")
+
+
+def hf_kernels_causal_conv1d(input_tensor, weight, bias):
+    return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+    impl_name="hf_kernels_causal_conv1d",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_causal_conv1d,
+)
+```
@@ -0,0 +1,57 @@
+---
+on_github: huggingface/kernels-uvnotes
+---
+
+# PyTorch Native - Causal Conv1D
+
+## GPU Info
+
+```python id=nv
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+```
+
+## Causal Conv1D Benchmark (PyTorch Native)
+
+```python id=benchmark outputs=causal_conv1d.jsonl
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import torch.nn.functional as F
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_causal_conv1d(input_tensor, weight, bias):
+    # Convert to weight dtype for computation
+    x = input_tensor.to(weight.dtype)
+    dim = weight.shape[0]
+    width = weight.shape[1]
+    seqlen = input_tensor.shape[-1]
+
+    # Depthwise causal conv1d using PyTorch
+    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+    # Truncate to original sequence length
+    out = out[..., :seqlen]
+
+    # Convert back to original dtype
+    return out.to(input_tensor.dtype)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_causal_conv1d,
+)
+```
@@ -0,0 +1,45 @@
+---
+title: "Causal Conv1D Benchmark - Combined Results"
+author: "uvnote"
+theme: "dark"
+syntax_theme: "monokai"
+show_line_numbers: true
+collapse_code: false
+---
+
+# Causal Conv1D Benchmarks - Aggregated Results
+
+This document combines benchmark results from multiple Causal Conv1D implementations.
+
+## Combined Summary and Visualization
+
+![artifact:latency.svg]
+
+```python id=combine collapse-code=true needs=../impls/hf_kernels_causal_conv1d.md:benchmark,../impls/torch_causal_conv1d.md:benchmark outputs=latency.svg
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+    "HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
+    "PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+    cache_env_map=cache_env_map,
+    output_filename="causal_conv1d.jsonl",
+    svg_filename="latency.svg"
+)
+```
@@ -0,0 +1,64 @@
+---
+on_github: huggingface/kernels-uvnotes
+---
+
+# HF Kernels - Rotary Position Embeddings
+
+## GPU Info
+
+```python id=nv
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+```
+
+## Rotary Embeddings Benchmark
+
+```python id=benchmark outputs=rotary.jsonl
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the rotary kernel
+rotary = get_kernel("kernels-community/rotary")
+
+
+def hf_kernels_rotary(query, key, cos, sin, conj=False):
+    rotary_dim = cos.shape[-1]
+
+    # Clone to avoid modifying inputs
+    q_out = query.clone()
+    k_out = key.clone()
+
+    # Apply rotation to query
+    q1 = q_out[..., :rotary_dim]
+    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+
+    # Apply rotation to key
+    k1 = k_out[..., :rotary_dim]
+    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+
+    return q_out, k_out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ROTARY,
+    impl_name="hf_kernels_rotary",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_rotary,
+)
+```
@@ -0,0 +1,74 @@
+---
+on_github: huggingface/kernels-uvnotes
+---
+
+# PyTorch Native - Rotary Position Embeddings
+
+## GPU Info
+
+```python id=nv
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+```
+
+## Rotary Embeddings Benchmark (PyTorch Native)
+
+```python id=benchmark outputs=rotary.jsonl
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def apply_rotary_torch(x1, x2, cos, sin, conj=False):
+    """Reference rotary implementation."""
+    if not conj:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    return out1, out2
+
+
+def torch_rotary(query, key, cos, sin, conj=False):
+    rotary_dim = cos.shape[-1]
+
+    # Clone inputs to avoid modifying them
+    q_out = query.clone()
+    k_out = key.clone()
+
+    # Apply rotation to query
+    q1 = q_out[..., :rotary_dim]
+    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
+    q_out[..., :rotary_dim] = q_out_1
+    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
+
+    # Apply rotation to key
+    k1 = k_out[..., :rotary_dim]
+    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
+    k_out[..., :rotary_dim] = k_out_1
+    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
+
+    return q_out, k_out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ROTARY,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_rotary,
+)
+```
@@ -0,0 +1,45 @@
+---
+title: "Rotary Position Embeddings Benchmark - Combined Results"
+author: "uvnote"
+theme: "dark"
+syntax_theme: "monokai"
+show_line_numbers: true
+collapse_code: false
+---
+
+# Rotary Position Embeddings Benchmarks - Aggregated Results
+
+This document combines benchmark results from multiple Rotary Position Embeddings implementations.
+
+## Combined Summary and Visualization
+
+![artifact:latency.svg]
+
+```python id=combine collapse-code=true needs=../impls/hf_kernels_rotary.md:benchmark,../impls/torch_rotary.md:benchmark outputs=latency.svg
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+    "HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK",
+    "PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+    cache_env_map=cache_env_map,
+    output_filename="rotary.jsonl",
+    svg_filename="latency.svg"
+)
+```