Skip to content

Commit 941661a

Browse files
committed
feat: add causal conv 1d and rotary
1 parent bc4cf44 commit 941661a

File tree

12 files changed

+646
-259
lines changed

12 files changed

+646
-259
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
---
2+
on_github: huggingface/kernels-uvnotes
3+
---
4+
5+
# HF Kernels - Causal Conv1D
6+
7+
## GPU Info
8+
9+
```python id=nv
10+
import subprocess
11+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
12+
```
13+
14+
## Causal Conv1D Benchmark
15+
16+
```python id=benchmark outputs=causal_conv1d.jsonl
17+
# /// script
18+
# requires-python = ">=3.10"
19+
# dependencies = [
20+
# "numpy",
21+
# "torch==2.8.0",
22+
# "kernels-benchmark-tools",
23+
# "kernels",
24+
# ]
25+
#
26+
# [tool.uv.sources]
27+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
28+
# ///
29+
import torch
30+
import sys
31+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
32+
from kernels import get_kernel
33+
34+
# Load the causal conv1d kernel
35+
causal_conv1d = get_kernel("kernels-community/causal-conv1d")
36+
37+
38+
def hf_kernels_causal_conv1d(input_tensor, weight, bias):
39+
return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
40+
41+
42+
run_benchmark(
43+
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
44+
impl_name="hf_kernels_causal_conv1d",
45+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
46+
impl_func=hf_kernels_causal_conv1d,
47+
)
48+
```
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
---
2+
on_github: huggingface/kernels-uvnotes
3+
---
4+
5+
# PyTorch Native - Causal Conv1D
6+
7+
## GPU Info
8+
9+
```python id=nv
10+
import subprocess
11+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
12+
```
13+
14+
## Causal Conv1D Benchmark (PyTorch Native)
15+
16+
```python id=benchmark outputs=causal_conv1d.jsonl
17+
# /// script
18+
# requires-python = ">=3.10"
19+
# dependencies = [
20+
# "numpy",
21+
# "torch==2.8.0",
22+
# "kernels-benchmark-tools",
23+
# ]
24+
#
25+
# [tool.uv.sources]
26+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
27+
# ///
28+
import torch
29+
import torch.nn.functional as F
30+
import sys
31+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
32+
33+
34+
def torch_causal_conv1d(input_tensor, weight, bias):
35+
# Convert to weight dtype for computation
36+
x = input_tensor.to(weight.dtype)
37+
dim = weight.shape[0]
38+
width = weight.shape[1]
39+
seqlen = input_tensor.shape[-1]
40+
41+
# Depthwise causal conv1d using PyTorch
42+
out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
43+
44+
# Truncate to original sequence length
45+
out = out[..., :seqlen]
46+
47+
# Convert back to original dtype
48+
return out.to(input_tensor.dtype)
49+
50+
51+
run_benchmark(
52+
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
53+
impl_name="torch_eager",
54+
impl_tags={"family": "pytorch", "backend": "eager"},
55+
impl_func=torch_causal_conv1d,
56+
)
57+
```
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
---
2+
title: "Causal Conv1D Benchmark - Combined Results"
3+
author: "uvnote"
4+
theme: "dark"
5+
syntax_theme: "monokai"
6+
show_line_numbers: true
7+
collapse_code: false
8+
---
9+
10+
# Causal Conv1D Benchmarks - Aggregated Results
11+
12+
This document combines benchmark results from multiple Causal Conv1D implementations.
13+
14+
## Combined Summary and Visualization
15+
16+
![artifact:latency.svg]
17+
18+
```python id=combine collapse-code=true needs=../impls/hf_kernels_causal_conv1d.md:benchmark,../impls/torch_causal_conv1d.md:benchmark outputs=latency.svg
19+
# /// script
20+
# requires-python = ">=3.10"
21+
# dependencies = [
22+
# "numpy",
23+
# "torch==2.8.0",
24+
# "kernels-benchmark-tools",
25+
# "matplotlib",
26+
# ]
27+
#
28+
# [tool.uv.sources]
29+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
30+
# ///
31+
from kernels_benchmark_tools.core.visuals import generate_combined_results
32+
33+
# Map display names to uvnote environment variables
34+
cache_env_map = {
35+
"HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
36+
"PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
37+
}
38+
39+
# Generate combined results with visualization
40+
generate_combined_results(
41+
cache_env_map=cache_env_map,
42+
output_filename="causal_conv1d.jsonl",
43+
svg_filename="latency.svg"
44+
)
45+
```
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
---
2+
on_github: huggingface/kernels-uvnotes
3+
---
4+
5+
# HF Kernels - Rotary Position Embeddings
6+
7+
## GPU Info
8+
9+
```python id=nv
10+
import subprocess
11+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
12+
```
13+
14+
## Rotary Embeddings Benchmark
15+
16+
```python id=benchmark outputs=rotary.jsonl
17+
# /// script
18+
# requires-python = ">=3.10"
19+
# dependencies = [
20+
# "numpy",
21+
# "torch==2.8.0",
22+
# "kernels-benchmark-tools",
23+
# "kernels",
24+
# ]
25+
#
26+
# [tool.uv.sources]
27+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
28+
# ///
29+
import torch
30+
import sys
31+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
32+
from kernels import get_kernel
33+
34+
# Load the rotary kernel
35+
rotary = get_kernel("kernels-community/rotary")
36+
37+
38+
def hf_kernels_rotary(query, key, cos, sin, conj=False):
39+
rotary_dim = cos.shape[-1]
40+
41+
# Clone to avoid modifying inputs
42+
q_out = query.clone()
43+
k_out = key.clone()
44+
45+
# Apply rotation to query
46+
q1 = q_out[..., :rotary_dim]
47+
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
48+
rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
49+
50+
# Apply rotation to key
51+
k1 = k_out[..., :rotary_dim]
52+
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
53+
rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
54+
55+
return q_out, k_out
56+
57+
58+
run_benchmark(
59+
kernel_type=KernelTypeEnum.ROTARY,
60+
impl_name="hf_kernels_rotary",
61+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
62+
impl_func=hf_kernels_rotary,
63+
)
64+
```
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
---
2+
on_github: huggingface/kernels-uvnotes
3+
---
4+
5+
# PyTorch Native - Rotary Position Embeddings
6+
7+
## GPU Info
8+
9+
```python id=nv
10+
import subprocess
11+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
12+
```
13+
14+
## Rotary Embeddings Benchmark (PyTorch Native)
15+
16+
```python id=benchmark outputs=rotary.jsonl
17+
# /// script
18+
# requires-python = ">=3.10"
19+
# dependencies = [
20+
# "numpy",
21+
# "torch==2.8.0",
22+
# "kernels-benchmark-tools",
23+
# ]
24+
#
25+
# [tool.uv.sources]
26+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
27+
# ///
28+
import torch
29+
import sys
30+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
31+
32+
33+
def apply_rotary_torch(x1, x2, cos, sin, conj=False):
34+
"""Reference rotary implementation."""
35+
if not conj:
36+
out1 = x1 * cos - x2 * sin
37+
out2 = x1 * sin + x2 * cos
38+
else:
39+
out1 = x1 * cos + x2 * sin
40+
out2 = -x1 * sin + x2 * cos
41+
return out1, out2
42+
43+
44+
def torch_rotary(query, key, cos, sin, conj=False):
45+
rotary_dim = cos.shape[-1]
46+
47+
# Clone inputs to avoid modifying them
48+
q_out = query.clone()
49+
k_out = key.clone()
50+
51+
# Apply rotation to query
52+
q1 = q_out[..., :rotary_dim]
53+
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
54+
q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
55+
q_out[..., :rotary_dim] = q_out_1
56+
q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
57+
58+
# Apply rotation to key
59+
k1 = k_out[..., :rotary_dim]
60+
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
61+
k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
62+
k_out[..., :rotary_dim] = k_out_1
63+
k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
64+
65+
return q_out, k_out
66+
67+
68+
run_benchmark(
69+
kernel_type=KernelTypeEnum.ROTARY,
70+
impl_name="torch_eager",
71+
impl_tags={"family": "pytorch", "backend": "eager"},
72+
impl_func=torch_rotary,
73+
)
74+
```
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
---
2+
title: "Rotary Position Embeddings Benchmark - Combined Results"
3+
author: "uvnote"
4+
theme: "dark"
5+
syntax_theme: "monokai"
6+
show_line_numbers: true
7+
collapse_code: false
8+
---
9+
10+
# Rotary Position Embeddings Benchmarks - Aggregated Results
11+
12+
This document combines benchmark results from multiple Rotary Position Embeddings implementations.
13+
14+
## Combined Summary and Visualization
15+
16+
![artifact:latency.svg]
17+
18+
```python id=combine collapse-code=true needs=../impls/hf_kernels_rotary.md:benchmark,../impls/torch_rotary.md:benchmark outputs=latency.svg
19+
# /// script
20+
# requires-python = ">=3.10"
21+
# dependencies = [
22+
# "numpy",
23+
# "torch==2.8.0",
24+
# "kernels-benchmark-tools",
25+
# "matplotlib",
26+
# ]
27+
#
28+
# [tool.uv.sources]
29+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
30+
# ///
31+
from kernels_benchmark_tools.core.visuals import generate_combined_results
32+
33+
# Map display names to uvnote environment variables
34+
cache_env_map = {
35+
"HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK",
36+
"PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK",
37+
}
38+
39+
# Generate combined results with visualization
40+
generate_combined_results(
41+
cache_env_map=cache_env_map,
42+
output_filename="rotary.jsonl",
43+
svg_filename="latency.svg"
44+
)
45+
```

0 commit comments

Comments
 (0)