Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
3880c1b
Added starting point files
nitrazek May 24, 2025
9986f50
Added basic implementation of operation
nitrazek May 24, 2025
608acc7
Parsing dataframe
nitrazek May 24, 2025
4b8c6af
Add notebook for analyzing base content in FASTQ sequences
nitrazek May 24, 2025
86cd1ac
Refactor base_content notebook to streamline FASTQ file reading and D…
nitrazek May 24, 2025
f58ccc3
Testing SQL in DataFusion returning map
sh4dqwx May 24, 2025
298a7cf
Fixed conflicts
sh4dqwx May 24, 2025
a623b63
Implementing UDAF for Apache DataFusion
sh4dqwx May 24, 2025
9871c86
Added test quality control algorithm in Rust to return mocked value
sh4dqwx May 25, 2025
45a9c49
Working on UDAF impl in Rust
sh4dqwx Jun 7, 2025
ef8b7d4
No errors version
nitrazek Jun 7, 2025
6854155
Refactor quality control functions and update notebook for base seque…
nitrazek Jun 7, 2025
3d1ee9c
Refactor base content analysis notebook and added processing real FAS…
nitrazek Jun 9, 2025
695613b
fixes
sh4dqwx Jun 10, 2025
039952f
Added UDAF implementation, test getting table from DataFusion
sh4dqwx Jun 11, 2025
6982394
Fixed conflicts
sh4dqwx Jun 11, 2025
9fa0d1b
fixing errors
sh4dqwx Jun 11, 2025
5cc9c1e
Added support various input types including FASTQ file paths, LazyFra…
nitrazek Jun 11, 2025
c901501
Implemented UDAF
sh4dqwx Jun 11, 2025
a39e638
Merge branch 'base_content' of https://github.com/nitrazek/Polars-UDF…
sh4dqwx Jun 11, 2025
5f6ce04
removed comments
sh4dqwx Jun 11, 2025
734b849
Tests on real data
nitrazek Jun 11, 2025
b821dd3
Added script to measuring execution times
nitrazek Jun 14, 2025
b5e1e48
Add quality control tests and target data for base sequence content
nitrazek Jun 15, 2025
ffcf526
Add support for PARQUET file input in quality control tests
nitrazek Jun 15, 2025
801956d
Fixing problem with parallel
sh4dqwx Jun 15, 2025
77698c0
gitignore
sh4dqwx Jun 15, 2025
6cc5541
fixed conflicts
sh4dqwx Jun 15, 2025
0b55a3e
Refactor test assertions to use class-level target DataFrame for cons…
nitrazek Jun 15, 2025
8813750
Fixed parallelism, removed unnecessary files
sh4dqwx Jun 15, 2025
250d875
Merge branch 'base_content' of https://github.com/nitrazek/Polars-UDF…
sh4dqwx Jun 15, 2025
79a856b
Added function to measure performance
nitrazek Jun 15, 2025
13980a5
notebook fixes
sh4dqwx Jun 15, 2025
52b555d
Merge branch 'master' into master
nitrazek Jun 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,9 @@ benchmark/src/results
benchmark/src/results/overlap
mprofile*dat
*csv
it/data/polars-bio-it.json
*fastq
*.parquet
rust_logs.txt
notebooks/.ipynb_checkpoints
notebooks/tmp
it/data/polars-bio-it.json
98 changes: 98 additions & 0 deletions docs/notebooks/base_content.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analiza zawartości zasad w sekwencjach FASTQ\n",
"\n",
"Ten notebook pokazuje jak:\n",
"- sparsować plik FASTQ do DataFrame Polars,\n",
"- wywołać funkcję Rust analizującą zawartość zasad na każdej pozycji,\n",
"- zwizualizować wynik."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Importy\n",
"import polars as pl\n",
"from polars_bio.base_content import base_sequence_content, plot_base_content"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wczytaj przykładowy plik FASTQ i przygotuj DataFrame\n",
"\n",
"Zakładamy, że plik `example.fastq` znajduje się w katalogu wyżej (`../example.fastq`)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from polars_bio.io import read_fastq\n",
"\n",
"fastq_path = \"../example.fastq\"\n",
"df = read_fastq(fastq_path)\n",
"df.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analiza zawartości zasad na każdej pozycji\n",
"\n",
"Wywołujemy funkcję Rust przez Python, która zwraca proporcje zasad A, T, G, C na każdej pozycji."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"base_content_df = base_sequence_content(df)\n",
"base_content_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wizualizacja zawartości zasad\n",
"\n",
"Wykres pokazuje proporcje zasad na każdej pozycji w odczytach."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_base_content(base_content_df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
719 changes: 719 additions & 0 deletions notebooks/base_content_analysis.ipynb

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions polars_bio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@
from .range_utils import Utils as utils
from .sql import SQL as data_processing

from .quality_control_op import base_sequence_content
from .quality_control_viz import plot_base_content

POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"


Expand All @@ -64,4 +67,6 @@
"VcfReadOptions",
"ObjectStorageOptions",
"set_option",
"base_sequence_content",
"plot_base_content"
]
43 changes: 43 additions & 0 deletions polars_bio/quality_control_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import polars as pl
import pandas as pd
from pathlib import Path
from typing import Union
from polars_bio import read_fastq, overlap
from polars_bio.polars_bio import py_base_sequence_content_frame, py_base_sequence_content_scan
from .context import ctx

def base_sequence_content(
data: Union[pl.DataFrame, pl.LazyFrame, pd.DataFrame, str]
) -> pl.DataFrame:
table_path = None
df = None

if isinstance(data, str):
path = Path(data)
if not path.exists():
raise FileNotFoundError(f"File not found: {data}")
if path.suffix.lower() in ['.fastq', '.fq']:
df = read_fastq(data).collect()
else:
table_path = data
elif isinstance(data, pl.LazyFrame):
df = data.collect()
elif isinstance(data, pd.DataFrame):
df = pl.from_pandas(data)
elif isinstance(data, pl.DataFrame):
df = data
else:
raise TypeError(f"Unsupported input type: {type(data)}")

if df is not None:
if "sequence" not in df.columns:
raise ValueError("Input data must have a 'sequence' column")
return py_base_sequence_content_frame(
ctx,
df.to_arrow().to_reader()
).to_polars()
else:
return py_base_sequence_content_scan(
ctx,
table_path
).to_polars()
45 changes: 45 additions & 0 deletions polars_bio/quality_control_viz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import matplotlib.pyplot as plt
import numpy as np
import polars as pl

def plot_base_content(df, figsize=(10, 6), title='Base per pos quantity',
colors=None, save_path=None, dpi=100):
if colors is None:
colors = {'a': 'blue', 'c': 'orange', 'g': 'red', 't': 'green', 'n': 'lightblue'}

# Create figure and axis
fig, ax = plt.subplots(figsize=figsize)

# Get positions (assuming they are 0-based indices)
positions = np.arange(len(df))

# Plot each base content
for base in ['a', 'c', 'g', 't', 'n']:
column = f"{base}_count"
if column in df.columns:
ax.plot(positions, df[column], label=base, color=colors[base], linewidth=1.5)

# Add labels and title
ax.set_xlabel('Position in read (bp)')
ax.set_ylabel('Base content')
ax.set_title(title)

# Add grid
ax.grid(True, alpha=0.3)

# Add legend
ax.legend(title='base', loc='upper right', bbox_to_anchor=(1.15, 1))

# Set y-axis to start at 0
ax.set_ylim(bottom=0)

# Tight layout
plt.tight_layout()

plt.close(fig)

# Save if requested
if save_path:
plt.savefig(save_path, dpi=dpi, bbox_inches='tight')

return fig
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ maturin
ruff
pytest
mypy
ipykernel
39 changes: 39 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod context;
mod operation;
mod option;
mod quality_control;
mod query;
mod scan;
mod streaming;
Expand Down Expand Up @@ -30,6 +31,7 @@ use crate::option::{
FastqReadOptions, FilterOp, GffReadOptions, InputFormat, PyObjectStorageOptions, RangeOp,
RangeOptions, ReadOptions, VcfReadOptions,
};
use crate::quality_control::{do_base_sequence_content, register_base_sequence_content};
use crate::scan::{maybe_register_table, register_frame, register_table};
use crate::streaming::RangeOperationScan;
use crate::utils::convert_arrow_rb_schema_to_polars_df_schema;
Expand Down Expand Up @@ -416,6 +418,40 @@ fn py_from_polars(
})
}

#[pyfunction]
#[pyo3(signature = (py_ctx, df))]
fn py_base_sequence_content_frame(
py_ctx: &PyBioSessionContext,
df: PyArrowType<ArrowArrayStreamReader>,
) -> PyResult<PyDataFrame> {
let rt = Runtime::new().unwrap();
let ctx = &py_ctx.ctx;
register_frame(py_ctx, df, LEFT_TABLE.to_string());
register_base_sequence_content(ctx);

Ok(PyDataFrame::new(rt.block_on(do_base_sequence_content(
ctx,
LEFT_TABLE.to_string(),
))))
}

#[pyfunction]
#[pyo3(signature = (py_ctx, df_path_or_table))]
fn py_base_sequence_content_scan(
py_ctx: &PyBioSessionContext,
df_path_or_table: String,
) -> PyResult<PyDataFrame> {
let rt = Runtime::new().unwrap();
let ctx = &py_ctx.ctx;
maybe_register_table(df_path_or_table, &LEFT_TABLE.to_string(), None, ctx, &rt);
register_base_sequence_content(ctx);

Ok(PyDataFrame::new(rt.block_on(do_base_sequence_content(
ctx,
LEFT_TABLE.to_string(),
))))
}

#[pymodule]
fn polars_bio(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
pyo3_log::init();
Expand All @@ -430,6 +466,9 @@ fn polars_bio(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(py_describe_vcf, m)?)?;
m.add_function(wrap_pyfunction!(py_register_view, m)?)?;
m.add_function(wrap_pyfunction!(py_from_polars, m)?)?;
m.add_function(wrap_pyfunction!(py_base_sequence_content_frame, m)?)?;
m.add_function(wrap_pyfunction!(py_base_sequence_content_scan, m)?)?;
// m.add_function(wrap_pyfunction!(unary_operation_scan, m)?)?;
m.add_class::<PyBioSessionContext>()?;
m.add_class::<FilterOp>()?;
m.add_class::<RangeOp>()?;
Expand Down
Loading