biodatageeks · nitrazek · May 24, 2025 · May 24, 2025 · May 24, 2025 · May 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,9 @@ benchmark/src/results
 benchmark/src/results/overlap
 mprofile*dat
 *csv
-it/data/polars-bio-it.json
+*fastq
+*.parquet
+rust_logs.txt
+notebooks/.ipynb_checkpoints
+notebooks/tmp
+it/data/polars-bio-it.json
diff --git a/docs/notebooks/base_content.ipynb b/docs/notebooks/base_content.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analiza zawartości zasad w sekwencjach FASTQ\n",
+    "\n",
+    "Ten notebook pokazuje jak:\n",
+    "- sparsować plik FASTQ do DataFrame Polars,\n",
+    "- wywołać funkcję Rust analizującą zawartość zasad na każdej pozycji,\n",
+    "- zwizualizować wynik."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Importy\n",
+    "import polars as pl\n",
+    "from polars_bio.base_content import base_sequence_content, plot_base_content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wczytaj przykładowy plik FASTQ i przygotuj DataFrame\n",
+    "\n",
+    "Zakładamy, że plik `example.fastq` znajduje się w katalogu wyżej (`../example.fastq`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from polars_bio.io import read_fastq\n",
+    "\n",
+    "fastq_path = \"../example.fastq\"\n",
+    "df = read_fastq(fastq_path)\n",
+    "df.head()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analiza zawartości zasad na każdej pozycji\n",
+    "\n",
+    "Wywołujemy funkcję Rust przez Python, która zwraca proporcje zasad A, T, G, C na każdej pozycji."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_content_df = base_sequence_content(df)\n",
+    "base_content_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wizualizacja zawartości zasad\n",
+    "\n",
+    "Wykres pokazuje proporcje zasad na każdej pozycji w odczytach."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_base_content(base_content_df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/base_content_analysis.ipynb b/notebooks/base_content_analysis.ipynb
diff --git a/polars_bio/__init__.py b/polars_bio/__init__.py
@@ -47,6 +47,9 @@
 from .range_utils import Utils as utils
 from .sql import SQL as data_processing
 
+from .quality_control_op import base_sequence_content
+from .quality_control_viz import plot_base_content
+
 POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
 
 
@@ -64,4 +67,6 @@
     "VcfReadOptions",
     "ObjectStorageOptions",
     "set_option",
+    "base_sequence_content",
+    "plot_base_content"
 ]
diff --git a/polars_bio/quality_control_op.py b/polars_bio/quality_control_op.py
@@ -0,0 +1,43 @@
+import polars as pl
+import pandas as pd
+from pathlib import Path
+from typing import Union
+from polars_bio import read_fastq, overlap
+from polars_bio.polars_bio import py_base_sequence_content_frame, py_base_sequence_content_scan
+from .context import ctx
+
+def base_sequence_content(
+    data: Union[pl.DataFrame, pl.LazyFrame, pd.DataFrame, str]
+) -> pl.DataFrame:
+    table_path = None
+    df = None
+
+    if isinstance(data, str):
+        path = Path(data)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {data}") 
+        if path.suffix.lower() in ['.fastq', '.fq']:
+            df = read_fastq(data).collect()
+        else:
+            table_path = data
+    elif isinstance(data, pl.LazyFrame):
+        df = data.collect()
+    elif isinstance(data, pd.DataFrame):
+        df = pl.from_pandas(data)
+    elif isinstance(data, pl.DataFrame):
+        df = data
+    else:
+        raise TypeError(f"Unsupported input type: {type(data)}")
+
+    if df is not None:
+        if "sequence" not in df.columns:
+            raise ValueError("Input data must have a 'sequence' column")
+        return py_base_sequence_content_frame(
+            ctx,
+            df.to_arrow().to_reader()
+        ).to_polars()
+    else:
+        return py_base_sequence_content_scan(
+            ctx,
+            table_path
+        ).to_polars()
diff --git a/polars_bio/quality_control_viz.py b/polars_bio/quality_control_viz.py
@@ -0,0 +1,45 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import polars as pl
+
+def plot_base_content(df, figsize=(10, 6), title='Base per pos quantity', 
+                      colors=None, save_path=None, dpi=100):
+    if colors is None:
+        colors = {'a': 'blue', 'c': 'orange', 'g': 'red', 't': 'green', 'n': 'lightblue'}
+
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=figsize)
+
+    # Get positions (assuming they are 0-based indices)
+    positions = np.arange(len(df))
+
+    # Plot each base content
+    for base in ['a', 'c', 'g', 't', 'n']:
+        column = f"{base}_count"
+        if column in df.columns:
+            ax.plot(positions, df[column], label=base, color=colors[base], linewidth=1.5)
+
+    # Add labels and title
+    ax.set_xlabel('Position in read (bp)')
+    ax.set_ylabel('Base content')
+    ax.set_title(title)
+
+    # Add grid
+    ax.grid(True, alpha=0.3)
+
+    # Add legend
+    ax.legend(title='base', loc='upper right', bbox_to_anchor=(1.15, 1))
+
+    # Set y-axis to start at 0
+    ax.set_ylim(bottom=0)
+
+    # Tight layout
+    plt.tight_layout()
+
+    plt.close(fig)
+
+    # Save if requested
+    if save_path:
+        plt.savefig(save_path, dpi=dpi, bbox_inches='tight')
+
+    return fig
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ maturin
 ruff
 pytest
 mypy
+ipykernel
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,7 @@
 mod context;
 mod operation;
 mod option;
+mod quality_control;
 mod query;
 mod scan;
 mod streaming;
@@ -30,6 +31,7 @@ use crate::option::{
     FastqReadOptions, FilterOp, GffReadOptions, InputFormat, PyObjectStorageOptions, RangeOp,
     RangeOptions, ReadOptions, VcfReadOptions,
 };
+use crate::quality_control::{do_base_sequence_content, register_base_sequence_content};
 use crate::scan::{maybe_register_table, register_frame, register_table};
 use crate::streaming::RangeOperationScan;
 use crate::utils::convert_arrow_rb_schema_to_polars_df_schema;
@@ -416,6 +418,40 @@ fn py_from_polars(
     })
 }
 
+#[pyfunction]
+#[pyo3(signature = (py_ctx, df))]
+fn py_base_sequence_content_frame(
+    py_ctx: &PyBioSessionContext,
+    df: PyArrowType<ArrowArrayStreamReader>,
+) -> PyResult<PyDataFrame> {
+    let rt = Runtime::new().unwrap();
+    let ctx = &py_ctx.ctx;
+    register_frame(py_ctx, df, LEFT_TABLE.to_string());
+    register_base_sequence_content(ctx);
+
+    Ok(PyDataFrame::new(rt.block_on(do_base_sequence_content(
+        ctx,
+        LEFT_TABLE.to_string(),
+    ))))
+}
+
+#[pyfunction]
+#[pyo3(signature = (py_ctx, df_path_or_table))]
+fn py_base_sequence_content_scan(
+    py_ctx: &PyBioSessionContext,
+    df_path_or_table: String,
+) -> PyResult<PyDataFrame> {
+    let rt = Runtime::new().unwrap();
+    let ctx = &py_ctx.ctx;
+    maybe_register_table(df_path_or_table, &LEFT_TABLE.to_string(), None, ctx, &rt);
+    register_base_sequence_content(ctx);
+
+    Ok(PyDataFrame::new(rt.block_on(do_base_sequence_content(
+        ctx,
+        LEFT_TABLE.to_string(),
+    ))))
+}
+
 #[pymodule]
 fn polars_bio(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
     pyo3_log::init();
@@ -430,6 +466,9 @@ fn polars_bio(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(py_describe_vcf, m)?)?;
     m.add_function(wrap_pyfunction!(py_register_view, m)?)?;
     m.add_function(wrap_pyfunction!(py_from_polars, m)?)?;
+    m.add_function(wrap_pyfunction!(py_base_sequence_content_frame, m)?)?;
+    m.add_function(wrap_pyfunction!(py_base_sequence_content_scan, m)?)?;
+    // m.add_function(wrap_pyfunction!(unary_operation_scan, m)?)?;
     m.add_class::<PyBioSessionContext>()?;
     m.add_class::<FilterOp>()?;
     m.add_class::<RangeOp>()?;
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ maturin @@
     ruff
     pytest
     mypy
+    ipykernel