Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
934c4f2
Create wormbase module
Bradley-Buchner Jan 27, 2025
46aa210
Make wormbase agents
Bradley-Buchner Jan 30, 2025
305ad9a
Modify _make_agent
Bradley-Buchner Jan 30, 2025
99e051d
Modify _download_wormbase_data and _read_wormbase_csv
Bradley-Buchner Jan 30, 2025
e9f1b49
Add test_print_agent_data_in_init
Bradley-Buchner Jan 30, 2025
9e9b0e8
Added test for agent info extraction (name, db_refs)
Bradley-Buchner Jan 30, 2025
fc6974d
Modified test_download_wormbase_data and test for agent info extraction
Bradley-Buchner Jan 30, 2025
18aaf12
Fix url usage
Bradley-Buchner Jan 31, 2025
2e11451
Modify _download_wormbase_data and _read_wormbase_data
Bradley-Buchner Jan 31, 2025
93bed0c
Delete parse_wormbase.py
Bradley-Buchner Jan 31, 2025
cdb7dd5
Add description
Bradley-Buchner Jan 31, 2025
511d788
Add "Make statements" section
Bradley-Buchner Feb 3, 2025
b01befb
Add assertions for statements, Remove test for _download_wormbase_data
Bradley-Buchner Feb 3, 2025
40d6dbc
Merge remote-tracking branch 'upstream/master'
Bradley-Buchner Feb 5, 2025
b4c84d3
Update __init__.py
Bradley-Buchner Feb 5, 2025
fac5e19
Update processor.py
Bradley-Buchner Feb 5, 2025
2a09e33
Modify statement creation to handle molecular interaction data
Bradley-Buchner Feb 8, 2025
4c5364d
Update test file
Bradley-Buchner Feb 8, 2025
94c04b7
Incorporate wormbase-to-entrez ID mappings to ensure agents have an e…
Bradley-Buchner Feb 9, 2025
230d8bb
Update Statement creation test
Bradley-Buchner Feb 9, 2025
bc79efd
Move data loading into api
Bradley-Buchner Feb 11, 2025
89e9da0
Decouple data loading and WormBase processor
Bradley-Buchner Feb 11, 2025
09ed482
Clean up some of the API and processor
bgyori Feb 11, 2025
e3e19ee
Refactor processor for processing rows
bgyori Feb 11, 2025
b2606a1
Improve code styling
bgyori Feb 11, 2025
c474930
Refactor processor into reusable functions
bgyori Feb 11, 2025
7e0d05d
Improve processor source_ids
bgyori Feb 12, 2025
5056484
Remove NA handling from pd.read_csv()
Bradley-Buchner Feb 13, 2025
97f75e2
Add get_agent_role_info(), replace Activation/Inhibition with Increas…
Bradley-Buchner Feb 13, 2025
c69faa6
Update test file to use process_from_files() instead of WormBaseProce…
Bradley-Buchner Feb 13, 2025
e8e90a9
Misc. updates
Bradley-Buchner Feb 16, 2025
26547df
Clean up test_statement_creation
Bradley-Buchner Feb 16, 2025
5d0294f
Add test files
Bradley-Buchner Feb 16, 2025
9cefe12
Add functionality for new data source
Bradley-Buchner May 29, 2025
d396e63
Create c_elegans_all_genes.txt
Bradley-Buchner May 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209,515 changes: 209,515 additions & 0 deletions Miniforge3-MacOSX-arm64.sh

Large diffs are not rendered by default.

Empty file added __init__.py
Empty file.
2 changes: 1 addition & 1 deletion indra/assemblers/indranet/net.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def from_df(cls, df):
edge data for one edge. Indices are used to distinguish
multiedges between a pair of nodes. Any columns not part of the
below mentioned mandatory columns are considered extra attributes.
Columns starting with 'agA\_' or 'agB\_' (excluding the agA/B_name)
Columns starting with 'agA_' or 'agB_' (excluding the agA/B_name)
will be added to its respective nodes as node attributes. Any other
columns will be added as edge attributes.

Expand Down
14,691 changes: 14,691 additions & 0 deletions indra/resources/c_elegans_all_genes.txt

Large diffs are not rendered by default.

273 changes: 245 additions & 28 deletions indra/sources/wormbase/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,33 @@
from .processor import WormBaseProcessor
from collections import namedtuple
import pandas as pd
import os

# Url for all C. elegans molecular interactions data file
wormbase_mol_file_url = ('https://fms.alliancegenome.org/download/'
alliance_mol_int_file_url = ('https://fms.alliancegenome.org/download/'
'INTERACTION-MOL_WB.tsv.gz')
# Url for all C. elegans genetic interactions data file
wormbase_gen_file_url = ('https://fms.alliancegenome.org/download/'
alliance_gen_int_file_url = ('https://fms.alliancegenome.org/download/'
'INTERACTION-GEN_WB.tsv.gz')
# Url for wormbase-to-entrez ID mapping
wormbase_entrez_mappings_file_url = ('https://ftp.ncbi.nih.gov/gene/'
'DATA/GENE_INFO/Invertebrates/'
'Caenorhabditis_elegans.gene_info.gz')

wormbase_int_file_url = ('https://downloads.wormbase.org/species/c_elegans/'
'annotation/interactions/'
'c_elegans.PRJNA13758.current.interactions.txt.gz')

# wormbase_all_genes_file_path = ("/Users/bradleybuchner/Desktop/Grad School/Research"
# "/Aging Project/indra/indra/resources/c_elegans_all_genes.txt")
dir = os.path.dirname(os.path.abspath(__file__))
resources_dir = os.path.dirname(os.path.dirname(dir))
wormbase_all_genes_file_path = os.path.join(resources_dir, 'resources/c_elegans_all_genes.txt')


# An explanation for each column of the interaction files are here:
# https://github.com/HUPO-PSI/miTab/blob/master/PSI-MITAB27Format.md
columns = ['ids_interactor_a', 'ids_interactor_b',
alliance_int_columns = ['ids_interactor_a', 'ids_interactor_b',
'alt_ids_interactor_a', 'alt_ids_interactor_b',
'aliases_interactor_a', 'aliases_interactor_b',
'interaction_detection_methods', 'publication_first_authors',
Expand All @@ -42,6 +54,49 @@
'identification_method_participant_a',
'identification_method_participant_b']

wormbase_int_columns_raw = [
"WBInteractionID", "Interaction_type", "Interaction_subtype", "Summary",
"Citation", "Interactor1", "Common_name1", "Role1",
"Interactor2", "Common_name2", "Role2",
"Interactor3", "Common_name3", "Role3",
"Interactor4", "Common_name4", "Role4",
"Interactor5", "Common_name5", "Role5",
"Interactor6", "Common_name6", "Role6",
"Interactor7", "Common_name7", "Role7",
"Interactor8", "Common_name8", "Role8",
"Interactor9", "Common_name9", "Role9",
"Interactor10", "Common_name10", "Role10",
"Interactor11", "Common_name11", "Role11",
"Interactor12", "Common_name12", "Role12",
"Interactor13", "Common_name13", "Role13",
"Interactor14", "Common_name14", "Role14",
"Interactor15", "Common_name15", "Role15",
"Interactor16", "Common_name16", "Role16",
"Interactor17", "Common_name17", "Role17",
"Interactor18", "Common_name18", "Role18",
"Interactor19", "Common_name19", "Role19",
"Interactor20", "Common_name20", "Role20",
"Interactor21", "Common_name21", "Role21",
"Interactor22", "Common_name22", "Role22",
"Interactor23", "Common_name23", "Role23",
"Interactor24", "Common_name24", "Role24",
"Interactor25", "Common_name25", "Role25",
"Interactor26", "Common_name26", "Role26",
"Interactor27", "Common_name27", "Role27",
"Interactor28", "Common_name28", "Role28",
"Interactor29", "Common_name29", "Role29",
"Interactor30", "Common_name30", "Role30",
"Interactor31", "Common_name31", "Role31",
"Interactor32", "Common_name32", "Role32"
# May need to add more column names if the dataset is updated
# with interactions having more than 32 interactors
]

wormbase_int_columns = ["WBInteractionID", "Type", "Subtype",
"Effector_ID", "Effector", "Effector_Role",
"Affected_ID", "Affected", "Affected_Role",
"Direction"]

mapping_columns = ['tax_id', 'GeneID', 'Symbol',
'LocusTag', 'Synonyms', 'dbXrefs', 'chromosome',
'map_location', 'description', 'type_of_gene',
Expand All @@ -50,19 +105,28 @@
'Nomenclature_status', 'Other_designations',
'Modification_date', 'Feature_type']

_WormBaseRow = namedtuple('WormBaseRow', columns)
wormbase_all_genes_columns = ['source', 'bioentity_internal_id', 'bioentity_label',
'synonym', 'database_xref', 'type', 'taxon',
'taxon_label']

_AllianceGenomeRow = namedtuple('AllianceGenomeRow', alliance_int_columns)
_WormBaseRow = namedtuple('WormBaseRow', wormbase_int_columns)

def process_from_files(wormbase_gen_data_file, wormbase_mol_data_file,
wb_to_entrez_mappings_file):

def process_from_files(alliance_gen_data_file, alliance_mol_data_file,
wb_all_int_data_file, wb_to_entrez_mappings_file):
"""Process WormBase interaction data from TSV files.

Parameters
----------
wormbase_gen_data_file : str
Path to the WormBase genetic interactions data file in TSV format.
wormbase_mol_data_file : str
Path to the WormBase molecular interactions data file in TSV format.
alliance_gen_data_file : str
Path to the Alliance Genome dataset of C. elegans genetic interactions
in TSV format.
alliance_mol_data_file : str
Path to the Alliance Genome dataset of C. elegans molecular interactions
in TSV format.
wb_all_int_data_file : str
Path to the WormBase dataset of all C. elegans interactions
wb_to_entrez_mappings_file : str
Path to the WormBase-to-Entrez ID mapping file in TSV format.

Expand All @@ -72,13 +136,21 @@ def process_from_files(wormbase_gen_data_file, wormbase_mol_data_file,
WormBaseProcessor containing Statements extracted from the
interactions data.
"""
gen_iter = pd.read_csv(wormbase_gen_data_file, sep='\t', comment='#',
gen_int_iter = pd.read_csv(alliance_gen_data_file, sep='\t', comment='#',
dtype=str).values.tolist()
mol_iter = pd.read_csv(wormbase_mol_data_file, sep='\t', comment='#',
mol_int_iter = pd.read_csv(alliance_mol_data_file, sep='\t', comment='#',
dtype=str).values.tolist()

all_wb_int = pd.read_csv(wb_all_int_data_file, sep='\t', comment='#',
header=None, names=wormbase_int_columns_raw,
dtype=str, compression='gzip')
all_wb_int_iter = _process_wormbase_interactions(all_wb_int).values.tolist()

mappings_df = pd.read_csv(wb_to_entrez_mappings_file, sep='\t',
comment='#', dtype=str, names=mapping_columns)
return _processor_from_data(gen_iter, mol_iter, mappings_df)

# return _processor_from_data(gen_int_iter, mol_int_iter, mappings_df)
return _processor_from_data(gen_int_iter, mol_int_iter, all_wb_int_iter, mappings_df)


def process_from_web():
Expand All @@ -89,37 +161,182 @@ def process_from_web():
indra.sources.wormbase.WormBaseProcessor
WormBaseProcessor containing Statements extracted from the interactions data.
"""
gen_iter = pd.read_csv(wormbase_gen_file_url, sep='\t', comment='#',
gen_int_iter = pd.read_csv(alliance_gen_int_file_url, sep='\t', comment='#',
dtype=str).values.tolist()
mol_iter = pd.read_csv(wormbase_mol_file_url, sep='\t', comment='#',
mol_int_iter = pd.read_csv(alliance_mol_int_file_url, sep='\t', comment='#',
dtype=str).values.tolist()
mappings_df = pd.read_csv(wormbase_entrez_mappings_file_url, sep='\t',

all_wb_int = pd.read_csv(wormbase_int_file_url, sep='\t', comment='#',
header=None, names=wormbase_int_columns_raw,
dtype=str, compression='gzip')
all_wb_int_iter = _process_wormbase_interactions(all_wb_int).values.tolist()

entrez_mappings_df = pd.read_csv(wormbase_entrez_mappings_file_url, sep='\t',
comment='#', dtype=str,
names=mapping_columns)

return _processor_from_data(gen_iter, mol_iter, mappings_df)
all_genes_df = pd.read_csv(wormbase_all_genes_file_path, sep="\t",
header=None, dtype=str,
names=wormbase_all_genes_columns)

# return _processor_from_data(gen_int_iter, mol_int_iter, entrez_mappings_df)
return _processor_from_data(gen_int_iter, mol_int_iter, all_wb_int_iter,
entrez_mappings_df, all_genes_df)


def _process_wormbase_interactions(df):
"""Process raw interaction data file from WormBase. Expand all interactions
into two-way interactions.

Parameters
----------
df : pandas.DataFrame
Raw interaction data file from WormBase.

Returns
-------
processed_df : pandas.DataFrame
Processed (pairwise) interaction data file from WormBase.
"""

from itertools import product

# Get relevant columns
interactor_cols = [col for col in df.columns if col.startswith('Interactor')]
common_name_cols = [col for col in df.columns if col.startswith('Common_name')]
role_cols = [col for col in df.columns if col.startswith('Role')]

# Create one row per effector-affected pair
seen = set()
table_rows = []
for _, row in df.iterrows():
participants = []
effectors = []
affected = []
non_directional = []
gene_ids = {}
effector_roles = {}
affected_roles = {}

for i in range(len(interactor_cols)):
gene_id = row.get(interactor_cols[i])
gene_name = row.get(common_name_cols[i])
role_raw = row.get(role_cols[i])

if pd.isnull(gene_id) or pd.isnull(role_raw):
continue # skip incomplete entries

role = str(role_raw).strip().lower()
gene_display = gene_name if pd.notnull(gene_name) else gene_id
gene_display = str(gene_display).strip()

if not gene_display:
continue # skip blank names

gene_ids[gene_display] = gene_id
participants.append(gene_display)

if role in ["effector", "target", "trans_regulator"]:
effectors.append(gene_display)
effector_roles[gene_display] = role
elif role in ["affected", "bait", "trans_regulated"]:
affected.append(gene_display)
affected_roles[gene_display] = role
elif role in ["non_directional"]:
non_directional.append(gene_display)

interaction_id = row.get("WBInteractionID", "-")
interaction_type = row.get("Interaction_type", "-").lower()
interaction_subtype = row.get("Interaction_subtype", "-").lower()

def _processor_from_data(gen_iter, mol_iter, mappings_df):
if len(affected) > 0 and len(effectors) < 1:
continue

if len(effectors) > 0 and len(affected) < 1:
continue

if len(participants) < 2:
continue

# For directional edges
for eff, aff in product(effectors, affected):
key = (eff, aff, "Effector->Affected")
if key not in seen:
seen.add(key)
table_rows.append({
"WBInteractionID": interaction_id,
"Type": interaction_type,
"Subtype": interaction_subtype,
"Effector_ID": gene_ids.get(eff, "-"),
"Effector": eff,
"Effector_Role": effector_roles.get(eff, "-"),
"Affected_ID": gene_ids.get(aff, "-"),
"Affected": aff,
"Affected_Role": affected_roles.get(aff, "-"),
"Direction": "Effector->Affected"
})

# For non-directional edges
if not effectors and not affected and len(non_directional) >= 2:
for i in range(len(non_directional)):
for j in range(i + 1, len(non_directional)):
eff, aff = non_directional[i], non_directional[j]
key = tuple(sorted([eff, aff])) + ("non-directional",)
if key not in seen and eff != aff:
seen.add(key)
table_rows.append({
"WBInteractionID": interaction_id,
"Type": interaction_type,
"Subtype": interaction_subtype,
"Effector_ID": gene_ids.get(eff, "-"),
"Effector": eff,
"Effector_Role": "non-directional",
"Affected_ID": gene_ids.get(aff, "-"),
"Affected": aff,
"Affected_Role": "non-directional",
"Direction": "non-directional"
})

# Convert to DataFrame
processed_df = pd.DataFrame(table_rows)

return processed_df


def _processor_from_data(gen_int_iter, mol_int_iter, all_wb_int_iter,
entrez_mappings_df, all_genes_df):
"""Create a WormBaseProcessor from the interaction data and ID mappings.

Parameters
----------
gen_iter : list
Iterable of rows in the genetic interactions data file.
mol_iter : list
Iterable of rows in the molecular interactions data file.
mappings_df : pd.DataFrame
gen_int_iter : list
Iterable of rows in the Alliance Genome genetic interactions data file.
mol_int_iter : list
Iterable of rows in the Alliance Genome molecular interactions data file.
all_wb_int_iter : list
Iterable of rows in the WormBase interactions data file.
entrez_mappings_df : pd.DataFrame
DataFrame containing associated WormBase and Entrez IDs.
all_genes_df : pd.DataFrame
DataFrame of all WormBase genes and their WormBase IDs,
symbols, and synonyms.

Returns
-------
indra.sources.wormbase.WormBaseProcessor
WormBaseProcessor containing Statements extracted from the interactions data.
"""
# Process into a list of WormBaseRow namedtuples
all_rows = gen_iter + mol_iter
data = [_WormBaseRow(*[None if item == '-' else item
for item in row][:len(columns)])
for row in all_rows]
return WormBaseProcessor(data, mappings_df)
alliance_rows = gen_int_iter + mol_int_iter
alliance_data = [_AllianceGenomeRow(*[None if item == '-' else item
for item in row][:len(alliance_int_columns)])
for row in alliance_rows]
wormbase_rows = all_wb_int_iter
wormbase_data = [_WormBaseRow(*[None if item == '-' else item
for item in row][:len(wormbase_int_columns)])
for row in wormbase_rows]

# return WormBaseProcessor(data, entrez_mappings_df)
return WormBaseProcessor(alliance_data, wormbase_data,
entrez_mappings_df, all_genes_df)

Loading