diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 3c3590a9..0edfb5fa 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -99,11 +99,18 @@ CREATE TABLE path_edges ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, path_id INTEGER NOT NULL, index_in_path INTEGER NOT NULL, - edge_id INTEGER NOT NULL, - FOREIGN KEY(edge_id) REFERENCES edges(id), + block_group_edge_id INTEGER NOT NULL, + FOREIGN KEY(block_group_edge_id) REFERENCES block_group_edges(id), FOREIGN KEY(path_id) REFERENCES paths(id) ) STRICT; -CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path); +CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, block_group_edge_id, index_in_path); + +CREATE TABLE phase_layers ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + chromosome_index INTEGER NOT NULL, + is_reference INTEGER NOT NULL DEFAULT 0 +) STRICT; +CREATE UNIQUE INDEX phase_layer_uidx ON phase_layers(chromosome_index, is_reference) WHERE is_reference = 1; CREATE TABLE block_group_edges ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, @@ -111,6 +118,8 @@ CREATE TABLE block_group_edges ( edge_id INTEGER NOT NULL, chromosome_index INTEGER, phased INTEGER NOT NULL, + source_phase_layer_id INTEGER DEFAULT 0, + target_phase_layer_id INTEGER DEFAULT 0, FOREIGN KEY(block_group_id) REFERENCES block_groups(id), FOREIGN KEY(edge_id) REFERENCES edges(id) ) STRICT; diff --git a/src/diffs/gfa.rs b/src/diffs/gfa.rs index 475c66c3..8e3e3a54 100644 --- a/src/diffs/gfa.rs +++ b/src/diffs/gfa.rs @@ -304,6 +304,11 @@ mod tests { Strand::Forward, ); + println!("here1"); + println!("block group id: {}", block_group.id); + println!("edge1 id: {}", edge1.id); + println!("edge2 id: {}", edge2.id); + println!("edge3 id: {}", edge3.id); let edge_ids = [edge1.id, edge2.id, edge3.id]; let block_group_edges = edge_ids .iter() @@ -312,11 +317,18 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path1 = Path::create(&conn, "parent", block_group.id, &edge_ids); + println!("here2"); + for bgei in block_group_edge_ids.iter() { + println!("block group edge id: {}", bgei); + } + println!("block group id: {}", block_group.id); + let _path1 = Path::create(&conn, "parent", block_group.id, &block_group_edge_ids); // Set up child let _child_sample = Sample::get_or_create_child(&conn, collection_name, "child", None); @@ -354,11 +366,20 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &child_block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &child_block_group_edges); let original_child_path = BlockGroup::get_current_path(&conn, child_block_group.id); - let _child_path = original_child_path.new_path_with(&conn, 2, 6, &edge4, &edge5); + let _child_path = original_child_path.new_path_with( + &conn, + 2, + 6, + block_group_edge_ids[0], + block_group_edge_ids[1], + node3_id, + ); let temp_dir = tempdir().unwrap(); let gfa_path = temp_dir.path().join("parent-child-diff.gfa"); @@ -419,13 +440,22 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &grandchild_block_group_edges); + let block_group_edge_ids = + BlockGroupEdge::bulk_create(&conn, &grandchild_block_group_edges); let original_grandchild_path = BlockGroup::get_current_path(&conn, grandchild_block_group.id); - let _grandchild_path = - original_grandchild_path.new_path_with(&conn, 10, 14, &edge6, &edge7); + let _grandchild_path = original_grandchild_path.new_path_with( + &conn, + 10, + 14, + block_group_edge_ids[0], + block_group_edge_ids[1], + node4_id, + ); let gfa_path = temp_dir.path().join("parent-grandchild-diff.gfa"); gfa_sample_diff(&conn, collection_name, &gfa_path, None, Some("grandchild")); @@ -539,11 +569,13 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path1 = Path::create(&conn, "test path", block_group.id, &edge_ids); + let _path1 = Path::create(&conn, "test path", block_group.id, &block_group_edge_ids); let temp_dir = tempdir().unwrap(); let gfa_path = temp_dir.path().join("diff-against-nothing.gfa"); @@ -626,11 +658,13 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path1 = Path::create(&conn, "test path", block_group.id, &edge_ids); + let _path1 = Path::create(&conn, "test path", block_group.id, &block_group_edge_ids); let temp_dir = tempdir().unwrap(); let gfa_path = temp_dir.path().join("self-diff.gfa"); @@ -715,11 +749,13 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path1 = Path::create(&conn, "parent", block_group.id, &edge_ids); + let _path1 = Path::create(&conn, "parent", block_group.id, &block_group_edge_ids); let _sample2 = Sample::get_or_create(&conn, "sample2"); let block_group2 = BlockGroup::create( @@ -775,11 +811,13 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path2 = Path::create(&conn, "parent", block_group2.id, &edge_ids); + let _path2 = Path::create(&conn, "parent", block_group2.id, &block_group_edge_ids); let temp_dir = tempdir().unwrap(); let gfa_path = temp_dir.path().join("unrelated-diff.gfa"); @@ -865,11 +903,13 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path1 = Path::create(&conn, "parent", block_group.id, &edge_ids); + let _path1 = Path::create(&conn, "parent", block_group.id, &block_group_edge_ids); let _sample2 = Sample::get_or_create(&conn, "sample2"); let block_group2 = @@ -921,11 +961,13 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - let _path2 = Path::create(&conn, "parent", block_group2.id, &edge_ids); + let _path2 = Path::create(&conn, "parent", block_group2.id, &block_group_edge_ids); let temp_dir = tempdir().unwrap(); let gfa_path = temp_dir.path().join("unrelated-diff.gfa"); @@ -996,11 +1038,15 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); + let bge1_id = block_group_edge_ids[0]; + let bge2_id = block_group_edge_ids[1]; - let _path1 = Path::create(&conn, "parent", block_group.id, &[edge1.id, edge2.id]); + let _path1 = Path::create(&conn, "parent", block_group.id, &[bge1_id, bge2_id]); // Set up child let _child_sample = Sample::get_or_create_child(&conn, collection_name, "child", None); @@ -1038,11 +1084,20 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &child_block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &child_block_group_edges); let original_child_path = BlockGroup::get_current_path(&conn, child_block_group.id); - let _child_path = original_child_path.new_path_with(&conn, 2, 6, &edge3, &edge4); + let _child_path = original_child_path.new_path_with( + &conn, + 2, + 6, + block_group_edge_ids[0], + block_group_edge_ids[1], + node2_id, + ); let temp_dir = tempdir().unwrap(); let gfa_path = temp_dir.path().join("parent-child-diff.gfa"); @@ -1103,12 +1158,22 @@ mod tests { edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(&conn, &grandchild_block_group_edges); + let block_group_edge_ids = + BlockGroupEdge::bulk_create(&conn, &grandchild_block_group_edges); let original_grandchild_path = BlockGroup::get_current_path(&conn, grandchild_block_group.id); - let _grandchild_path = original_grandchild_path.new_path_with(&conn, 4, 10, &edge5, &edge6); + let _grandchild_path = original_grandchild_path.new_path_with( + &conn, + 4, + 10, + block_group_edge_ids[0], + block_group_edge_ids[1], + node3_id, + ); let gfa_path = temp_dir.path().join("parent-grandchild-diff.gfa"); gfa_sample_diff(&conn, collection_name, &gfa_path, None, Some("grandchild")); diff --git a/src/exports/genbank.rs b/src/exports/genbank.rs index 232a3cfe..55bdf910 100644 --- a/src/exports/genbank.rs +++ b/src/exports/genbank.rs @@ -678,6 +678,7 @@ mod tests { path_start: 0, path_end: 30, strand: Forward, + phase_layer_id: 0, }, PathBlock { id: 0, @@ -688,6 +689,7 @@ mod tests { path_start: 30, path_end: 60, strand: Forward, + phase_layer_id: 0, }, ]; assert_eq!( @@ -802,6 +804,7 @@ mod tests { path_start: 0, path_end: 8302, strand: Forward, + phase_layer_id: 0, }]; assert_eq!( get_path_nodes(&graph, &path_blocks), diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index db7b0b1b..ccb1fd00 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -330,46 +330,51 @@ mod tests { Strand::Forward, ); - let new_block_group_edges = vec![ + let block_group_edges = vec![ BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge1.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge2.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge3.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge4.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge5.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, ]; - BlockGroupEdge::bulk_create(&conn, &new_block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(&conn, &block_group_edges); - Path::create( - &conn, - "1234", - block_group.id, - &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], - ); + Path::create(&conn, "1234", block_group.id, &block_group_edge_ids); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id, false); @@ -495,6 +500,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, diff --git a/src/graph.rs b/src/graph.rs index 7233a9d2..41ea78aa 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -295,6 +295,7 @@ pub fn flatten_to_interval_tree( sequence_start: source_node.sequence_start, sequence_end: source_node.sequence_end, strand: edge.source_strand, + phase_layer_id: 0, }); spans.insert(NodeIntervalBlock { block_id: target_node.block_id, @@ -304,6 +305,7 @@ pub fn flatten_to_interval_tree( sequence_start: target_node.sequence_start, sequence_end: target_node.sequence_end, strand: edge.target_strand, + phase_layer_id: 0, }); if remove_ambiguous_positions { for (node_id, node_range) in [ diff --git a/src/graph_operators.rs b/src/graph_operators.rs index 74a8c92b..d150fe04 100644 --- a/src/graph_operators.rs +++ b/src/graph_operators.rs @@ -12,7 +12,7 @@ use crate::models::{ use crate::operation_management; use core::ops::Range; use rusqlite::Connection; -use std::collections::HashSet; +use std::collections::HashMap; use std::io; #[allow(clippy::too_many_arguments)] @@ -80,7 +80,7 @@ pub fn derive_subgraph( new_block_group_id, ); - let current_edges = PathEdge::edges_for_path(conn, current_path.id); + let current_edges = PathEdge::block_group_edges_for_path(conn, current_path.id); let child_block_group_edges = BlockGroupEdge::edges_for_block_group(conn, child_block_group_id); let new_start_edge = child_block_group_edges .iter() @@ -90,24 +90,25 @@ pub fn derive_subgraph( .iter() .find(|x| x.edge.target_node_id == PATH_END_NODE_ID) .unwrap(); - let new_edge_id_set = child_block_group_edges + let child_edges_by_id = child_block_group_edges .iter() - .map(|x| x.edge.id) - .collect::>(); + .map(|x| (x.edge.id, x)) + .collect::>(); - let mut new_path_edge_ids = vec![]; - new_path_edge_ids.push(new_start_edge.edge.id); + let mut new_path_block_group_edge_ids = vec![]; + new_path_block_group_edge_ids.push(new_start_edge.block_group_edge_id); for current_edge in current_edges { - if new_edge_id_set.contains(¤t_edge.id) { - new_path_edge_ids.push(current_edge.id); + let child_edge = child_edges_by_id.get(¤t_edge.edge_id); + if let Some(child_edge) = child_edge { + new_path_block_group_edge_ids.push(child_edge.block_group_edge_id); } } - new_path_edge_ids.push(new_end_edge.edge.id); + new_path_block_group_edge_ids.push(new_end_edge.block_group_edge_id); Path::create( conn, ¤t_path.name, child_block_group_id, - &new_path_edge_ids, + &new_path_block_group_edge_ids, ); let summary_str = format!(" {}: 1 new derived block group", new_sample_name); @@ -142,6 +143,7 @@ mod tests { use crate::test_helpers::{ get_connection, get_operation_connection, setup_block_group, setup_gen_dir, }; + use std::collections::HashSet; #[test] fn test_derive_subgraph_one_insertion() { @@ -195,12 +197,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let insert_path = - original_path.new_path_with(conn, 16, 24, &edge_into_insert, &edge_out_of_insert); + let insert_path = original_path.new_path_with( + conn, + 16, + 24, + block_group_edge_ids[0], + block_group_edge_ids[1], + insert_node_id, + ); assert_eq!( insert_path.sequence(conn), "AAAAAAAAAATTTTTTAAAAAAAACCCCCCGGGGGGGGGG" diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index dd52b78a..30206ee9 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -106,23 +106,27 @@ pub fn import_fasta<'a>( Strand::Forward, ); - let new_block_group_edges = vec![ + let block_group_edges = vec![ BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge_into.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge_out_of.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, ]; - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); - let path = Path::create(conn, &name, block_group.id, &[edge_into.id, edge_out_of.id]); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let path = Path::create(conn, &name, block_group.id, &block_group_edge_ids); summary.entry(path.name).or_insert(sequence_length); bar.inc(1); } diff --git a/src/imports/genbank.rs b/src/imports/genbank.rs index bcd7db61..e5155000 100644 --- a/src/imports/genbank.rs +++ b/src/imports/genbank.rs @@ -84,7 +84,7 @@ where 0, Strand::Forward, ); - BlockGroupEdge::bulk_create( + let block_group_edge_ids = BlockGroupEdge::bulk_create( conn, &[ BlockGroupEdgeData { @@ -92,21 +92,20 @@ where edge_id: edge_into.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge_out_of.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, ], ); - let path = Path::create( - conn, - &locus.name, - block_group.id, - &[edge_into.id, edge_out_of.id], - ); + let path = Path::create(conn, &locus.name, block_group.id, &block_group_edge_ids); for edit in locus.changes_to_wt() { let start = edit.start; @@ -145,6 +144,7 @@ where path_start: start, path_end: end + change_seq.length, strand: Strand::Forward, + phase_layer_id: 0, }, chromosome_index: 0, phased: 0, @@ -165,6 +165,7 @@ where path_start: start, path_end: end, strand: Strand::Forward, + phase_layer_id: 0, }, chromosome_index: 0, phased: 0, diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 0c791a7b..17cf04ce 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -172,17 +172,24 @@ pub fn import_gfa<'a>( let bar = progress_bar.add(get_time_elapsed_bar()); bar.set_message("Creating Gen Objects"); let edge_ids = Edge::bulk_create(conn, &edges.into_iter().collect::>()); - let new_block_group_edges = edge_ids + let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { block_group_id: block_group.id, edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids_by_edge_id = block_group_edge_ids + .iter() + .enumerate() + .map(|(index, block_group_edge_id)| (edge_ids[index], block_group_edge_id)) + .collect::>(); let saved_edges = Edge::bulk_load(conn, &edge_ids); let mut edge_ids_by_data = HashMap::new(); @@ -229,7 +236,11 @@ pub fn import_gfa<'a>( ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); - Path::create(conn, path_name, block_group.id, &path_edge_ids); + let path_block_group_edge_ids = path_edge_ids + .iter() + .map(|edge_id| **block_group_edge_ids_by_edge_id.get(edge_id).unwrap()) + .collect::>(); + Path::create(conn, path_name, block_group.id, &path_block_group_edge_ids); } for input_walk in &gfa.walk { @@ -264,7 +275,11 @@ pub fn import_gfa<'a>( ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); - Path::create(conn, path_name, block_group.id, &path_edge_ids); + let path_block_group_edge_ids = path_edge_ids + .iter() + .map(|edge_id| **block_group_edge_ids_by_edge_id.get(edge_id).unwrap()) + .collect::>(); + Path::create(conn, path_name, block_group.id, &path_block_group_edge_ids); } bar.finish(); } diff --git a/src/imports/library.rs b/src/imports/library.rs index 21610a38..bc4576f7 100644 --- a/src/imports/library.rs +++ b/src/imports/library.rs @@ -156,6 +156,8 @@ pub fn import_library<'a>( edge_id: *edge_id, chromosome_index: *edge_id, // TODO: This is a hack, clean it up with phase layers phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); BlockGroupEdge::bulk_create(conn, &new_block_group_edges); @@ -167,10 +169,15 @@ pub fn import_library<'a>( } path_node_ids.push(PATH_END_NODE_ID); - let new_edges = Edge::bulk_load(conn, &new_edge_ids); + let new_edges = BlockGroupEdge::edges_for_block_group(conn, new_block_group.id); let new_edge_ids_by_source_and_target_node = new_edges .iter() - .map(|edge| ((edge.source_node_id, edge.target_node_id), edge.id)) + .map(|edge| { + ( + (edge.edge.source_node_id, edge.edge.target_node_id), + edge.block_group_edge_id, + ) + }) .collect::>(); let path_edge_ids = path_node_ids .iter() diff --git a/src/models.rs b/src/models.rs index 84fc4d62..a7a4b859 100644 --- a/src/models.rs +++ b/src/models.rs @@ -9,6 +9,7 @@ pub mod node; pub mod operations; pub mod path; pub mod path_edge; +pub mod phase_layer; pub mod sample; pub mod sequence; pub mod strand; diff --git a/src/models/accession.rs b/src/models/accession.rs index b9f1a381..51ec0970 100644 --- a/src/models/accession.rs +++ b/src/models/accession.rs @@ -1,4 +1,4 @@ -use crate::models::block_group_edge::AugmentedEdgeData; +use crate::models::block_group_edge::NewAugmentedEdgeData; use crate::models::strand::Strand; use crate::models::traits::*; use rusqlite::types::Value; @@ -60,8 +60,8 @@ impl From<&AccessionEdge> for AccessionEdgeData { } } -impl From<&AugmentedEdgeData> for AccessionEdgeData { - fn from(item: &AugmentedEdgeData) -> Self { +impl From<&NewAugmentedEdgeData> for AccessionEdgeData { + fn from(item: &NewAugmentedEdgeData) -> Self { AccessionEdgeData { source_node_id: item.edge_data.source_node_id, source_coordinate: item.edge_data.source_coordinate, diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 0c1f3dc9..475fa349 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -12,11 +12,12 @@ use crate::graph::{ GraphEdge, GraphNode, }; use crate::models::accession::{Accession, AccessionEdge, AccessionEdgeData, AccessionPath}; -use crate::models::block_group_edge::{AugmentedEdgeData, BlockGroupEdge, BlockGroupEdgeData}; +use crate::models::block_group_edge::{BlockGroupEdge, BlockGroupEdgeData, NewAugmentedEdgeData}; use crate::models::edge::{Edge, EdgeData, GroupBlock}; use crate::models::node::{PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::path::{Path, PathBlock, PathData}; use crate::models::path_edge::PathEdge; +use crate::models::phase_layer::UNPHASED_CHROMOSOME_INDEX; use crate::models::strand::Strand; use crate::models::traits::*; @@ -102,6 +103,7 @@ pub struct NodeIntervalBlock { pub sequence_start: i64, pub sequence_end: i64, pub strand: Strand, + pub phase_layer_id: i64, } impl BlockGroup { pub fn create( @@ -188,7 +190,7 @@ impl BlockGroup { .iter() .map(|edge| edge.edge.id) .collect::>(); - let new_block_group_edges = edge_ids + let block_group_edges = edge_ids .iter() .enumerate() .map(|(i, edge_id)| BlockGroupEdgeData { @@ -196,18 +198,34 @@ impl BlockGroup { edge_id: *edge_id, chromosome_index: augmented_edges[i].chromosome_index, phased: augmented_edges[i].phased, + source_phase_layer_id: augmented_edges[i].source_phase_layer_id, + target_phase_layer_id: augmented_edges[i].target_phase_layer_id, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + let new_block_group_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let new_block_group_ids_by_old = augmented_edges + .iter() + .zip(new_block_group_ids.iter()) + .map(|(augmented_edge, new_id)| (augmented_edge.block_group_edge_id, *new_id)) + .collect::>(); let mut path_map = HashMap::new(); for path in existing_paths.iter() { - let edge_ids = PathEdge::edges_for_path(conn, path.id) + let old_block_group_edge_ids = PathEdge::block_group_edges_for_path(conn, path.id) .into_iter() - .map(|edge| edge.id) + .map(|block_group_edge| block_group_edge.id) + .collect::>(); + let block_group_edge_ids = old_block_group_edge_ids + .iter() + .map(|old_id| new_block_group_ids_by_old[old_id]) .collect::>(); - let new_path = Path::create(conn, &path.name, target_block_group_id, &edge_ids); + let new_path = Path::create( + conn, + &path.name, + target_block_group_id, + &block_group_edge_ids, + ); path_map.insert(path.id, new_path.id); } @@ -535,7 +553,8 @@ impl BlockGroup { cache: &mut PathCache, modify_blockgroup: bool, ) { - let mut new_augmented_edges_by_block_group = HashMap::>::new(); + let mut new_augmented_edges_by_block_group = + HashMap::>::new(); let mut new_accession_edges = HashMap::new(); let mut tree_map = HashMap::new(); for change in changes { @@ -554,7 +573,7 @@ impl BlockGroup { if let Some(accession) = &change.path_accession { new_accession_edges .entry((&change.path, accession)) - .and_modify(|new_edge_data: &mut Vec| { + .and_modify(|new_edge_data: &mut Vec| { new_edge_data.extend(new_augmented_edges.clone()) }) .or_insert_with(|| new_augmented_edges.clone()); @@ -562,7 +581,6 @@ impl BlockGroup { } let mut edge_data_map = HashMap::new(); - for (block_group_id, new_augmented_edges) in new_augmented_edges_by_block_group { let new_edges = new_augmented_edges .iter() @@ -572,7 +590,7 @@ impl BlockGroup { for (i, edge_data) in new_edges.iter().enumerate() { edge_data_map.insert(edge_data.clone(), edge_ids[i]); } - let new_block_group_edges = edge_ids + let block_group_edges = edge_ids .iter() .enumerate() .map(|(i, edge_id)| BlockGroupEdgeData { @@ -580,9 +598,11 @@ impl BlockGroup { edge_id: *edge_id, chromosome_index: new_augmented_edges[i].chromosome_index, phased: new_augmented_edges[i].phased, + source_phase_layer_id: new_augmented_edges[i].source_phase_layer_id, + target_phase_layer_id: new_augmented_edges[i].target_phase_layer_id, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + BlockGroupEdge::bulk_create(conn, &block_group_edges); } for ((path, accession_name), path_edges) in new_accession_edges { @@ -616,14 +636,14 @@ impl BlockGroup { conn: &Connection, change: &PathChange, tree: &IntervalTree, - ) { + ) -> Vec { let new_augmented_edges = BlockGroup::set_up_new_edges(change, tree); let new_edges = new_augmented_edges .iter() .map(|augmented_edge| augmented_edge.edge_data.clone()) .collect::>(); let edge_ids = Edge::bulk_create(conn, &new_edges); - let new_block_group_edges = edge_ids + let block_group_edges = edge_ids .iter() .enumerate() .map(|(i, edge_id)| BlockGroupEdgeData { @@ -631,15 +651,17 @@ impl BlockGroup { edge_id: *edge_id, chromosome_index: new_augmented_edges[i].chromosome_index, phased: new_augmented_edges[i].phased, + source_phase_layer_id: new_augmented_edges[i].source_phase_layer_id, + target_phase_layer_id: new_augmented_edges[i].target_phase_layer_id, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + BlockGroupEdge::bulk_create(conn, &block_group_edges) } fn set_up_new_edges( change: &PathChange, tree: &IntervalTree, - ) -> Vec { + ) -> Vec { let start_blocks: Vec<&NodeIntervalBlock> = tree.query_point(change.start).map(|x| &x.value).collect(); assert_eq!(start_blocks.len(), 1); @@ -675,10 +697,12 @@ impl BlockGroup { target_coordinate: change.end - end_block.start + end_block.sequence_start, target_strand: Strand::Forward, }; - let new_augmented_edge = AugmentedEdgeData { + let new_augmented_edge = NewAugmentedEdgeData { edge_data: new_edge, chromosome_index: change.chromosome_index, phased: change.phased, + source_phase_layer_id: start_block.phase_layer_id, + target_phase_layer_id: end_block.phase_layer_id, }; new_edges.push(new_augmented_edge); @@ -694,10 +718,12 @@ impl BlockGroup { target_coordinate: change.end - end_block.start + end_block.sequence_start, target_strand: Strand::Forward, }; - let new_augmented_edge = AugmentedEdgeData { + let new_augmented_edge = NewAugmentedEdgeData { edge_data: new_beginning_edge, chromosome_index: change.chromosome_index, phased: change.phased, + source_phase_layer_id: UNPHASED_CHROMOSOME_INDEX, + target_phase_layer_id: end_block.phase_layer_id, }; new_edges.push(new_augmented_edge); } @@ -714,10 +740,12 @@ impl BlockGroup { target_coordinate: change.block.sequence_start, target_strand: Strand::Forward, }; - let new_augmented_start_edge = AugmentedEdgeData { + let new_augmented_start_edge = NewAugmentedEdgeData { edge_data: new_start_edge, chromosome_index: change.chromosome_index, phased: change.phased, + source_phase_layer_id: start_block.phase_layer_id, + target_phase_layer_id: change.block.phase_layer_id, }; let new_end_edge = EdgeData { source_node_id: change.block.node_id, @@ -727,10 +755,12 @@ impl BlockGroup { target_coordinate: change.end - end_block.start + end_block.sequence_start, target_strand: Strand::Forward, }; - let new_augmented_end_edge = AugmentedEdgeData { + let new_augmented_end_edge = NewAugmentedEdgeData { edge_data: new_end_edge, chromosome_index: change.chromosome_index, phased: change.phased, + source_phase_layer_id: change.block.phase_layer_id, + target_phase_layer_id: end_block.phase_layer_id, }; new_edges.push(new_augmented_start_edge); new_edges.push(new_augmented_end_edge); @@ -823,6 +853,8 @@ impl BlockGroup { edge_id: edge.edge.id, chromosome_index: block_group_edge.chromosome_index, phased: block_group_edge.phased, + source_phase_layer_id: block_group_edge.source_phase_layer_id, + target_phase_layer_id: block_group_edge.target_phase_layer_id, } }) .collect::>(); @@ -841,6 +873,8 @@ impl BlockGroup { edge_id: new_start_edge.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: UNPHASED_CHROMOSOME_INDEX, + target_phase_layer_id: start_block.phase_layer_id, }; let new_end_edge = Edge::create( conn, @@ -856,6 +890,8 @@ impl BlockGroup { edge_id: new_end_edge.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: end_block.phase_layer_id, + target_phase_layer_id: UNPHASED_CHROMOSOME_INDEX, }; let mut all_edges = subgraph_edge_inputs.clone(); all_edges.push(new_start_edge_data); @@ -983,6 +1019,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1020,6 +1057,7 @@ mod tests { path_start: 19, path_end: 31, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { @@ -1065,6 +1103,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1107,6 +1146,7 @@ mod tests { path_start: 15, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1149,6 +1189,7 @@ mod tests { path_start: 12, path_end: 17, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1191,6 +1232,7 @@ mod tests { path_start: 10, path_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1233,6 +1275,7 @@ mod tests { path_start: 9, path_end: 9, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1275,6 +1318,7 @@ mod tests { path_start: 10, path_end: 20, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1317,6 +1361,7 @@ mod tests { path_start: 15, path_end: 25, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1359,6 +1404,7 @@ mod tests { path_start: 5, path_end: 35, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1401,6 +1447,7 @@ mod tests { path_start: 19, path_end: 31, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { @@ -1445,6 +1492,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1499,6 +1547,7 @@ mod tests { path_start: 0, path_end: 0, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1542,6 +1591,7 @@ mod tests { path_start: 40, path_end: 40, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1584,6 +1634,7 @@ mod tests { path_start: 10, path_end: 11, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1626,6 +1677,7 @@ mod tests { path_start: 19, path_end: 20, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1668,6 +1720,7 @@ mod tests { path_start: 0, path_end: 1, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1710,6 +1763,7 @@ mod tests { path_start: 35, path_end: 40, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1752,6 +1806,7 @@ mod tests { path_start: 10, path_end: 12, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1794,6 +1849,7 @@ mod tests { path_start: 18, path_end: 20, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, @@ -1845,6 +1901,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: new_bg_id, @@ -1872,6 +1929,7 @@ mod tests { sequence_start: 0, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }], ); interval_tree_verify( @@ -1885,6 +1943,7 @@ mod tests { sequence_start: 0, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }], ); interval_tree_verify( @@ -1898,6 +1957,7 @@ mod tests { sequence_start: 0, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }], ); interval_tree_verify( @@ -1911,6 +1971,7 @@ mod tests { sequence_start: 0, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }], ); @@ -1928,6 +1989,7 @@ mod tests { sequence_start: 0, sequence_end: 7, strand: Strand::Forward, + phase_layer_id: 0, }], ); interval_tree_verify( @@ -1941,6 +2003,7 @@ mod tests { sequence_start: 0, sequence_end: 7, strand: Strand::Forward, + phase_layer_id: 0, }], ); interval_tree_verify( @@ -1955,6 +2018,7 @@ mod tests { sequence_start: 0, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }, NodeIntervalBlock { block_id: 0, @@ -1964,6 +2028,7 @@ mod tests { sequence_start: 0, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }, ], ); @@ -1982,6 +2047,7 @@ mod tests { sequence_start: 0, sequence_end: 4, strand: Strand::Forward, + phase_layer_id: 0, }, NodeIntervalBlock { block_id: 6, @@ -1991,6 +2057,7 @@ mod tests { sequence_start: 7, sequence_end: 10, strand: Strand::Forward, + phase_layer_id: 0, }, ], ); @@ -2023,6 +2090,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: new_bg_id, @@ -2068,6 +2136,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: gc_bg_id, @@ -2118,6 +2187,7 @@ mod tests { path_start: 7, path_end: 11, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: new_bg_id, @@ -2173,6 +2243,7 @@ mod tests { path_start: 20, path_end: 24, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: gc_bg_id, @@ -2230,6 +2301,7 @@ mod tests { path_start: 7, path_end: 12, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: new_bg_id, @@ -2285,6 +2357,7 @@ mod tests { path_start: 20, path_end: 24, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id: gc_bg_id, @@ -2354,12 +2427,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let insert_path = - original_path.new_path_with(conn, 16, 24, &edge_into_insert, &edge_out_of_insert); + let insert_path = original_path.new_path_with( + conn, + 16, + 24, + block_group_edge_ids[0], + block_group_edge_ids[1], + insert_node_id, + ); assert_eq!( insert_path.sequence(conn), "AAAAAAAAAATTTTTTAAAAAAAACCCCCCGGGGGGGGGG" @@ -2448,12 +2529,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let insert_path = - original_path.new_path_with(conn, 16, 24, &edge_into_insert, &edge_out_of_insert); + let insert_path = original_path.new_path_with( + conn, + 16, + 24, + block_group_edge_ids[0], + block_group_edge_ids[1], + insert_node_id, + ); assert_eq!( insert_path.sequence(conn), "AAAAAAAAAATTTTTTAAAAAAAACCCCCCGGGGGGGGGG" @@ -2494,12 +2583,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let insert2_path = - insert_path.new_path_with(conn, 28, 32, &edge_into_insert2, &edge_out_of_insert2); + let insert2_path = insert_path.new_path_with( + conn, + 28, + 32, + block_group_edge_ids[0], + block_group_edge_ids[1], + insert2_node_id, + ); assert_eq!( insert2_path.sequence(conn), "AAAAAAAAAATTTTTTAAAAAAAACCTTTTTTTTGGGGGG" @@ -2598,12 +2695,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let insert_path = - original_path.new_path_with(conn, 16, 24, &edge_into_insert, &edge_out_of_insert); + let insert_path = original_path.new_path_with( + conn, + 16, + 24, + block_group_edge_ids[0], + block_group_edge_ids[1], + insert_node_id, + ); assert_eq!( insert_path.sequence(conn), "AAAAAAAAAATTTTTTAAAAAAAACCCCCCGGGGGGGGGG" @@ -2644,12 +2749,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let insert2_path = - insert_path.new_path_with(conn, 28, 32, &edge_into_insert2, &edge_out_of_insert2); + let insert2_path = insert_path.new_path_with( + conn, + 28, + 32, + block_group_edge_ids[0], + block_group_edge_ids[1], + insert2_node_id, + ); assert_eq!( insert2_path.sequence(conn), "AAAAAAAAAATTTTTTAAAAAAAACCTTTTTTTTGGGGGG" @@ -2670,6 +2783,8 @@ mod tests { edge_id: deletion_edge.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }; BlockGroupEdge::bulk_create(conn, &[block_group_edge]); diff --git a/src/models/block_group_edge.rs b/src/models/block_group_edge.rs index e9cece75..99c9d82a 100644 --- a/src/models/block_group_edge.rs +++ b/src/models/block_group_edge.rs @@ -2,17 +2,21 @@ use crate::models::edge::{Edge, EdgeData}; use crate::models::traits::*; use rusqlite; use rusqlite::types::Value; -use rusqlite::{Connection, Row}; -use std::collections::HashMap; +use rusqlite::{Connection, Result as SQLResult, Row}; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::hash::RandomState; use std::rc::Rc; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] pub struct BlockGroupEdge { pub id: i64, pub block_group_id: i64, pub edge_id: i64, pub chromosome_index: i64, pub phased: i64, + pub source_phase_layer_id: i64, + pub target_phase_layer_id: i64, } #[derive(Clone, Debug, Eq, Hash, PartialEq)] @@ -21,20 +25,40 @@ pub struct BlockGroupEdgeData { pub edge_id: i64, pub chromosome_index: i64, pub phased: i64, + pub source_phase_layer_id: i64, + pub target_phase_layer_id: i64, +} + +impl From<&BlockGroupEdge> for BlockGroupEdgeData { + fn from(item: &BlockGroupEdge) -> Self { + BlockGroupEdgeData { + block_group_id: item.block_group_id, + edge_id: item.edge_id, + chromosome_index: item.chromosome_index, + phased: item.phased, + source_phase_layer_id: item.source_phase_layer_id, + target_phase_layer_id: item.target_phase_layer_id, + } + } } #[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub struct AugmentedEdge { +pub struct NewAugmentedEdge { pub edge: Edge, + pub block_group_edge_id: i64, pub chromosome_index: i64, pub phased: i64, + pub source_phase_layer_id: i64, + pub target_phase_layer_id: i64, } #[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub struct AugmentedEdgeData { +pub struct NewAugmentedEdgeData { pub edge_data: EdgeData, pub chromosome_index: i64, pub phased: i64, + pub source_phase_layer_id: i64, + pub target_phase_layer_id: i64, } impl Query for BlockGroupEdge { @@ -46,41 +70,124 @@ impl Query for BlockGroupEdge { edge_id: row.get(2).unwrap(), chromosome_index: row.get(3).unwrap(), phased: row.get(4).unwrap(), + source_phase_layer_id: row.get(5).unwrap(), + target_phase_layer_id: row.get(6).unwrap(), } } } impl BlockGroupEdge { - pub fn bulk_create(conn: &Connection, block_group_edges: &[BlockGroupEdgeData]) { - for chunk in block_group_edges.chunks(100000) { - let mut rows_to_insert = vec![]; - for block_group_edge in chunk { - let row = format!( - "({0}, {1}, {2}, {3})", - block_group_edge.block_group_id, - block_group_edge.edge_id, - block_group_edge.chromosome_index, - block_group_edge.phased, - ); - rows_to_insert.push(row); - } + pub fn bulk_create(conn: &Connection, block_group_edges: &[BlockGroupEdgeData]) -> Vec { + let mut block_group_edge_rows = vec![]; + let mut block_group_edge_map: HashMap = HashMap::new(); + for block_group_edge in block_group_edges { + let block_group_edge_row = format!( + "({0}, {1}, {2}, {3}, {4}, {5})", + block_group_edge.block_group_id, + block_group_edge.edge_id, + block_group_edge.chromosome_index, + block_group_edge.phased, + block_group_edge.source_phase_layer_id, + block_group_edge.target_phase_layer_id, + ); + block_group_edge_rows.push(block_group_edge_row); + } + let formatted_block_group_edge_rows = block_group_edge_rows.join(", "); - let formatted_rows_to_insert = rows_to_insert.join(", "); + let select_statement = format!("SELECT * FROM block_group_edges WHERE (block_group_id, edge_id, chromosome_index, phased, source_phase_layer_id, target_phase_layer_id) in ({0});", formatted_block_group_edge_rows); + let existing_block_group_edges = + BlockGroupEdge::query(conn, &select_statement, rusqlite::params!()); + for block_group_edge in existing_block_group_edges.iter() { + block_group_edge_map.insert( + BlockGroupEdgeData::from(block_group_edge), + block_group_edge.id, + ); + } + + let existing_block_group_edge_set = HashSet::::from_iter( + existing_block_group_edges + .into_iter() + .map(BlockGroupEdge::to_data), + ); + let mut block_group_edges_to_insert = HashSet::new(); + for block_group_edge in block_group_edges { + if !existing_block_group_edge_set.contains(block_group_edge) { + block_group_edges_to_insert.insert(block_group_edge); + } + } - let insert_statement = format!( - "INSERT OR IGNORE INTO block_group_edges (block_group_id, edge_id, chromosome_index, phased) VALUES {0};", - formatted_rows_to_insert + let mut block_group_edge_rows_to_insert = vec![]; + for block_group_edge in block_group_edges_to_insert { + let block_group_edge_row = format!( + "({0}, {1}, {2}, {3}, {4}, {5})", + block_group_edge.block_group_id, + block_group_edge.edge_id, + block_group_edge.chromosome_index, + block_group_edge.phased, + block_group_edge.source_phase_layer_id, + block_group_edge.target_phase_layer_id, ); - let _ = conn.execute(&insert_statement, ()); + block_group_edge_rows_to_insert.push(block_group_edge_row); + } + + if !block_group_edge_rows_to_insert.is_empty() { + for chunk in block_group_edge_rows_to_insert.chunks(100000) { + let formatted_block_group_edge_rows_to_insert = chunk.join(", "); + + let insert_statement = format!("INSERT INTO block_group_edges (block_group_id, edge_id, chromosome_index, phased, source_phase_layer_id, target_phase_layer_id) VALUES {0} RETURNING *;", formatted_block_group_edge_rows_to_insert); + let mut stmt = conn.prepare(&insert_statement).unwrap(); + let rows = stmt + .query_map([], BlockGroupEdge::block_group_edge_from_row) + .unwrap(); + for row in rows { + let block_group_edge = row.unwrap(); + block_group_edge_map.insert( + BlockGroupEdgeData::from(&block_group_edge), + block_group_edge.id, + ); + } + } + } + block_group_edges + .iter() + .map(|block_group_edge| *block_group_edge_map.get(block_group_edge).unwrap()) + .collect::>() + } + + fn block_group_edge_from_row(row: &Row) -> SQLResult { + Ok(BlockGroupEdge { + id: row.get(0)?, + block_group_id: row.get(1)?, + edge_id: row.get(2)?, + chromosome_index: row.get(3)?, + phased: row.get(4)?, + source_phase_layer_id: row.get(5)?, + target_phase_layer_id: row.get(6)?, + }) + } + + pub fn to_data(block_group_edge: BlockGroupEdge) -> BlockGroupEdgeData { + BlockGroupEdgeData { + block_group_id: block_group_edge.block_group_id, + edge_id: block_group_edge.edge_id, + chromosome_index: block_group_edge.chromosome_index, + phased: block_group_edge.phased, + source_phase_layer_id: block_group_edge.source_phase_layer_id, + target_phase_layer_id: block_group_edge.target_phase_layer_id, } } - pub fn edges_for_block_group(conn: &Connection, block_group_id: i64) -> Vec { + pub fn edges_for_block_group(conn: &Connection, block_group_id: i64) -> Vec { let block_group_edges = BlockGroupEdge::query( conn, "select * from block_group_edges where block_group_id = ?1;", rusqlite::params!(Value::from(block_group_id)), ); + let block_group_edge_ids = block_group_edges + .clone() + .into_iter() + .map(|block_group_edge| block_group_edge.id) + .collect::>(); let edge_ids = block_group_edges .clone() .into_iter() @@ -92,16 +199,71 @@ impl BlockGroupEdge { .map(|block_group_edge| (block_group_edge.edge_id, block_group_edge.chromosome_index)) .collect::>(); let phased_by_edge_id = block_group_edges + .clone() .into_iter() .map(|block_group_edge| (block_group_edge.edge_id, block_group_edge.phased)) .collect::>(); + let source_phase_layer_id_by_edge_id = block_group_edges + .clone() + .into_iter() + .map(|block_group_edge| { + ( + block_group_edge.edge_id, + block_group_edge.source_phase_layer_id, + ) + }) + .collect::>(); + let target_phase_layer_id_by_edge_id = block_group_edges + .clone() + .into_iter() + .map(|block_group_edge| { + ( + block_group_edge.edge_id, + block_group_edge.target_phase_layer_id, + ) + }) + .collect::>(); let edges = Edge::bulk_load(conn, &edge_ids); edges .into_iter() - .map(|edge| AugmentedEdge { + .enumerate() + .map(|(i, edge)| NewAugmentedEdge { edge: edge.clone(), + block_group_edge_id: block_group_edge_ids[i], chromosome_index: *chromosome_index_by_edge_id.get(&edge.id).unwrap(), phased: *phased_by_edge_id.get(&edge.id).unwrap(), + source_phase_layer_id: *source_phase_layer_id_by_edge_id.get(&edge.id).unwrap(), + target_phase_layer_id: *target_phase_layer_id_by_edge_id.get(&edge.id).unwrap(), + }) + .collect() + } + + pub fn load_block_group_edges( + conn: &Connection, + block_group_edge_ids: &[i64], + ) -> Vec { + let query_block_group_edge_ids: Vec = block_group_edge_ids + .iter() + .map(|block_group_edge_id| Value::from(*block_group_edge_id)) + .collect(); + let query = "select id, block_group_id, edge_id, chromosome_index, phased, source_phase_layer_id, target_phase_layer_id from block_group_edges where id in rarray(?1);"; + let block_group_edges = BlockGroupEdge::query( + conn, + query, + rusqlite::params!(Rc::new(query_block_group_edge_ids)), + ); + let block_group_edges_by_id = block_group_edges + .clone() + .into_iter() + .map(|block_group_edge| (block_group_edge.id, block_group_edge)) + .collect::>(); + block_group_edge_ids + .iter() + .map(|block_group_edge_id| { + block_group_edges_by_id + .get(block_group_edge_id) + .unwrap() + .clone() }) .collect() } @@ -110,7 +272,7 @@ impl BlockGroupEdge { conn: &Connection, block_group_id: i64, edge_ids: &[i64], - ) -> Vec { + ) -> Vec { let block_group_edges = BlockGroupEdge::query( conn, "SELECT * FROM block_group_edges WHERE block_group_id = ?1 AND edge_id in rarray(?2);", @@ -129,22 +291,51 @@ impl BlockGroupEdge { .into_iter() .map(|block_group_edge| block_group_edge.edge_id) .collect::>(); + let block_group_edge_ids = block_group_edges + .clone() + .into_iter() + .map(|block_group_edge| block_group_edge.id) + .collect::>(); let chromosome_index_by_edge_id = block_group_edges .clone() .into_iter() .map(|block_group_edge| (block_group_edge.edge_id, block_group_edge.chromosome_index)) .collect::>(); let phased_by_edge_id = block_group_edges + .clone() .into_iter() .map(|block_group_edge| (block_group_edge.edge_id, block_group_edge.phased)) .collect::>(); + let source_phase_layer_id_by_edge_id = block_group_edges + .clone() + .into_iter() + .map(|block_group_edge| { + ( + block_group_edge.edge_id, + block_group_edge.source_phase_layer_id, + ) + }) + .collect::>(); + let target_phase_layer_id_by_edge_id = block_group_edges + .into_iter() + .map(|block_group_edge| { + ( + block_group_edge.edge_id, + block_group_edge.target_phase_layer_id, + ) + }) + .collect::>(); let edges = Edge::bulk_load(conn, &edge_ids); edges .into_iter() - .map(|edge| AugmentedEdge { + .enumerate() + .map(|(i, edge)| NewAugmentedEdge { edge: edge.clone(), + block_group_edge_id: block_group_edge_ids[i], chromosome_index: *chromosome_index_by_edge_id.get(&edge.id).unwrap(), phased: *phased_by_edge_id.get(&edge.id).unwrap(), + source_phase_layer_id: *source_phase_layer_id_by_edge_id.get(&edge.id).unwrap(), + target_phase_layer_id: *target_phase_layer_id_by_edge_id.get(&edge.id).unwrap(), }) .collect() } diff --git a/src/models/edge.rs b/src/models/edge.rs index afee73a1..913c6544 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -8,7 +8,7 @@ use std::hash::{Hash, RandomState}; use std::rc::Rc; use crate::graph::{GraphEdge, GraphNode}; -use crate::models::block_group_edge::AugmentedEdge; +use crate::models::block_group_edge::NewAugmentedEdge; use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::sequence::{cached_sequence, Sequence}; use crate::models::strand::Strand; @@ -294,7 +294,7 @@ impl Edge { .collect::>() } - pub fn blocks_from_edges(conn: &Connection, edges: &[AugmentedEdge]) -> Vec { + pub fn blocks_from_edges(conn: &Connection, edges: &[NewAugmentedEdge]) -> Vec { let mut node_ids = HashSet::new(); let mut edges_by_source_node_id: HashMap> = HashMap::new(); let mut edges_by_target_node_id: HashMap> = HashMap::new(); @@ -374,7 +374,7 @@ impl Edge { } pub fn build_graph( - edges: &Vec, + edges: &Vec, blocks: &Vec, ) -> (DiGraphMap, HashMap<(i64, i64), Edge>) { let blocks_by_start = blocks @@ -463,7 +463,7 @@ impl Edge { (graph, edges_by_node_pair) } - pub fn boundary_edges_from_sequences(blocks: &[GroupBlock]) -> Vec { + pub fn boundary_edges_from_sequences(blocks: &[GroupBlock]) -> Vec { let node_blocks_by_id: HashMap> = blocks.iter().fold(HashMap::new(), |mut acc, block| { acc.entry(block.node_id) @@ -476,7 +476,7 @@ impl Edge { for (previous_block, next_block) in node_blocks.iter().tuple_windows() { // NOTE: Most of this data is bogus, the Edge struct is just a convenient wrapper // for the data we need to set up boundary edges in the block group graph - boundary_edges.push(AugmentedEdge { + boundary_edges.push(NewAugmentedEdge { edge: Edge { id: -1, source_node_id: previous_block.node_id, @@ -486,8 +486,11 @@ impl Edge { target_coordinate: next_block.start, target_strand: Strand::Forward, }, + block_group_edge_id: -1, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }); } } @@ -750,6 +753,7 @@ mod tests { path_start: 7, path_end: 15, strand: Strand::Forward, + phase_layer_id: 0, }; let change = PathChange { block_group_id, diff --git a/src/models/path.rs b/src/models/path.rs index 3f6830d1..cfb102df 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -13,6 +13,7 @@ use crate::models::{ edge::Edge, node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, path_edge::PathEdge, + phase_layer::UNPHASED_CHROMOSOME_INDEX, sequence::Sequence, strand::Strand, traits::*, @@ -75,6 +76,7 @@ pub struct PathBlock { pub path_start: i64, pub path_end: i64, pub strand: Strand, + pub phase_layer_id: i64, } #[derive(Clone, Debug)] @@ -85,7 +87,30 @@ pub struct Annotation { } impl Path { - pub fn validate_edges(conn: &Connection, edge_ids: &[i64], block_group_id: i64) { + pub fn validate_block_group_edges(block_group_edges: &[BlockGroupEdge], block_group_id: i64) { + for block_group_edge in block_group_edges.iter() { + assert!( + block_group_edge.block_group_id == block_group_id, + "Block group edge {} doesn't belong to block group {}", + block_group_edge.id, + block_group_id + ); + } + + // Two consecutive block group edges must go into and out of a node on the same phase layer + for (block_group_edge1, block_group_edge2) in block_group_edges.iter().tuple_windows() { + assert!( + block_group_edge1.target_phase_layer_id == block_group_edge2.source_phase_layer_id, + "Block group edges {} and {} don't share the same phase layer ({} vs. {})", + block_group_edge1.id, + block_group_edge2.id, + block_group_edge1.target_phase_layer_id, + block_group_edge2.source_phase_layer_id + ); + } + } + + pub fn validate_edges(conn: &Connection, edge_ids: &[i64]) { let edge_id_set = edge_ids.iter().collect::>(); // No duplicate edges allowed @@ -93,21 +118,10 @@ impl Path { println!("Duplicate edge IDs detected in path creation"); } - // All path edges must be in the path's block group - let augmented_edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); - let bg_edge_ids = augmented_edges + let edges = Edge::bulk_load(conn, edge_ids); + let edges_by_id = edges .iter() - .map(|augmented_edge| &augmented_edge.edge.id) - .collect::>(); - assert!( - edge_id_set.is_subset(&bg_edge_ids), - "Not all edges are in the block group ({})", - block_group_id - ); - - let edges_by_id = augmented_edges - .iter() - .map(|augmented_edge| (augmented_edge.edge.id, augmented_edge.edge.clone())) + .map(|edge| (edge.id, edge.clone())) .collect::>(); // Two consecutive edges must share a node @@ -153,8 +167,19 @@ impl Path { } } - pub fn create(conn: &Connection, name: &str, block_group_id: i64, edge_ids: &[i64]) -> Path { - Path::validate_edges(conn, edge_ids, block_group_id); + pub fn create( + conn: &Connection, + name: &str, + block_group_id: i64, + block_group_edge_ids: &[i64], + ) -> Path { + let block_group_edges = BlockGroupEdge::load_block_group_edges(conn, block_group_edge_ids); + Path::validate_block_group_edges(&block_group_edges, block_group_id); + let edge_ids = block_group_edges + .iter() + .map(|block_group_edge| block_group_edge.edge_id) + .collect::>(); + Path::validate_edges(conn, &edge_ids); // TODO: Should we do something if edge_ids don't match here? Suppose we have a path // for a block group with edges 1,2,3. And then the same path is added again with edges @@ -199,7 +224,7 @@ impl Path { } }; - PathEdge::bulk_create(conn, path.id, edge_ids); + PathEdge::bulk_create(conn, path.id, block_group_edge_ids); path } @@ -267,6 +292,7 @@ impl Path { block_id: i64, into: Edge, out_of: Edge, + phase_layer_id: i64, sequences_by_node_id: &HashMap, current_path_length: i64, ) -> PathBlock { @@ -292,11 +318,30 @@ impl Path { path_start: current_path_length, path_end: current_path_length + block_sequence_length, strand, + phase_layer_id, } } pub fn blocks(&self, conn: &Connection) -> Vec { - let edges = PathEdge::edges_for_path(conn, self.id); + let block_group_edges = PathEdge::block_group_edges_for_path(conn, self.id); + let edge_ids = block_group_edges + .iter() + .map(|block_group_edge| block_group_edge.edge_id) + .collect::>(); + let edges = Edge::bulk_load(conn, &edge_ids); + let edges_by_id = edges + .iter() + .map(|edge| (edge.id, edge.clone())) + .collect::>(); + let edges_by_block_group_edge_id = block_group_edges + .iter() + .map(|block_group_edge| { + ( + block_group_edge.id, + edges_by_id[&block_group_edge.edge_id].clone(), + ) + }) + .collect::>(); let mut sequence_node_ids = HashSet::new(); for edge in &edges { @@ -327,13 +372,17 @@ impl Path { path_start: i64::MIN + 1, path_end: 0, strand: Strand::Forward, + phase_layer_id: UNPHASED_CHROMOSOME_INDEX, }); - for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { + for (index, (into, out_of)) in block_group_edges.into_iter().tuple_windows().enumerate() { + let edge_in = edges_by_block_group_edge_id[&into.id].clone(); + let edge_out = edges_by_block_group_edge_id[&out_of.id].clone(); let block = self.edge_pairs_to_block( index as i64, - into, - out_of, + edge_in, + edge_out, + into.target_phase_layer_id, &sequences_by_node_id, path_length, ); @@ -353,6 +402,7 @@ impl Path { path_start: path_length, path_end: i64::MAX - 1, strand: Strand::Forward, + phase_layer_id: UNPHASED_CHROMOSOME_INDEX, }); blocks @@ -373,6 +423,7 @@ impl Path { sequence_start: block.sequence_start, sequence_end: block.sequence_end, strand: block.strand, + phase_layer_id: 0, }, ) }) @@ -613,8 +664,9 @@ impl Path { conn: &Connection, path_start: i64, path_end: i64, - edge_to_new_node: &Edge, - edge_from_new_node: &Edge, + block_group_edge_to_new_node_id: i64, + block_group_edge_from_new_node_id: i64, + new_node_id: i64, ) -> Path { // Creates a new path from the current one by replacing all edges between path_start and // path_end with the input edges that are to and from a new node @@ -622,46 +674,77 @@ impl Path { let block_with_start = tree.query_point(path_start).next().unwrap().value; let block_with_end = tree.query_point(path_end).next().unwrap().value; - let edges = PathEdge::edges_for_path(conn, self.id); - let edges_by_source = edges + let block_group_edges = PathEdge::block_group_edges_for_path(conn, self.id); + let edge_ids = block_group_edges + .iter() + .map(|block_group_edge| block_group_edge.edge_id) + .collect::>(); + let edges = Edge::bulk_load(conn, &edge_ids); + let edges_by_id = edges + .iter() + .map(|edge| (edge.id, edge.clone())) + .collect::>(); + let edges_by_block_group_edge_id = block_group_edges .iter() - .map(|edge| ((edge.source_node_id, edge.source_coordinate), edge)) - .collect::>(); - let edges_by_target = edges + .map(|block_group_edge| { + ( + block_group_edge.id, + edges_by_id[&block_group_edge.edge_id].clone(), + ) + }) + .collect::>(); + + let block_group_edges_by_source = block_group_edges .iter() - .map(|edge| ((edge.target_node_id, edge.target_coordinate), edge)) - .collect::>(); - let edge_before_new_node = edges_by_target + .map(|block_group_edge| { + let edge = &edges_by_block_group_edge_id[&block_group_edge.id]; + ( + (edge.source_node_id, edge.source_coordinate), + block_group_edge, + ) + }) + .collect::>(); + let block_group_edges_by_target = block_group_edges + .iter() + .map(|block_group_edge| { + let edge = &edges_by_block_group_edge_id[&block_group_edge.id]; + ( + (edge.target_node_id, edge.target_coordinate), + block_group_edge, + ) + }) + .collect::>(); + let block_group_edge_before_new_node = block_group_edges_by_target .get(&(block_with_start.node_id, block_with_start.sequence_start)) .unwrap(); - let edge_after_new_node = edges_by_source + let block_group_edge_after_new_node = block_group_edges_by_source .get(&(block_with_end.node_id, block_with_end.sequence_end)) .unwrap(); - let mut new_edge_ids = vec![]; + let mut block_group_edge_ids = vec![]; let mut before_new_node = true; let mut after_new_node = false; - for edge in &edges { + for block_group_edge in &block_group_edges { if before_new_node { - new_edge_ids.push(edge.id); - if edge.id == edge_before_new_node.id { + block_group_edge_ids.push(block_group_edge.id); + if block_group_edge.id == block_group_edge_before_new_node.id { before_new_node = false; - new_edge_ids.push(edge_to_new_node.id); - new_edge_ids.push(edge_from_new_node.id); + block_group_edge_ids.push(block_group_edge_to_new_node_id); + block_group_edge_ids.push(block_group_edge_from_new_node_id); } } else if after_new_node { - new_edge_ids.push(edge.id); - } else if edge.id == edge_after_new_node.id { + block_group_edge_ids.push(block_group_edge.id); + } else if block_group_edge.id == block_group_edge_after_new_node.id { after_new_node = true; - new_edge_ids.push(edge.id); + block_group_edge_ids.push(block_group_edge.id); } } let new_name = format!( "{}-start-{}-end-{}-node-{}", - self.name, path_start, path_end, edge_to_new_node.target_node_id + self.name, path_start, path_end, new_node_id, ); - Path::create(conn, &new_name, self.block_group_id, &new_edge_ids) + Path::create(conn, &new_name, self.block_group_id, &block_group_edge_ids) } fn node_blocks_for_range( @@ -696,6 +779,7 @@ impl Path { sequence_start: node_blocks[0].sequence_start + start_offset, sequence_end: node_blocks[0].sequence_end, strand: node_blocks[0].strand, + phase_layer_id: 0, }; for block in &node_blocks[1..] { @@ -711,6 +795,7 @@ impl Path { sequence_start: consolidated_block.sequence_start, sequence_end: block.sequence_end, strand: consolidated_block.strand, + phase_layer_id: 0, }; } else { result_node_blocks.push(consolidated_block); @@ -732,6 +817,7 @@ impl Path { sequence_start: consolidated_block.sequence_start, sequence_end: consolidated_block.sequence_end - end_offset, strand: consolidated_block.strand, + phase_layer_id: 0, }); result_node_blocks @@ -846,7 +932,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; + let edge_ids = &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -854,11 +940,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); assert_eq!(path.sequence(conn), "ATCGATCGAAAAAAACCCCCCCGGGGGGG"); } @@ -933,7 +1021,7 @@ mod tests { Strand::Reverse, ); - let edge_ids = vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; + let edge_ids = &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -941,11 +1029,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); assert_eq!(path.sequence(conn), "CCCCCCCGGGGGGGTTTTTTTCGATCGAT"); } @@ -1027,7 +1117,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; + let edge_ids = &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1035,11 +1125,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let tree = path.intervaltree(conn); let blocks1: Vec = tree.query_point(2).map(|x| x.value).collect(); assert_eq!(blocks1.len(), 1); @@ -1146,7 +1238,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; + let edge_ids = [edge1.id, edge2.id, edge3.id, edge4.id, edge5.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1154,11 +1246,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let tree = path.intervaltree(conn); let blocks1: Vec = tree.query_point(2).map(|x| x.value).collect(); assert_eq!(blocks1.len(), 1); @@ -1229,7 +1323,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1237,11 +1331,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let mappings = path.find_block_mappings(conn, &path); assert_eq!(mappings.len(), 1); @@ -1291,7 +1387,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1299,10 +1395,12 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -1328,7 +1426,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge3.id, edge4.id]; + let edge_ids = &[edge3.id, edge4.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1336,11 +1434,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path2 = Path::create(conn, "chr2", block_group.id, &edge_ids); + let path2 = Path::create(conn, "chr2", block_group.id, &block_group_edge_ids); let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 0); @@ -1388,7 +1488,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1396,11 +1496,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -1435,7 +1537,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge3.id, edge4.id, edge5.id]; + let edge_ids = &[edge3.id, edge4.id, edge5.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1443,11 +1545,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path2 = Path::create(conn, "chr2", block_group.id, &edge_ids); + let path2 = Path::create(conn, "chr2", block_group.id, &block_group_edge_ids); assert_eq!(path2.sequence(conn), "ATCGTTTTTTTT"); @@ -1503,7 +1607,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -1511,11 +1615,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -1549,15 +1655,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge2_id = block_group_edge_ids1[1]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge2.id], + &[bge1_id, bge4_id, bge5_id, bge2_id], ); assert_eq!(path2.sequence(conn), "ATCGTTTTTTTTATCG"); @@ -1628,11 +1740,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -1666,15 +1780,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge2_id = block_group_edge_ids1[1]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge2.id], + &[bge1_id, bge4_id, bge5_id, bge2_id], ); assert_eq!(path2.sequence(conn), "ATTTTTTTTTCG"); @@ -1744,11 +1864,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let edge4 = Edge::create( conn, @@ -1768,16 +1890,16 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge2_id = block_group_edge_ids1[1]; - let path2 = Path::create( - conn, - "chr2", - block_group.id, - &[edge1.id, edge4.id, edge2.id], - ); + let path2 = Path::create(conn, "chr2", block_group.id, &[bge1_id, bge4_id, bge2_id]); assert_eq!(path2.sequence(conn), "ATCG"); @@ -1862,11 +1984,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence3 = Sequence::new() .sequence_type("DNA") @@ -1900,15 +2024,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge3_id = block_group_edge_ids1[2]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge3.id], + &[bge1_id, bge4_id, bge5_id, bge3_id], ); assert_eq!(path2.sequence(conn), "ATCGATCGAAAAAAAATTTTTTTT"); @@ -1994,11 +2124,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence3 = Sequence::new() .sequence_type("DNA") @@ -2032,15 +2164,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge3_id = block_group_edge_ids1[2]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge3.id], + &[bge1_id, bge4_id, bge5_id, bge3_id], ); assert_eq!(path2.sequence(conn), "ATCGAAAAAAAATTTT"); @@ -2125,11 +2263,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let edge4 = Edge::create( conn, @@ -2149,16 +2289,16 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge3_id = block_group_edge_ids1[2]; - let path2 = Path::create( - conn, - "chr2", - block_group.id, - &[edge1.id, edge4.id, edge3.id], - ); + let path2 = Path::create(conn, "chr2", block_group.id, &[bge1_id, bge4_id, bge3_id]); assert_eq!(path2.sequence(conn), "ATCGTTTT"); @@ -2222,11 +2362,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let annotation = Annotation { name: "foo".to_string(), @@ -2287,11 +2429,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -2325,11 +2469,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path2 = Path::create(conn, "chr2", block_group.id, &edge_ids); + let path2 = Path::create(conn, "chr2", block_group.id, &block_group_edge_ids); let annotation = Annotation { name: "foo".to_string(), @@ -2382,7 +2528,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -2390,11 +2536,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -2429,7 +2577,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge3.id, edge4.id, edge5.id]; + let edge_ids = &[edge3.id, edge4.id, edge5.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -2437,11 +2585,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path2 = Path::create(conn, "chr2", block_group.id, &edge_ids); + let path2 = Path::create(conn, "chr2", block_group.id, &block_group_edge_ids); assert_eq!(path2.sequence(conn), "ATCGTTTTTTTT"); @@ -2500,7 +2650,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -2508,11 +2658,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -2546,15 +2698,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge2_id = block_group_edge_ids1[1]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge2.id], + &[bge1_id, bge4_id, bge5_id, bge2_id], ); assert_eq!(path2.sequence(conn), "ATCGTTTTTTTTATCG"); @@ -2626,11 +2784,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence2 = Sequence::new() .sequence_type("DNA") @@ -2664,15 +2824,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge2_id = block_group_edge_ids1[1]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge2.id], + &[bge1_id, bge4_id, bge5_id, bge2_id], ); assert_eq!(path2.sequence(conn), "ATTTTTTTTTCG"); @@ -2758,11 +2924,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let sequence3 = Sequence::new() .sequence_type("DNA") @@ -2796,15 +2964,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge5_id = block_group_edge_ids2[1]; + let bge3_id = block_group_edge_ids1[2]; let path2 = Path::create( conn, "chr2", block_group.id, - &[edge1.id, edge4.id, edge5.id, edge3.id], + &[bge1_id, bge4_id, bge5_id, bge3_id], ); assert_eq!(path2.sequence(conn), "ATCGATCGAAAAAAAATTTTTTTT"); @@ -2890,11 +3064,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids1 = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids1); let edge4 = Edge::create( conn, @@ -2914,16 +3090,16 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids2 = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids1[0]; + let bge4_id = block_group_edge_ids2[0]; + let bge3_id = block_group_edge_ids1[2]; - let path2 = Path::create( - conn, - "chr2", - block_group.id, - &[edge1.id, edge4.id, edge3.id], - ); + let path2 = Path::create(conn, "chr2", block_group.id, &[bge1_id, bge4_id, bge3_id]); assert_eq!(path2.sequence(conn), "ATCGTTTT"); @@ -2994,11 +3170,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path1 = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path1 = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); assert_eq!(path1.sequence(conn), "ATCGATCGAAAAAAAA"); let sequence3 = Sequence::new() @@ -3033,11 +3211,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path2 = path1.new_path_with(conn, 4, 11, &edge4, &edge5); + let path2 = path1.new_path_with( + conn, + 4, + 11, + block_group_edge_ids[0], + block_group_edge_ids[1], + node3_id, + ); assert_eq!(path2.sequence(conn), "ATCGCCCCCCCCAAAAA"); let edge6 = Edge::create( @@ -3067,11 +3254,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path3 = path1.new_path_with(conn, 4, 7, &edge6, &edge7); + let path3 = path1.new_path_with( + conn, + 4, + 7, + block_group_edge_ids[0], + block_group_edge_ids[1], + node3_id, + ); assert_eq!(path3.sequence(conn), "ATCGCCCCCCCCGAAAAAAAA"); } @@ -3104,7 +3300,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -3112,17 +3308,20 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); // Should print a warning that there are duplicate edges, but continue - let _path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let _path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); } #[test] - #[should_panic(expected = "Not all edges are in the block group")] - fn test_edges_must_be_in_path_block_group() { + #[should_panic] + // Panic message is something like "Block group edge 1 doesn't belong to block group 2" + fn test_block_group_edges_must_be_in_path_block_group() { let conn = &mut get_connection(None); Collection::create(conn, "test collection"); let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); @@ -3150,8 +3349,23 @@ mod tests { Strand::Forward, ); + Collection::create(conn, "test collection 2"); + let block_group2 = + BlockGroup::create(conn, "test collection 2", None, "test block group 2"); let edge_ids = [edge1.id, edge2.id]; - let _path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let block_group_edges = edge_ids + .iter() + .map(|edge_id| BlockGroupEdgeData { + block_group_id: block_group.id, + edge_id: *edge_id, + chromosome_index: 0, + phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, + }) + .collect::>(); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let _path = Path::create(conn, "chr1", block_group2.id, &block_group_edge_ids); } #[test] @@ -3190,25 +3404,27 @@ mod tests { Strand::Forward, ); - let block_group_edges = vec![ + let block_group_edges = &[ BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge1.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge2.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, ]; - BlockGroupEdge::bulk_create(conn, &block_group_edges); - - let edge_ids = vec![edge1.id, edge2.id]; + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, block_group_edges); - let _path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let _path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); } #[test] @@ -3247,7 +3463,7 @@ mod tests { Strand::Reverse, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -3255,12 +3471,14 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let _path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let _path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); } #[test] @@ -3300,7 +3518,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id]; + let edge_ids = &[edge1.id, edge2.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -3308,11 +3526,13 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let _path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let _path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); } #[test] @@ -3337,15 +3557,17 @@ mod tests { Strand::Forward, ); - let block_group_edges = vec![BlockGroupEdgeData { + let block_group_edges = &[BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge1.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }]; - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, block_group_edges); - let _path = Path::create(conn, "chr1", block_group.id, &[edge1.id]); + let _path = Path::create(conn, "chr1", block_group.id, &[block_group_edge_ids[0]]); } #[test] @@ -3391,7 +3613,7 @@ mod tests { Strand::Forward, ); - let edge_ids = vec![edge1.id, edge2.id, edge3.id]; + let edge_ids = &[edge1.id, edge2.id, edge3.id]; let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { @@ -3399,16 +3621,18 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create(conn, "chr1", block_group.id, &edge_ids); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); let intervaltree = path.intervaltree(conn); let node_blocks1 = path.node_blocks_for_range(&intervaltree, 0, 8); - let expected_node_blocks1 = vec![NodeIntervalBlock { + let expected_node_blocks1 = &[NodeIntervalBlock { block_id: 0, node_id: node1_id, start: 0, @@ -3416,11 +3640,12 @@ mod tests { sequence_start: 0, sequence_end: 8, strand: Strand::Forward, + phase_layer_id: 0, }]; assert_eq!(node_blocks1, expected_node_blocks1); let node_blocks2 = path.node_blocks_for_range(&intervaltree, 0, 4); - let expected_node_blocks2 = vec![NodeIntervalBlock { + let expected_node_blocks2 = &[NodeIntervalBlock { block_id: 0, node_id: node1_id, start: 0, @@ -3428,11 +3653,12 @@ mod tests { sequence_start: 0, sequence_end: 4, strand: Strand::Forward, + phase_layer_id: 0, }]; assert_eq!(node_blocks2, expected_node_blocks2); let node_blocks3 = path.node_blocks_for_range(&intervaltree, 2, 6); - let expected_node_blocks3 = vec![NodeIntervalBlock { + let expected_node_blocks3 = &[NodeIntervalBlock { block_id: 0, node_id: node1_id, start: 2, @@ -3440,11 +3666,12 @@ mod tests { sequence_start: 2, sequence_end: 6, strand: Strand::Forward, + phase_layer_id: 0, }]; assert_eq!(node_blocks3, expected_node_blocks3); let node_blocks4 = path.node_blocks_for_range(&intervaltree, 3, 8); - let expected_node_blocks4 = vec![NodeIntervalBlock { + let expected_node_blocks4 = &[NodeIntervalBlock { block_id: 0, node_id: node1_id, start: 3, @@ -3452,11 +3679,12 @@ mod tests { sequence_start: 3, sequence_end: 8, strand: Strand::Forward, + phase_layer_id: 0, }]; assert_eq!(node_blocks4, expected_node_blocks4); let node_blocks5 = path.node_blocks_for_range(&intervaltree, 6, 10); - let expected_node_blocks5 = vec![ + let expected_node_blocks5 = &[ NodeIntervalBlock { block_id: 0, node_id: node1_id, @@ -3465,6 +3693,7 @@ mod tests { sequence_start: 6, sequence_end: 8, strand: Strand::Forward, + phase_layer_id: 0, }, NodeIntervalBlock { block_id: 1, @@ -3474,12 +3703,13 @@ mod tests { sequence_start: 0, sequence_end: 2, strand: Strand::Forward, + phase_layer_id: 0, }, ]; assert_eq!(node_blocks5, expected_node_blocks5); let node_blocks6 = path.node_blocks_for_range(&intervaltree, 12, 16); - let expected_node_blocks6 = vec![NodeIntervalBlock { + let expected_node_blocks6 = &[NodeIntervalBlock { block_id: 0, node_id: node2_id, start: 12, @@ -3487,6 +3717,7 @@ mod tests { sequence_start: 4, sequence_end: 8, strand: Strand::Forward, + phase_layer_id: 0, }]; assert_eq!(node_blocks6, expected_node_blocks6); } @@ -3551,17 +3782,21 @@ mod tests { edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let bge1_id = block_group_edge_ids[0]; + let bge4_id = block_group_edge_ids[3]; - let path1 = Path::create(conn, "chr1.1", block_group.id, &[edge1.id, edge4.id]); - let path2 = Path::create(conn, "chr1.2", block_group.id, edge_ids); + let path1 = Path::create(conn, "chr1.1", block_group.id, &[bge1_id, bge4_id]); + let path2 = Path::create(conn, "chr1.2", block_group.id, &block_group_edge_ids); let intervaltree1 = path1.intervaltree(conn); let node_blocks1 = path1.node_blocks_for_range(&intervaltree1, 0, 8); - let expected_node_blocks1 = vec![NodeIntervalBlock { + let expected_node_blocks1 = &[NodeIntervalBlock { block_id: 0, node_id: node1_id, start: 0, @@ -3569,13 +3804,14 @@ mod tests { sequence_start: 0, sequence_end: 8, strand: Strand::Forward, + phase_layer_id: 0, }]; assert_eq!(node_blocks1, expected_node_blocks1); let intervaltree2 = path2.intervaltree(conn); let node_blocks2 = path2.node_blocks_for_range(&intervaltree2, 0, 8); - let expected_node_blocks2 = vec![ + let expected_node_blocks2 = &[ NodeIntervalBlock { block_id: 0, node_id: node1_id, @@ -3584,6 +3820,7 @@ mod tests { sequence_start: 0, sequence_end: 5, strand: Strand::Forward, + phase_layer_id: 0, }, NodeIntervalBlock { block_id: 1, @@ -3593,12 +3830,13 @@ mod tests { sequence_start: 0, sequence_end: 3, strand: Strand::Forward, + phase_layer_id: 0, }, ]; assert_eq!(node_blocks2, expected_node_blocks2); let node_blocks3 = path2.node_blocks_for_range(&intervaltree2, 4, 14); - let expected_node_blocks3 = vec![ + let expected_node_blocks3 = &[ NodeIntervalBlock { block_id: 0, node_id: node1_id, @@ -3607,6 +3845,7 @@ mod tests { sequence_start: 4, sequence_end: 5, strand: Strand::Forward, + phase_layer_id: 0, }, NodeIntervalBlock { block_id: 1, @@ -3616,6 +3855,7 @@ mod tests { sequence_start: 0, sequence_end: 8, strand: Strand::Forward, + phase_layer_id: 0, }, NodeIntervalBlock { block_id: 2, @@ -3625,6 +3865,7 @@ mod tests { sequence_start: 6, sequence_end: 7, strand: Strand::Forward, + phase_layer_id: 0, }, ]; assert_eq!(node_blocks3, expected_node_blocks3); diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index 728680dd..9d73601d 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -5,6 +5,7 @@ use rusqlite::{params_from_iter, Connection, Row}; use std::collections::HashMap; use std::rc::Rc; +use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::Edge; use crate::models::traits::*; @@ -13,7 +14,7 @@ pub struct PathEdge { pub id: i64, pub path_id: i64, pub index_in_path: i64, - pub edge_id: i64, + pub block_group_edge_id: i64, } impl Query for PathEdge { @@ -23,23 +24,28 @@ impl Query for PathEdge { id: row.get(0).unwrap(), path_id: row.get(1).unwrap(), index_in_path: row.get(2).unwrap(), - edge_id: row.get(3).unwrap(), + block_group_edge_id: row.get(3).unwrap(), } } } impl PathEdge { - pub fn create(conn: &Connection, path_id: i64, index_in_path: i64, edge_id: i64) -> PathEdge { + pub fn create( + conn: &Connection, + path_id: i64, + index_in_path: i64, + block_group_edge_id: i64, + ) -> PathEdge { let query = - "INSERT INTO path_edges (path_id, index_in_path, edge_id) VALUES (?1, ?2, ?3) RETURNING (id)"; + "INSERT INTO path_edges (path_id, index_in_path, block_group_edge_id) VALUES (?1, ?2, ?3) RETURNING (id)"; let mut stmt = conn.prepare(query).unwrap(); let mut rows = stmt - .query_map((path_id, index_in_path, edge_id), |row| { + .query_map((path_id, index_in_path, block_group_edge_id), |row| { Ok(PathEdge { id: row.get(0)?, path_id, index_in_path, - edge_id, + block_group_edge_id, }) }) .unwrap(); @@ -48,8 +54,8 @@ impl PathEdge { Err(rusqlite::Error::SqliteFailure(err, _details)) => { if err.code == rusqlite::ErrorCode::ConstraintViolation { let mut placeholders = vec![path_id]; - let query = "SELECT id from path_edges where path_id = ?1 AND edge_id = ?2;"; - placeholders.push(edge_id); + let query = "SELECT id from path_edges where path_id = ?1 AND block_group_edge_id = ?2;"; + placeholders.push(block_group_edge_id); println!("{query} {placeholders:?}"); PathEdge { id: conn @@ -57,7 +63,7 @@ impl PathEdge { .unwrap(), path_id, index_in_path, - edge_id, + block_group_edge_id, } } else { panic!("something bad happened querying the database") @@ -69,14 +75,14 @@ impl PathEdge { } } - pub fn bulk_create(conn: &Connection, path_id: i64, edge_ids: &[i64]) { - for (index1, chunk) in edge_ids.chunks(100000).enumerate() { + pub fn bulk_create(conn: &Connection, path_id: i64, block_group_edge_ids: &[i64]) { + for (index1, chunk) in block_group_edge_ids.chunks(100000).enumerate() { let mut rows_to_insert = vec![]; - for (index2, edge_id) in chunk.iter().enumerate() { + for (index2, block_group_edge_id) in chunk.iter().enumerate() { let row = format!( "({0}, {1}, {2})", path_id, - edge_id, + block_group_edge_id, index1 * 100000 + index2 ); rows_to_insert.push(row); @@ -85,7 +91,7 @@ impl PathEdge { let formatted_rows_to_insert = rows_to_insert.join(", "); let insert_statement = format!( - "INSERT OR IGNORE INTO path_edges (path_id, edge_id, index_in_path) VALUES {0};", + "INSERT OR IGNORE INTO path_edges (path_id, block_group_edge_id, index_in_path) VALUES {0};", formatted_rows_to_insert ); let _ = conn.execute(&insert_statement, ()); @@ -98,9 +104,14 @@ impl PathEdge { "select * from path_edges where path_id = ?1 order by index_in_path ASC", rusqlite::params!(Value::from(path_id)), ); - let edge_ids = path_edges + let block_group_edge_ids = path_edges .into_iter() - .map(|path_edge| path_edge.edge_id) + .map(|path_edge| path_edge.block_group_edge_id) + .collect::>(); + let block_group_edges = BlockGroupEdge::load_block_group_edges(conn, &block_group_edge_ids); + let edge_ids = block_group_edges + .into_iter() + .map(|block_group_edge| block_group_edge.edge_id) .collect::>(); let edges = Edge::bulk_load(conn, &edge_ids); let edges_by_id = edges @@ -113,6 +124,27 @@ impl PathEdge { .collect::>() } + pub fn block_group_edges_for_path(conn: &Connection, path_id: i64) -> Vec { + let path_edges = PathEdge::query( + conn, + "select * from path_edges where path_id = ?1 order by index_in_path ASC", + rusqlite::params!(Value::from(path_id)), + ); + let block_group_edge_ids = path_edges + .into_iter() + .map(|path_edge| path_edge.block_group_edge_id) + .collect::>(); + let block_group_edges = BlockGroupEdge::load_block_group_edges(conn, &block_group_edge_ids); + let block_group_edges_by_id = block_group_edges + .into_iter() + .map(|block_group_edge| (block_group_edge.id, block_group_edge)) + .collect::>(); + block_group_edge_ids + .into_iter() + .map(|block_group_edge_id| block_group_edges_by_id[&block_group_edge_id].clone()) + .collect::>() + } + pub fn edges_for_paths(conn: &Connection, path_ids: Vec) -> HashMap> { let query_path_ids = path_ids .iter() @@ -123,10 +155,20 @@ impl PathEdge { "select * from path_edges where path_id in rarray(?1) ORDER BY path_id, index_in_path", rusqlite::params!(Rc::new(query_path_ids)), ); - let edge_ids = path_edges + let block_group_edge_ids = path_edges + .clone() + .into_iter() + .map(|path_edge| path_edge.block_group_edge_id) + .collect::>(); + let block_group_edges = BlockGroupEdge::load_block_group_edges(conn, &block_group_edge_ids); + let edge_ids_by_block_group_edge_id = block_group_edges .clone() .into_iter() - .map(|path_edge| path_edge.edge_id) + .map(|block_group_edge| (block_group_edge.id, block_group_edge.edge_id)) + .collect::>(); + let edge_ids = block_group_edges + .into_iter() + .map(|block_group_edge| block_group_edge.edge_id) .collect::>(); let edges = Edge::bulk_load(conn, &edge_ids); let edges_by_id = edges @@ -135,7 +177,12 @@ impl PathEdge { .collect::>(); let path_edges_by_path_id = path_edges .into_iter() - .map(|path_edge| (path_edge.path_id, path_edge.edge_id)) + .map(|path_edge| { + ( + path_edge.path_id, + edge_ids_by_block_group_edge_id[&path_edge.block_group_edge_id], + ) + }) .into_group_map(); path_edges_by_path_id .into_iter() diff --git a/src/models/phase_layer.rs b/src/models/phase_layer.rs new file mode 100644 index 00000000..157b6116 --- /dev/null +++ b/src/models/phase_layer.rs @@ -0,0 +1,62 @@ +use rusqlite::Connection; + +#[derive(Clone, Debug)] +pub struct PhaseLayer { + pub id: i64, + pub chromosome_index: i64, + pub is_reference: i64, +} + +pub const UNPHASED_CHROMOSOME_INDEX: i64 = -1; + +impl PhaseLayer { + pub fn get_or_create( + conn: &Connection, + chromosome_index: i64, + is_reference: i64, + ) -> Result { + let phase_layer_id: i64 = match conn.query_row( + "select id from phase_layers where chromosome_index = ?1 AND is_reference = ?2", + (chromosome_index, is_reference), + |row| row.get(0), + ) { + Ok(res) => res, + Err(rusqlite::Error::QueryReturnedNoRows) => 0, + Err(_e) => { + panic!("Error querying the database: {_e}"); + } + }; + if phase_layer_id != 0 { + return Ok(phase_layer_id); + } + + let new_phase_layer_id = PhaseLayer::create(conn, chromosome_index, is_reference); + + Ok(new_phase_layer_id) + } + + pub fn create(conn: &Connection, chromosome_index: i64, is_reference: i64) -> i64 { + let query = "INSERT INTO phase_layers (chromosome_index, is_reference) VALUES (?1, ?2) RETURNING (id)"; + let mut stmt = conn.prepare(query).unwrap(); + match stmt.query_row((chromosome_index, is_reference), |row| row.get(0)) { + Ok(res) => res, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + conn + .query_row( + "select id from phase_layers where chromosome_index = ?1 and is_reference = ?2", + (chromosome_index, is_reference), + |row| row.get(0), + ) + .unwrap() + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } + } +} diff --git a/src/operation_management.rs b/src/operation_management.rs index 8c21e6d6..b6ed02c6 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -51,6 +51,29 @@ pub enum FileMode { Write, } +// NOTE: The ordering of these entries is important, because it is the order in which tables are +// processed in get_changeset_dependencies. If model Foo depends on model Bar, then foo must come +// first in this list. For instance, an edge depends on a source node and a target node, so the +// edges table must come before the nodes table here. This is because in +// get_changeset_dependencies, if we process edges first, we correctly find any nodes that were +// created that the edge must be associated with. If we instead processed nodes first, some nodes +// associated with edges would get counted as new nodes, and if we recrate the edge it might not be +// associated with the correct recreated nodes. +const CHANGE_TABLES: &[&str; 12] = &[ + "collections", + "samples", + "accessions", + "paths", + "path_edges", + "block_group_edges", + "edges", + "accession_paths", + "accession_edges", + "nodes", + "sequences", + "block_groups", +]; + #[derive(Deserialize, Serialize, Debug)] pub struct DependencyModels { pub sequences: Vec, @@ -60,6 +83,7 @@ pub struct DependencyModels { pub paths: Vec, pub accessions: Vec, pub accession_edges: Vec, + pub block_group_edges: Vec, } #[derive(Debug)] @@ -101,6 +125,7 @@ pub fn get_changeset_dependencies(conn: &Connection, mut changes: &[u8]) -> Vec< let mut previous_nodes = HashSet::new(); let mut previous_sequences = HashSet::new(); let mut previous_accession_edges = HashSet::new(); + let mut previous_block_group_edges = HashSet::new(); let mut created_block_groups = HashSet::new(); let mut created_paths = HashSet::new(); let mut created_accessions = HashSet::new(); @@ -108,6 +133,7 @@ pub fn get_changeset_dependencies(conn: &Connection, mut changes: &[u8]) -> Vec< let mut created_accession_edges = HashSet::new(); let mut created_nodes = HashSet::new(); let mut created_sequences: HashSet = HashSet::new(); + let mut created_block_group_edges = HashSet::new(); while let Some(item) = iter.next().unwrap() { let op = item.op().unwrap(); @@ -164,16 +190,18 @@ pub fn get_changeset_dependencies(conn: &Connection, mut changes: &[u8]) -> Vec< } "path_edges" => { let path_id = item.new_value(1).unwrap().as_i64().unwrap(); - let edge_id = item.new_value(3).unwrap().as_i64().unwrap(); + let block_group_edge_id = item.new_value(3).unwrap().as_i64().unwrap(); if !created_paths.contains(&path_id) { previous_paths.insert(path_id); } - if !created_edges.contains(&edge_id) { - previous_edges.insert(edge_id); + if !created_block_group_edges.contains(&block_group_edge_id) { + previous_block_group_edges.insert(block_group_edge_id); } } "block_group_edges" => { // make sure blockgroup_map has blockgroups for bg ids made in external changes. + let block_group_edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap(); + created_block_group_edges.insert(block_group_edge_pk); let bg_id = item.new_value(1).unwrap().as_i64().unwrap(); let edge_id = item.new_value(2).unwrap().as_i64().unwrap(); if !created_edges.contains(&edge_id) { @@ -276,6 +304,14 @@ pub fn get_changeset_dependencies(conn: &Connection, mut changes: &[u8]) -> Vec< ), rusqlite::params!(), ), + block_group_edges: BlockGroupEdge::query( + conn, + &format!( + "select * from block_group_edges where id in ({ids})", + ids = previous_block_group_edges.iter().join(",") + ), + rusqlite::params!(), + ), }; serde_json::to_vec(&s).unwrap() } @@ -392,6 +428,8 @@ pub fn load_changeset_models(changeset: &mut ChangesetIter) -> ChangesetModels { edge_id: parse_number(item, 2), chromosome_index: parse_number(item, 3), phased: parse_number(item, 4), + source_phase_layer_id: parse_number(item, 5), + target_phase_layer_id: parse_number(item, 6), }), _ => {} } @@ -414,6 +452,7 @@ pub fn apply_changeset( for sequence in dependencies.sequences.iter() { NewSequence::from(sequence).save(conn); } + for node in dependencies.nodes.iter() { if !Node::is_terminal(node.id) { assert!(Sequence::sequence_from_hash(conn, &node.sequence_hash).is_some()); @@ -423,6 +462,12 @@ pub fn apply_changeset( let mut dep_bg_map = HashMap::new(); for bg in dependencies.block_group.iter() { let sample_name = bg.sample_name.as_ref().map(|v| v as &str); + if !Collection::exists(conn, &bg.collection_name) { + Collection::create(conn, &bg.collection_name); + } + if let Some(sample_name) = sample_name { + Sample::get_or_create(conn, sample_name); + } let new_bg = BlockGroup::create(conn, &bg.collection_name, sample_name, &bg.name); dep_bg_map.insert(&bg.id, new_bg.id); } @@ -434,14 +479,52 @@ pub fn apply_changeset( } let mut dep_edge_map = HashMap::new(); - let new_edges = Edge::bulk_create( - conn, - &dependencies.edges.iter().map(EdgeData::from).collect(), - ); + let updated_edges = dependencies + .edges + .iter() + .map(|edge| EdgeData { + source_node_id: *dep_node_map + .get(&edge.source_node_id) + .unwrap_or(&edge.source_node_id), + source_coordinate: edge.source_coordinate, + source_strand: edge.source_strand, + target_node_id: *dep_node_map + .get(&edge.target_node_id) + .unwrap_or(&edge.target_node_id), + target_coordinate: edge.target_coordinate, + target_strand: edge.target_strand, + }) + .collect::>(); + let new_edges = Edge::bulk_create(conn, &updated_edges); for (index, edge_id) in new_edges.iter().enumerate() { dep_edge_map.insert(&dependencies.edges[index].id, *edge_id); } + let mut dep_block_group_edge_map = HashMap::new(); + let updated_block_group_edges = &dependencies + .block_group_edges + .iter() + .map(|bge| BlockGroupEdgeData { + block_group_id: *dep_bg_map + .get(&bge.block_group_id) + .unwrap_or(&bge.block_group_id), + edge_id: *dep_edge_map.get(&bge.edge_id).unwrap_or(&bge.edge_id), + chromosome_index: bge.chromosome_index, + phased: bge.phased, + source_phase_layer_id: bge.source_phase_layer_id, + target_phase_layer_id: bge.target_phase_layer_id, + }) + .collect::>(); + + let new_block_group_edge_ids = BlockGroupEdge::bulk_create(conn, updated_block_group_edges); + + for (index, block_group_edge_id) in new_block_group_edge_ids.iter().enumerate() { + dep_block_group_edge_map.insert( + &dependencies.block_group_edges[index].id, + *block_group_edge_id, + ); + } + let mut dep_path_map = HashMap::new(); for path in dependencies.paths.iter() { let new_path = Path::create( @@ -456,14 +539,24 @@ pub fn apply_changeset( } let mut dep_accession_edge_map = HashMap::new(); - let new_accession_edges = AccessionEdge::bulk_create( - conn, - &dependencies - .accession_edges - .iter() - .map(AccessionEdgeData::from) - .collect(), - ); + let updated_accession_edges = dependencies + .accession_edges + .iter() + .map(|edge| AccessionEdgeData { + source_node_id: *dep_node_map + .get(&edge.source_node_id) + .unwrap_or(&edge.source_node_id), + source_coordinate: edge.source_coordinate, + source_strand: edge.source_strand, + target_node_id: *dep_node_map + .get(&edge.target_node_id) + .unwrap_or(&edge.target_node_id), + target_coordinate: edge.target_coordinate, + target_strand: edge.target_strand, + chromosome_index: edge.chromosome_index, + }) + .collect::>(); + let new_accession_edges = AccessionEdge::bulk_create(conn, &updated_accession_edges); for (index, edge_id) in new_accession_edges.iter().enumerate() { dep_accession_edge_map.insert(&dependencies.accession_edges[index].id, *edge_id); } @@ -579,6 +672,23 @@ pub fn apply_changeset( }, ); } + "block_group_edges" => { + // make sure blockgroup_map has blockgroups for bg ids made in external changes. + let bg_id = item.new_value(1).unwrap().as_i64().unwrap(); + let edge_id = item.new_value(2).unwrap().as_i64().unwrap(); + let chromosome_index = item.new_value(3).unwrap().as_i64().unwrap(); + let phased = item.new_value(4).unwrap().as_i64().unwrap(); + let source_phase_layer_id = item.new_value(5).unwrap().as_i64().unwrap(); + let target_phase_layer_id = item.new_value(6).unwrap().as_i64().unwrap(); + insert_block_group_edges.push(BlockGroupEdgeData { + block_group_id: bg_id, + edge_id, + chromosome_index, + phased, + source_phase_layer_id, + target_phase_layer_id, + }); + } "path_edges" => { let path_id = item.new_value(1).unwrap().as_i64().unwrap(); let path_index = item.new_value(2).unwrap().as_i64().unwrap(); @@ -589,14 +699,6 @@ pub fn apply_changeset( .or_default() .push((path_index, edge_id)); } - "block_group_edges" => { - // make sure blockgroup_map has blockgroups for bg ids made in external changes. - let bg_id = item.new_value(1).unwrap().as_i64().unwrap(); - let edge_id = item.new_value(2).unwrap().as_i64().unwrap(); - let chromosome_index = item.new_value(3).unwrap().as_i64().unwrap(); - let phased = item.new_value(4).unwrap().as_i64().unwrap(); - insert_block_group_edges.push((bg_id, edge_id, chromosome_index, phased)); - } "collections" => { Collection::create( conn, @@ -695,56 +797,52 @@ pub fn apply_changeset( edge_id_map.insert(sorted_edge_ids[index], *edge_id); } - let mut block_group_edges: HashMap> = HashMap::new(); + let mut block_group_edges = vec![]; - for (bg_id, edge_id, chromosome_index, phased) in insert_block_group_edges { + for block_group_edge_data in insert_block_group_edges { let bg_id = *dep_bg_map - .get(&bg_id) - .or(blockgroup_map.get(&bg_id).or(Some(&bg_id))) + .get(&block_group_edge_data.block_group_id) + .or(blockgroup_map + .get(&block_group_edge_data.block_group_id) + .or(Some(&block_group_edge_data.block_group_id))) .unwrap(); let edge_id = dep_edge_map - .get(&edge_id) - .or(edge_id_map.get(&edge_id).or(Some(&edge_id))) + .get(&block_group_edge_data.edge_id) + .or(edge_id_map + .get(&block_group_edge_data.edge_id) + .or(Some(&block_group_edge_data.edge_id))) .unwrap(); - block_group_edges - .entry(bg_id) - .or_default() - .push((*edge_id, chromosome_index, phased)); + block_group_edges.push(BlockGroupEdgeData { + block_group_id: bg_id, + edge_id: *edge_id, + chromosome_index: block_group_edge_data.chromosome_index, + phased: block_group_edge_data.phased, + source_phase_layer_id: block_group_edge_data.source_phase_layer_id, + target_phase_layer_id: block_group_edge_data.target_phase_layer_id, + }); } - for (bg_id, edges) in block_group_edges.iter() { - let new_block_group_edges = edges - .iter() - .map(|(edge_id, chromosome_index, phased)| BlockGroupEdgeData { - block_group_id: *bg_id, - edge_id: *edge_id, - chromosome_index: *chromosome_index, - phased: *phased, - }) - .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); - } + let _block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); for path in insert_paths { - let mut sorted_edges = vec![]; - for (_, edge_id) in path_edges + let mut sorted_block_group_edge_ids = vec![]; + for (_, block_group_edge_id) in path_edges .get(&path.id) .unwrap() .iter() .sorted_by(|(c1, _), (c2, _)| Ord::cmp(&c1, &c2)) { - let new_edge_id = dep_edge_map - .get(edge_id) - .unwrap_or(edge_id_map.get(edge_id).unwrap_or(edge_id)); - sorted_edges.push(*new_edge_id); + let new_block_group_edge_id = dep_block_group_edge_map + .get(block_group_edge_id) + .unwrap_or(block_group_edge_id); + sorted_block_group_edge_ids.push(*new_block_group_edge_id); } - let new_bg_id = *dep_bg_map - .get(&path.block_group_id) - .or(blockgroup_map + let new_bg_id = *dep_bg_map.get(&path.block_group_id).unwrap_or( + blockgroup_map .get(&path.block_group_id) - .or(Some(&path.block_group_id))) - .unwrap(); - Path::create(conn, &path.name, new_bg_id, &sorted_edges); + .unwrap_or(&path.block_group_id), + ); + Path::create(conn, &path.name, new_bg_id, &sorted_block_group_edge_ids); } let mut updated_accession_edge_map = HashMap::new(); @@ -1059,20 +1157,7 @@ pub fn end_operation<'a>( } pub fn attach_session(session: &mut session::Session) { - for table in [ - "collections", - "samples", - "sequences", - "block_groups", - "paths", - "nodes", - "edges", - "path_edges", - "block_group_edges", - "accessions", - "accession_edges", - "accession_paths", - ] { + for table in CHANGE_TABLES { session.attach(Some(table)).unwrap(); } } @@ -1534,6 +1619,8 @@ mod tests { edge_id: new_edge.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }; BlockGroupEdge::bulk_create(conn, &[block_group_edge]); let operation = end_operation( @@ -1556,7 +1643,7 @@ mod tests { get_changeset_path(&operation).join(format!("{op_id}.dep", op_id = operation.hash)); let dependencies: DependencyModels = serde_json::from_reader(fs::File::open(dependency_path).unwrap()).unwrap(); - assert_eq!(dependencies.sequences.len(), 1); + assert_eq!(dependencies.sequences.len(), 2); assert_eq!( dependencies.block_group[0].collection_name, dep_bg.collection_name diff --git a/src/test_helpers.rs b/src/test_helpers.rs index 6a5073d3..9826b501 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -144,40 +144,45 @@ pub fn setup_block_group(conn: &Connection) -> (i64, Path) { edge_id: edge0.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge1.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge2.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge3.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge4.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, ]; - BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); - let path = Path::create( - conn, - "chr1", - block_group.id, - &[edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], - ); + let path = Path::create(conn, "chr1", block_group.id, &block_group_edge_ids); (block_group.id, path) } diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 1348dec7..103af2b3 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -1,19 +1,17 @@ use noodles::fasta; use rusqlite; -use rusqlite::{types::Value as SQLValue, Connection}; +use rusqlite::Connection; use std::{io, str}; use crate::models::operations::{OperationFile, OperationInfo}; use crate::models::{ block_group::{BlockGroup, PathChange}, - edge::Edge, file_types::FileTypes, node::Node, path::PathBlock, sample::Sample, sequence::Sequence, strand::Strand, - traits::*, }; use crate::{calculate_hash, operation_management}; @@ -90,6 +88,7 @@ pub fn update_with_fasta( path_start: start_coordinate, path_end: end_coordinate, strand: Strand::Forward, + phase_layer_id: 0, }; let path_change = PathChange { @@ -104,26 +103,15 @@ pub fn update_with_fasta( }; let interval_tree = path.intervaltree(conn); - BlockGroup::insert_change(conn, &path_change, &interval_tree); + let block_group_edge_ids = BlockGroup::insert_change(conn, &path_change, &interval_tree); - let edge_to_new_node = Edge::query( - conn, - "select * from edges where target_node_id = ?1", - rusqlite::params!(SQLValue::from(node_id)), - )[0] - .clone(); - let edge_from_new_node = Edge::query( - conn, - "select * from edges where source_node_id = ?1", - rusqlite::params!(SQLValue::from(node_id)), - )[0] - .clone(); let new_path = path.new_path_with( conn, start_coordinate, end_coordinate, - &edge_to_new_node, - &edge_from_new_node, + block_group_edge_ids[0], + block_group_edge_ids[1], + node_id, ); let summary_str = format!(" {}: 1 change", new_path.name); @@ -153,8 +141,9 @@ mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; use crate::imports::fasta::import_fasta; - use crate::models::{metadata, operations::setup_db}; + use crate::models::{metadata, operations::setup_db, traits::*}; use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir}; + use rusqlite::types::Value as SQLValue; use std::collections::HashSet; use std::path::PathBuf; diff --git a/src/updates/gaf.rs b/src/updates/gaf.rs index f3ef3a2f..2b5864c1 100644 --- a/src/updates/gaf.rs +++ b/src/updates/gaf.rs @@ -338,16 +338,18 @@ pub fn update_with_gaf<'a, P>( BlockGroup::query(conn, "select distinct bg.* from block_groups bg left join block_group_edges bge on (bg.id = bge.block_group_id) left join edges e on (e.id = bge.edge_id and (e.source_node_id in rarray(?2) or e.target_node_id in rarray(?2))) where collection_name = ?1 and sample_name is null", params!(collection_name.to_string(), Rc::new(bg_nodes))) }; for bg in bgs.iter() { - let new_block_group_edges = edge_ids + let block_group_edges = edge_ids .iter() .map(|edge_id| BlockGroupEdgeData { block_group_id: bg.id, edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + BlockGroupEdge::bulk_create(conn, &block_group_edges); } } } diff --git a/src/updates/genbank.rs b/src/updates/genbank.rs index c40fecf1..231242bd 100644 --- a/src/updates/genbank.rs +++ b/src/updates/genbank.rs @@ -93,7 +93,7 @@ where 0, Strand::Forward, ); - BlockGroupEdge::bulk_create( + let block_group_edge_ids = BlockGroupEdge::bulk_create( conn, &[ BlockGroupEdgeData { @@ -101,21 +101,20 @@ where edge_id: edge_into.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, BlockGroupEdgeData { block_group_id: block_group.id, edge_id: edge_out_of.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }, ], ); - Path::create( - conn, - &locus.name, - block_group.id, - &[edge_into.id, edge_out_of.id], - ) + Path::create(conn, &locus.name, block_group.id, &block_group_edge_ids) }; for edit in locus.changes_to_wt() { let start = edit.start; @@ -154,6 +153,7 @@ where path_start: start, path_end: end + change_seq.length, strand: Strand::Forward, + phase_layer_id: 0, }, chromosome_index: 0, phased: 0, @@ -174,6 +174,7 @@ where path_start: start, path_end: end, strand: Strand::Forward, + phase_layer_id: 0, }, chromosome_index: 0, phased: 0, diff --git a/src/updates/gfa.rs b/src/updates/gfa.rs index e0c4bcdf..b5ec335c 100644 --- a/src/updates/gfa.rs +++ b/src/updates/gfa.rs @@ -365,10 +365,27 @@ fn create_new_path_from_existing( edge_id: *edge_id, chromosome_index: 0, phased: 0, + source_phase_layer_id: 0, + target_phase_layer_id: 0, }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &block_group_edges); - Path::create(conn, unmatched_path_name, block_group_id, &new_edge_ids); + let block_group_edge_ids = BlockGroupEdge::bulk_create(conn, &block_group_edges); + let block_group_edges = BlockGroupEdge::load_block_group_edges(conn, &block_group_edge_ids); + let block_group_edges_by_edge_id = block_group_edges + .iter() + .map(|edge| (edge.edge_id, edge.id)) + .collect::>(); + let mut new_block_group_edge_ids = vec![]; + for new_edge_id in new_edge_ids { + let block_group_edge_id = block_group_edges_by_edge_id.get(&new_edge_id).unwrap(); + new_block_group_edge_ids.push(*block_group_edge_id); + } + Path::create( + conn, + unmatched_path_name, + block_group_id, + &new_block_group_edge_ids, + ); } #[cfg(test)] diff --git a/src/updates/library.rs b/src/updates/library.rs index f392907e..5fbf59b4 100644 --- a/src/updates/library.rs +++ b/src/updates/library.rs @@ -13,6 +13,7 @@ use crate::models::edge::{Edge, EdgeData}; use crate::models::file_types::FileTypes; use crate::models::node::Node; use crate::models::operations::{OperationFile, OperationInfo}; +use crate::models::phase_layer::{PhaseLayer, UNPHASED_CHROMOSOME_INDEX}; use crate::models::sample::Sample; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -130,6 +131,17 @@ pub fn update_with_library( let node_end_coordinate = end_coordinate - end_block.start + end_block.sequence_start; let mut new_edges = HashSet::new(); + let mut phase_layers_by_node_id = HashMap::new(); + phase_layers_by_node_id.insert(start_block.node_id, start_block.phase_layer_id); + phase_layers_by_node_id.insert(end_block.node_id, end_block.phase_layer_id); + + for parts in parts_list.iter() { + for part in parts.iter() { + let phased_layer_id = PhaseLayer::create(conn, UNPHASED_CHROMOSOME_INDEX, 0); + phase_layers_by_node_id.insert(**part, phased_layer_id); + } + } + let start_parts = parts_list.first().unwrap(); for start_part in *start_parts { let edge = EdgeData { @@ -179,16 +191,19 @@ pub fn update_with_library( path_changes_count *= end_parts.len(); let new_edge_ids = Edge::bulk_create(conn, &new_edges.iter().cloned().collect()); - let new_block_group_edges = new_edge_ids + let new_edges = Edge::bulk_load(conn, &new_edge_ids); + let block_group_edges = new_edges .iter() - .map(|edge_id| BlockGroupEdgeData { + .map(|edge| BlockGroupEdgeData { block_group_id: path.block_group_id, - edge_id: *edge_id, + edge_id: edge.id, chromosome_index: 0, phased: 0, + source_phase_layer_id: *phase_layers_by_node_id.get(&edge.source_node_id).unwrap(), + target_phase_layer_id: *phase_layers_by_node_id.get(&edge.target_node_id).unwrap(), }) .collect::>(); - BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + BlockGroupEdge::bulk_create(conn, &block_group_edges); let summary_str = format!("{region_name}: {path_changes_count} changes.\n"); operation_management::end_operation( diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 6c4b7f2e..24c24383 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -144,6 +144,7 @@ fn prepare_change( path_start: ref_start, path_end: ref_end, strand: Strand::Forward, + phase_layer_id: 0, }; PathChange { block_group_id: sample_bg_id,