-
Couldn't load subscription status.
- Fork 2
Add combine_tables function and sakila dataset #72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
syroBx
wants to merge
13
commits into
dev
Choose a base branch
from
44-create-single-cohort-table
base: dev
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
4c121ad
Add combine_tables function and sakila dataset
syroBx f18d2b9
Update combine_tales function
syroBx 847c5e0
Merge branch '44-create-single-cohort-table' of https://github.com/r-…
syroBx fa5600c
Add plot_binding_keys function
syroBx 8053800
Add checking of subsets of binding keys
syroBx de368ed
Change function name
syroBx 74da7e4
Remove main_key argument from combine_tables and create functions to …
syroBx c611444
Add function select_keys to select only necessary keys
syroBx 0fd8ac6
Correct style in single_cohort_table
syroBx 39e7867
Replace base R with tidyverse approach
syroBx 1890ff6
Replace base R with tidyverse approach
syroBx 9789035
Merge branch '44-create-single-cohort-table' of https://github.com/r-…
syroBx 7c8bde4
Merge branch 'dev' into 44-create-single-cohort-table
syroBx File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,295 @@ | ||
| #' @param x Cohort | ||
| #' @param tables Vector of tables to be combined. | ||
| #' The first is the main one on the basis of which the connection is created | ||
| combine_tables <- function(x, tables, use_nest = TRUE) { | ||
| if (!all(tables %in% names(get_data(x)))) { | ||
| stop("one or more tables do not appear in this cohort") | ||
| } | ||
|
|
||
| source_cohort <- x$get_source() | ||
| prim_keys <- source_cohort$primary_keys | ||
| keys <- source_cohort$binding_keys | ||
| tables_keys <- list() | ||
| update_ds <- keys[[1]]$update$dataset | ||
|
|
||
| subset_tables <- create_subset_list(keys) | ||
| tables_in_subset <- get_tables_from_subsets(tables, subset_tables) | ||
|
|
||
| if (is.null(tables_in_subset)) { | ||
| stop("At this moment we do not support joining 2 tables from different table subsets") | ||
| } | ||
|
|
||
| keys <- keys %>% | ||
| purrr::keep(~ .x$update$dataset %in% tables_in_subset & | ||
| .x$data_key[[1]]$dataset %in% tables_in_subset) | ||
|
|
||
| main_key <- prim_keys %>% | ||
| purrr::keep(~.x$dataset == tables[1]) %>% | ||
| purrr::map_chr("key") %>% | ||
| dplyr::first() | ||
|
|
||
| keys <- select_keys(tables, keys) | ||
|
|
||
| final_table <- join_tables(x, keys, use_nest, main_key, tables) | ||
|
|
||
| return(final_table) | ||
| } | ||
|
|
||
| #' @param keys List of binding keys | ||
| create_subset_list <- function(keys) { | ||
| subset_list <- list("subset_1" = keys[[1]]$update$dataset) | ||
| id_subset_list <- 2 | ||
|
|
||
| purrr::walk(keys, function(key){ | ||
| found <- FALSE | ||
| up_table <- key$update$dataset | ||
| data_table <- key$data_keys[[1]]$dataset | ||
|
|
||
| matching_subsets <- subset_list %>% | ||
| purrr::keep(~ up_table %in% .x) | ||
| if (!(rlang::is_empty(matching_subsets))) { | ||
| subset_list[names(matching_subsets)] <- matching_subsets %>% | ||
| purrr::map(~ if (!(data_table %in% .x)){ | ||
| append(.x, data_table) | ||
| }else .x) | ||
| found <- TRUE | ||
| } | ||
|
|
||
| matching_subsets <- subset_list %>% | ||
| purrr::keep(~ data_table %in% .x) | ||
| if (!(rlang::is_empty(matching_subsets))) { | ||
| subset_list[names(matching_subsets)] <- matching_subsets %>% | ||
| purrr::map(~ if (!(up_table %in% .x)){ | ||
| append(.x, up_table) | ||
| }else .x) | ||
| found <- TRUE | ||
| } | ||
|
|
||
| if (!found) { | ||
| subset_list[[paste0("subset_", id_subset_list)]] <- c(up_table, data_table) | ||
| id_subset_list <- id_subset_list + 1 | ||
| } | ||
| }) | ||
|
|
||
| return(subset_list) | ||
| } | ||
|
|
||
| #' @param tables vector of tables to check in all subset and merged subsets | ||
| #' @param subset_tables list of subsets | ||
| #' @return vector of tables from subset where all tables from input are located | ||
| get_tables_from_subsets <- function(tables, subset_tables) { | ||
| subset <- subset_tables %>% | ||
| purrr::keep(~all(tables %in% .x)) | ||
|
|
||
| if (!rlang::is_empty(subset)) { | ||
| return(subset) | ||
| } | ||
|
|
||
| if (length(subset_tables) == 1) { | ||
| return(NULL) | ||
| } | ||
|
|
||
| change <- FALSE | ||
| start_id <- 1 | ||
| while (TRUE) { | ||
| end_id <- length(subset_tables) | ||
|
|
||
| if (change) { | ||
| if (all(tables %in% subset_tables[[start_id]])) { | ||
| return(subset_tables[[start_id]]) | ||
| } | ||
| change <- FALSE | ||
| } | ||
|
|
||
| purrr::walk((start_id + 1):end_id, function(id) { | ||
| common_tables <- intersect(subset_tables[[start_id]], subset_tables[[id]]) | ||
| if (length(common_tables) > 0) { | ||
| subset_tables[[start_id]] <- union(subset_tables[[start_id]], subset_tables[[id]]) | ||
| subset_tables[[id]] <- NULL | ||
| change <<- TRUE | ||
| return(NULL) | ||
| } | ||
| }) | ||
|
|
||
|
|
||
| if (!change) { | ||
| start_id <- start_id + 1 | ||
| } | ||
| if (start_id >= end_id - 1) { | ||
| return(NULL) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| join_tables <- function(x, keys, use_nest, main_key, tables) { | ||
| tables_keys <- list() | ||
|
|
||
| update_ds <- keys[[1]]$update$dataset | ||
|
|
||
| if (update_ds %in% tables[-1]) { | ||
| tables_keys <- append(tables_keys, list(c( | ||
| dataset = update_ds, | ||
| key = paste0(update_ds, "_", keys[[1]]$update$key) | ||
| ))) | ||
| } | ||
|
|
||
| # Rename columns "first_name" -> "actor_first_name" | ||
| result <- get_data(x)[[update_ds]] %>% | ||
| dplyr::rename_with( ~ paste0(update_ds, "_", .), everything()) | ||
|
|
||
| combined_tables <- update_ds | ||
|
|
||
| for (i in keys) { | ||
| update_ds <- i$update$dataset | ||
| data_keys_ds <- i$data_keys[[1]]$dataset | ||
|
|
||
| # Save name of key to create one table using a nested join | ||
| if ((update_ds %in% tables[-1]) && | ||
| !(update_ds %in% unlist(tables_keys))) { | ||
| tables_keys <- tables_keys %>% | ||
| append(list(c( | ||
| dataset = update_ds, | ||
| key = paste0(update_ds, "_", i$update$key) | ||
| ))) | ||
| } else if ((data_keys_ds %in% tables[-1]) && | ||
| !(data_keys_ds %in% unlist(tables_keys))) { | ||
| tables_keys <- tables_keys %>% | ||
| append(list(c( | ||
| dataset = data_keys_ds, | ||
| key = paste0(data_keys_ds, "_", i$data_keys[[1]]$key) | ||
| ))) | ||
| } | ||
|
|
||
| if ((update_ds %in% combined_tables) && | ||
| !(data_keys_ds %in% combined_tables)) { | ||
| right_table <- get_data(x)[[data_keys_ds]] %>% | ||
| dplyr::rename_with( ~ paste0(data_keys_ds, "_", .), everything()) | ||
|
|
||
| result <- result %>% | ||
| dplyr::left_join( | ||
| right_table, | ||
| by = rlang::set_names( | ||
| paste0(data_keys_ds, "_", i$data_keys[[1]]$key), | ||
| paste0(update_ds, "_", i$update$key) | ||
| ), | ||
| keep = TRUE, | ||
| relationship = "many-to-many" | ||
| ) | ||
|
|
||
| combined_tables <- append(combined_tables, data_keys_ds) | ||
| } else if ((data_keys_ds %in% combined_tables) && | ||
| !(update_ds %in% combined_tables)) { | ||
| right_table <- get_data(x)[[update_ds]] %>% | ||
| dplyr::rename_with( ~ paste0(update_ds, "_", .), everything()) | ||
|
|
||
| result <- result %>% | ||
| dplyr::left_join( | ||
| right_table, | ||
| by = rlang::set_names( | ||
| paste0(update_ds, "_", i$update$key), | ||
| paste0(data_keys_ds, "_", i$data_keys[[1]]$key) | ||
| ), | ||
| keep = TRUE, | ||
| relationship = "many-to-many" | ||
| ) | ||
|
|
||
| combined_tables <- append(combined_tables, update_ds) | ||
| } | ||
| } | ||
| if (use_nest) { | ||
| name_of_new_col <- paste0(tables[1], "_", tables_keys[[1]][[1]]) | ||
| main_key_result <- paste0(tables[1], "_", main_key) | ||
| # Name of table to combine | ||
| table_name <- tables_keys[[1]][[1]] | ||
| # Names of columns in table to combine | ||
| col_names <- names(get_data(x)[[table_name]]) | ||
|
|
||
| final_table <- get_data(x)[[tables[[1]]]] %>% | ||
| dplyr::nest_join( | ||
| result %>% dplyr::select(paste0(table_name, "_", col_names), main_key_result), | ||
| by = rlang::set_names(main_key_result, main_key), | ||
| name = name_of_new_col | ||
| ) | ||
|
|
||
| final_table[[name_of_new_col]] <- lapply(final_table[[name_of_new_col]], function(df) { | ||
| df %>% dplyr::distinct(across(all_of(tables_keys[[1]][[2]])), .keep_all = TRUE) | ||
| }) | ||
|
|
||
| if (!(length(tables_keys) < 2)) { | ||
| for (i in 2:length(tables_keys)) { | ||
| name_of_new_col <- paste0(tables[1], "_", tables_keys[[i]][[1]]) | ||
| # Name of table to combine | ||
| table_name <- tables_keys[[i]][[1]] | ||
| # Names of columns in table to combine | ||
| col_names <- names(get_data(x)[[table_name]]) | ||
|
|
||
| final_table <- final_table %>% | ||
| dplyr::nest_join( | ||
| result %>% dplyr::select(paste0(table_name, "_", col_names), main_key_result), | ||
| by = rlang::set_names(main_key_result, main_key), | ||
| name = name_of_new_col | ||
| ) | ||
|
|
||
| final_table[[name_of_new_col]] <- lapply(final_table[[name_of_new_col]], function(df) { | ||
| df %>% dplyr::distinct(across(all_of(tables_keys[[i]][[2]])), .keep_all = TRUE) | ||
| }) | ||
| } | ||
| } | ||
| } else { | ||
| regex_pattern <- paste0("^(", paste(tables, collapse = "|"), ")") | ||
| final_table <- dplyr::select(result, matches(regex_pattern)) | ||
| } | ||
|
|
||
| return(final_table) | ||
| } | ||
|
|
||
| select_keys <- function(tables, keys) { | ||
| id <- 1 | ||
| keys_test <- keys | ||
| continue <- TRUE | ||
| while (continue) { | ||
| keys_test <- keys[-id] | ||
| subset_tables <- create_subset_list(keys_test) | ||
| tables_in_subset <- get_tables_from_subsets(tables, subset_tables) | ||
|
|
||
| if (!is.null(tables_in_subset)) { | ||
| keys <- keys %>% | ||
| purrr::keep(~ .x$update$dataset %in% tables_in_subset & | ||
| .x$data_key[[1]]$dataset %in% tables_in_subset) | ||
| } else { | ||
| id <- id + 1 | ||
| } | ||
| if (id > length(keys)) { | ||
| continue <- FALSE | ||
| } | ||
| } | ||
|
|
||
| return(keys) | ||
| } | ||
|
|
||
| #' @description | ||
| #' Create plot to draw connection between binding keys | ||
| #' | ||
| #' @param x Cohort | ||
| plot_binding_keys <- function(x) { | ||
| source_cohort <- x$get_source() | ||
|
|
||
| edges <- do.call(rbind, lapply(source_cohort$binding_keys, function(bind) { | ||
| source <- bind$update$dataset | ||
| targets <- purrr::map_vec(bind$data_keys, function(dk) | ||
| dk$dataset) | ||
| data.frame(source = source, | ||
| target = targets, | ||
| stringsAsFactors = FALSE) | ||
| })) | ||
|
|
||
| graph <- igraph::graph_from_data_frame(edges, directed = TRUE) | ||
|
|
||
| ggraph::ggraph(graph, layout = "fr") + | ||
| ggraph::geom_edge_link(linewidth = 0.8) + | ||
| ggraph::geom_node_point(size = 6, color = "blue") + | ||
| ggraph::geom_node_text(ggplot2::aes(label = name), | ||
| vjust = 1.5, | ||
| size = 5) + | ||
| ggplot2::theme_minimal() | ||
| } | ||
Binary file not shown.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function to draw connection between binding_keys.
To work only need Cohort with defined binding_keys.